The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 6ded703c56c21bfb259725d4f1831a5feb563e9b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '167812503219822(a)kroah.com' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
6ded703c56c2 ("brd: check for REQ_NOWAIT and set correct page allocation mask")
db0ccc44a20b ("brd: return 0/-error from brd_insert_page()")
ba91fd01aad2 ("block/brd: Use the enum req_op type")
3e08773c3841 ("block: switch polling to be bio based")
6ce913fe3eee ("block: rename REQ_HIPRI to REQ_POLLED")
d729cf9acb93 ("io_uring: don't sleep when polling for I/O")
ef99b2d37666 ("block: replace the spin argument to blk_iopoll with a flags argument")
28a1ae6b9dab ("blk-mq: remove blk_qc_t_valid")
efbabbe121f9 ("blk-mq: remove blk_qc_t_to_tag and blk_qc_t_is_internal")
c6699d6fe0ff ("blk-mq: factor out a "classic" poll helper")
f70299f0d58e ("blk-mq: factor out a blk_qc_to_hctx helper")
71fc3f5e2c00 ("block: don't try to poll multi-bio I/Os in __blkdev_direct_IO")
0f38d7664615 ("blk-mq: cleanup blk_mq_submit_bio")
47c122e35d7e ("block: pre-allocate requests if plug is started and is a batch")
24b83deb29b7 ("block: move struct request to blk-mq.h")
badf7f643787 ("block: move a few merge helpers out of <linux/blkdev.h>")
2e9bc3465ac5 ("block: move elevator.h to block/")
9778ac77c202 ("block: remove the struct blk_queue_ctx forward declaration")
90138237a562 ("block: remove the unused blk_queue_state enum")
cc9c884dd7f4 ("block: call submit_bio_checks under q_usage_counter")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 6ded703c56c21bfb259725d4f1831a5feb563e9b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe(a)kernel.dk>
Date: Thu, 16 Feb 2023 08:01:08 -0700
Subject: [PATCH] brd: check for REQ_NOWAIT and set correct page allocation
mask
If REQ_NOWAIT is set, then do a non-blocking allocation if the operation
is a write and we need to insert a new page. Currently REQ_NOWAIT cannot
be set as the queue isn't marked as supporting nowait, this change is in
preparation for allowing that.
radix_tree_preload() warns on attempting to call it with an allocation
mask that doesn't allow blocking. While that warning could arguably
be removed, we need to handle radix insertion failures anyway as they
are more likely if we cannot block to get memory.
Remove legacy BUG_ON()'s and turn them into proper errors instead, one
for the allocation failure and one for finding a page that doesn't
match the correct index.
Cc: stable(a)vger.kernel.org # 5.10+
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 15a148d5aad9..00f3c5b51a01 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -80,26 +80,21 @@ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
/*
* Insert a new page for a given sector, if one does not already exist.
*/
-static int brd_insert_page(struct brd_device *brd, sector_t sector)
+static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp)
{
pgoff_t idx;
struct page *page;
- gfp_t gfp_flags;
+ int ret = 0;
page = brd_lookup_page(brd, sector);
if (page)
return 0;
- /*
- * Must use NOIO because we don't want to recurse back into the
- * block or filesystem layers from page reclaim.
- */
- gfp_flags = GFP_NOIO | __GFP_ZERO | __GFP_HIGHMEM;
- page = alloc_page(gfp_flags);
+ page = alloc_page(gfp | __GFP_ZERO | __GFP_HIGHMEM);
if (!page)
return -ENOMEM;
- if (radix_tree_preload(GFP_NOIO)) {
+ if (gfpflags_allow_blocking(gfp) && radix_tree_preload(gfp)) {
__free_page(page);
return -ENOMEM;
}
@@ -110,15 +105,17 @@ static int brd_insert_page(struct brd_device *brd, sector_t sector)
if (radix_tree_insert(&brd->brd_pages, idx, page)) {
__free_page(page);
page = radix_tree_lookup(&brd->brd_pages, idx);
- BUG_ON(!page);
- BUG_ON(page->index != idx);
+ if (!page)
+ ret = -ENOMEM;
+ else if (page->index != idx)
+ ret = -EIO;
} else {
brd->brd_nr_pages++;
}
spin_unlock(&brd->brd_lock);
radix_tree_preload_end();
- return 0;
+ return ret;
}
/*
@@ -167,19 +164,20 @@ static void brd_free_pages(struct brd_device *brd)
/*
* copy_to_brd_setup must be called before copy_to_brd. It may sleep.
*/
-static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n)
+static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n,
+ gfp_t gfp)
{
unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
size_t copy;
int ret;
copy = min_t(size_t, n, PAGE_SIZE - offset);
- ret = brd_insert_page(brd, sector);
+ ret = brd_insert_page(brd, sector, gfp);
if (ret)
return ret;
if (copy < n) {
sector += copy >> SECTOR_SHIFT;
- ret = brd_insert_page(brd, sector);
+ ret = brd_insert_page(brd, sector, gfp);
}
return ret;
}
@@ -254,20 +252,26 @@ static void copy_from_brd(void *dst, struct brd_device *brd,
* Process a single bvec of a bio.
*/
static int brd_do_bvec(struct brd_device *brd, struct page *page,
- unsigned int len, unsigned int off, enum req_op op,
+ unsigned int len, unsigned int off, blk_opf_t opf,
sector_t sector)
{
void *mem;
int err = 0;
- if (op_is_write(op)) {
- err = copy_to_brd_setup(brd, sector, len);
+ if (op_is_write(opf)) {
+ /*
+ * Must use NOIO because we don't want to recurse back into the
+ * block or filesystem layers from page reclaim.
+ */
+ gfp_t gfp = opf & REQ_NOWAIT ? GFP_NOWAIT : GFP_NOIO;
+
+ err = copy_to_brd_setup(brd, sector, len, gfp);
if (err)
goto out;
}
mem = kmap_atomic(page);
- if (!op_is_write(op)) {
+ if (!op_is_write(opf)) {
copy_from_brd(mem + off, brd, sector, len);
flush_dcache_page(page);
} else {
@@ -296,8 +300,12 @@ static void brd_submit_bio(struct bio *bio)
(len & (SECTOR_SIZE - 1)));
err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset,
- bio_op(bio), sector);
+ bio->bi_opf, sector);
if (err) {
+ if (err == -ENOMEM && bio->bi_opf & REQ_NOWAIT) {
+ bio_wouldblock_error(bio);
+ return;
+ }
bio_io_error(bio);
return;
}
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 6ded703c56c21bfb259725d4f1831a5feb563e9b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '1678125031200144(a)kroah.com' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
6ded703c56c2 ("brd: check for REQ_NOWAIT and set correct page allocation mask")
db0ccc44a20b ("brd: return 0/-error from brd_insert_page()")
ba91fd01aad2 ("block/brd: Use the enum req_op type")
3e08773c3841 ("block: switch polling to be bio based")
6ce913fe3eee ("block: rename REQ_HIPRI to REQ_POLLED")
d729cf9acb93 ("io_uring: don't sleep when polling for I/O")
ef99b2d37666 ("block: replace the spin argument to blk_iopoll with a flags argument")
28a1ae6b9dab ("blk-mq: remove blk_qc_t_valid")
efbabbe121f9 ("blk-mq: remove blk_qc_t_to_tag and blk_qc_t_is_internal")
c6699d6fe0ff ("blk-mq: factor out a "classic" poll helper")
f70299f0d58e ("blk-mq: factor out a blk_qc_to_hctx helper")
71fc3f5e2c00 ("block: don't try to poll multi-bio I/Os in __blkdev_direct_IO")
0f38d7664615 ("blk-mq: cleanup blk_mq_submit_bio")
47c122e35d7e ("block: pre-allocate requests if plug is started and is a batch")
24b83deb29b7 ("block: move struct request to blk-mq.h")
badf7f643787 ("block: move a few merge helpers out of <linux/blkdev.h>")
2e9bc3465ac5 ("block: move elevator.h to block/")
9778ac77c202 ("block: remove the struct blk_queue_ctx forward declaration")
90138237a562 ("block: remove the unused blk_queue_state enum")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 6ded703c56c21bfb259725d4f1831a5feb563e9b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe(a)kernel.dk>
Date: Thu, 16 Feb 2023 08:01:08 -0700
Subject: [PATCH] brd: check for REQ_NOWAIT and set correct page allocation
mask
If REQ_NOWAIT is set, then do a non-blocking allocation if the operation
is a write and we need to insert a new page. Currently REQ_NOWAIT cannot
be set as the queue isn't marked as supporting nowait, this change is in
preparation for allowing that.
radix_tree_preload() warns on attempting to call it with an allocation
mask that doesn't allow blocking. While that warning could arguably
be removed, we need to handle radix insertion failures anyway as they
are more likely if we cannot block to get memory.
Remove legacy BUG_ON()'s and turn them into proper errors instead, one
for the allocation failure and one for finding a page that doesn't
match the correct index.
Cc: stable(a)vger.kernel.org # 5.10+
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 15a148d5aad9..00f3c5b51a01 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -80,26 +80,21 @@ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
/*
* Insert a new page for a given sector, if one does not already exist.
*/
-static int brd_insert_page(struct brd_device *brd, sector_t sector)
+static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp)
{
pgoff_t idx;
struct page *page;
- gfp_t gfp_flags;
+ int ret = 0;
page = brd_lookup_page(brd, sector);
if (page)
return 0;
- /*
- * Must use NOIO because we don't want to recurse back into the
- * block or filesystem layers from page reclaim.
- */
- gfp_flags = GFP_NOIO | __GFP_ZERO | __GFP_HIGHMEM;
- page = alloc_page(gfp_flags);
+ page = alloc_page(gfp | __GFP_ZERO | __GFP_HIGHMEM);
if (!page)
return -ENOMEM;
- if (radix_tree_preload(GFP_NOIO)) {
+ if (gfpflags_allow_blocking(gfp) && radix_tree_preload(gfp)) {
__free_page(page);
return -ENOMEM;
}
@@ -110,15 +105,17 @@ static int brd_insert_page(struct brd_device *brd, sector_t sector)
if (radix_tree_insert(&brd->brd_pages, idx, page)) {
__free_page(page);
page = radix_tree_lookup(&brd->brd_pages, idx);
- BUG_ON(!page);
- BUG_ON(page->index != idx);
+ if (!page)
+ ret = -ENOMEM;
+ else if (page->index != idx)
+ ret = -EIO;
} else {
brd->brd_nr_pages++;
}
spin_unlock(&brd->brd_lock);
radix_tree_preload_end();
- return 0;
+ return ret;
}
/*
@@ -167,19 +164,20 @@ static void brd_free_pages(struct brd_device *brd)
/*
* copy_to_brd_setup must be called before copy_to_brd. It may sleep.
*/
-static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n)
+static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n,
+ gfp_t gfp)
{
unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
size_t copy;
int ret;
copy = min_t(size_t, n, PAGE_SIZE - offset);
- ret = brd_insert_page(brd, sector);
+ ret = brd_insert_page(brd, sector, gfp);
if (ret)
return ret;
if (copy < n) {
sector += copy >> SECTOR_SHIFT;
- ret = brd_insert_page(brd, sector);
+ ret = brd_insert_page(brd, sector, gfp);
}
return ret;
}
@@ -254,20 +252,26 @@ static void copy_from_brd(void *dst, struct brd_device *brd,
* Process a single bvec of a bio.
*/
static int brd_do_bvec(struct brd_device *brd, struct page *page,
- unsigned int len, unsigned int off, enum req_op op,
+ unsigned int len, unsigned int off, blk_opf_t opf,
sector_t sector)
{
void *mem;
int err = 0;
- if (op_is_write(op)) {
- err = copy_to_brd_setup(brd, sector, len);
+ if (op_is_write(opf)) {
+ /*
+ * Must use NOIO because we don't want to recurse back into the
+ * block or filesystem layers from page reclaim.
+ */
+ gfp_t gfp = opf & REQ_NOWAIT ? GFP_NOWAIT : GFP_NOIO;
+
+ err = copy_to_brd_setup(brd, sector, len, gfp);
if (err)
goto out;
}
mem = kmap_atomic(page);
- if (!op_is_write(op)) {
+ if (!op_is_write(opf)) {
copy_from_brd(mem + off, brd, sector, len);
flush_dcache_page(page);
} else {
@@ -296,8 +300,12 @@ static void brd_submit_bio(struct bio *bio)
(len & (SECTOR_SIZE - 1)));
err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset,
- bio_op(bio), sector);
+ bio->bi_opf, sector);
if (err) {
+ if (err == -ENOMEM && bio->bi_opf & REQ_NOWAIT) {
+ bio_wouldblock_error(bio);
+ return;
+ }
bio_io_error(bio);
return;
}
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 67205f80be9910207481406c47f7d85e703fb2e9
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '167812500026148(a)kroah.com' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
67205f80be99 ("brd: mark as nowait compatible")
e1528830bd4e ("block/brd: add error handling support for add_disk()")
f7bf35862477 ("brd: reduce the brd_devices_mutex scope")
7f9b348cb5e9 ("brd: convert to blk_alloc_disk/blk_cleanup_disk")
f4be591f1436 ("brd: expose number of allocated pages in debugfs")
7cc178a6b994 ("brd: use __register_blkdev to allocate devices on demand")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 67205f80be9910207481406c47f7d85e703fb2e9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe(a)kernel.dk>
Date: Wed, 15 Feb 2023 16:43:47 -0700
Subject: [PATCH] brd: mark as nowait compatible
By default, non-mq drivers do not support nowait. This causes io_uring
to use a slower path as the driver cannot be trust not to block. brd
can safely set the nowait flag, as worst case all it does is a NOIO
allocation.
For io_uring, this makes a substantial difference. Before:
submitter=0, tid=453, file=/dev/ram0, node=-1
polled=0, fixedbufs=1/0, register_files=1, buffered=0, QD=128
Engine=io_uring, sq_ring=128, cq_ring=128
IOPS=440.03K, BW=1718MiB/s, IOS/call=32/31
IOPS=428.96K, BW=1675MiB/s, IOS/call=32/32
IOPS=442.59K, BW=1728MiB/s, IOS/call=32/31
IOPS=419.65K, BW=1639MiB/s, IOS/call=32/32
IOPS=426.82K, BW=1667MiB/s, IOS/call=32/31
and after:
submitter=0, tid=354, file=/dev/ram0, node=-1
polled=0, fixedbufs=1/0, register_files=1, buffered=0, QD=128
Engine=io_uring, sq_ring=128, cq_ring=128
IOPS=3.37M, BW=13.15GiB/s, IOS/call=32/31
IOPS=3.45M, BW=13.46GiB/s, IOS/call=32/31
IOPS=3.43M, BW=13.42GiB/s, IOS/call=32/32
IOPS=3.43M, BW=13.39GiB/s, IOS/call=32/31
IOPS=3.43M, BW=13.38GiB/s, IOS/call=32/31
or about an 8x in difference. Now that brd is prepared to deal with
REQ_NOWAIT reads/writes, mark it as supporting that.
Cc: stable(a)vger.kernel.org # 5.10+
Link: https://lore.kernel.org/linux-block/20230203103005.31290-1-p.raghav@samsung…
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 00f3c5b51a01..740631dcdd0e 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -418,6 +418,7 @@ static int brd_alloc(int i)
/* Tell the block layer that this is not a rotational device */
blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
+ blk_queue_flag_set(QUEUE_FLAG_NOWAIT, disk->queue);
err = add_disk(disk);
if (err)
goto out_cleanup_disk;
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 67205f80be9910207481406c47f7d85e703fb2e9
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '167812499941176(a)kroah.com' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
67205f80be99 ("brd: mark as nowait compatible")
e1528830bd4e ("block/brd: add error handling support for add_disk()")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 67205f80be9910207481406c47f7d85e703fb2e9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe(a)kernel.dk>
Date: Wed, 15 Feb 2023 16:43:47 -0700
Subject: [PATCH] brd: mark as nowait compatible
By default, non-mq drivers do not support nowait. This causes io_uring
to use a slower path as the driver cannot be trust not to block. brd
can safely set the nowait flag, as worst case all it does is a NOIO
allocation.
For io_uring, this makes a substantial difference. Before:
submitter=0, tid=453, file=/dev/ram0, node=-1
polled=0, fixedbufs=1/0, register_files=1, buffered=0, QD=128
Engine=io_uring, sq_ring=128, cq_ring=128
IOPS=440.03K, BW=1718MiB/s, IOS/call=32/31
IOPS=428.96K, BW=1675MiB/s, IOS/call=32/32
IOPS=442.59K, BW=1728MiB/s, IOS/call=32/31
IOPS=419.65K, BW=1639MiB/s, IOS/call=32/32
IOPS=426.82K, BW=1667MiB/s, IOS/call=32/31
and after:
submitter=0, tid=354, file=/dev/ram0, node=-1
polled=0, fixedbufs=1/0, register_files=1, buffered=0, QD=128
Engine=io_uring, sq_ring=128, cq_ring=128
IOPS=3.37M, BW=13.15GiB/s, IOS/call=32/31
IOPS=3.45M, BW=13.46GiB/s, IOS/call=32/31
IOPS=3.43M, BW=13.42GiB/s, IOS/call=32/32
IOPS=3.43M, BW=13.39GiB/s, IOS/call=32/31
IOPS=3.43M, BW=13.38GiB/s, IOS/call=32/31
or about an 8x in difference. Now that brd is prepared to deal with
REQ_NOWAIT reads/writes, mark it as supporting that.
Cc: stable(a)vger.kernel.org # 5.10+
Link: https://lore.kernel.org/linux-block/20230203103005.31290-1-p.raghav@samsung…
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 00f3c5b51a01..740631dcdd0e 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -418,6 +418,7 @@ static int brd_alloc(int i)
/* Tell the block layer that this is not a rotational device */
blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
+ blk_queue_flag_set(QUEUE_FLAG_NOWAIT, disk->queue);
err = add_disk(disk);
if (err)
goto out_cleanup_disk;
Hi -stable team,
please backport the commit f1aa2eb5ea05 ("sysctl: fix proc_dobool() usability")
to the stable kernels containing commit 83efeeeb3d04 ("tty: Allow TIOCSTI to be disabled").
(Which seems only to be the 6.2 branch only at the moment)
Without this backport the sysctl dev.net.legacy_tiocsti to enable
ioctl(TIOCSTI) is not functional. So on kernels that don't enable
CONFIG_LEGACY_TIOCSTI, ioctl(TIOCSTI) is not usable at all.
This ioctl is used for the copy-and-paste functionality of the
screenreader "fenrir".
( https://github.com/chrys87/fenrir )
Reported-by: Storm Dragon <stormdragon2976(a)gmail.com>
Link: https://lore.kernel.org/lkml/ZAOi9hDBTYqoAZuI@hotmail.com/
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.14.y
git checkout FETCH_HEAD
git cherry-pick -x 26044aff37a5455b19a91785086914fd33053ef4
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '16781236386157(a)kroah.com' --subject-prefix 'PATCH 4.14.y' HEAD^..
Possible dependencies:
26044aff37a5 ("x86/crash: Disable virt in core NMI crash handler to avoid double shootdown")
ed72736183c4 ("x86/reboot: Force all cpus to exit VMX root if VMX is supported")
4d1d0977a215 ("x86: Fix a handful of typos")
22ca7ee933a3 ("x86/apic: Provide and use helper for send_IPI_allbutself()")
6a1cb5f5c641 ("x86/apic: Add static key to Control IPI shorthands")
bdda3b93e660 ("x86/apic: Move no_ipi_broadcast() out of 32bit")
9c92374b631d ("x86/cpu: Move arch_smt_update() to a neutral place")
c94f0718fb1c ("x86/apic: Consolidate the apic local headers")
ba77b2a02e00 ("x86/apic: Move apic_flat_64 header into apic directory")
8b542da37287 ("x86/apic: Move ipi header into apic directory")
521b82fee98c ("x86/apic: Cleanup the include maze")
cdc86c9d1f82 ("x86/apic: Move IPI inlines into ipi.c")
2591bc4e8d70 ("x86/kgbd: Use NMI_VECTOR not APIC_DM_NMI")
747d5a1bf293 ("x86/reboot: Always use NMI fallback when shutdown via reboot vector IPI fails")
7e300dabb7e7 ("treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 223")
fa4bff165070 ("Merge branch 'x86-mds-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 26044aff37a5455b19a91785086914fd33053ef4 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Wed, 30 Nov 2022 23:36:47 +0000
Subject: [PATCH] x86/crash: Disable virt in core NMI crash handler to avoid
double shootdown
Disable virtualization in crash_nmi_callback() and rework the
emergency_vmx_disable_all() path to do an NMI shootdown if and only if a
shootdown has not already occurred. NMI crash shootdown fundamentally
can't support multiple invocations as responding CPUs are deliberately
put into halt state without unblocking NMIs. But, the emergency reboot
path doesn't have any work of its own, it simply cares about disabling
virtualization, i.e. so long as a shootdown occurred, emergency reboot
doesn't care who initiated the shootdown, or when.
If "crash_kexec_post_notifiers" is specified on the kernel command line,
panic() will invoke crash_smp_send_stop() and result in a second call to
nmi_shootdown_cpus() during native_machine_emergency_restart().
Invoke the callback _before_ disabling virtualization, as the current
VMCS needs to be cleared before doing VMXOFF. Note, this results in a
subtle change in ordering between disabling virtualization and stopping
Intel PT on the responding CPUs. While VMX and Intel PT do interact,
VMXOFF and writes to MSR_IA32_RTIT_CTL do not induce faults between one
another, which is all that matters when panicking.
Harden nmi_shootdown_cpus() against multiple invocations to try and
capture any such kernel bugs via a WARN instead of hanging the system
during a crash/dump, e.g. prior to the recent hardening of
register_nmi_handler(), re-registering the NMI handler would trigger a
double list_add() and hang the system if CONFIG_BUG_ON_DATA_CORRUPTION=y.
list_add double add: new=ffffffff82220800, prev=ffffffff8221cfe8, next=ffffffff82220800.
WARNING: CPU: 2 PID: 1319 at lib/list_debug.c:29 __list_add_valid+0x67/0x70
Call Trace:
__register_nmi_handler+0xcf/0x130
nmi_shootdown_cpus+0x39/0x90
native_machine_emergency_restart+0x1c9/0x1d0
panic+0x237/0x29b
Extract the disabling logic to a common helper to deduplicate code, and
to prepare for doing the shootdown in the emergency reboot path if SVM
is supported.
Note, prior to commit ed72736183c4 ("x86/reboot: Force all cpus to exit
VMX root if VMX is supported"), nmi_shootdown_cpus() was subtly protected
against a second invocation by a cpu_vmx_enabled() check as the kdump
handler would disable VMX if it ran first.
Fixes: ed72736183c4 ("x86/reboot: Force all cpus to exit VMX root if VMX is supported")
Cc: stable(a)vger.kernel.org
Reported-by: Guilherme G. Piccoli <gpiccoli(a)igalia.com>
Cc: Vitaly Kuznetsov <vkuznets(a)redhat.com>
Cc: Paolo Bonzini <pbonzini(a)redhat.com>
Link: https://lore.kernel.org/all/20220427224924.592546-2-gpiccoli@igalia.com
Tested-by: Guilherme G. Piccoli <gpiccoli(a)igalia.com>
Reviewed-by: Thomas Gleixner <tglx(a)linutronix.de>
Link: https://lore.kernel.org/r/20221130233650.1404148-2-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
index 04c17be9b5fd..bc5b4d788c08 100644
--- a/arch/x86/include/asm/reboot.h
+++ b/arch/x86/include/asm/reboot.h
@@ -25,6 +25,8 @@ void __noreturn machine_real_restart(unsigned int type);
#define MRR_BIOS 0
#define MRR_APM 1
+void cpu_emergency_disable_virtualization(void);
+
typedef void (*nmi_shootdown_cb)(int, struct pt_regs*);
void nmi_panic_self_stop(struct pt_regs *regs);
void nmi_shootdown_cpus(nmi_shootdown_cb callback);
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 305514431f26..cdd92ab43cda 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -37,7 +37,6 @@
#include <linux/kdebug.h>
#include <asm/cpu.h>
#include <asm/reboot.h>
-#include <asm/virtext.h>
#include <asm/intel_pt.h>
#include <asm/crash.h>
#include <asm/cmdline.h>
@@ -81,15 +80,6 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
*/
cpu_crash_vmclear_loaded_vmcss();
- /* Disable VMX or SVM if needed.
- *
- * We need to disable virtualization on all CPUs.
- * Having VMX or SVM enabled on any CPU may break rebooting
- * after the kdump kernel has finished its task.
- */
- cpu_emergency_vmxoff();
- cpu_emergency_svm_disable();
-
/*
* Disable Intel PT to stop its logging
*/
@@ -148,12 +138,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
*/
cpu_crash_vmclear_loaded_vmcss();
- /* Booting kdump kernel with VMX or SVM enabled won't work,
- * because (among other limitations) we can't disable paging
- * with the virt flags.
- */
- cpu_emergency_vmxoff();
- cpu_emergency_svm_disable();
+ cpu_emergency_disable_virtualization();
/*
* Disable Intel PT to stop its logging
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index c3636ea4aa71..374c14b3a9bf 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -528,10 +528,7 @@ static inline void kb_wait(void)
}
}
-static void vmxoff_nmi(int cpu, struct pt_regs *regs)
-{
- cpu_emergency_vmxoff();
-}
+static inline void nmi_shootdown_cpus_on_restart(void);
/* Use NMIs as IPIs to tell all CPUs to disable virtualization */
static void emergency_vmx_disable_all(void)
@@ -554,7 +551,7 @@ static void emergency_vmx_disable_all(void)
__cpu_emergency_vmxoff();
/* Halt and exit VMX root operation on the other CPUs. */
- nmi_shootdown_cpus(vmxoff_nmi);
+ nmi_shootdown_cpus_on_restart();
}
}
@@ -795,6 +792,17 @@ void machine_crash_shutdown(struct pt_regs *regs)
/* This is the CPU performing the emergency shutdown work. */
int crashing_cpu = -1;
+/*
+ * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during
+ * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if
+ * GIF=0, i.e. if the crash occurred between CLGI and STGI.
+ */
+void cpu_emergency_disable_virtualization(void)
+{
+ cpu_emergency_vmxoff();
+ cpu_emergency_svm_disable();
+}
+
#if defined(CONFIG_SMP)
static nmi_shootdown_cb shootdown_callback;
@@ -817,7 +825,14 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
return NMI_HANDLED;
local_irq_disable();
- shootdown_callback(cpu, regs);
+ if (shootdown_callback)
+ shootdown_callback(cpu, regs);
+
+ /*
+ * Prepare the CPU for reboot _after_ invoking the callback so that the
+ * callback can safely use virtualization instructions, e.g. VMCLEAR.
+ */
+ cpu_emergency_disable_virtualization();
atomic_dec(&waiting_for_crash_ipi);
/* Assume hlt works */
@@ -828,18 +843,32 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
return NMI_HANDLED;
}
-/*
- * Halt all other CPUs, calling the specified function on each of them
+/**
+ * nmi_shootdown_cpus - Stop other CPUs via NMI
+ * @callback: Optional callback to be invoked from the NMI handler
+ *
+ * The NMI handler on the remote CPUs invokes @callback, if not
+ * NULL, first and then disables virtualization to ensure that
+ * INIT is recognized during reboot.
*
- * This function can be used to halt all other CPUs on crash
- * or emergency reboot time. The function passed as parameter
- * will be called inside a NMI handler on all CPUs.
+ * nmi_shootdown_cpus() can only be invoked once. After the first
+ * invocation all other CPUs are stuck in crash_nmi_callback() and
+ * cannot respond to a second NMI.
*/
void nmi_shootdown_cpus(nmi_shootdown_cb callback)
{
unsigned long msecs;
+
local_irq_disable();
+ /*
+ * Avoid certain doom if a shootdown already occurred; re-registering
+ * the NMI handler will cause list corruption, modifying the callback
+ * will do who knows what, etc...
+ */
+ if (WARN_ON_ONCE(crash_ipi_issued))
+ return;
+
/* Make a note of crashing cpu. Will be used in NMI callback. */
crashing_cpu = safe_smp_processor_id();
@@ -867,7 +896,17 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
msecs--;
}
- /* Leave the nmi callback set */
+ /*
+ * Leave the nmi callback set, shootdown is a one-time thing. Clearing
+ * the callback could result in a NULL pointer dereference if a CPU
+ * (finally) responds after the timeout expires.
+ */
+}
+
+static inline void nmi_shootdown_cpus_on_restart(void)
+{
+ if (!crash_ipi_issued)
+ nmi_shootdown_cpus(NULL);
}
/*
@@ -897,6 +936,8 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
/* No other CPUs to shoot down */
}
+static inline void nmi_shootdown_cpus_on_restart(void) { }
+
void run_crash_ipi_callback(struct pt_regs *regs)
{
}
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 26044aff37a5455b19a91785086914fd33053ef4
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '1678123636156122(a)kroah.com' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
26044aff37a5 ("x86/crash: Disable virt in core NMI crash handler to avoid double shootdown")
ed72736183c4 ("x86/reboot: Force all cpus to exit VMX root if VMX is supported")
4d1d0977a215 ("x86: Fix a handful of typos")
22ca7ee933a3 ("x86/apic: Provide and use helper for send_IPI_allbutself()")
6a1cb5f5c641 ("x86/apic: Add static key to Control IPI shorthands")
bdda3b93e660 ("x86/apic: Move no_ipi_broadcast() out of 32bit")
9c92374b631d ("x86/cpu: Move arch_smt_update() to a neutral place")
c94f0718fb1c ("x86/apic: Consolidate the apic local headers")
ba77b2a02e00 ("x86/apic: Move apic_flat_64 header into apic directory")
8b542da37287 ("x86/apic: Move ipi header into apic directory")
521b82fee98c ("x86/apic: Cleanup the include maze")
cdc86c9d1f82 ("x86/apic: Move IPI inlines into ipi.c")
2591bc4e8d70 ("x86/kgbd: Use NMI_VECTOR not APIC_DM_NMI")
747d5a1bf293 ("x86/reboot: Always use NMI fallback when shutdown via reboot vector IPI fails")
7e300dabb7e7 ("treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 223")
fa4bff165070 ("Merge branch 'x86-mds-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 26044aff37a5455b19a91785086914fd33053ef4 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Wed, 30 Nov 2022 23:36:47 +0000
Subject: [PATCH] x86/crash: Disable virt in core NMI crash handler to avoid
double shootdown
Disable virtualization in crash_nmi_callback() and rework the
emergency_vmx_disable_all() path to do an NMI shootdown if and only if a
shootdown has not already occurred. NMI crash shootdown fundamentally
can't support multiple invocations as responding CPUs are deliberately
put into halt state without unblocking NMIs. But, the emergency reboot
path doesn't have any work of its own, it simply cares about disabling
virtualization, i.e. so long as a shootdown occurred, emergency reboot
doesn't care who initiated the shootdown, or when.
If "crash_kexec_post_notifiers" is specified on the kernel command line,
panic() will invoke crash_smp_send_stop() and result in a second call to
nmi_shootdown_cpus() during native_machine_emergency_restart().
Invoke the callback _before_ disabling virtualization, as the current
VMCS needs to be cleared before doing VMXOFF. Note, this results in a
subtle change in ordering between disabling virtualization and stopping
Intel PT on the responding CPUs. While VMX and Intel PT do interact,
VMXOFF and writes to MSR_IA32_RTIT_CTL do not induce faults between one
another, which is all that matters when panicking.
Harden nmi_shootdown_cpus() against multiple invocations to try and
capture any such kernel bugs via a WARN instead of hanging the system
during a crash/dump, e.g. prior to the recent hardening of
register_nmi_handler(), re-registering the NMI handler would trigger a
double list_add() and hang the system if CONFIG_BUG_ON_DATA_CORRUPTION=y.
list_add double add: new=ffffffff82220800, prev=ffffffff8221cfe8, next=ffffffff82220800.
WARNING: CPU: 2 PID: 1319 at lib/list_debug.c:29 __list_add_valid+0x67/0x70
Call Trace:
__register_nmi_handler+0xcf/0x130
nmi_shootdown_cpus+0x39/0x90
native_machine_emergency_restart+0x1c9/0x1d0
panic+0x237/0x29b
Extract the disabling logic to a common helper to deduplicate code, and
to prepare for doing the shootdown in the emergency reboot path if SVM
is supported.
Note, prior to commit ed72736183c4 ("x86/reboot: Force all cpus to exit
VMX root if VMX is supported"), nmi_shootdown_cpus() was subtly protected
against a second invocation by a cpu_vmx_enabled() check as the kdump
handler would disable VMX if it ran first.
Fixes: ed72736183c4 ("x86/reboot: Force all cpus to exit VMX root if VMX is supported")
Cc: stable(a)vger.kernel.org
Reported-by: Guilherme G. Piccoli <gpiccoli(a)igalia.com>
Cc: Vitaly Kuznetsov <vkuznets(a)redhat.com>
Cc: Paolo Bonzini <pbonzini(a)redhat.com>
Link: https://lore.kernel.org/all/20220427224924.592546-2-gpiccoli@igalia.com
Tested-by: Guilherme G. Piccoli <gpiccoli(a)igalia.com>
Reviewed-by: Thomas Gleixner <tglx(a)linutronix.de>
Link: https://lore.kernel.org/r/20221130233650.1404148-2-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
index 04c17be9b5fd..bc5b4d788c08 100644
--- a/arch/x86/include/asm/reboot.h
+++ b/arch/x86/include/asm/reboot.h
@@ -25,6 +25,8 @@ void __noreturn machine_real_restart(unsigned int type);
#define MRR_BIOS 0
#define MRR_APM 1
+void cpu_emergency_disable_virtualization(void);
+
typedef void (*nmi_shootdown_cb)(int, struct pt_regs*);
void nmi_panic_self_stop(struct pt_regs *regs);
void nmi_shootdown_cpus(nmi_shootdown_cb callback);
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 305514431f26..cdd92ab43cda 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -37,7 +37,6 @@
#include <linux/kdebug.h>
#include <asm/cpu.h>
#include <asm/reboot.h>
-#include <asm/virtext.h>
#include <asm/intel_pt.h>
#include <asm/crash.h>
#include <asm/cmdline.h>
@@ -81,15 +80,6 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
*/
cpu_crash_vmclear_loaded_vmcss();
- /* Disable VMX or SVM if needed.
- *
- * We need to disable virtualization on all CPUs.
- * Having VMX or SVM enabled on any CPU may break rebooting
- * after the kdump kernel has finished its task.
- */
- cpu_emergency_vmxoff();
- cpu_emergency_svm_disable();
-
/*
* Disable Intel PT to stop its logging
*/
@@ -148,12 +138,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
*/
cpu_crash_vmclear_loaded_vmcss();
- /* Booting kdump kernel with VMX or SVM enabled won't work,
- * because (among other limitations) we can't disable paging
- * with the virt flags.
- */
- cpu_emergency_vmxoff();
- cpu_emergency_svm_disable();
+ cpu_emergency_disable_virtualization();
/*
* Disable Intel PT to stop its logging
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index c3636ea4aa71..374c14b3a9bf 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -528,10 +528,7 @@ static inline void kb_wait(void)
}
}
-static void vmxoff_nmi(int cpu, struct pt_regs *regs)
-{
- cpu_emergency_vmxoff();
-}
+static inline void nmi_shootdown_cpus_on_restart(void);
/* Use NMIs as IPIs to tell all CPUs to disable virtualization */
static void emergency_vmx_disable_all(void)
@@ -554,7 +551,7 @@ static void emergency_vmx_disable_all(void)
__cpu_emergency_vmxoff();
/* Halt and exit VMX root operation on the other CPUs. */
- nmi_shootdown_cpus(vmxoff_nmi);
+ nmi_shootdown_cpus_on_restart();
}
}
@@ -795,6 +792,17 @@ void machine_crash_shutdown(struct pt_regs *regs)
/* This is the CPU performing the emergency shutdown work. */
int crashing_cpu = -1;
+/*
+ * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during
+ * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if
+ * GIF=0, i.e. if the crash occurred between CLGI and STGI.
+ */
+void cpu_emergency_disable_virtualization(void)
+{
+ cpu_emergency_vmxoff();
+ cpu_emergency_svm_disable();
+}
+
#if defined(CONFIG_SMP)
static nmi_shootdown_cb shootdown_callback;
@@ -817,7 +825,14 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
return NMI_HANDLED;
local_irq_disable();
- shootdown_callback(cpu, regs);
+ if (shootdown_callback)
+ shootdown_callback(cpu, regs);
+
+ /*
+ * Prepare the CPU for reboot _after_ invoking the callback so that the
+ * callback can safely use virtualization instructions, e.g. VMCLEAR.
+ */
+ cpu_emergency_disable_virtualization();
atomic_dec(&waiting_for_crash_ipi);
/* Assume hlt works */
@@ -828,18 +843,32 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
return NMI_HANDLED;
}
-/*
- * Halt all other CPUs, calling the specified function on each of them
+/**
+ * nmi_shootdown_cpus - Stop other CPUs via NMI
+ * @callback: Optional callback to be invoked from the NMI handler
+ *
+ * The NMI handler on the remote CPUs invokes @callback, if not
+ * NULL, first and then disables virtualization to ensure that
+ * INIT is recognized during reboot.
*
- * This function can be used to halt all other CPUs on crash
- * or emergency reboot time. The function passed as parameter
- * will be called inside a NMI handler on all CPUs.
+ * nmi_shootdown_cpus() can only be invoked once. After the first
+ * invocation all other CPUs are stuck in crash_nmi_callback() and
+ * cannot respond to a second NMI.
*/
void nmi_shootdown_cpus(nmi_shootdown_cb callback)
{
unsigned long msecs;
+
local_irq_disable();
+ /*
+ * Avoid certain doom if a shootdown already occurred; re-registering
+ * the NMI handler will cause list corruption, modifying the callback
+ * will do who knows what, etc...
+ */
+ if (WARN_ON_ONCE(crash_ipi_issued))
+ return;
+
/* Make a note of crashing cpu. Will be used in NMI callback. */
crashing_cpu = safe_smp_processor_id();
@@ -867,7 +896,17 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
msecs--;
}
- /* Leave the nmi callback set */
+ /*
+ * Leave the nmi callback set, shootdown is a one-time thing. Clearing
+ * the callback could result in a NULL pointer dereference if a CPU
+ * (finally) responds after the timeout expires.
+ */
+}
+
+static inline void nmi_shootdown_cpus_on_restart(void)
+{
+ if (!crash_ipi_issued)
+ nmi_shootdown_cpus(NULL);
}
/*
@@ -897,6 +936,8 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
/* No other CPUs to shoot down */
}
+static inline void nmi_shootdown_cpus_on_restart(void) { }
+
void run_crash_ipi_callback(struct pt_regs *regs)
{
}
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 26044aff37a5455b19a91785086914fd33053ef4
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '1678123634056(a)kroah.com' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
26044aff37a5 ("x86/crash: Disable virt in core NMI crash handler to avoid double shootdown")
ed72736183c4 ("x86/reboot: Force all cpus to exit VMX root if VMX is supported")
4d1d0977a215 ("x86: Fix a handful of typos")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 26044aff37a5455b19a91785086914fd33053ef4 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Wed, 30 Nov 2022 23:36:47 +0000
Subject: [PATCH] x86/crash: Disable virt in core NMI crash handler to avoid
double shootdown
Disable virtualization in crash_nmi_callback() and rework the
emergency_vmx_disable_all() path to do an NMI shootdown if and only if a
shootdown has not already occurred. NMI crash shootdown fundamentally
can't support multiple invocations as responding CPUs are deliberately
put into halt state without unblocking NMIs. But, the emergency reboot
path doesn't have any work of its own, it simply cares about disabling
virtualization, i.e. so long as a shootdown occurred, emergency reboot
doesn't care who initiated the shootdown, or when.
If "crash_kexec_post_notifiers" is specified on the kernel command line,
panic() will invoke crash_smp_send_stop() and result in a second call to
nmi_shootdown_cpus() during native_machine_emergency_restart().
Invoke the callback _before_ disabling virtualization, as the current
VMCS needs to be cleared before doing VMXOFF. Note, this results in a
subtle change in ordering between disabling virtualization and stopping
Intel PT on the responding CPUs. While VMX and Intel PT do interact,
VMXOFF and writes to MSR_IA32_RTIT_CTL do not induce faults between one
another, which is all that matters when panicking.
Harden nmi_shootdown_cpus() against multiple invocations to try and
capture any such kernel bugs via a WARN instead of hanging the system
during a crash/dump, e.g. prior to the recent hardening of
register_nmi_handler(), re-registering the NMI handler would trigger a
double list_add() and hang the system if CONFIG_BUG_ON_DATA_CORRUPTION=y.
list_add double add: new=ffffffff82220800, prev=ffffffff8221cfe8, next=ffffffff82220800.
WARNING: CPU: 2 PID: 1319 at lib/list_debug.c:29 __list_add_valid+0x67/0x70
Call Trace:
__register_nmi_handler+0xcf/0x130
nmi_shootdown_cpus+0x39/0x90
native_machine_emergency_restart+0x1c9/0x1d0
panic+0x237/0x29b
Extract the disabling logic to a common helper to deduplicate code, and
to prepare for doing the shootdown in the emergency reboot path if SVM
is supported.
Note, prior to commit ed72736183c4 ("x86/reboot: Force all cpus to exit
VMX root if VMX is supported"), nmi_shootdown_cpus() was subtly protected
against a second invocation by a cpu_vmx_enabled() check as the kdump
handler would disable VMX if it ran first.
Fixes: ed72736183c4 ("x86/reboot: Force all cpus to exit VMX root if VMX is supported")
Cc: stable(a)vger.kernel.org
Reported-by: Guilherme G. Piccoli <gpiccoli(a)igalia.com>
Cc: Vitaly Kuznetsov <vkuznets(a)redhat.com>
Cc: Paolo Bonzini <pbonzini(a)redhat.com>
Link: https://lore.kernel.org/all/20220427224924.592546-2-gpiccoli@igalia.com
Tested-by: Guilherme G. Piccoli <gpiccoli(a)igalia.com>
Reviewed-by: Thomas Gleixner <tglx(a)linutronix.de>
Link: https://lore.kernel.org/r/20221130233650.1404148-2-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
index 04c17be9b5fd..bc5b4d788c08 100644
--- a/arch/x86/include/asm/reboot.h
+++ b/arch/x86/include/asm/reboot.h
@@ -25,6 +25,8 @@ void __noreturn machine_real_restart(unsigned int type);
#define MRR_BIOS 0
#define MRR_APM 1
+void cpu_emergency_disable_virtualization(void);
+
typedef void (*nmi_shootdown_cb)(int, struct pt_regs*);
void nmi_panic_self_stop(struct pt_regs *regs);
void nmi_shootdown_cpus(nmi_shootdown_cb callback);
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 305514431f26..cdd92ab43cda 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -37,7 +37,6 @@
#include <linux/kdebug.h>
#include <asm/cpu.h>
#include <asm/reboot.h>
-#include <asm/virtext.h>
#include <asm/intel_pt.h>
#include <asm/crash.h>
#include <asm/cmdline.h>
@@ -81,15 +80,6 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
*/
cpu_crash_vmclear_loaded_vmcss();
- /* Disable VMX or SVM if needed.
- *
- * We need to disable virtualization on all CPUs.
- * Having VMX or SVM enabled on any CPU may break rebooting
- * after the kdump kernel has finished its task.
- */
- cpu_emergency_vmxoff();
- cpu_emergency_svm_disable();
-
/*
* Disable Intel PT to stop its logging
*/
@@ -148,12 +138,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
*/
cpu_crash_vmclear_loaded_vmcss();
- /* Booting kdump kernel with VMX or SVM enabled won't work,
- * because (among other limitations) we can't disable paging
- * with the virt flags.
- */
- cpu_emergency_vmxoff();
- cpu_emergency_svm_disable();
+ cpu_emergency_disable_virtualization();
/*
* Disable Intel PT to stop its logging
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index c3636ea4aa71..374c14b3a9bf 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -528,10 +528,7 @@ static inline void kb_wait(void)
}
}
-static void vmxoff_nmi(int cpu, struct pt_regs *regs)
-{
- cpu_emergency_vmxoff();
-}
+static inline void nmi_shootdown_cpus_on_restart(void);
/* Use NMIs as IPIs to tell all CPUs to disable virtualization */
static void emergency_vmx_disable_all(void)
@@ -554,7 +551,7 @@ static void emergency_vmx_disable_all(void)
__cpu_emergency_vmxoff();
/* Halt and exit VMX root operation on the other CPUs. */
- nmi_shootdown_cpus(vmxoff_nmi);
+ nmi_shootdown_cpus_on_restart();
}
}
@@ -795,6 +792,17 @@ void machine_crash_shutdown(struct pt_regs *regs)
/* This is the CPU performing the emergency shutdown work. */
int crashing_cpu = -1;
+/*
+ * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during
+ * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if
+ * GIF=0, i.e. if the crash occurred between CLGI and STGI.
+ */
+void cpu_emergency_disable_virtualization(void)
+{
+ cpu_emergency_vmxoff();
+ cpu_emergency_svm_disable();
+}
+
#if defined(CONFIG_SMP)
static nmi_shootdown_cb shootdown_callback;
@@ -817,7 +825,14 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
return NMI_HANDLED;
local_irq_disable();
- shootdown_callback(cpu, regs);
+ if (shootdown_callback)
+ shootdown_callback(cpu, regs);
+
+ /*
+ * Prepare the CPU for reboot _after_ invoking the callback so that the
+ * callback can safely use virtualization instructions, e.g. VMCLEAR.
+ */
+ cpu_emergency_disable_virtualization();
atomic_dec(&waiting_for_crash_ipi);
/* Assume hlt works */
@@ -828,18 +843,32 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
return NMI_HANDLED;
}
-/*
- * Halt all other CPUs, calling the specified function on each of them
+/**
+ * nmi_shootdown_cpus - Stop other CPUs via NMI
+ * @callback: Optional callback to be invoked from the NMI handler
+ *
+ * The NMI handler on the remote CPUs invokes @callback, if not
+ * NULL, first and then disables virtualization to ensure that
+ * INIT is recognized during reboot.
*
- * This function can be used to halt all other CPUs on crash
- * or emergency reboot time. The function passed as parameter
- * will be called inside a NMI handler on all CPUs.
+ * nmi_shootdown_cpus() can only be invoked once. After the first
+ * invocation all other CPUs are stuck in crash_nmi_callback() and
+ * cannot respond to a second NMI.
*/
void nmi_shootdown_cpus(nmi_shootdown_cb callback)
{
unsigned long msecs;
+
local_irq_disable();
+ /*
+ * Avoid certain doom if a shootdown already occurred; re-registering
+ * the NMI handler will cause list corruption, modifying the callback
+ * will do who knows what, etc...
+ */
+ if (WARN_ON_ONCE(crash_ipi_issued))
+ return;
+
/* Make a note of crashing cpu. Will be used in NMI callback. */
crashing_cpu = safe_smp_processor_id();
@@ -867,7 +896,17 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
msecs--;
}
- /* Leave the nmi callback set */
+ /*
+ * Leave the nmi callback set, shootdown is a one-time thing. Clearing
+ * the callback could result in a NULL pointer dereference if a CPU
+ * (finally) responds after the timeout expires.
+ */
+}
+
+static inline void nmi_shootdown_cpus_on_restart(void)
+{
+ if (!crash_ipi_issued)
+ nmi_shootdown_cpus(NULL);
}
/*
@@ -897,6 +936,8 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
/* No other CPUs to shoot down */
}
+static inline void nmi_shootdown_cpus_on_restart(void) { }
+
void run_crash_ipi_callback(struct pt_regs *regs)
{
}
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x f2d3155e2a6bac44d16f04415a321e8707d895c6
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '167810000636131(a)kroah.com' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
f2d3155e2a6b ("KVM: s390: disable migration mode when dirty tracking is disabled")
ec5c86976674 ("KVM: s390: Skip gfn/size sanity checks on memslot DELETE or FLAGS_ONLY")
cf5b486922dc ("KVM: s390: Use "new" memslot instead of userspace memory region")
106ee47dc633 ("docs: kvm: Convert api.txt to ReST format")
6c972ba685d5 ("docs: kvm: convert devices/vm.txt to ReST")
aff7aeea5483 ("docs: kvm: convert devices/vfio.txt to ReST")
e777a5bd98c6 ("docs: kvm: convert devices/vcpu.txt to ReST")
e94474300361 ("docs: kvm: convert devices/s390_flic.txt to ReST")
05c47036c62e ("docs: kvm: convert devices/mpic.txt to ReST")
c0d1c8a0af59 ("docs: kvm: devices/arm-vgit-v3.txt to ReST")
d371c011fc5e ("docs: kvm: devices/arm-vgic-its.txt to ReST format")
7bd460fc1dfa ("docs: kvm: add arm/pvtime.rst to index.rst")
290a6bb06de9 ("arm64: KVM: Add UAPI notes for swapped registers")
22945688acd4 ("KVM: PPC: Book3S HV: Support reset of secure guest")
008e359c76d8 ("KVM: PPC: Book3S HV: Radix changes for secure guest")
60f0a643aa44 ("KVM: PPC: Book3S HV: Shared pages support for secure guests")
ca9f4942670c ("KVM: PPC: Book3S HV: Support for running secure guests")
a4b28f5c6798 ("Merge remote-tracking branch 'kvmarm/kvm-arm64/stolen-time' into kvmarm-master/next")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f2d3155e2a6bac44d16f04415a321e8707d895c6 Mon Sep 17 00:00:00 2001
From: Nico Boehr <nrb(a)linux.ibm.com>
Date: Fri, 27 Jan 2023 15:05:32 +0100
Subject: [PATCH] KVM: s390: disable migration mode when dirty tracking is
disabled
Migration mode is a VM attribute which enables tracking of changes in
storage attributes (PGSTE). It assumes dirty tracking is enabled on all
memslots to keep a dirty bitmap of pages with changed storage attributes.
When enabling migration mode, we currently check that dirty tracking is
enabled for all memslots. However, userspace can disable dirty tracking
without disabling migration mode.
Since migration mode is pointless with dirty tracking disabled, disable
migration mode whenever userspace disables dirty tracking on any slot.
Also update the documentation to clarify that dirty tracking must be
enabled when enabling migration mode, which is already enforced by the
code in kvm_s390_vm_start_migration().
Also highlight in the documentation for KVM_S390_GET_CMMA_BITS that it
can now fail with -EINVAL when dirty tracking is disabled while
migration mode is on. Move all the error codes to a table so this stays
readable.
To disable migration mode, slots_lock should be held, which is taken
in kvm_set_memory_region() and thus held in
kvm_arch_prepare_memory_region().
Restructure the prepare code a bit so all the sanity checking is done
before disabling migration mode. This ensures migration mode isn't
disabled when some sanity check fails.
Cc: stable(a)vger.kernel.org
Fixes: 190df4a212a7 ("KVM: s390: CMMA tracking, ESSA emulation, migration mode")
Signed-off-by: Nico Boehr <nrb(a)linux.ibm.com>
Reviewed-by: Janosch Frank <frankja(a)linux.ibm.com>
Reviewed-by: Claudio Imbrenda <imbrenda(a)linux.ibm.com>
Link: https://lore.kernel.org/r/20230127140532.230651-2-nrb@linux.ibm.com
Message-Id: <20230127140532.230651-2-nrb(a)linux.ibm.com>
[frankja(a)linux.ibm.com: fixed commit message typo, moved api.rst error table upwards]
Signed-off-by: Janosch Frank <frankja(a)linux.ibm.com>
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index deb494f759ed..8cd7fd05d53b 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -4449,6 +4449,18 @@ not holding a previously reported uncorrected error).
:Parameters: struct kvm_s390_cmma_log (in, out)
:Returns: 0 on success, a negative value on error
+Errors:
+
+ ====== =============================================================
+ ENOMEM not enough memory can be allocated to complete the task
+ ENXIO if CMMA is not enabled
+ EINVAL if KVM_S390_CMMA_PEEK is not set but migration mode was not enabled
+ EINVAL if KVM_S390_CMMA_PEEK is not set but dirty tracking has been
+ disabled (and thus migration mode was automatically disabled)
+ EFAULT if the userspace address is invalid or if no page table is
+ present for the addresses (e.g. when using hugepages).
+ ====== =============================================================
+
This ioctl is used to get the values of the CMMA bits on the s390
architecture. It is meant to be used in two scenarios:
@@ -4529,12 +4541,6 @@ mask is unused.
values points to the userspace buffer where the result will be stored.
-This ioctl can fail with -ENOMEM if not enough memory can be allocated to
-complete the task, with -ENXIO if CMMA is not enabled, with -EINVAL if
-KVM_S390_CMMA_PEEK is not set but migration mode was not enabled, with
--EFAULT if the userspace address is invalid or if no page table is
-present for the addresses (e.g. when using hugepages).
-
4.108 KVM_S390_SET_CMMA_BITS
----------------------------
diff --git a/Documentation/virt/kvm/devices/vm.rst b/Documentation/virt/kvm/devices/vm.rst
index 60acc39e0e93..147efec626e5 100644
--- a/Documentation/virt/kvm/devices/vm.rst
+++ b/Documentation/virt/kvm/devices/vm.rst
@@ -302,6 +302,10 @@ Allows userspace to start migration mode, needed for PGSTE migration.
Setting this attribute when migration mode is already active will have
no effects.
+Dirty tracking must be enabled on all memslots, else -EINVAL is returned. When
+dirty tracking is disabled on any memslot, migration mode is automatically
+stopped.
+
:Parameters: none
:Returns: -ENOMEM if there is not enough free memory to start migration mode;
-EINVAL if the state of the VM is invalid (e.g. no memory defined);
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index e4890e04b210..cb72f9a09fb3 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -5633,23 +5633,40 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
if (kvm_s390_pv_get_handle(kvm))
return -EINVAL;
- if (change == KVM_MR_DELETE || change == KVM_MR_FLAGS_ONLY)
- return 0;
+ if (change != KVM_MR_DELETE && change != KVM_MR_FLAGS_ONLY) {
+ /*
+ * A few sanity checks. We can have memory slots which have to be
+ * located/ended at a segment boundary (1MB). The memory in userland is
+ * ok to be fragmented into various different vmas. It is okay to mmap()
+ * and munmap() stuff in this slot after doing this call at any time
+ */
- /* A few sanity checks. We can have memory slots which have to be
- located/ended at a segment boundary (1MB). The memory in userland is
- ok to be fragmented into various different vmas. It is okay to mmap()
- and munmap() stuff in this slot after doing this call at any time */
+ if (new->userspace_addr & 0xffffful)
+ return -EINVAL;
- if (new->userspace_addr & 0xffffful)
- return -EINVAL;
+ size = new->npages * PAGE_SIZE;
+ if (size & 0xffffful)
+ return -EINVAL;
- size = new->npages * PAGE_SIZE;
- if (size & 0xffffful)
- return -EINVAL;
+ if ((new->base_gfn * PAGE_SIZE) + size > kvm->arch.mem_limit)
+ return -EINVAL;
+ }
- if ((new->base_gfn * PAGE_SIZE) + size > kvm->arch.mem_limit)
- return -EINVAL;
+ if (!kvm->arch.migration_mode)
+ return 0;
+
+ /*
+ * Turn off migration mode when:
+ * - userspace creates a new memslot with dirty logging off,
+ * - userspace modifies an existing memslot (MOVE or FLAGS_ONLY) and
+ * dirty logging is turned off.
+ * Migration mode expects dirty page logging being enabled to store
+ * its dirty bitmap.
+ */
+ if (change != KVM_MR_DELETE &&
+ !(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
+ WARN(kvm_s390_vm_stop_migration(kvm),
+ "Failed to stop migration mode");
return 0;
}