Removing a drive with drive_del while it is being used to run an I/O
intensive workload can cause QEMU to crash.
An AIO flush can yield at some point:
blk_aio_flush_entry()
blk_co_flush(blk)
bdrv_co_flush(blk->root->bs)
...
qemu_coroutine_yield()
and let the HMP command to run, free blk->root and give control
back to the AIO flush:
hmp_drive_del()
blk_remove_bs()
bdrv_root_unref_child(blk->root)
child_bs = blk->root->bs
bdrv_detach_child(blk->root)
bdrv_replace_child(blk->root, NULL)
blk->root->bs = NULL
g_free(blk->root) <============== blk->root becomes stale
bdrv_unref(child_bs)
bdrv_delete(child_bs)
bdrv_close()
bdrv_drained_begin()
bdrv_do_drained_begin()
bdrv_drain_recurse()
aio_poll()
...
qemu_coroutine_switch()
and the AIO flush completion ends up dereferencing blk->root:
blk_aio_complete()
scsi_aio_complete()
blk_get_aio_context(blk)
bs = blk_bs(blk)
ie, bs = blk->root ? blk->root->bs : NULL
^^^^^
stale
The solution to this user-after-free situation is is to clear
blk->root before calling bdrv_unref() in bdrv_detach_child(),
and let blk_get_aio_context() fall back to the main loop context
since the BDS has been removed.
Signed-off-by: Greg Kurz <groug(a)kaod.org>
---
The use-after-free condition is easy to reproduce with a stress-ng
run in the guest:
-device virtio-scsi-pci,id=scsi1 \
-drive file=/home/greg/images/scratch.qcow2,format=qcow2,if=none,id=drive1 \
-device scsi-hd,bus=scsi1.0,drive=drive1,id=scsi-hd1
# stress-ng --hdd 0 --aggressive
and doing drive_del from the QEMU monitor while stress-ng is still running:
(qemu) drive_del drive1
The crash is less easy to hit though, as it depends on the bs field
of the stale blk->root to have a non-NULL value that eventually breaks
something when it gets dereferenced. The following patch simulates
that, and allows to validate the fix:
--- a/block.c
+++ b/block.c
@@ -2127,6 +2127,8 @@ BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
static void bdrv_detach_child(BdrvChild *child)
{
+ BlockDriverState *bs = child->bs;
+
if (child->next.le_prev) {
QLIST_REMOVE(child, next);
child->next.le_prev = NULL;
@@ -2135,7 +2137,15 @@ static void bdrv_detach_child(BdrvChild *child)
bdrv_replace_child(child, NULL);
g_free(child->name);
- g_free(child);
+ /* Poison the BdrvChild instead of freeing it, in order to break blk_bs()
+ * if the blk still has a pointer to this BdrvChild in blk->root.
+ */
+ if (atomic_read(&bs->in_flight)) {
+ child->bs = (BlockDriverState *) -1;
+ fprintf(stderr, "\nPoisonned BdrvChild %p\n", child);
+ } else {
+ g_free(child);
+ }
}
void bdrv_root_unref_child(BdrvChild *child)
---
block/block-backend.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/block/block-backend.c b/block/block-backend.c
index 681b240b1268..ed9434e236b9 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -756,6 +756,7 @@ void blk_remove_bs(BlockBackend *blk)
{
ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
BlockDriverState *bs;
+ BdrvChild *root;
notifier_list_notify(&blk->remove_bs_notifiers, blk);
if (tgm->throttle_state) {
@@ -768,8 +769,9 @@ void blk_remove_bs(BlockBackend *blk)
blk_update_root_state(blk);
- bdrv_root_unref_child(blk->root);
+ root = blk->root;
blk->root = NULL;
+ bdrv_root_unref_child(root);
}
/*
From: "Steven Rostedt (VMware)" <rostedt(a)goodmis.org>
The trigger code is picky in how it can be disabled as there may be
dependencies between different events and synthetic events. Change the order
on how triggers are reset.
1) Reset triggers of all synthetic events first
2) Remove triggers with actions attached to them
3) Remove all other triggers
If this order isn't followed, then some triggers will not be reset, and an
error may happen because a trigger is busy.
Cc: stable(a)vger.kernel.org
Fixes: cfa0963dc474f ("kselftests/ftrace : Add event trigger testcases")
Acked-by: Masami Hiramatsu <mhiramat(a)kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt(a)goodmis.org>
---
.../testing/selftests/ftrace/test.d/functions | 21 ++++++++++++++++---
1 file changed, 18 insertions(+), 3 deletions(-)
diff --git a/tools/testing/selftests/ftrace/test.d/functions b/tools/testing/selftests/ftrace/test.d/functions
index 2a4f16fc9819..8393b1c06027 100644
--- a/tools/testing/selftests/ftrace/test.d/functions
+++ b/tools/testing/selftests/ftrace/test.d/functions
@@ -15,14 +15,29 @@ reset_tracer() { # reset the current tracer
echo nop > current_tracer
}
-reset_trigger() { # reset all current setting triggers
- grep -v ^# events/*/*/trigger |
+reset_trigger_file() {
+ # remove action triggers first
+ grep -H ':on[^:]*(' $@ |
+ while read line; do
+ cmd=`echo $line | cut -f2- -d: | cut -f1 -d" "`
+ file=`echo $line | cut -f1 -d:`
+ echo "!$cmd" >> $file
+ done
+ grep -Hv ^# $@ |
while read line; do
cmd=`echo $line | cut -f2- -d: | cut -f1 -d" "`
- echo "!$cmd" > `echo $line | cut -f1 -d:`
+ file=`echo $line | cut -f1 -d:`
+ echo "!$cmd" > $file
done
}
+reset_trigger() { # reset all current setting triggers
+ if [ -d events/synthetic ]; then
+ reset_trigger_file events/synthetic/*/trigger
+ fi
+ reset_trigger_file events/*/*/trigger
+}
+
reset_events_filter() { # reset all current setting filters
grep -v ^none events/*/*/filter |
while read line; do
--
2.17.0
Update support for the UV kernel to accommodate Intel BIOS changes in
NVDIMM alignment, which caused UV BIOS to align the memory boundaries
on different blocks than the previous UV standard of 2GB.
--