The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 6050fa4c84cc93ae509f5105f585a429dffc5633 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel(a)i-love.sakura.ne.jp>
Date: Wed, 24 Nov 2021 19:47:40 +0900
Subject: [PATCH] loop: don't hold lo_mutex during __loop_clr_fd()
syzbot is reporting circular locking problem at __loop_clr_fd() [1], for
commit 87579e9b7d8dc36e ("loop: use worker per cgroup instead of kworker")
is calling destroy_workqueue() with lo->lo_mutex held.
Since all functions where lo->lo_state matters are already checking
lo->lo_state with lo->lo_mutex held (in order to avoid racing with e.g.
ioctl(LOOP_CTL_REMOVE)), and __loop_clr_fd() can be called from either
ioctl(LOOP_CLR_FD) xor close(), lo->lo_state == Lo_rundown is considered
as an exclusive lock for __loop_clr_fd(). Therefore, hold lo->lo_mutex
inside __loop_clr_fd() only when asserting/updating lo->lo_state.
Since ioctl(LOOP_CLR_FD) depends on lo->lo_state == Lo_bound, a valid
lo->lo_backing_file must have been assigned by ioctl(LOOP_SET_FD) or
ioctl(LOOP_CONFIGURE). Thus, we can remove lo->lo_backing_file test,
and convert __loop_clr_fd() into a void function.
Link: https://syzkaller.appspot.com/bug?extid=63614029dfb79abd4383 [1]
Reported-by: syzbot <syzbot+63614029dfb79abd4383(a)syzkaller.appspotmail.com>
Signed-off-by: Tetsuo Handa <penguin-kernel(a)I-love.SAKURA.ne.jp>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
Link: https://lore.kernel.org/r/8ebe3b2e-8975-7f26-0620-7144a3b8b8cd@i-love.sakur…
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 0954ea8cf9e3..ba76319b5544 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1082,13 +1082,10 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
return error;
}
-static int __loop_clr_fd(struct loop_device *lo, bool release)
+static void __loop_clr_fd(struct loop_device *lo, bool release)
{
- struct file *filp = NULL;
+ struct file *filp;
gfp_t gfp = lo->old_gfp_mask;
- int err = 0;
- bool partscan = false;
- int lo_number;
struct loop_worker *pos, *worker;
/*
@@ -1103,17 +1100,14 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
* became visible.
*/
+ /*
+ * Since this function is called upon "ioctl(LOOP_CLR_FD)" xor "close()
+ * after ioctl(LOOP_CLR_FD)", it is a sign of something going wrong if
+ * lo->lo_state has changed while waiting for lo->lo_mutex.
+ */
mutex_lock(&lo->lo_mutex);
- if (WARN_ON_ONCE(lo->lo_state != Lo_rundown)) {
- err = -ENXIO;
- goto out_unlock;
- }
-
- filp = lo->lo_backing_file;
- if (filp == NULL) {
- err = -EINVAL;
- goto out_unlock;
- }
+ BUG_ON(lo->lo_state != Lo_rundown);
+ mutex_unlock(&lo->lo_mutex);
if (test_bit(QUEUE_FLAG_WC, &lo->lo_queue->queue_flags))
blk_queue_write_cache(lo->lo_queue, false, false);
@@ -1134,6 +1128,7 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
del_timer_sync(&lo->timer);
spin_lock_irq(&lo->lo_lock);
+ filp = lo->lo_backing_file;
lo->lo_backing_file = NULL;
spin_unlock_irq(&lo->lo_lock);
@@ -1153,12 +1148,11 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
module_put(THIS_MODULE);
blk_mq_unfreeze_queue(lo->lo_queue);
- partscan = lo->lo_flags & LO_FLAGS_PARTSCAN;
- lo_number = lo->lo_number;
disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE);
-out_unlock:
- mutex_unlock(&lo->lo_mutex);
- if (partscan) {
+
+ if (lo->lo_flags & LO_FLAGS_PARTSCAN) {
+ int err;
+
/*
* open_mutex has been held already in release path, so don't
* acquire it if this function is called in such case.
@@ -1174,24 +1168,20 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
mutex_unlock(&lo->lo_disk->open_mutex);
if (err)
pr_warn("%s: partition scan of loop%d failed (rc=%d)\n",
- __func__, lo_number, err);
+ __func__, lo->lo_number, err);
/* Device is gone, no point in returning error */
- err = 0;
}
/*
* lo->lo_state is set to Lo_unbound here after above partscan has
- * finished.
- *
- * There cannot be anybody else entering __loop_clr_fd() as
- * lo->lo_backing_file is already cleared and Lo_rundown state
- * protects us from all the other places trying to change the 'lo'
- * device.
+ * finished. There cannot be anybody else entering __loop_clr_fd() as
+ * Lo_rundown state protects us from all the other places trying to
+ * change the 'lo' device.
*/
- mutex_lock(&lo->lo_mutex);
lo->lo_flags = 0;
if (!part_shift)
lo->lo_disk->flags |= GENHD_FL_NO_PART;
+ mutex_lock(&lo->lo_mutex);
lo->lo_state = Lo_unbound;
mutex_unlock(&lo->lo_mutex);
@@ -1200,9 +1190,7 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
* lo_mutex triggers a circular lock dependency possibility warning as
* fput can take open_mutex which is usually taken before lo_mutex.
*/
- if (filp)
- fput(filp);
- return err;
+ fput(filp);
}
static int loop_clr_fd(struct loop_device *lo)
@@ -1234,7 +1222,8 @@ static int loop_clr_fd(struct loop_device *lo)
lo->lo_state = Lo_rundown;
mutex_unlock(&lo->lo_mutex);
- return __loop_clr_fd(lo, false);
+ __loop_clr_fd(lo, false);
+ return 0;
}
static int
The patch below does not apply to the 5.16-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 6050fa4c84cc93ae509f5105f585a429dffc5633 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel(a)i-love.sakura.ne.jp>
Date: Wed, 24 Nov 2021 19:47:40 +0900
Subject: [PATCH] loop: don't hold lo_mutex during __loop_clr_fd()
syzbot is reporting circular locking problem at __loop_clr_fd() [1], for
commit 87579e9b7d8dc36e ("loop: use worker per cgroup instead of kworker")
is calling destroy_workqueue() with lo->lo_mutex held.
Since all functions where lo->lo_state matters are already checking
lo->lo_state with lo->lo_mutex held (in order to avoid racing with e.g.
ioctl(LOOP_CTL_REMOVE)), and __loop_clr_fd() can be called from either
ioctl(LOOP_CLR_FD) xor close(), lo->lo_state == Lo_rundown is considered
as an exclusive lock for __loop_clr_fd(). Therefore, hold lo->lo_mutex
inside __loop_clr_fd() only when asserting/updating lo->lo_state.
Since ioctl(LOOP_CLR_FD) depends on lo->lo_state == Lo_bound, a valid
lo->lo_backing_file must have been assigned by ioctl(LOOP_SET_FD) or
ioctl(LOOP_CONFIGURE). Thus, we can remove lo->lo_backing_file test,
and convert __loop_clr_fd() into a void function.
Link: https://syzkaller.appspot.com/bug?extid=63614029dfb79abd4383 [1]
Reported-by: syzbot <syzbot+63614029dfb79abd4383(a)syzkaller.appspotmail.com>
Signed-off-by: Tetsuo Handa <penguin-kernel(a)I-love.SAKURA.ne.jp>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
Link: https://lore.kernel.org/r/8ebe3b2e-8975-7f26-0620-7144a3b8b8cd@i-love.sakur…
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 0954ea8cf9e3..ba76319b5544 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1082,13 +1082,10 @@ static int loop_configure(struct loop_device *lo, fmode_t mode,
return error;
}
-static int __loop_clr_fd(struct loop_device *lo, bool release)
+static void __loop_clr_fd(struct loop_device *lo, bool release)
{
- struct file *filp = NULL;
+ struct file *filp;
gfp_t gfp = lo->old_gfp_mask;
- int err = 0;
- bool partscan = false;
- int lo_number;
struct loop_worker *pos, *worker;
/*
@@ -1103,17 +1100,14 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
* became visible.
*/
+ /*
+ * Since this function is called upon "ioctl(LOOP_CLR_FD)" xor "close()
+ * after ioctl(LOOP_CLR_FD)", it is a sign of something going wrong if
+ * lo->lo_state has changed while waiting for lo->lo_mutex.
+ */
mutex_lock(&lo->lo_mutex);
- if (WARN_ON_ONCE(lo->lo_state != Lo_rundown)) {
- err = -ENXIO;
- goto out_unlock;
- }
-
- filp = lo->lo_backing_file;
- if (filp == NULL) {
- err = -EINVAL;
- goto out_unlock;
- }
+ BUG_ON(lo->lo_state != Lo_rundown);
+ mutex_unlock(&lo->lo_mutex);
if (test_bit(QUEUE_FLAG_WC, &lo->lo_queue->queue_flags))
blk_queue_write_cache(lo->lo_queue, false, false);
@@ -1134,6 +1128,7 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
del_timer_sync(&lo->timer);
spin_lock_irq(&lo->lo_lock);
+ filp = lo->lo_backing_file;
lo->lo_backing_file = NULL;
spin_unlock_irq(&lo->lo_lock);
@@ -1153,12 +1148,11 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
module_put(THIS_MODULE);
blk_mq_unfreeze_queue(lo->lo_queue);
- partscan = lo->lo_flags & LO_FLAGS_PARTSCAN;
- lo_number = lo->lo_number;
disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE);
-out_unlock:
- mutex_unlock(&lo->lo_mutex);
- if (partscan) {
+
+ if (lo->lo_flags & LO_FLAGS_PARTSCAN) {
+ int err;
+
/*
* open_mutex has been held already in release path, so don't
* acquire it if this function is called in such case.
@@ -1174,24 +1168,20 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
mutex_unlock(&lo->lo_disk->open_mutex);
if (err)
pr_warn("%s: partition scan of loop%d failed (rc=%d)\n",
- __func__, lo_number, err);
+ __func__, lo->lo_number, err);
/* Device is gone, no point in returning error */
- err = 0;
}
/*
* lo->lo_state is set to Lo_unbound here after above partscan has
- * finished.
- *
- * There cannot be anybody else entering __loop_clr_fd() as
- * lo->lo_backing_file is already cleared and Lo_rundown state
- * protects us from all the other places trying to change the 'lo'
- * device.
+ * finished. There cannot be anybody else entering __loop_clr_fd() as
+ * Lo_rundown state protects us from all the other places trying to
+ * change the 'lo' device.
*/
- mutex_lock(&lo->lo_mutex);
lo->lo_flags = 0;
if (!part_shift)
lo->lo_disk->flags |= GENHD_FL_NO_PART;
+ mutex_lock(&lo->lo_mutex);
lo->lo_state = Lo_unbound;
mutex_unlock(&lo->lo_mutex);
@@ -1200,9 +1190,7 @@ static int __loop_clr_fd(struct loop_device *lo, bool release)
* lo_mutex triggers a circular lock dependency possibility warning as
* fput can take open_mutex which is usually taken before lo_mutex.
*/
- if (filp)
- fput(filp);
- return err;
+ fput(filp);
}
static int loop_clr_fd(struct loop_device *lo)
@@ -1234,7 +1222,8 @@ static int loop_clr_fd(struct loop_device *lo)
lo->lo_state = Lo_rundown;
mutex_unlock(&lo->lo_mutex);
- return __loop_clr_fd(lo, false);
+ __loop_clr_fd(lo, false);
+ return 0;
}
static int
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 791f3465c4afde02d7f16cf7424ca87070b69396 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence(a)gmail.com>
Date: Fri, 14 Jan 2022 11:59:10 +0000
Subject: [PATCH] io_uring: fix UAF due to missing POLLFREE handling
Fixes a problem described in 50252e4b5e989
("aio: fix use-after-free due to missing POLLFREE handling")
and copies the approach used there.
In short, we have to forcibly eject a poll entry when we meet POLLFREE.
We can't rely on io_poll_get_ownership() as can't wait for potentially
running tw handlers, so we use the fact that wqs are RCU freed. See
Eric's patch and comments for more details.
Reported-by: Eric Biggers <ebiggers(a)google.com>
Link: https://lore.kernel.org/r/20211209010455.42744-6-ebiggers@kernel.org
Reported-and-tested-by: syzbot+5426c7ed6868c705ca14(a)syzkaller.appspotmail.com
Fixes: 221c5eb233823 ("io_uring: add support for IORING_OP_POLL")
Signed-off-by: Pavel Begunkov <asml.silence(a)gmail.com>
Link: https://lore.kernel.org/r/4ed56b6f548f7ea337603a82315750449412748a.16421612…
[axboe: drop non-functional change from patch]
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io_uring.c b/fs/io_uring.c
index fa3277844d2e..422d6de48688 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5462,12 +5462,14 @@ static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
static inline void io_poll_remove_entry(struct io_poll_iocb *poll)
{
- struct wait_queue_head *head = poll->head;
+ struct wait_queue_head *head = smp_load_acquire(&poll->head);
- spin_lock_irq(&head->lock);
- list_del_init(&poll->wait.entry);
- poll->head = NULL;
- spin_unlock_irq(&head->lock);
+ if (head) {
+ spin_lock_irq(&head->lock);
+ list_del_init(&poll->wait.entry);
+ poll->head = NULL;
+ spin_unlock_irq(&head->lock);
+ }
}
static void io_poll_remove_entries(struct io_kiocb *req)
@@ -5475,10 +5477,26 @@ static void io_poll_remove_entries(struct io_kiocb *req)
struct io_poll_iocb *poll = io_poll_get_single(req);
struct io_poll_iocb *poll_double = io_poll_get_double(req);
- if (poll->head)
- io_poll_remove_entry(poll);
- if (poll_double && poll_double->head)
+ /*
+ * While we hold the waitqueue lock and the waitqueue is nonempty,
+ * wake_up_pollfree() will wait for us. However, taking the waitqueue
+ * lock in the first place can race with the waitqueue being freed.
+ *
+ * We solve this as eventpoll does: by taking advantage of the fact that
+ * all users of wake_up_pollfree() will RCU-delay the actual free. If
+ * we enter rcu_read_lock() and see that the pointer to the queue is
+ * non-NULL, we can then lock it without the memory being freed out from
+ * under us.
+ *
+ * Keep holding rcu_read_lock() as long as we hold the queue lock, in
+ * case the caller deletes the entry from the queue, leaving it empty.
+ * In that case, only RCU prevents the queue memory from being freed.
+ */
+ rcu_read_lock();
+ io_poll_remove_entry(poll);
+ if (poll_double)
io_poll_remove_entry(poll_double);
+ rcu_read_unlock();
}
/*
@@ -5618,6 +5636,30 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
wait);
__poll_t mask = key_to_poll(key);
+ if (unlikely(mask & POLLFREE)) {
+ io_poll_mark_cancelled(req);
+ /* we have to kick tw in case it's not already */
+ io_poll_execute(req, 0);
+
+ /*
+ * If the waitqueue is being freed early but someone is already
+ * holds ownership over it, we have to tear down the request as
+ * best we can. That means immediately removing the request from
+ * its waitqueue and preventing all further accesses to the
+ * waitqueue via the request.
+ */
+ list_del_init(&poll->wait.entry);
+
+ /*
+ * Careful: this *must* be the last step, since as soon
+ * as req->head is NULL'ed out, the request can be
+ * completed and freed, since aio_poll_complete_work()
+ * will no longer need to take the waitqueue lock.
+ */
+ smp_store_release(&poll->head, NULL);
+ return 1;
+ }
+
/* for instances that support it check for an event match first */
if (mask && !(mask & poll->events))
return 0;
The patch below does not apply to the 5.16-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 791f3465c4afde02d7f16cf7424ca87070b69396 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence(a)gmail.com>
Date: Fri, 14 Jan 2022 11:59:10 +0000
Subject: [PATCH] io_uring: fix UAF due to missing POLLFREE handling
Fixes a problem described in 50252e4b5e989
("aio: fix use-after-free due to missing POLLFREE handling")
and copies the approach used there.
In short, we have to forcibly eject a poll entry when we meet POLLFREE.
We can't rely on io_poll_get_ownership() as can't wait for potentially
running tw handlers, so we use the fact that wqs are RCU freed. See
Eric's patch and comments for more details.
Reported-by: Eric Biggers <ebiggers(a)google.com>
Link: https://lore.kernel.org/r/20211209010455.42744-6-ebiggers@kernel.org
Reported-and-tested-by: syzbot+5426c7ed6868c705ca14(a)syzkaller.appspotmail.com
Fixes: 221c5eb233823 ("io_uring: add support for IORING_OP_POLL")
Signed-off-by: Pavel Begunkov <asml.silence(a)gmail.com>
Link: https://lore.kernel.org/r/4ed56b6f548f7ea337603a82315750449412748a.16421612…
[axboe: drop non-functional change from patch]
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io_uring.c b/fs/io_uring.c
index fa3277844d2e..422d6de48688 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5462,12 +5462,14 @@ static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
static inline void io_poll_remove_entry(struct io_poll_iocb *poll)
{
- struct wait_queue_head *head = poll->head;
+ struct wait_queue_head *head = smp_load_acquire(&poll->head);
- spin_lock_irq(&head->lock);
- list_del_init(&poll->wait.entry);
- poll->head = NULL;
- spin_unlock_irq(&head->lock);
+ if (head) {
+ spin_lock_irq(&head->lock);
+ list_del_init(&poll->wait.entry);
+ poll->head = NULL;
+ spin_unlock_irq(&head->lock);
+ }
}
static void io_poll_remove_entries(struct io_kiocb *req)
@@ -5475,10 +5477,26 @@ static void io_poll_remove_entries(struct io_kiocb *req)
struct io_poll_iocb *poll = io_poll_get_single(req);
struct io_poll_iocb *poll_double = io_poll_get_double(req);
- if (poll->head)
- io_poll_remove_entry(poll);
- if (poll_double && poll_double->head)
+ /*
+ * While we hold the waitqueue lock and the waitqueue is nonempty,
+ * wake_up_pollfree() will wait for us. However, taking the waitqueue
+ * lock in the first place can race with the waitqueue being freed.
+ *
+ * We solve this as eventpoll does: by taking advantage of the fact that
+ * all users of wake_up_pollfree() will RCU-delay the actual free. If
+ * we enter rcu_read_lock() and see that the pointer to the queue is
+ * non-NULL, we can then lock it without the memory being freed out from
+ * under us.
+ *
+ * Keep holding rcu_read_lock() as long as we hold the queue lock, in
+ * case the caller deletes the entry from the queue, leaving it empty.
+ * In that case, only RCU prevents the queue memory from being freed.
+ */
+ rcu_read_lock();
+ io_poll_remove_entry(poll);
+ if (poll_double)
io_poll_remove_entry(poll_double);
+ rcu_read_unlock();
}
/*
@@ -5618,6 +5636,30 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
wait);
__poll_t mask = key_to_poll(key);
+ if (unlikely(mask & POLLFREE)) {
+ io_poll_mark_cancelled(req);
+ /* we have to kick tw in case it's not already */
+ io_poll_execute(req, 0);
+
+ /*
+ * If the waitqueue is being freed early but someone is already
+ * holds ownership over it, we have to tear down the request as
+ * best we can. That means immediately removing the request from
+ * its waitqueue and preventing all further accesses to the
+ * waitqueue via the request.
+ */
+ list_del_init(&poll->wait.entry);
+
+ /*
+ * Careful: this *must* be the last step, since as soon
+ * as req->head is NULL'ed out, the request can be
+ * completed and freed, since aio_poll_complete_work()
+ * will no longer need to take the waitqueue lock.
+ */
+ smp_store_release(&poll->head, NULL);
+ return 1;
+ }
+
/* for instances that support it check for an event match first */
if (mask && !(mask & poll->events))
return 0;
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 791f3465c4afde02d7f16cf7424ca87070b69396 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence(a)gmail.com>
Date: Fri, 14 Jan 2022 11:59:10 +0000
Subject: [PATCH] io_uring: fix UAF due to missing POLLFREE handling
Fixes a problem described in 50252e4b5e989
("aio: fix use-after-free due to missing POLLFREE handling")
and copies the approach used there.
In short, we have to forcibly eject a poll entry when we meet POLLFREE.
We can't rely on io_poll_get_ownership() as can't wait for potentially
running tw handlers, so we use the fact that wqs are RCU freed. See
Eric's patch and comments for more details.
Reported-by: Eric Biggers <ebiggers(a)google.com>
Link: https://lore.kernel.org/r/20211209010455.42744-6-ebiggers@kernel.org
Reported-and-tested-by: syzbot+5426c7ed6868c705ca14(a)syzkaller.appspotmail.com
Fixes: 221c5eb233823 ("io_uring: add support for IORING_OP_POLL")
Signed-off-by: Pavel Begunkov <asml.silence(a)gmail.com>
Link: https://lore.kernel.org/r/4ed56b6f548f7ea337603a82315750449412748a.16421612…
[axboe: drop non-functional change from patch]
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io_uring.c b/fs/io_uring.c
index fa3277844d2e..422d6de48688 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5462,12 +5462,14 @@ static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
static inline void io_poll_remove_entry(struct io_poll_iocb *poll)
{
- struct wait_queue_head *head = poll->head;
+ struct wait_queue_head *head = smp_load_acquire(&poll->head);
- spin_lock_irq(&head->lock);
- list_del_init(&poll->wait.entry);
- poll->head = NULL;
- spin_unlock_irq(&head->lock);
+ if (head) {
+ spin_lock_irq(&head->lock);
+ list_del_init(&poll->wait.entry);
+ poll->head = NULL;
+ spin_unlock_irq(&head->lock);
+ }
}
static void io_poll_remove_entries(struct io_kiocb *req)
@@ -5475,10 +5477,26 @@ static void io_poll_remove_entries(struct io_kiocb *req)
struct io_poll_iocb *poll = io_poll_get_single(req);
struct io_poll_iocb *poll_double = io_poll_get_double(req);
- if (poll->head)
- io_poll_remove_entry(poll);
- if (poll_double && poll_double->head)
+ /*
+ * While we hold the waitqueue lock and the waitqueue is nonempty,
+ * wake_up_pollfree() will wait for us. However, taking the waitqueue
+ * lock in the first place can race with the waitqueue being freed.
+ *
+ * We solve this as eventpoll does: by taking advantage of the fact that
+ * all users of wake_up_pollfree() will RCU-delay the actual free. If
+ * we enter rcu_read_lock() and see that the pointer to the queue is
+ * non-NULL, we can then lock it without the memory being freed out from
+ * under us.
+ *
+ * Keep holding rcu_read_lock() as long as we hold the queue lock, in
+ * case the caller deletes the entry from the queue, leaving it empty.
+ * In that case, only RCU prevents the queue memory from being freed.
+ */
+ rcu_read_lock();
+ io_poll_remove_entry(poll);
+ if (poll_double)
io_poll_remove_entry(poll_double);
+ rcu_read_unlock();
}
/*
@@ -5618,6 +5636,30 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
wait);
__poll_t mask = key_to_poll(key);
+ if (unlikely(mask & POLLFREE)) {
+ io_poll_mark_cancelled(req);
+ /* we have to kick tw in case it's not already */
+ io_poll_execute(req, 0);
+
+ /*
+ * If the waitqueue is being freed early but someone is already
+ * holds ownership over it, we have to tear down the request as
+ * best we can. That means immediately removing the request from
+ * its waitqueue and preventing all further accesses to the
+ * waitqueue via the request.
+ */
+ list_del_init(&poll->wait.entry);
+
+ /*
+ * Careful: this *must* be the last step, since as soon
+ * as req->head is NULL'ed out, the request can be
+ * completed and freed, since aio_poll_complete_work()
+ * will no longer need to take the waitqueue lock.
+ */
+ smp_store_release(&poll->head, NULL);
+ return 1;
+ }
+
/* for instances that support it check for an event match first */
if (mask && !(mask & poll->events))
return 0;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 791f3465c4afde02d7f16cf7424ca87070b69396 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence(a)gmail.com>
Date: Fri, 14 Jan 2022 11:59:10 +0000
Subject: [PATCH] io_uring: fix UAF due to missing POLLFREE handling
Fixes a problem described in 50252e4b5e989
("aio: fix use-after-free due to missing POLLFREE handling")
and copies the approach used there.
In short, we have to forcibly eject a poll entry when we meet POLLFREE.
We can't rely on io_poll_get_ownership() as can't wait for potentially
running tw handlers, so we use the fact that wqs are RCU freed. See
Eric's patch and comments for more details.
Reported-by: Eric Biggers <ebiggers(a)google.com>
Link: https://lore.kernel.org/r/20211209010455.42744-6-ebiggers@kernel.org
Reported-and-tested-by: syzbot+5426c7ed6868c705ca14(a)syzkaller.appspotmail.com
Fixes: 221c5eb233823 ("io_uring: add support for IORING_OP_POLL")
Signed-off-by: Pavel Begunkov <asml.silence(a)gmail.com>
Link: https://lore.kernel.org/r/4ed56b6f548f7ea337603a82315750449412748a.16421612…
[axboe: drop non-functional change from patch]
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io_uring.c b/fs/io_uring.c
index fa3277844d2e..422d6de48688 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5462,12 +5462,14 @@ static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
static inline void io_poll_remove_entry(struct io_poll_iocb *poll)
{
- struct wait_queue_head *head = poll->head;
+ struct wait_queue_head *head = smp_load_acquire(&poll->head);
- spin_lock_irq(&head->lock);
- list_del_init(&poll->wait.entry);
- poll->head = NULL;
- spin_unlock_irq(&head->lock);
+ if (head) {
+ spin_lock_irq(&head->lock);
+ list_del_init(&poll->wait.entry);
+ poll->head = NULL;
+ spin_unlock_irq(&head->lock);
+ }
}
static void io_poll_remove_entries(struct io_kiocb *req)
@@ -5475,10 +5477,26 @@ static void io_poll_remove_entries(struct io_kiocb *req)
struct io_poll_iocb *poll = io_poll_get_single(req);
struct io_poll_iocb *poll_double = io_poll_get_double(req);
- if (poll->head)
- io_poll_remove_entry(poll);
- if (poll_double && poll_double->head)
+ /*
+ * While we hold the waitqueue lock and the waitqueue is nonempty,
+ * wake_up_pollfree() will wait for us. However, taking the waitqueue
+ * lock in the first place can race with the waitqueue being freed.
+ *
+ * We solve this as eventpoll does: by taking advantage of the fact that
+ * all users of wake_up_pollfree() will RCU-delay the actual free. If
+ * we enter rcu_read_lock() and see that the pointer to the queue is
+ * non-NULL, we can then lock it without the memory being freed out from
+ * under us.
+ *
+ * Keep holding rcu_read_lock() as long as we hold the queue lock, in
+ * case the caller deletes the entry from the queue, leaving it empty.
+ * In that case, only RCU prevents the queue memory from being freed.
+ */
+ rcu_read_lock();
+ io_poll_remove_entry(poll);
+ if (poll_double)
io_poll_remove_entry(poll_double);
+ rcu_read_unlock();
}
/*
@@ -5618,6 +5636,30 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
wait);
__poll_t mask = key_to_poll(key);
+ if (unlikely(mask & POLLFREE)) {
+ io_poll_mark_cancelled(req);
+ /* we have to kick tw in case it's not already */
+ io_poll_execute(req, 0);
+
+ /*
+ * If the waitqueue is being freed early but someone is already
+ * holds ownership over it, we have to tear down the request as
+ * best we can. That means immediately removing the request from
+ * its waitqueue and preventing all further accesses to the
+ * waitqueue via the request.
+ */
+ list_del_init(&poll->wait.entry);
+
+ /*
+ * Careful: this *must* be the last step, since as soon
+ * as req->head is NULL'ed out, the request can be
+ * completed and freed, since aio_poll_complete_work()
+ * will no longer need to take the waitqueue lock.
+ */
+ smp_store_release(&poll->head, NULL);
+ return 1;
+ }
+
/* for instances that support it check for an event match first */
if (mask && !(mask & poll->events))
return 0;
This series is the essential subset of "[PATCH v3 0/5] KVM: nVMX: Fix
Windows 11 + WSL2 + Enlightened VMCS" which got merged upstream recently.
It fixes Windows 11 guests with enabled Hyper-V role on KVM when eVMCS is
in use.
Vitaly Kuznetsov (3):
KVM: nVMX: Rename vmcs_to_field_offset{,_table}
KVM: nVMX: Implement evmcs_field_offset() suitable for handle_vmread()
KVM: nVMX: Allow VMREAD when Enlightened VMCS is in use
arch/x86/kvm/vmx/evmcs.c | 3 +-
arch/x86/kvm/vmx/evmcs.h | 44 +++++++++++++++++++++++------
arch/x86/kvm/vmx/nested.c | 59 +++++++++++++++++++++++++++------------
arch/x86/kvm/vmx/vmcs12.c | 4 +--
arch/x86/kvm/vmx/vmcs12.h | 6 ++--
5 files changed, 83 insertions(+), 33 deletions(-)
--
2.34.1
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From a37d9a17f099072fe4d3a9048b0321978707a918 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il(a)gmail.com>
Date: Thu, 20 Jan 2022 23:53:04 +0200
Subject: [PATCH] fsnotify: invalidate dcache before IN_DELETE event
Apparently, there are some applications that use IN_DELETE event as an
invalidation mechanism and expect that if they try to open a file with
the name reported with the delete event, that it should not contain the
content of the deleted file.
Commit 49246466a989 ("fsnotify: move fsnotify_nameremove() hook out of
d_delete()") moved the fsnotify delete hook before d_delete() so fsnotify
will have access to a positive dentry.
This allowed a race where opening the deleted file via cached dentry
is now possible after receiving the IN_DELETE event.
To fix the regression, create a new hook fsnotify_delete() that takes
the unlinked inode as an argument and use a helper d_delete_notify() to
pin the inode, so we can pass it to fsnotify_delete() after d_delete().
Backporting hint: this regression is from v5.3. Although patch will
apply with only trivial conflicts to v5.4 and v5.10, it won't build,
because fsnotify_delete() implementation is different in each of those
versions (see fsnotify_link()).
A follow up patch will fix the fsnotify_unlink/rmdir() calls in pseudo
filesystem that do not need to call d_delete().
Link: https://lore.kernel.org/r/20220120215305.282577-1-amir73il@gmail.com
Reported-by: Ivan Delalande <colona(a)arista.com>
Link: https://lore.kernel.org/linux-fsdevel/YeNyzoDM5hP5LtGW@visor/
Fixes: 49246466a989 ("fsnotify: move fsnotify_nameremove() hook out of d_delete()")
Cc: stable(a)vger.kernel.org # v5.3+
Signed-off-by: Amir Goldstein <amir73il(a)gmail.com>
Signed-off-by: Jan Kara <jack(a)suse.cz>
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a5bd6926f7ff..7807b28b7892 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3086,10 +3086,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
btrfs_inode_lock(inode, 0);
err = btrfs_delete_subvolume(dir, dentry);
btrfs_inode_unlock(inode, 0);
- if (!err) {
- fsnotify_rmdir(dir, dentry);
- d_delete(dentry);
- }
+ if (!err)
+ d_delete_notify(dir, dentry);
out_dput:
dput(dentry);
diff --git a/fs/namei.c b/fs/namei.c
index d81f04f8d818..4ed0e41feab7 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3974,13 +3974,12 @@ int vfs_rmdir(struct user_namespace *mnt_userns, struct inode *dir,
dentry->d_inode->i_flags |= S_DEAD;
dont_mount(dentry);
detach_mounts(dentry);
- fsnotify_rmdir(dir, dentry);
out:
inode_unlock(dentry->d_inode);
dput(dentry);
if (!error)
- d_delete(dentry);
+ d_delete_notify(dir, dentry);
return error;
}
EXPORT_SYMBOL(vfs_rmdir);
@@ -4102,7 +4101,6 @@ int vfs_unlink(struct user_namespace *mnt_userns, struct inode *dir,
if (!error) {
dont_mount(dentry);
detach_mounts(dentry);
- fsnotify_unlink(dir, dentry);
}
}
}
@@ -4110,9 +4108,11 @@ int vfs_unlink(struct user_namespace *mnt_userns, struct inode *dir,
inode_unlock(target);
/* We don't d_delete() NFS sillyrenamed files--they still exist. */
- if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
+ if (!error && dentry->d_flags & DCACHE_NFSFS_RENAMED) {
+ fsnotify_unlink(dir, dentry);
+ } else if (!error) {
fsnotify_link_count(target);
- d_delete(dentry);
+ d_delete_notify(dir, dentry);
}
return error;
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 3a2d7dc3c607..bb8467cd11ae 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -224,6 +224,43 @@ static inline void fsnotify_link(struct inode *dir, struct inode *inode,
dir, &new_dentry->d_name, 0);
}
+/*
+ * fsnotify_delete - @dentry was unlinked and unhashed
+ *
+ * Caller must make sure that dentry->d_name is stable.
+ *
+ * Note: unlike fsnotify_unlink(), we have to pass also the unlinked inode
+ * as this may be called after d_delete() and old_dentry may be negative.
+ */
+static inline void fsnotify_delete(struct inode *dir, struct inode *inode,
+ struct dentry *dentry)
+{
+ __u32 mask = FS_DELETE;
+
+ if (S_ISDIR(inode->i_mode))
+ mask |= FS_ISDIR;
+
+ fsnotify_name(mask, inode, FSNOTIFY_EVENT_INODE, dir, &dentry->d_name,
+ 0);
+}
+
+/**
+ * d_delete_notify - delete a dentry and call fsnotify_delete()
+ * @dentry: The dentry to delete
+ *
+ * This helper is used to guaranty that the unlinked inode cannot be found
+ * by lookup of this name after fsnotify_delete() event has been delivered.
+ */
+static inline void d_delete_notify(struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode = d_inode(dentry);
+
+ ihold(inode);
+ d_delete(dentry);
+ fsnotify_delete(dir, inode, dentry);
+ iput(inode);
+}
+
/*
* fsnotify_unlink - 'name' was unlinked
*
@@ -231,10 +268,10 @@ static inline void fsnotify_link(struct inode *dir, struct inode *inode,
*/
static inline void fsnotify_unlink(struct inode *dir, struct dentry *dentry)
{
- /* Expected to be called before d_delete() */
- WARN_ON_ONCE(d_is_negative(dentry));
+ if (WARN_ON_ONCE(d_is_negative(dentry)))
+ return;
- fsnotify_dirent(dir, dentry, FS_DELETE);
+ fsnotify_delete(dir, d_inode(dentry), dentry);
}
/*
@@ -258,10 +295,10 @@ static inline void fsnotify_mkdir(struct inode *dir, struct dentry *dentry)
*/
static inline void fsnotify_rmdir(struct inode *dir, struct dentry *dentry)
{
- /* Expected to be called before d_delete() */
- WARN_ON_ONCE(d_is_negative(dentry));
+ if (WARN_ON_ONCE(d_is_negative(dentry)))
+ return;
- fsnotify_dirent(dir, dentry, FS_DELETE | FS_ISDIR);
+ fsnotify_delete(dir, d_inode(dentry), dentry);
}
/*