Calling kiocb_set_cancel_fn() without knowing whether the caller submitted a struct kiocb or a struct aio_kiocb is unsafe. Fix this by introducing the cancel_kiocb() method in struct file_operations. The following call trace illustrates that without this patch an out-of-bounds write happens if I/O is submitted by io_uring (from a phone with an ARM CPU and kernel 6.1):
WARNING: CPU: 3 PID: 368 at fs/aio.c:598 kiocb_set_cancel_fn+0x9c/0xa8 Call trace: kiocb_set_cancel_fn+0x9c/0xa8 ffs_epfile_read_iter+0x144/0x1d0 io_read+0x19c/0x498 io_issue_sqe+0x118/0x27c io_submit_sqes+0x25c/0x5fc __arm64_sys_io_uring_enter+0x104/0xab0 invoke_syscall+0x58/0x11c el0_svc_common+0xb4/0xf4 do_el0_svc+0x2c/0xb0 el0_svc+0x2c/0xa4 el0t_64_sync_handler+0x68/0xb4 el0t_64_sync+0x1a4/0x1a8
Several ideas in this patch come from the following patch: Christoph Hellwig, "[PATCH 08/32] aio: replace kiocb_set_cancel_fn with a cancel_kiocb file operation", May 2018 (https://lore.kernel.org/all/20180515194833.6906-9-hch@lst.de/).
Cc: Christoph Hellwig hch@lst.de Cc: Avi Kivity avi@scylladb.com Cc: Sandeep Dhavale dhavale@google.com Cc: Jens Axboe axboe@kernel.dk Cc: Greg Kroah-Hartman gregkh@linuxfoundation.org Cc: stable@vger.kernel.org Signed-off-by: Bart Van Assche bvanassche@acm.org --- drivers/usb/gadget/function/f_fs.c | 19 +---- drivers/usb/gadget/legacy/inode.c | 12 +-- fs/aio.c | 129 +++++++++++++++++++---------- include/linux/aio.h | 7 -- include/linux/fs.h | 1 + 5 files changed, 90 insertions(+), 78 deletions(-)
Changes compared to v1: - Fixed a race between request completion and addition to the list of active requests. - Changed the return type of .cancel_kiocb() from int into void. - Simplified the .cancel_kiocb() implementations. - Introduced the ki_opcode member in struct aio_kiocb. - aio_cancel_and_del() checks .ki_opcode before accessing union members. - Left out the include/include/mm changes.
diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 6bff6cb93789..4837e3071263 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -31,7 +31,6 @@ #include <linux/usb/composite.h> #include <linux/usb/functionfs.h>
-#include <linux/aio.h> #include <linux/kthread.h> #include <linux/poll.h> #include <linux/eventfd.h> @@ -1157,23 +1156,16 @@ ffs_epfile_open(struct inode *inode, struct file *file) return stream_open(inode, file); }
-static int ffs_aio_cancel(struct kiocb *kiocb) +static void ffs_epfile_cancel_kiocb(struct kiocb *kiocb) { struct ffs_io_data *io_data = kiocb->private; struct ffs_epfile *epfile = kiocb->ki_filp->private_data; unsigned long flags; - int value;
spin_lock_irqsave(&epfile->ffs->eps_lock, flags); - if (io_data && io_data->ep && io_data->req) - value = usb_ep_dequeue(io_data->ep, io_data->req); - else - value = -EINVAL; - + usb_ep_dequeue(io_data->ep, io_data->req); spin_unlock_irqrestore(&epfile->ffs->eps_lock, flags); - - return value; }
static ssize_t ffs_epfile_write_iter(struct kiocb *kiocb, struct iov_iter *from) @@ -1198,9 +1190,6 @@ static ssize_t ffs_epfile_write_iter(struct kiocb *kiocb, struct iov_iter *from)
kiocb->private = p;
- if (p->aio) - kiocb_set_cancel_fn(kiocb, ffs_aio_cancel); - res = ffs_epfile_io(kiocb->ki_filp, p); if (res == -EIOCBQUEUED) return res; @@ -1242,9 +1231,6 @@ static ssize_t ffs_epfile_read_iter(struct kiocb *kiocb, struct iov_iter *to)
kiocb->private = p;
- if (p->aio) - kiocb_set_cancel_fn(kiocb, ffs_aio_cancel); - res = ffs_epfile_io(kiocb->ki_filp, p); if (res == -EIOCBQUEUED) return res; @@ -1356,6 +1342,7 @@ static const struct file_operations ffs_epfile_operations = { .release = ffs_epfile_release, .unlocked_ioctl = ffs_epfile_ioctl, .compat_ioctl = compat_ptr_ioctl, + .cancel_kiocb = ffs_epfile_cancel_kiocb, };
diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c index 03179b1880fd..c2cf7fca6937 100644 --- a/drivers/usb/gadget/legacy/inode.c +++ b/drivers/usb/gadget/legacy/inode.c @@ -22,7 +22,6 @@ #include <linux/slab.h> #include <linux/poll.h> #include <linux/kthread.h> -#include <linux/aio.h> #include <linux/uio.h> #include <linux/refcount.h> #include <linux/delay.h> @@ -446,23 +445,18 @@ struct kiocb_priv { unsigned actual; };
-static int ep_aio_cancel(struct kiocb *iocb) +static void ep_cancel_kiocb(struct kiocb *iocb) { struct kiocb_priv *priv = iocb->private; struct ep_data *epdata; - int value;
local_irq_disable(); epdata = priv->epdata; // spin_lock(&epdata->dev->lock); if (likely(epdata && epdata->ep && priv->req)) - value = usb_ep_dequeue (epdata->ep, priv->req); - else - value = -EINVAL; + usb_ep_dequeue(epdata->ep, priv->req); // spin_unlock(&epdata->dev->lock); local_irq_enable(); - - return value; }
static void ep_user_copy_worker(struct work_struct *work) @@ -537,7 +531,6 @@ static ssize_t ep_aio(struct kiocb *iocb, iocb->private = priv; priv->iocb = iocb;
- kiocb_set_cancel_fn(iocb, ep_aio_cancel); get_ep(epdata); priv->epdata = epdata; priv->actual = 0; @@ -709,6 +702,7 @@ static const struct file_operations ep_io_operations = { .unlocked_ioctl = ep_ioctl, .read_iter = ep_read_iter, .write_iter = ep_write_iter, + .cancel_kiocb = ep_cancel_kiocb, };
/* ENDPOINT INITIALIZATION diff --git a/fs/aio.c b/fs/aio.c index bb2ff48991f3..9dc0be703aa6 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -203,7 +203,6 @@ struct aio_kiocb { };
struct kioctx *ki_ctx; - kiocb_cancel_fn *ki_cancel;
struct io_event ki_res;
@@ -211,6 +210,8 @@ struct aio_kiocb { * for cancellation */ refcount_t ki_refcnt;
+ u16 ki_opcode; /* IOCB_CMD_* */ + /* * If the aio_resfd field of the userspace iocb is not zero, * this is the underlying eventfd context to deliver events to. @@ -587,22 +588,6 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
-void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel) -{ - struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, rw); - struct kioctx *ctx = req->ki_ctx; - unsigned long flags; - - if (WARN_ON_ONCE(!list_empty(&req->ki_list))) - return; - - spin_lock_irqsave(&ctx->ctx_lock, flags); - list_add_tail(&req->ki_list, &ctx->active_reqs); - req->ki_cancel = cancel; - spin_unlock_irqrestore(&ctx->ctx_lock, flags); -} -EXPORT_SYMBOL(kiocb_set_cancel_fn); - /* * free_ioctx() should be RCU delayed to synchronize against the RCU * protected lookup_ioctx() and also needs process context to call @@ -634,6 +619,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref) queue_rcu_work(system_wq, &ctx->free_rwork); }
+static void aio_cancel_and_del(struct aio_kiocb *req); + /* * When this function runs, the kioctx has been removed from the "hash table" * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted - @@ -649,8 +636,7 @@ static void free_ioctx_users(struct percpu_ref *ref) while (!list_empty(&ctx->active_reqs)) { req = list_first_entry(&ctx->active_reqs, struct aio_kiocb, ki_list); - req->ki_cancel(&req->rw); - list_del_init(&req->ki_list); + aio_cancel_and_del(req); }
spin_unlock_irq(&ctx->ctx_lock); @@ -1552,6 +1538,24 @@ static ssize_t aio_setup_rw(int rw, const struct iocb *iocb, return __import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter, compat); }
+static void aio_add_rw_to_active_reqs(struct kiocb *req) +{ + struct aio_kiocb *aio = container_of(req, struct aio_kiocb, rw); + struct kioctx *ctx = aio->ki_ctx; + unsigned long flags; + + /* + * If the .cancel_kiocb() callback has been set, add the request + * to the list of active requests. + */ + if (!req->ki_filp->f_op->cancel_kiocb) + return; + + spin_lock_irqsave(&ctx->ctx_lock, flags); + list_add_tail(&aio->ki_list, &ctx->active_reqs); + spin_unlock_irqrestore(&ctx->ctx_lock, flags); +} + static inline void aio_rw_done(struct kiocb *req, ssize_t ret) { switch (ret) { @@ -1593,8 +1597,10 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb, if (ret < 0) return ret; ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter)); - if (!ret) + if (!ret) { + aio_add_rw_to_active_reqs(req); aio_rw_done(req, call_read_iter(file, req, &iter)); + } kfree(iovec); return ret; } @@ -1622,6 +1628,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb, return ret; ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) { + aio_add_rw_to_active_reqs(req); if (S_ISREG(file_inode(file)->i_mode)) kiocb_start_write(req); req->ki_flags |= IOCB_WRITE; @@ -1715,6 +1722,54 @@ static void poll_iocb_unlock_wq(struct poll_iocb *req) rcu_read_unlock(); }
+/* Must be called only for IOCB_CMD_POLL requests. */ +static void aio_poll_cancel(struct aio_kiocb *aiocb) +{ + struct poll_iocb *req = &aiocb->poll; + struct kioctx *ctx = aiocb->ki_ctx; + + lockdep_assert_held(&ctx->ctx_lock); + + if (!poll_iocb_lock_wq(req)) + return; + + WRITE_ONCE(req->cancelled, true); + if (!req->work_scheduled) { + schedule_work(&aiocb->poll.work); + req->work_scheduled = true; + } + poll_iocb_unlock_wq(req); +} + +static void aio_cancel_and_del(struct aio_kiocb *req) +{ + void (*cancel_kiocb)(struct kiocb *) = + req->rw.ki_filp->f_op->cancel_kiocb; + struct kioctx *ctx = req->ki_ctx; + + lockdep_assert_held(&ctx->ctx_lock); + + switch (req->ki_opcode) { + case IOCB_CMD_PREAD: + case IOCB_CMD_PWRITE: + case IOCB_CMD_PREADV: + case IOCB_CMD_PWRITEV: + if (cancel_kiocb) + cancel_kiocb(&req->rw); + break; + case IOCB_CMD_FSYNC: + case IOCB_CMD_FDSYNC: + break; + case IOCB_CMD_POLL: + aio_poll_cancel(req); + break; + default: + WARN_ONCE(true, "invalid aio operation %d\n", req->ki_opcode); + } + + list_del_init(&req->ki_list); +} + static void aio_poll_complete_work(struct work_struct *work) { struct poll_iocb *req = container_of(work, struct poll_iocb, work); @@ -1727,11 +1782,11 @@ static void aio_poll_complete_work(struct work_struct *work) mask = vfs_poll(req->file, &pt) & req->events;
/* - * Note that ->ki_cancel callers also delete iocb from active_reqs after - * calling ->ki_cancel. We need the ctx_lock roundtrip here to - * synchronize with them. In the cancellation case the list_del_init - * itself is not actually needed, but harmless so we keep it in to - * avoid further branches in the fast path. + * aio_cancel_and_del() deletes the iocb from the active_reqs list. We + * need the ctx_lock here to synchronize with aio_cancel_and_del(). In + * the cancellation case the list_del_init itself is not actually + * needed, but harmless so we keep it in to avoid further branches in + * the fast path. */ spin_lock_irq(&ctx->ctx_lock); if (poll_iocb_lock_wq(req)) { @@ -1760,24 +1815,6 @@ static void aio_poll_complete_work(struct work_struct *work) iocb_put(iocb); }
-/* assumes we are called with irqs disabled */ -static int aio_poll_cancel(struct kiocb *iocb) -{ - struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw); - struct poll_iocb *req = &aiocb->poll; - - if (poll_iocb_lock_wq(req)) { - WRITE_ONCE(req->cancelled, true); - if (!req->work_scheduled) { - schedule_work(&aiocb->poll.work); - req->work_scheduled = true; - } - poll_iocb_unlock_wq(req); - } /* else, the request was force-cancelled by POLLFREE already */ - - return 0; -} - static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { @@ -1945,7 +1982,6 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) * active_reqs so that it can be cancelled if needed. */ list_add_tail(&aiocb->ki_list, &ctx->active_reqs); - aiocb->ki_cancel = aio_poll_cancel; } if (on_queue) poll_iocb_unlock_wq(req); @@ -1993,6 +2029,8 @@ static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb, req->ki_res.res = 0; req->ki_res.res2 = 0;
+ req->ki_opcode = iocb->aio_lio_opcode; + switch (iocb->aio_lio_opcode) { case IOCB_CMD_PREAD: return aio_read(&req->rw, iocb, false, compat); @@ -2189,8 +2227,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, /* TODO: use a hash or array, this sucks. */ list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { if (kiocb->ki_res.obj == obj) { - ret = kiocb->ki_cancel(&kiocb->rw); - list_del_init(&kiocb->ki_list); + aio_cancel_and_del(kiocb); break; } } diff --git a/include/linux/aio.h b/include/linux/aio.h index 86892a4fe7c8..2aa6d0be3171 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -4,20 +4,13 @@
#include <linux/aio_abi.h>
-struct kioctx; -struct kiocb; struct mm_struct;
-typedef int (kiocb_cancel_fn)(struct kiocb *); - /* prototypes */ #ifdef CONFIG_AIO extern void exit_aio(struct mm_struct *mm); -void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel); #else static inline void exit_aio(struct mm_struct *mm) { } -static inline void kiocb_set_cancel_fn(struct kiocb *req, - kiocb_cancel_fn *cancel) { } #endif /* CONFIG_AIO */
#endif /* __LINUX__AIO_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index ed5966a70495..7ec714878637 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2021,6 +2021,7 @@ struct file_operations { int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags); int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *, unsigned int poll_flags); + void (*cancel_kiocb)(struct kiocb *); } __randomize_layout;
/* Wrap a directory iterator that needs exclusive inode access */
On 2/8/24 2:55 PM, Bart Van Assche wrote:
diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 6bff6cb93789..4837e3071263 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -31,7 +31,6 @@ #include <linux/usb/composite.h> #include <linux/usb/functionfs.h> -#include <linux/aio.h> #include <linux/kthread.h> #include <linux/poll.h> #include <linux/eventfd.h> @@ -1157,23 +1156,16 @@ ffs_epfile_open(struct inode *inode, struct file *file) return stream_open(inode, file); } -static int ffs_aio_cancel(struct kiocb *kiocb) +static void ffs_epfile_cancel_kiocb(struct kiocb *kiocb) { struct ffs_io_data *io_data = kiocb->private; struct ffs_epfile *epfile = kiocb->ki_filp->private_data; unsigned long flags;
- int value;
spin_lock_irqsave(&epfile->ffs->eps_lock, flags);
- if (io_data && io_data->ep && io_data->req)
value = usb_ep_dequeue(io_data->ep, io_data->req);
- else
value = -EINVAL;
spin_unlock_irqrestore(&epfile->ffs->eps_lock, flags);usb_ep_dequeue(io_data->ep, io_data->req);
- return value;
}
I'm assuming the NULL checks can go because it's just the async parts now?
@@ -634,6 +619,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref) queue_rcu_work(system_wq, &ctx->free_rwork); } +static void aio_cancel_and_del(struct aio_kiocb *req);
Just move the function higher up? It doesn't have any dependencies.
@@ -1552,6 +1538,24 @@ static ssize_t aio_setup_rw(int rw, const struct iocb *iocb, return __import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter, compat); } +static void aio_add_rw_to_active_reqs(struct kiocb *req) +{
- struct aio_kiocb *aio = container_of(req, struct aio_kiocb, rw);
- struct kioctx *ctx = aio->ki_ctx;
- unsigned long flags;
- /*
* If the .cancel_kiocb() callback has been set, add the request
* to the list of active requests.
*/
- if (!req->ki_filp->f_op->cancel_kiocb)
return;
- spin_lock_irqsave(&ctx->ctx_lock, flags);
- list_add_tail(&aio->ki_list, &ctx->active_reqs);
- spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+}
This can use spin_lock_irq(), always called from process context.
+/* Must be called only for IOCB_CMD_POLL requests. */ +static void aio_poll_cancel(struct aio_kiocb *aiocb) +{
- struct poll_iocb *req = &aiocb->poll;
- struct kioctx *ctx = aiocb->ki_ctx;
- lockdep_assert_held(&ctx->ctx_lock);
- if (!poll_iocb_lock_wq(req))
return;
- WRITE_ONCE(req->cancelled, true);
- if (!req->work_scheduled) {
schedule_work(&aiocb->poll.work);
req->work_scheduled = true;
- }
- poll_iocb_unlock_wq(req);
+}
Not your code, it's just moved, but this looks racy. Might not matter, and obviously beyond the scope of this change.
+{
- void (*cancel_kiocb)(struct kiocb *) =
req->rw.ki_filp->f_op->cancel_kiocb;
- struct kioctx *ctx = req->ki_ctx;
- lockdep_assert_held(&ctx->ctx_lock);
- switch (req->ki_opcode) {
- case IOCB_CMD_PREAD:
- case IOCB_CMD_PWRITE:
- case IOCB_CMD_PREADV:
- case IOCB_CMD_PWRITEV:
if (cancel_kiocb)
cancel_kiocb(&req->rw);
break;
- case IOCB_CMD_FSYNC:
- case IOCB_CMD_FDSYNC:
break;
- case IOCB_CMD_POLL:
aio_poll_cancel(req);
break;
- default:
WARN_ONCE(true, "invalid aio operation %d\n", req->ki_opcode);
- }
- list_del_init(&req->ki_list);
+}
Why don't you just keep ki_cancel() and just change it to a void return that takes an aio_kiocb? Then you don't need this odd switch, or adding an opcode field just for this. That seems cleaner.
Outside of these little nits, looks alright. I'd still love to kill the silly cancel code just for the gadget bits, but that's for another day. And since the gadget and aio code basically never changes, a cleaned up variant of the this patch should be trivial enough to backport to stable, so I don't think we need to worry about doing a fixup first.
On 2/8/24 14:14, Jens Axboe wrote:
On 2/8/24 2:55 PM, Bart Van Assche wrote:
-static int ffs_aio_cancel(struct kiocb *kiocb) +static void ffs_epfile_cancel_kiocb(struct kiocb *kiocb) { struct ffs_io_data *io_data = kiocb->private; struct ffs_epfile *epfile = kiocb->ki_filp->private_data; unsigned long flags;
- int value;
spin_lock_irqsave(&epfile->ffs->eps_lock, flags);
- if (io_data && io_data->ep && io_data->req)
value = usb_ep_dequeue(io_data->ep, io_data->req);
- else
value = -EINVAL;
spin_unlock_irqrestore(&epfile->ffs->eps_lock, flags);usb_ep_dequeue(io_data->ep, io_data->req);
- return value; }
I'm assuming the NULL checks can go because it's just the async parts now?
Probably. I will look into this.
+static void aio_cancel_and_del(struct aio_kiocb *req);
Just move the function higher up? It doesn't have any dependencies.
aio_cancel_and_del() calls aio_poll_cancel(). aio_poll_cancel() calls poll_iocb_lock_wq(). poll_iocb_lock_wq() is defined below the first call of aio_cancel_and_del(). It's probably possible to get rid of that function declaration but a nontrivial amount of code would have to be moved.
@@ -1552,6 +1538,24 @@ static ssize_t aio_setup_rw(int rw, const struct iocb *iocb, return __import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter, compat); } +static void aio_add_rw_to_active_reqs(struct kiocb *req) +{
- struct aio_kiocb *aio = container_of(req, struct aio_kiocb, rw);
- struct kioctx *ctx = aio->ki_ctx;
- unsigned long flags;
- /*
* If the .cancel_kiocb() callback has been set, add the request
* to the list of active requests.
*/
- if (!req->ki_filp->f_op->cancel_kiocb)
return;
- spin_lock_irqsave(&ctx->ctx_lock, flags);
- list_add_tail(&aio->ki_list, &ctx->active_reqs);
- spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+}
This can use spin_lock_irq(), always called from process context.
I will make this change.
+{
- void (*cancel_kiocb)(struct kiocb *) =
req->rw.ki_filp->f_op->cancel_kiocb;
- struct kioctx *ctx = req->ki_ctx;
- lockdep_assert_held(&ctx->ctx_lock);
- switch (req->ki_opcode) {
- case IOCB_CMD_PREAD:
- case IOCB_CMD_PWRITE:
- case IOCB_CMD_PREADV:
- case IOCB_CMD_PWRITEV:
if (cancel_kiocb)
cancel_kiocb(&req->rw);
break;
- case IOCB_CMD_FSYNC:
- case IOCB_CMD_FDSYNC:
break;
- case IOCB_CMD_POLL:
aio_poll_cancel(req);
break;
- default:
WARN_ONCE(true, "invalid aio operation %d\n", req->ki_opcode);
- }
- list_del_init(&req->ki_list);
+}
Why don't you just keep ki_cancel() and just change it to a void return that takes an aio_kiocb? Then you don't need this odd switch, or adding an opcode field just for this. That seems cleaner.
Keeping .ki_cancel() means that it must be set before I/O starts and only if the I/O is submitted by libaio. That would require an approach to recognize whether or not a struct kiocb is embedded in struct aio_kiocb, e.g. the patch that you posted as a reply on version one of this patch. Does anyone else want to comment on this?
Thanks,
Bart.
On 2/8/24 3:41 PM, Bart Van Assche wrote:
Just move the function higher up? It doesn't have any dependencies.
aio_cancel_and_del() calls aio_poll_cancel(). aio_poll_cancel() calls poll_iocb_lock_wq(). poll_iocb_lock_wq() is defined below the first call of aio_cancel_and_del(). It's probably possible to get rid of that function declaration but a nontrivial amount of code would have to be moved.
Ah yes, I mixed it up with the cancel add helper. Forward decl is fine then, keeps the patch smaller for backporting too.
+{
- void (*cancel_kiocb)(struct kiocb *) =
req->rw.ki_filp->f_op->cancel_kiocb;
- struct kioctx *ctx = req->ki_ctx;
- lockdep_assert_held(&ctx->ctx_lock);
- switch (req->ki_opcode) {
- case IOCB_CMD_PREAD:
- case IOCB_CMD_PWRITE:
- case IOCB_CMD_PREADV:
- case IOCB_CMD_PWRITEV:
if (cancel_kiocb)
cancel_kiocb(&req->rw);
break;
- case IOCB_CMD_FSYNC:
- case IOCB_CMD_FDSYNC:
break;
- case IOCB_CMD_POLL:
aio_poll_cancel(req);
break;
- default:
WARN_ONCE(true, "invalid aio operation %d\n", req->ki_opcode);
- }
- list_del_init(&req->ki_list);
+}
Why don't you just keep ki_cancel() and just change it to a void return that takes an aio_kiocb? Then you don't need this odd switch, or adding an opcode field just for this. That seems cleaner.
Keeping .ki_cancel() means that it must be set before I/O starts and only if the I/O is submitted by libaio. That would require an approach to recognize whether or not a struct kiocb is embedded in struct aio_kiocb, e.g. the patch that you posted as a reply on version one of this patch. Does anyone else want to comment on this?
Maybe I wasn't clear, but this is in aio_req. You already add an opcode in there, only to then add a switch here based on that opcode. Just have a cancel callback which takes aio_req as an argument. For POLL, this can be aio_poll_cancel(). Add a wrapper for read/write which then calls req->rw.ki_filp->f_op->cancel_kiocb(&req->rw); Then the above can become:
aio_rw_cancel(req) { void (*cancel_kiocb)(struct kiocb *) = req->rw.ki_filp->f_op->cancel_kiocb;
cancel_kiocb(&req->rw); }
aio_read() { ... req->cancel = aio_rw_cancel; ... }
static void aio_cancel_and_del(struct aio_kiocb *req) { void (*cancel_kiocb)(struct kiocb *) = req->rw.ki_filp->f_op->cancel_kiocb; struct kioctx *ctx = req->ki_ctx;
lockdep_assert_held(&ctx->ctx_lock); if (req->cancel) req->cancel(req); list_del_init(&req->ki_list); }
or something like that. fsync/fdsync clears ->cancel() to NULL, poll sets it to aio_poll_cancel(), and read/write like the above.
On 2/8/24 4:05 PM, Jens Axboe wrote:
On 2/8/24 3:41 PM, Bart Van Assche wrote:
Just move the function higher up? It doesn't have any dependencies.
aio_cancel_and_del() calls aio_poll_cancel(). aio_poll_cancel() calls poll_iocb_lock_wq(). poll_iocb_lock_wq() is defined below the first call of aio_cancel_and_del(). It's probably possible to get rid of that function declaration but a nontrivial amount of code would have to be moved.
Ah yes, I mixed it up with the cancel add helper. Forward decl is fine then, keeps the patch smaller for backporting too.
+{
- void (*cancel_kiocb)(struct kiocb *) =
req->rw.ki_filp->f_op->cancel_kiocb;
- struct kioctx *ctx = req->ki_ctx;
- lockdep_assert_held(&ctx->ctx_lock);
- switch (req->ki_opcode) {
- case IOCB_CMD_PREAD:
- case IOCB_CMD_PWRITE:
- case IOCB_CMD_PREADV:
- case IOCB_CMD_PWRITEV:
if (cancel_kiocb)
cancel_kiocb(&req->rw);
break;
- case IOCB_CMD_FSYNC:
- case IOCB_CMD_FDSYNC:
break;
- case IOCB_CMD_POLL:
aio_poll_cancel(req);
break;
- default:
WARN_ONCE(true, "invalid aio operation %d\n", req->ki_opcode);
- }
- list_del_init(&req->ki_list);
+}
Why don't you just keep ki_cancel() and just change it to a void return that takes an aio_kiocb? Then you don't need this odd switch, or adding an opcode field just for this. That seems cleaner.
Keeping .ki_cancel() means that it must be set before I/O starts and only if the I/O is submitted by libaio. That would require an approach to recognize whether or not a struct kiocb is embedded in struct aio_kiocb, e.g. the patch that you posted as a reply on version one of this patch. Does anyone else want to comment on this?
Maybe I wasn't clear, but this is in aio_req. You already add an opcode in there, only to then add a switch here based on that opcode. Just have a cancel callback which takes aio_req as an argument. For POLL, this can be aio_poll_cancel(). Add a wrapper for read/write which then calls req->rw.ki_filp->f_op->cancel_kiocb(&req->rw); Then the above can become:
aio_rw_cancel(req) { void (*cancel_kiocb)(struct kiocb *) = req->rw.ki_filp->f_op->cancel_kiocb;
cancel_kiocb(&req->rw); }
aio_read() { ... req->cancel = aio_rw_cancel; ... }
static void aio_cancel_and_del(struct aio_kiocb *req) { void (*cancel_kiocb)(struct kiocb *) = req->rw.ki_filp->f_op->cancel_kiocb; struct kioctx *ctx = req->ki_ctx;
lockdep_assert_held(&ctx->ctx_lock); if (req->cancel) req->cancel(req); list_del_init(&req->ki_list); }
or something like that. fsync/fdsync clears ->cancel() to NULL, poll sets it to aio_poll_cancel(), and read/write like the above.
Totally untested incremental. I think this is cleaner, and it's less code too.
diff --git a/fs/aio.c b/fs/aio.c index 9dc0be703aa6..a7770f59269f 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -202,6 +202,8 @@ struct aio_kiocb { struct poll_iocb poll; };
+ void (*ki_cancel)(struct aio_kiocb *); + struct kioctx *ki_ctx;
struct io_event ki_res; @@ -210,8 +212,6 @@ struct aio_kiocb { * for cancellation */ refcount_t ki_refcnt;
- u16 ki_opcode; /* IOCB_CMD_* */ - /* * If the aio_resfd field of the userspace iocb is not zero, * this is the underlying eventfd context to deliver events to. @@ -1576,6 +1576,11 @@ static inline void aio_rw_done(struct kiocb *req, ssize_t ret) } }
+static void aio_rw_cancel(struct aio_kiocb *req) +{ + iocb->ki_filp->f_op->cancel_kiocb(iocb); +} + static int aio_read(struct kiocb *req, const struct iocb *iocb, bool vectored, bool compat) { @@ -1722,50 +1727,14 @@ static void poll_iocb_unlock_wq(struct poll_iocb *req) rcu_read_unlock(); }
-/* Must be called only for IOCB_CMD_POLL requests. */ -static void aio_poll_cancel(struct aio_kiocb *aiocb) -{ - struct poll_iocb *req = &aiocb->poll; - struct kioctx *ctx = aiocb->ki_ctx; - - lockdep_assert_held(&ctx->ctx_lock); - - if (!poll_iocb_lock_wq(req)) - return; - - WRITE_ONCE(req->cancelled, true); - if (!req->work_scheduled) { - schedule_work(&aiocb->poll.work); - req->work_scheduled = true; - } - poll_iocb_unlock_wq(req); -} - static void aio_cancel_and_del(struct aio_kiocb *req) { - void (*cancel_kiocb)(struct kiocb *) = - req->rw.ki_filp->f_op->cancel_kiocb; struct kioctx *ctx = req->ki_ctx;
lockdep_assert_held(&ctx->ctx_lock);
- switch (req->ki_opcode) { - case IOCB_CMD_PREAD: - case IOCB_CMD_PWRITE: - case IOCB_CMD_PREADV: - case IOCB_CMD_PWRITEV: - if (cancel_kiocb) - cancel_kiocb(&req->rw); - break; - case IOCB_CMD_FSYNC: - case IOCB_CMD_FDSYNC: - break; - case IOCB_CMD_POLL: - aio_poll_cancel(req); - break; - default: - WARN_ONCE(true, "invalid aio operation %d\n", req->ki_opcode); - } + if (req->ki_cancel) + req->ki_cancel(req);
list_del_init(&req->ki_list); } @@ -1922,6 +1891,25 @@ aio_poll_queue_proc(struct file *file, struct wait_queue_head *head, add_wait_queue(head, &pt->iocb->poll.wait); }
+/* Must be called only for IOCB_CMD_POLL requests. */ +static void aio_poll_cancel(struct aio_kiocb *aiocb) +{ + struct poll_iocb *req = &aiocb->poll; + struct kioctx *ctx = aiocb->ki_ctx; + + lockdep_assert_held(&ctx->ctx_lock); + + if (!poll_iocb_lock_wq(req)) + return; + + WRITE_ONCE(req->cancelled, true); + if (!req->work_scheduled) { + schedule_work(&aiocb->poll.work); + req->work_scheduled = true; + } + poll_iocb_unlock_wq(req); +} + static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb) { struct kioctx *ctx = aiocb->ki_ctx; @@ -2028,23 +2016,27 @@ static int __io_submit_one(struct kioctx *ctx, const struct iocb *iocb, req->ki_res.data = iocb->aio_data; req->ki_res.res = 0; req->ki_res.res2 = 0; - - req->ki_opcode = iocb->aio_lio_opcode; + req->ki_cancel = NULL;
switch (iocb->aio_lio_opcode) { case IOCB_CMD_PREAD: + req->ki_cancel = aio_rw_cancel; return aio_read(&req->rw, iocb, false, compat); case IOCB_CMD_PWRITE: + req->ki_cancel = aio_rw_cancel; return aio_write(&req->rw, iocb, false, compat); case IOCB_CMD_PREADV: + req->ki_cancel = aio_rw_cancel; return aio_read(&req->rw, iocb, true, compat); case IOCB_CMD_PWRITEV: + req->ki_cancel = aio_rw_cancel; return aio_write(&req->rw, iocb, true, compat); case IOCB_CMD_FSYNC: return aio_fsync(&req->fsync, iocb, false); case IOCB_CMD_FDSYNC: return aio_fsync(&req->fsync, iocb, true); case IOCB_CMD_POLL: + req->ki_cancel = aio_poll_cancel; return aio_poll(req, iocb); default: pr_debug("invalid aio operation %d\n", iocb->aio_lio_opcode);
On Thu, Feb 08, 2024 at 03:14:43PM -0700, Jens Axboe wrote:
On 2/8/24 2:55 PM, Bart Van Assche wrote:
diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 6bff6cb93789..4837e3071263 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -31,7 +31,6 @@ #include <linux/usb/composite.h> #include <linux/usb/functionfs.h> -#include <linux/aio.h> #include <linux/kthread.h> #include <linux/poll.h> #include <linux/eventfd.h> @@ -1157,23 +1156,16 @@ ffs_epfile_open(struct inode *inode, struct file *file) return stream_open(inode, file); } -static int ffs_aio_cancel(struct kiocb *kiocb) +static void ffs_epfile_cancel_kiocb(struct kiocb *kiocb) { struct ffs_io_data *io_data = kiocb->private; struct ffs_epfile *epfile = kiocb->ki_filp->private_data; unsigned long flags;
- int value;
spin_lock_irqsave(&epfile->ffs->eps_lock, flags);
- if (io_data && io_data->ep && io_data->req)
value = usb_ep_dequeue(io_data->ep, io_data->req);
- else
value = -EINVAL;
spin_unlock_irqrestore(&epfile->ffs->eps_lock, flags);usb_ep_dequeue(io_data->ep, io_data->req);
- return value;
}
I'm assuming the NULL checks can go because it's just the async parts now?
@@ -634,6 +619,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref) queue_rcu_work(system_wq, &ctx->free_rwork); } +static void aio_cancel_and_del(struct aio_kiocb *req);
Just move the function higher up? It doesn't have any dependencies.
@@ -1552,6 +1538,24 @@ static ssize_t aio_setup_rw(int rw, const struct iocb *iocb, return __import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter, compat); } +static void aio_add_rw_to_active_reqs(struct kiocb *req) +{
- struct aio_kiocb *aio = container_of(req, struct aio_kiocb, rw);
- struct kioctx *ctx = aio->ki_ctx;
- unsigned long flags;
- /*
* If the .cancel_kiocb() callback has been set, add the request
* to the list of active requests.
*/
- if (!req->ki_filp->f_op->cancel_kiocb)
return;
- spin_lock_irqsave(&ctx->ctx_lock, flags);
- list_add_tail(&aio->ki_list, &ctx->active_reqs);
- spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+}
This can use spin_lock_irq(), always called from process context.
+/* Must be called only for IOCB_CMD_POLL requests. */ +static void aio_poll_cancel(struct aio_kiocb *aiocb) +{
- struct poll_iocb *req = &aiocb->poll;
- struct kioctx *ctx = aiocb->ki_ctx;
- lockdep_assert_held(&ctx->ctx_lock);
- if (!poll_iocb_lock_wq(req))
return;
- WRITE_ONCE(req->cancelled, true);
- if (!req->work_scheduled) {
schedule_work(&aiocb->poll.work);
req->work_scheduled = true;
- }
- poll_iocb_unlock_wq(req);
+}
Not your code, it's just moved, but this looks racy. Might not matter, and obviously beyond the scope of this change.
+{
- void (*cancel_kiocb)(struct kiocb *) =
req->rw.ki_filp->f_op->cancel_kiocb;
- struct kioctx *ctx = req->ki_ctx;
- lockdep_assert_held(&ctx->ctx_lock);
- switch (req->ki_opcode) {
- case IOCB_CMD_PREAD:
- case IOCB_CMD_PWRITE:
- case IOCB_CMD_PREADV:
- case IOCB_CMD_PWRITEV:
if (cancel_kiocb)
cancel_kiocb(&req->rw);
break;
- case IOCB_CMD_FSYNC:
- case IOCB_CMD_FDSYNC:
break;
- case IOCB_CMD_POLL:
aio_poll_cancel(req);
break;
- default:
WARN_ONCE(true, "invalid aio operation %d\n", req->ki_opcode);
- }
- list_del_init(&req->ki_list);
+}
Why don't you just keep ki_cancel() and just change it to a void return that takes an aio_kiocb? Then you don't need this odd switch, or adding an opcode field just for this. That seems cleaner.
Outside of these little nits, looks alright. I'd still love to kill the silly cancel code just for the gadget bits, but that's for another day.
Well, I'd prefer to kill it if we can asap. Because then we can lose that annoying file_operations addition. That really rubs me the wrong way.
And since the gadget and aio code basically never changes, a cleaned up variant of the this patch should be trivial enough to backport to stable, so I don't think we need to worry about doing a fixup first.
On 2/9/24 2:34 AM, Christian Brauner wrote:
Why don't you just keep ki_cancel() and just change it to a void return that takes an aio_kiocb? Then you don't need this odd switch, or adding an opcode field just for this. That seems cleaner.
Outside of these little nits, looks alright. I'd still love to kill the silly cancel code just for the gadget bits, but that's for another day.
Well, I'd prefer to kill it if we can asap. Because then we can lose that annoying file_operations addition. That really rubs me the wrong way.
Greg, can you elaborate on how useful cancel is for gadgets? Is it one of those things that was wired up "just because", or does it have actually useful cases?
Because cancel, internally in aio, makes sense on eg a poll request. But we don't need extra support for that, that's all internal to aio. It doesn't make sense for O_DIRECT IO on a regular file, as there's no way to cancel that anyway.
Reason I'm asking is that we have this broken cancel infrastructure that we can either attempt to make work, at a cost of adding an operation to the file_operations struct, or we can just get rid of it.
On 2/9/24 10:12, Jens Axboe wrote:
Greg, can you elaborate on how useful cancel is for gadgets? Is it one of those things that was wired up "just because", or does it have actually useful cases?
I found two use cases in the Android Open Source Project and have submitted CLs that request to remove the io_cancel() calls from that code. Although I think I understand why these calls were added, the race conditions that these io_cancel() calls try to address cannot be addressed completely by calling io_cancel().
Bart.
On 2/12/24 11:28, Bart Van Assche wrote:
On 2/9/24 10:12, Jens Axboe wrote:
Greg, can you elaborate on how useful cancel is for gadgets? Is it one of those things that was wired up "just because", or does it have actually useful cases?
I found two use cases in the Android Open Source Project and have submitted CLs that request to remove the io_cancel() calls from that code. Although I think I understand why these calls were added, the race conditions that these io_cancel() calls try to address cannot be addressed completely by calling io_cancel().
(replying to my own e-mail) The adb daemon (adbd) maintainers asked me to preserve the I/O cancellation code in adbd because it was introduced recently in that code to fix an important bug. Does everyone agree with the approach of the untested patches below?
Thanks,
Bart.
----------------------------------------------------------------------------- [PATCH 1/2] fs/aio: Restrict kiocb_set_cancel_fn() to I/O submitted by libaio
If kiocb_set_cancel_fn() is called for I/O submitted by io_uring, the following kernel warning appears:
WARNING: CPU: 3 PID: 368 at fs/aio.c:598 kiocb_set_cancel_fn+0x9c/0xa8 Call trace: kiocb_set_cancel_fn+0x9c/0xa8 ffs_epfile_read_iter+0x144/0x1d0 io_read+0x19c/0x498 io_issue_sqe+0x118/0x27c io_submit_sqes+0x25c/0x5fc __arm64_sys_io_uring_enter+0x104/0xab0 invoke_syscall+0x58/0x11c el0_svc_common+0xb4/0xf4 do_el0_svc+0x2c/0xb0 el0_svc+0x2c/0xa4 el0t_64_sync_handler+0x68/0xb4 el0t_64_sync+0x1a4/0x1a8
Fix this by setting the IOCB_AIO_RW flag for read and write I/O that is submitted by libaio.
Suggested-by: Jens Axboe axboe@kernel.dk --- fs/aio.c | 9 ++++++++- include/linux/fs.h | 2 ++ 2 files changed, 10 insertions(+), 1 deletion(-)
diff --git a/fs/aio.c b/fs/aio.c index bb2ff48991f3..da18dbcfcb22 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -593,6 +593,13 @@ void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel) struct kioctx *ctx = req->ki_ctx; unsigned long flags;
+ /* + * kiocb didn't come from aio or is neither a read nor a write, hence + * ignore it. + */ + if (!(iocb->ki_flags & IOCB_AIO_RW)) + return; + if (WARN_ON_ONCE(!list_empty(&req->ki_list))) return;
@@ -1509,7 +1516,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) req->ki_complete = aio_complete_rw; req->private = NULL; req->ki_pos = iocb->aio_offset; - req->ki_flags = req->ki_filp->f_iocb_flags; + req->ki_flags = req->ki_filp->f_iocb_flags | IOCB_AIO_RW; if (iocb->aio_flags & IOCB_FLAG_RESFD) req->ki_flags |= IOCB_EVENTFD; if (iocb->aio_flags & IOCB_FLAG_IOPRIO) { diff --git a/include/linux/fs.h b/include/linux/fs.h index ed5966a70495..c2dcc98cb4c8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -352,6 +352,8 @@ enum rw_hint { * unrelated IO (like cache flushing, new IO generation, etc). */ #define IOCB_DIO_CALLER_COMP (1 << 22) +/* kiocb is a read or write operation submitted by fs/aio.c. */ +#define IOCB_AIO_RW (1 << 23)
/* for use in trace events */ #define TRACE_IOCB_STRINGS \
-----------------------------------------------------------------------------
[PATCH 2/2] fs/aio: Make io_cancel() generate completions again
The following patch accidentally removed the code for delivering completions for cancelled reads and writes to user space: "[PATCH 04/33] aio: remove retry-based AIO" (https://lore.kernel.org/all/1363883754-27966-5-git-send-email-koverstreet@go...) From that patch:
- if (kiocbIsCancelled(iocb)) { - ret = -EINTR; - aio_complete(iocb, ret, 0); - /* must not access the iocb after this */ - goto out; - }
This leads to a leak in user space of a struct iocb. Hence this patch that restores the code that reports to user space that a read or write has been cancelled successfully. --- fs/aio.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c index da18dbcfcb22..28223f511931 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -2165,14 +2165,11 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id, #endif
/* sys_io_cancel: - * Attempts to cancel an iocb previously passed to io_submit. If - * the operation is successfully cancelled, the resulting event is - * copied into the memory pointed to by result without being placed - * into the completion queue and 0 is returned. May fail with - * -EFAULT if any of the data structures pointed to are invalid. - * May fail with -EINVAL if aio_context specified by ctx_id is - * invalid. May fail with -EAGAIN if the iocb specified was not - * cancelled. Will fail with -ENOSYS if not implemented. + * Attempts to cancel an iocb previously passed to io_submit(). If the + * operation is successfully cancelled 0 is returned. May fail with + * -EFAULT if any of the data structures pointed to are invalid. May + * fail with -EINVAL if aio_context specified by ctx_id is invalid. Will + * fail with -ENOSYS if not implemented. */ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, struct io_event __user *, result) @@ -2203,14 +2200,12 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, } spin_unlock_irq(&ctx->ctx_lock);
- if (!ret) { - /* - * The result argument is no longer used - the io_event is - * always delivered via the ring buffer. -EINPROGRESS indicates - * cancellation is progress: - */ - ret = -EINPROGRESS; - } + /* + * The result argument is no longer used - the io_event is always + * delivered via the ring buffer. + */ + if (ret == 0 && kiocb->rw.ki_flags & IOCB_AIO_RW) + aio_complete_rw(&kiocb->rw, -EINTR);
percpu_ref_put(&ctx->users);
-----------------------------------------------------------------------------
On Tue, Feb 13, 2024 at 01:01:51PM -0800, Bart Van Assche wrote:
On 2/12/24 11:28, Bart Van Assche wrote:
On 2/9/24 10:12, Jens Axboe wrote:
Greg, can you elaborate on how useful cancel is for gadgets? Is it one of those things that was wired up "just because", or does it have actually useful cases?
I found two use cases in the Android Open Source Project and have submitted CLs that request to remove the io_cancel() calls from that code. Although I think I understand why these calls were added, the race conditions that these io_cancel() calls try to address cannot be addressed completely by calling io_cancel().
(replying to my own e-mail) The adb daemon (adbd) maintainers asked me to preserve the I/O cancellation code in adbd because it was introduced recently in that code to fix an important bug. Does everyone agree with the approach of the untested patches below?
But I mean, the cancellation code has seemingly been broken since forever according to your patch description. So their fix which relies on it actually fixes nothing? And they seemingly didn't notice until you told them that it's broken. Can we get a link to that stuff, please?
Btw, you should probably provide that context in your patch series you sent that Christoph and I responded too. Because I just stumbled upon this here and it provided context I was missing.
linux-stable-mirror@lists.linaro.org