We need to pass more information to buffer registration than we can fit into a single struct iovec. This patch allows users to optionally pass struct io_uring_regbuf_desc. Apart from having more space for future use cases, it also introduces registration types.
Currently, the type can be either of IO_REGBUF_TYPE_UADDR, which mirrors the iovec path, or IO_REGBUF_TYPE_EMPTY for leaving a buffer table slot empty. The next patch introduces a dmabuf backed type, and can be useful for other extensions like splicing a list of user addresses (i.e. iovec[]), interoperability with zcrx, kernel allocated memory like was brough up by Cristoph. Note, the type only represents a registration option, which is distinct from how io_uring internally stores it.
The flags field is not used yet but always useful to have, e.g. we can encode read-only / write-only restrictions using it.
Signed-off-by: Pavel Begunkov asml.silence@gmail.com --- include/uapi/linux/io_uring.h | 27 +++++++++++++- io_uring/rsrc.c | 69 ++++++++++++++++++++++------------- 2 files changed, 69 insertions(+), 27 deletions(-)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 17ac1b785440..05c3fd078767 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -790,13 +790,38 @@ struct io_uring_rsrc_update {
struct io_uring_rsrc_update2 { __u32 offset; - __u32 resv; + __u32 flags; __aligned_u64 data; __aligned_u64 tags; __u32 nr; __u32 resv2; };
+/* struct io_uring_rsrc_update2::flags */ +enum io_uring_rsrc_reg_flags { + /* + * Use the extended descriptor format for buffer updates, + * see struct io_uring_regbuf_desc + */ + IORING_RSRC_UPDATE_EXTENDED = 1U << 1, +}; + +/* Buffer registration type, passed in struct io_uring_regbuf_desc::type */ +enum io_uring_regbuf_type { + IO_REGBUF_TYPE_EMPTY, + IO_REGBUF_TYPE_UADDR, + + __IO_REGBUF_TYPE_MAX, +}; + +struct io_uring_regbuf_desc { + __u32 type; /* enum io_uring_regbuf_type */ + __u32 flags; + __u64 size; + __u64 uaddr; + __u64 __resv[7]; +}; + /* Skip updating fd indexes set to this value in the fd table */ #define IORING_REGISTER_FILES_SKIP (-2)
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index ba00238941ed..f8696b01cb54 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -27,11 +27,6 @@ struct io_rsrc_update { u32 offset; };
-struct io_uring_regbuf_desc { - __u64 uaddr; - __u64 size; -}; - static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, struct io_uring_regbuf_desc *desc, struct page **last_hpage); @@ -46,9 +41,12 @@ static void io_iov_to_regbuf_desc(const struct iovec *iov, struct io_uring_regbuf_desc *desc) { *desc = (struct io_uring_regbuf_desc) { + .type = IO_REGBUF_TYPE_UADDR, .uaddr = (u64)iov->iov_base, .size = iov->iov_len, }; + if (!desc->uaddr) + desc->type = IO_REGBUF_TYPE_EMPTY; }
int __io_account_mem(struct user_struct *user, unsigned long nr_pages) @@ -236,6 +234,8 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, return -ENXIO; if (up->offset + nr_args > ctx->file_table.data.nr) return -EINVAL; + if (up->flags) + return -EINVAL;
for (done = 0; done < nr_args; done++) { u64 tag = 0; @@ -292,10 +292,9 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, struct io_uring_rsrc_update2 *up, unsigned int nr_args) { + bool extended = up->flags & IORING_RSRC_UPDATE_EXTENDED; u64 __user *tags = u64_to_user_ptr(up->tags); - struct iovec fast_iov, *iov; struct page *last_hpage = NULL; - struct iovec __user *uvec; u64 user_data = up->data; __u32 done; int i, err; @@ -304,29 +303,49 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, return -ENXIO; if (up->offset + nr_args > ctx->buf_table.nr) return -EINVAL; + if (up->flags & ~IORING_RSRC_UPDATE_EXTENDED) + return -EINVAL;
for (done = 0; done < nr_args; done++) { struct io_uring_regbuf_desc desc; struct io_rsrc_node *node; u64 tag = 0;
- uvec = u64_to_user_ptr(user_data); - iov = iovec_from_user(uvec, 1, 1, &fast_iov, io_is_compat(ctx)); - if (IS_ERR(iov)) { - err = PTR_ERR(iov); - break; - } if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { err = -EFAULT; break; }
- io_iov_to_regbuf_desc(iov, &desc); + if (extended) { + if (copy_from_user(&desc, u64_to_user_ptr(user_data), + sizeof(desc))) { + err = -EFAULT; + break; + } + user_data += sizeof(desc); + } else { + struct iovec __user *uvec = u64_to_user_ptr(user_data); + struct iovec fast_iov, *iov; + + if (io_is_compat(ctx)) + user_data += sizeof(struct compat_iovec); + else + user_data += sizeof(struct iovec); + + iov = iovec_from_user(uvec, 1, 1, &fast_iov, io_is_compat(ctx)); + if (IS_ERR(iov)) { + err = PTR_ERR(iov); + break; + } + io_iov_to_regbuf_desc(iov, &desc); + } + node = io_sqe_buffer_register(ctx, &desc, &last_hpage); if (IS_ERR(node)) { err = PTR_ERR(node); break; } + if (tag) { if (!node) { err = -EINVAL; @@ -337,10 +356,6 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, i = array_index_nospec(up->offset + done, ctx->buf_table.nr); io_reset_rsrc_node(ctx, &ctx->buf_table, i); ctx->buf_table.nodes[i] = node; - if (io_is_compat(ctx)) - user_data += sizeof(struct compat_iovec); - else - user_data += sizeof(struct iovec); } return done ? done : err; } @@ -375,7 +390,7 @@ int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, memset(&up, 0, sizeof(up)); if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) return -EFAULT; - if (up.resv || up.resv2) + if (up.resv2) return -EINVAL; return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); } @@ -389,7 +404,7 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, return -EINVAL; if (copy_from_user(&up, arg, sizeof(up))) return -EFAULT; - if (!up.nr || up.resv || up.resv2) + if (!up.nr || up.resv2) return -EINVAL; return __io_register_rsrc_update(ctx, type, &up, up.nr); } @@ -489,12 +504,9 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags) struct io_uring_rsrc_update2 up2; int ret;
+ memset(&up2, 0, sizeof(up2)); up2.offset = up->offset; up2.data = up->arg; - up2.nr = 0; - up2.tags = 0; - up2.resv = 0; - up2.resv2 = 0;
if (up->offset == IORING_FILE_INDEX_ALLOC) { ret = io_files_update_with_index_alloc(req, issue_flags); @@ -791,8 +803,13 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, struct io_imu_folio_data data; bool coalesced = false;
- if (!uaddr) { - if (size) + if (desc->type >= __IO_REGBUF_TYPE_MAX) + return ERR_PTR(-EINVAL); + if (!mem_is_zero(&desc->__resv, sizeof(desc->__resv))) + return ERR_PTR(-EINVAL); + + if (desc->type == IO_REGBUF_TYPE_EMPTY) { + if (uaddr || size) return ERR_PTR(-EFAULT); /* remove the buffer without installing a new one */ return NULL;