Implement dmabuf backed registered buffers. To register them, the user should specify IO_REGBUF_TYPE_DMABUF for the regitration and pass the desired dmabuf fd and a file for which it should be registered.
From there, it can be used with io_uring read/write requests IORING_OP_{READ,WRITE}_FIXED) as normal. The requests should be issued against the file specified during registration, and otherwise they'll be failed. The user should also be prepared to handle spurious -EAGAIN by reissuing the request.
Internally, dmabuf registered buffers is an optin feature for io_uring request opcodes and they should pass a special flag on import to use it.
Suggested-by: David Wei dw@davidwei.uk Suggested-by: Vishal Verma vishal1.verma@intel.com Suggested-by: Tushar Gohad tushar.gohad@intel.com Signed-off-by: Pavel Begunkov asml.silence@gmail.com --- include/linux/io_uring_types.h | 5 + include/uapi/linux/io_uring.h | 6 +- io_uring/io_uring.c | 3 +- io_uring/rsrc.c | 163 +++++++++++++++++++++++++++++++-- io_uring/rsrc.h | 30 +++++- io_uring/rw.c | 4 +- 6 files changed, 200 insertions(+), 11 deletions(-)
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 7aee83e5ea0e..f9a33099421a 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -10,6 +10,7 @@
struct iou_loop_params; struct io_uring_bpf_ops; +struct io_dmabuf_map;
enum { /* @@ -567,6 +568,7 @@ enum { REQ_F_IMPORT_BUFFER_BIT, REQ_F_SQE_COPIED_BIT, REQ_F_IOPOLL_BIT, + REQ_F_DROP_DMABUF_BIT,
/* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -662,6 +664,8 @@ enum { REQ_F_SQE_COPIED = IO_REQ_FLAG(REQ_F_SQE_COPIED_BIT), /* request must be iopolled to completion (set in ->issue()) */ REQ_F_IOPOLL = IO_REQ_FLAG(REQ_F_IOPOLL_BIT), + /* there is a dma map attached to request that needs to be dropped */ + REQ_F_DROP_DMABUF = IO_REQ_FLAG(REQ_F_DROP_DMABUF_BIT), };
struct io_tw_req { @@ -786,6 +790,7 @@ struct io_kiocb { /* custom credentials, valid IFF REQ_F_CREDS is set */ const struct cred *creds; struct io_wq_work work; + struct io_dmabuf_map *dmabuf_map;
struct io_big_cqe { u64 extra1; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 05c3fd078767..3cd6ce28f9f5 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -810,6 +810,7 @@ enum io_uring_rsrc_reg_flags { enum io_uring_regbuf_type { IO_REGBUF_TYPE_EMPTY, IO_REGBUF_TYPE_UADDR, + IO_REGBUF_TYPE_DMABUF,
__IO_REGBUF_TYPE_MAX, }; @@ -819,7 +820,10 @@ struct io_uring_regbuf_desc { __u32 flags; __u64 size; __u64 uaddr; - __u64 __resv[7]; + + __s32 dmabuf_fd; + __s32 target_fd; + __u64 __resv[6]; };
/* Skip updating fd indexes set to this value in the fd table */ diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 6068448a5aaa..e8a8eef45c3f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -108,7 +108,7 @@
#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | IO_REQ_LINK_FLAGS | \ REQ_F_REISSUE | REQ_F_POLLED | \ - IO_REQ_CLEAN_FLAGS) + IO_REQ_CLEAN_FLAGS | REQ_F_DROP_DMABUF)
#define IO_TCTX_REFS_CACHE_NR (1U << 10)
@@ -1115,6 +1115,7 @@ static void io_free_batch_list(struct io_ring_ctx *ctx, io_queue_next(req); if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS)) io_clean_op(req); + io_req_drop_dmabuf(req); } io_put_file(req); io_req_put_rsrc_nodes(req); diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index f8696b01cb54..bb61de308543 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -10,6 +10,7 @@ #include <linux/compat.h> #include <linux/io_uring.h> #include <linux/io_uring/cmd.h> +#include <linux/io_dmabuf_token.h>
#include <uapi/linux/io_uring.h>
@@ -789,6 +790,93 @@ bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, return true; }
+struct io_regbuf_dma { + struct io_dmabuf_token token; + struct file *target_file; +}; + +static void io_release_reg_dmabuf(void *priv) +{ + struct io_regbuf_dma *db = priv; + + fput(db->target_file); + io_dmabuf_token_release(&db->token); +} + +static struct io_rsrc_node *io_register_dmabuf(struct io_ring_ctx *ctx, + struct io_uring_regbuf_desc *desc) +{ + struct io_rsrc_node *node = NULL; + struct io_mapped_ubuf *imu = NULL; + struct io_regbuf_dma *regbuf = NULL; + struct file *target_file = NULL; + struct dma_buf *dmabuf = NULL; + int ret; + + if (!IS_ENABLED(CONFIG_DMABUF_TOKEN)) + return ERR_PTR(-EOPNOTSUPP); + if (desc->uaddr || desc->size) + return ERR_PTR(-EINVAL); + + ret = -ENOMEM; + node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); + if (!node) + return ERR_PTR(-ENOMEM); + imu = io_alloc_imu(ctx, 0); + if (!imu) + goto err; + regbuf = kzalloc(sizeof(*regbuf), GFP_KERNEL); + if (!regbuf) + goto err; + + ret = -EBADF; + target_file = fget(desc->target_fd); + if (!target_file) + goto err; + + dmabuf = dma_buf_get(desc->dmabuf_fd); + if (IS_ERR(dmabuf)) { + ret = PTR_ERR(dmabuf); + dmabuf = NULL; + goto err; + } + if (dmabuf->size > SZ_1G) { + ret = -EINVAL; + goto err; + } + + ret = io_dmabuf_token_create(target_file, ®buf->token, dmabuf, + DMA_BIDIRECTIONAL); + if (ret) + goto err; + + regbuf->target_file = target_file; + imu->nr_bvecs = 1; + imu->ubuf = 0; + imu->len = dmabuf->size; + imu->folio_shift = 0; + imu->release = io_release_reg_dmabuf; + imu->priv = regbuf; + imu->flags = IO_REGBUF_F_DMABUF; + imu->dir = IO_BUF_DEST | IO_BUF_SOURCE; + refcount_set(&imu->refs, 1); + node->buf = imu; + dma_buf_put(dmabuf); + return node; +err: + kfree(regbuf); + if (imu) + io_free_imu(ctx, imu); + if (node) + io_cache_free(&ctx->node_cache, node); + if (target_file) + fput(target_file); + if (dmabuf) + dma_buf_put(dmabuf); + return ERR_PTR(ret); +} + + static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, struct io_uring_regbuf_desc *desc, struct page **last_hpage) @@ -808,6 +896,12 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, if (!mem_is_zero(&desc->__resv, sizeof(desc->__resv))) return ERR_PTR(-EINVAL);
+ if (desc->type == IO_REGBUF_TYPE_DMABUF) + return io_register_dmabuf(ctx, desc); + + if (desc->dmabuf_fd || desc->target_fd) + return ERR_PTR(-EINVAL); + if (desc->type == IO_REGBUF_TYPE_EMPTY) { if (uaddr || size) return ERR_PTR(-EFAULT); @@ -1134,9 +1228,57 @@ static int io_import_kbuf(int ddir, struct iov_iter *iter, return 0; }
-static int io_import_fixed(int ddir, struct iov_iter *iter, +void io_drop_dmabuf_node(struct io_kiocb *req) +{ + struct io_mapped_ubuf *imu; + + if (!IS_ENABLED(CONFIG_DMABUF_TOKEN)) + return; + if (WARN_ON_ONCE(req->buf_node->type != IORING_RSRC_BUFFER)) + return; + imu = req->buf_node->buf; + if (WARN_ON_ONCE(!(imu->flags & IO_REGBUF_F_DMABUF))) + return; + io_dmabuf_map_drop(req->dmabuf_map); +} + +static int io_import_dmabuf(struct io_kiocb *req, + int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu, - u64 buf_addr, size_t len) + size_t len, size_t offset, + unsigned issue_flags) +{ + struct io_regbuf_dma *db = imu->priv; + struct io_dmabuf_map *map; + + if (!IS_ENABLED(CONFIG_DMABUF_TOKEN)) + return -EOPNOTSUPP; + if (!len) + return -EFAULT; + if (req->file != db->target_file) + return -EBADF; + + map = io_dmabuf_get_map(&db->token); + if (unlikely(!map)) { + if (!(issue_flags & IO_URING_F_UNLOCKED)) + return -EAGAIN; + map = io_dmabuf_create_map(&db->token); + if (IS_ERR(map)) + return PTR_ERR(map); + } + + req->dmabuf_map = map; + req->flags |= REQ_F_DROP_DMABUF; + iov_iter_dmabuf_map(iter, ddir, map, offset, len); + return 0; +} + +static int io_import_fixed(struct io_kiocb *req, + int ddir, struct iov_iter *iter, + struct io_mapped_ubuf *imu, + u64 buf_addr, size_t len, + unsigned issue_flags, + unsigned import_flags) { const struct bio_vec *bvec; size_t folio_mask; @@ -1156,6 +1298,12 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
offset = buf_addr - imu->ubuf;
+ if (imu->flags & IO_REGBUF_F_DMABUF) { + if (!(import_flags & IO_REGBUF_IMPORT_ALLOW_DMABUF)) + return -EFAULT; + return io_import_dmabuf(req, ddir, iter, imu, len, offset, + issue_flags); + } if (imu->flags & IO_REGBUF_F_KBUF) return io_import_kbuf(ddir, iter, imu, len, offset);
@@ -1209,16 +1357,17 @@ inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, return NULL; }
-int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, +int __io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, u64 buf_addr, size_t len, int ddir, - unsigned issue_flags) + unsigned issue_flags, unsigned import_flags) { struct io_rsrc_node *node;
node = io_find_buf_node(req, issue_flags); if (!node) return -EFAULT; - return io_import_fixed(ddir, iter, node->buf, buf_addr, len); + return io_import_fixed(req, ddir, iter, node->buf, buf_addr, len, + issue_flags, import_flags); }
/* Lock two rings at once. The rings must be different! */ @@ -1577,7 +1726,9 @@ int io_import_reg_vec(int ddir, struct iov_iter *iter, iovec_off = vec->nr - nr_iovs; iov = vec->iovec + iovec_off;
- if (imu->flags & IO_REGBUF_F_KBUF) { + if (imu->flags & IO_REGBUF_F_DMABUF) { + return -EOPNOTSUPP; + } else if (imu->flags & IO_REGBUF_F_KBUF) { int ret = io_kern_bvec_size(iov, nr_iovs, imu, &nr_segs);
if (unlikely(ret)) diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 8d48195faf9d..005a273ba107 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -25,6 +25,11 @@ struct io_rsrc_node {
enum { IO_REGBUF_F_KBUF = 1, + IO_REGBUF_F_DMABUF = 2, +}; + +enum { + IO_REGBUF_IMPORT_ALLOW_DMABUF = 1, };
struct io_mapped_ubuf { @@ -60,9 +65,19 @@ int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr);
struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, unsigned issue_flags); +int __io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, + u64 buf_addr, size_t len, int ddir, + unsigned issue_flags, unsigned import_flags); + +static inline int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, u64 buf_addr, size_t len, int ddir, - unsigned issue_flags); + unsigned issue_flags) +{ + return __io_import_reg_buf(req, iter, buf_addr, len, ddir, + issue_flags, 0); +} + int io_import_reg_vec(int ddir, struct iov_iter *iter, struct io_kiocb *req, struct iou_vec *vec, unsigned nr_iovs, unsigned issue_flags); @@ -147,4 +162,17 @@ static inline void io_alloc_cache_vec_kasan(struct iou_vec *iv) io_vec_free(iv); }
+void io_drop_dmabuf_node(struct io_kiocb *req); + +static inline void io_req_drop_dmabuf(struct io_kiocb *req) +{ + if (!IS_ENABLED(CONFIG_DMABUF_TOKEN)) + return; + if (!(req->flags & REQ_F_DROP_DMABUF)) + return; + if (WARN_ON_ONCE(!(req->flags & REQ_F_BUF_NODE))) + return; + io_drop_dmabuf_node(req); +} + #endif diff --git a/io_uring/rw.c b/io_uring/rw.c index 20654deff84d..d50da5fa8bb9 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -380,8 +380,8 @@ static int io_init_rw_fixed(struct io_kiocb *req, unsigned int issue_flags, if (io->bytes_done) return 0;
- ret = io_import_reg_buf(req, &io->iter, rw->addr, rw->len, ddir, - issue_flags); + ret = __io_import_reg_buf(req, &io->iter, rw->addr, rw->len, ddir, + issue_flags, IO_REGBUF_IMPORT_ALLOW_DMABUF); iov_iter_save_state(&io->iter, &io->iter_state); return ret; }