On Wed Apr 29, 2026 at 5:25 PM CEST, Pavel Begunkov wrote:
Enable BIO_DMABUF_MAP backed requests. It creates a prp list for the dmabuf when it's mapped, which is then used to initialise requests.
Suggested-by: Keith Busch kbusch@kernel.org Signed-off-by: Pavel Begunkov asml.silence@gmail.com
drivers/nvme/host/pci.c | 282 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 282 insertions(+)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index db5fc9bf6627..d2629853a972 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -27,6 +27,8 @@ #include <linux/io-64-nonatomic-lo-hi.h> #include <linux/io-64-nonatomic-hi-lo.h> #include <linux/sed-opal.h> +#include <linux/io_dmabuf_token.h> +#include <linux/dma-resv.h> #include "trace.h" #include "nvme.h" @@ -393,6 +395,17 @@ struct nvme_queue { struct completion delete_done; }; +struct nvme_dmabuf_token {
- struct dma_buf_attachment *attach;
+};
+struct nvme_dmabuf_map {
- struct io_dmabuf_map base;
- dma_addr_t *dma_list;
- struct sg_table *sgt;
- unsigned nr_entries;
+};
/* bits for iod->flags */ enum nvme_iod_flags { /* this command has been aborted by the timeout handler */ @@ -854,6 +867,134 @@ static void nvme_free_descriptors(struct request *req) } } +static void nvme_dmabuf_map_sync(struct nvme_dev *nvme_dev, struct request *req,
bool for_cpu)+{
- int length = blk_rq_payload_bytes(req);
- struct device *dev = nvme_dev->dev;
- enum dma_data_direction dma_dir;
- struct bio *bio = req->bio;
- struct nvme_dmabuf_map *map;
- dma_addr_t *dma_list;
- int offset, map_idx;
- dma_dir = rq_data_dir(req) == READ ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
- map = container_of(bio->dmabuf_map, struct nvme_dmabuf_map, base);
- dma_list = map->dma_list;
- offset = bio->bi_iter.bi_bvec_done;
- map_idx = offset / NVME_CTRL_PAGE_SIZE;
- length += offset & (NVME_CTRL_PAGE_SIZE - 1);
- while (length > 0) {
u64 dma_addr = dma_list[map_idx++];if (for_cpu)__dma_sync_single_for_cpu(dev, dma_addr,NVME_CTRL_PAGE_SIZE, dma_dir);else__dma_sync_single_for_device(dev, dma_addr,NVME_CTRL_PAGE_SIZE,dma_dir);length -= NVME_CTRL_PAGE_SIZE;- }
+}
+static void nvme_rq_clean_dmabuf_map(struct nvme_dev *dev,
struct request *req)+{
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- nvme_dmabuf_map_sync(dev, req, true);
- if (!(iod->flags & IOD_SINGLE_SEGMENT))
nvme_free_descriptors(req);+}
+static blk_status_t nvme_rq_setup_dmabuf_map(struct request *req,
struct nvme_queue *nvmeq)+{
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- int length = blk_rq_payload_bytes(req);
- u64 dma_addr, prp1_dma, prp2_dma;
- struct bio *bio = req->bio;
- struct nvme_dmabuf_map *map;
- dma_addr_t *dma_list;
- dma_addr_t prp_dma;
- __le64 *prp_list;
- int i, map_idx;
- int offset;
- nvme_dmabuf_map_sync(nvmeq->dev, req, false);
- map = container_of(bio->dmabuf_map, struct nvme_dmabuf_map, base);
- dma_list = map->dma_list;
- offset = bio->bi_iter.bi_bvec_done;
- map_idx = offset / NVME_CTRL_PAGE_SIZE;
- offset &= (NVME_CTRL_PAGE_SIZE - 1);
- prp1_dma = dma_list[map_idx++] + offset;
- length -= (NVME_CTRL_PAGE_SIZE - offset);
- if (length <= 0) {
prp2_dma = 0;goto done;- }
- if (length <= NVME_CTRL_PAGE_SIZE) {
prp2_dma = dma_list[map_idx];goto done;- }
- if (DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE) <=
NVME_SMALL_POOL_SIZE / sizeof(__le64))iod->flags |= IOD_SMALL_DESCRIPTOR;- prp_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC,
&prp_dma);- if (!prp_list)
return BLK_STS_RESOURCE;- iod->descriptors[iod->nr_descriptors++] = prp_list;
- prp2_dma = prp_dma;
- i = 0;
- for (;;) {
if (i == NVME_CTRL_PAGE_SIZE >> 3) {__le64 *old_prp_list = prp_list;prp_list = dma_pool_alloc(nvmeq->descriptor_pools.large,GFP_ATOMIC, &prp_dma);if (!prp_list)goto free_prps;iod->descriptors[iod->nr_descriptors++] = prp_list;prp_list[0] = old_prp_list[i - 1];old_prp_list[i - 1] = cpu_to_le64(prp_dma);i = 1;}dma_addr = dma_list[map_idx++];prp_list[i++] = cpu_to_le64(dma_addr);length -= NVME_CTRL_PAGE_SIZE;if (length <= 0)break;- }
+done:
- iod->cmd.common.dptr.prp1 = cpu_to_le64(prp1_dma);
- iod->cmd.common.dptr.prp2 = cpu_to_le64(prp2_dma);
- return BLK_STS_OK;
+free_prps:
- nvme_free_descriptors(req);
- return BLK_STS_RESOURCE;
+}
+static inline bool nvme_rq_is_dmabuf_attached(struct request *req) +{
- if (!IS_ENABLED(CONFIG_DMABUF_TOKEN))
return false;- return req->bio && bio_flagged(req->bio, BIO_DMABUF_MAP);
+}
static void nvme_free_prps(struct request *req, unsigned int attrs) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); @@ -932,6 +1073,11 @@ static void nvme_unmap_data(struct request *req) struct device *dma_dev = nvmeq->dev->dev; unsigned int attrs = 0;
- if (nvme_rq_is_dmabuf_attached(req)) {
nvme_rq_clean_dmabuf_map(nvmeq->dev, req);return;- }
- if (iod->flags & IOD_SINGLE_SEGMENT) { static_assert(offsetof(union nvme_data_ptr, prp1) == offsetof(union nvme_data_ptr, sgl.addr));
@@ -1222,6 +1368,9 @@ static blk_status_t nvme_map_data(struct request *req) struct blk_dma_iter iter; blk_status_t ret;
- if (nvme_rq_is_dmabuf_attached(req))
return nvme_rq_setup_dmabuf_map(req, nvmeq);- /*
- Try to skip the DMA iterator for single segment requests, as that
- significantly improves performances for small I/O sizes.
@@ -2238,6 +2387,134 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled) return result; } +#ifdef CONFIG_DMABUF_TOKEN +static void nvme_dmabuf_invalidate_mappings(struct dma_buf_attachment *attach) +{
- struct io_dmabuf_token *token = attach->importer_priv;
- io_dmabuf_token_invalidate_mappings(token);
+}
+const struct dma_buf_attach_ops nvme_dmabuf_importer_ops = {
- .invalidate_mappings = nvme_dmabuf_invalidate_mappings,
- .allow_peer2peer = true,
+};
+static struct io_dmabuf_map *nvme_dmabuf_token_map(struct io_dmabuf_token *token) +{
- struct nvme_dmabuf_token *data = token->dev_priv;
- struct dma_buf_attachment *attach = data->attach;
- dma_addr_t *dma_list = NULL;
- unsigned long tmp, i = 0;
- struct nvme_dmabuf_map *map;
- struct scatterlist *sg;
- struct sg_table *sgt;
- unsigned nr_entries;
- int ret;
- dma_resv_assert_held(token->dmabuf->resv);
- map = kmalloc(sizeof(*map), GFP_KERNEL);
- if (!map)
return ERR_PTR(-ENOMEM);- nr_entries = token->dmabuf->size / NVME_CTRL_PAGE_SIZE;
- dma_list = kmalloc_array(nr_entries, sizeof(dma_list[0]), GFP_KERNEL);
- if (!dma_list) {
ret = -ENOMEM;goto err;- }
- sgt = dma_buf_map_attachment(attach, token->dir);
- if (IS_ERR(sgt)) {
ret = PTR_ERR(sgt);sgt = NULL;goto err;- }
- for_each_sgtable_dma_sg(sgt, sg, tmp) {
dma_addr_t dma_addr = sg_dma_address(sg);unsigned long sg_len = sg_dma_len(sg);if (sg_len % NVME_CTRL_PAGE_SIZE) {ret = -EINVAL;goto err;}while (sg_len) {dma_list[i++] = dma_addr;dma_addr += NVME_CTRL_PAGE_SIZE;sg_len -= NVME_CTRL_PAGE_SIZE;}- }
- ret = io_dmabuf_init_map(token, &map->base);
- if (ret)
goto err;- map->nr_entries = nr_entries;
- map->dma_list = dma_list;
- map->sgt = sgt;
- return &map->base;
+err:
- if (sgt)
dma_buf_unmap_attachment(attach, sgt, token->dir);- kfree(map);
- kfree(dma_list);
- return ERR_PTR(ret);
+}
+static void nvme_dmabuf_token_unmap(struct io_dmabuf_token *token,
struct io_dmabuf_map *map_base)+{
- struct nvme_dmabuf_token *data = token->dev_priv;
- struct nvme_dmabuf_map *map = container_of(map_base,
struct nvme_dmabuf_map, base);- dma_resv_assert_held(token->dmabuf->resv);
- dma_buf_unmap_attachment(data->attach, map->sgt, token->dir);
- kfree(map->dma_list);
+}
+static void nvme_dmabuf_token_release(struct io_dmabuf_token *token) +{
- struct nvme_dmabuf_token *data = token->dev_priv;
- dma_buf_detach(token->dmabuf, data->attach);
- kfree(data);
+}
+const struct io_dmabuf_token_dev_ops nvme_dma_token_ops = {
- .map = nvme_dmabuf_token_map,
- .unmap = nvme_dmabuf_token_unmap,
- .release = nvme_dmabuf_token_release,
+};
+static int nvme_create_dmabuf_token(struct request_queue *q,
struct io_dmabuf_token *token)+{
- struct nvme_dmabuf_token *data;
- struct dma_buf_attachment *attach;
- struct nvme_ns *ns = q->queuedata;
- struct nvme_dev *dev = to_nvme_dev(ns->ctrl);
- struct dma_buf *dmabuf = token->dmabuf;
- data = kzalloc(sizeof(data), GFP_KERNEL);
- if (!data)
return -ENOMEM;
Shouldn't this be kzalloc(sizeof(*data)...) ?
Also, checkpatch generates a warning because kzalloc_obj() should be preferred for this kind of memory allocations over kzalloc().
- token->dev_priv = data;
- token->dev_ops = &nvme_dma_token_ops;
- attach = dma_buf_dynamic_attach(dmabuf, dev->dev,
&nvme_dmabuf_importer_ops, token);- if (IS_ERR(attach))
return PTR_ERR(attach);
Supposing dma_buf_dynamic_attach() returns an error, won't the 'data' pointer be leaked?
Maurizio