Enable BIO_DMABUF_MAP backed requests. It creates a prp list for the dmabuf when it's mapped, which is then used to initialise requests.
Suggested-by: Keith Busch kbusch@kernel.org Signed-off-by: Pavel Begunkov asml.silence@gmail.com --- drivers/nvme/host/pci.c | 282 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 282 insertions(+)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index db5fc9bf6627..d2629853a972 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -27,6 +27,8 @@ #include <linux/io-64-nonatomic-lo-hi.h> #include <linux/io-64-nonatomic-hi-lo.h> #include <linux/sed-opal.h> +#include <linux/io_dmabuf_token.h> +#include <linux/dma-resv.h>
#include "trace.h" #include "nvme.h" @@ -393,6 +395,17 @@ struct nvme_queue { struct completion delete_done; };
+struct nvme_dmabuf_token { + struct dma_buf_attachment *attach; +}; + +struct nvme_dmabuf_map { + struct io_dmabuf_map base; + dma_addr_t *dma_list; + struct sg_table *sgt; + unsigned nr_entries; +}; + /* bits for iod->flags */ enum nvme_iod_flags { /* this command has been aborted by the timeout handler */ @@ -854,6 +867,134 @@ static void nvme_free_descriptors(struct request *req) } }
+static void nvme_dmabuf_map_sync(struct nvme_dev *nvme_dev, struct request *req, + bool for_cpu) +{ + int length = blk_rq_payload_bytes(req); + struct device *dev = nvme_dev->dev; + enum dma_data_direction dma_dir; + struct bio *bio = req->bio; + struct nvme_dmabuf_map *map; + dma_addr_t *dma_list; + int offset, map_idx; + + dma_dir = rq_data_dir(req) == READ ? DMA_FROM_DEVICE : DMA_TO_DEVICE; + map = container_of(bio->dmabuf_map, struct nvme_dmabuf_map, base); + dma_list = map->dma_list; + + offset = bio->bi_iter.bi_bvec_done; + map_idx = offset / NVME_CTRL_PAGE_SIZE; + length += offset & (NVME_CTRL_PAGE_SIZE - 1); + + while (length > 0) { + u64 dma_addr = dma_list[map_idx++]; + + if (for_cpu) + __dma_sync_single_for_cpu(dev, dma_addr, + NVME_CTRL_PAGE_SIZE, dma_dir); + else + __dma_sync_single_for_device(dev, dma_addr, + NVME_CTRL_PAGE_SIZE, + dma_dir); + length -= NVME_CTRL_PAGE_SIZE; + } +} + +static void nvme_rq_clean_dmabuf_map(struct nvme_dev *dev, + struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + + nvme_dmabuf_map_sync(dev, req, true); + + if (!(iod->flags & IOD_SINGLE_SEGMENT)) + nvme_free_descriptors(req); +} + +static blk_status_t nvme_rq_setup_dmabuf_map(struct request *req, + struct nvme_queue *nvmeq) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + int length = blk_rq_payload_bytes(req); + u64 dma_addr, prp1_dma, prp2_dma; + struct bio *bio = req->bio; + struct nvme_dmabuf_map *map; + dma_addr_t *dma_list; + dma_addr_t prp_dma; + __le64 *prp_list; + int i, map_idx; + int offset; + + nvme_dmabuf_map_sync(nvmeq->dev, req, false); + + map = container_of(bio->dmabuf_map, struct nvme_dmabuf_map, base); + dma_list = map->dma_list; + + offset = bio->bi_iter.bi_bvec_done; + map_idx = offset / NVME_CTRL_PAGE_SIZE; + offset &= (NVME_CTRL_PAGE_SIZE - 1); + prp1_dma = dma_list[map_idx++] + offset; + + length -= (NVME_CTRL_PAGE_SIZE - offset); + if (length <= 0) { + prp2_dma = 0; + goto done; + } + + if (length <= NVME_CTRL_PAGE_SIZE) { + prp2_dma = dma_list[map_idx]; + goto done; + } + + if (DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE) <= + NVME_SMALL_POOL_SIZE / sizeof(__le64)) + iod->flags |= IOD_SMALL_DESCRIPTOR; + + prp_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC, + &prp_dma); + if (!prp_list) + return BLK_STS_RESOURCE; + + iod->descriptors[iod->nr_descriptors++] = prp_list; + prp2_dma = prp_dma; + i = 0; + for (;;) { + if (i == NVME_CTRL_PAGE_SIZE >> 3) { + __le64 *old_prp_list = prp_list; + + prp_list = dma_pool_alloc(nvmeq->descriptor_pools.large, + GFP_ATOMIC, &prp_dma); + if (!prp_list) + goto free_prps; + iod->descriptors[iod->nr_descriptors++] = prp_list; + prp_list[0] = old_prp_list[i - 1]; + old_prp_list[i - 1] = cpu_to_le64(prp_dma); + i = 1; + } + + dma_addr = dma_list[map_idx++]; + prp_list[i++] = cpu_to_le64(dma_addr); + + length -= NVME_CTRL_PAGE_SIZE; + if (length <= 0) + break; + } +done: + iod->cmd.common.dptr.prp1 = cpu_to_le64(prp1_dma); + iod->cmd.common.dptr.prp2 = cpu_to_le64(prp2_dma); + return BLK_STS_OK; +free_prps: + nvme_free_descriptors(req); + return BLK_STS_RESOURCE; +} + +static inline bool nvme_rq_is_dmabuf_attached(struct request *req) +{ + if (!IS_ENABLED(CONFIG_DMABUF_TOKEN)) + return false; + return req->bio && bio_flagged(req->bio, BIO_DMABUF_MAP); +} + static void nvme_free_prps(struct request *req, unsigned int attrs) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); @@ -932,6 +1073,11 @@ static void nvme_unmap_data(struct request *req) struct device *dma_dev = nvmeq->dev->dev; unsigned int attrs = 0;
+ if (nvme_rq_is_dmabuf_attached(req)) { + nvme_rq_clean_dmabuf_map(nvmeq->dev, req); + return; + } + if (iod->flags & IOD_SINGLE_SEGMENT) { static_assert(offsetof(union nvme_data_ptr, prp1) == offsetof(union nvme_data_ptr, sgl.addr)); @@ -1222,6 +1368,9 @@ static blk_status_t nvme_map_data(struct request *req) struct blk_dma_iter iter; blk_status_t ret;
+ if (nvme_rq_is_dmabuf_attached(req)) + return nvme_rq_setup_dmabuf_map(req, nvmeq); + /* * Try to skip the DMA iterator for single segment requests, as that * significantly improves performances for small I/O sizes. @@ -2238,6 +2387,134 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled) return result; }
+#ifdef CONFIG_DMABUF_TOKEN +static void nvme_dmabuf_invalidate_mappings(struct dma_buf_attachment *attach) +{ + struct io_dmabuf_token *token = attach->importer_priv; + + io_dmabuf_token_invalidate_mappings(token); +} + +const struct dma_buf_attach_ops nvme_dmabuf_importer_ops = { + .invalidate_mappings = nvme_dmabuf_invalidate_mappings, + .allow_peer2peer = true, +}; + +static struct io_dmabuf_map *nvme_dmabuf_token_map(struct io_dmabuf_token *token) +{ + struct nvme_dmabuf_token *data = token->dev_priv; + struct dma_buf_attachment *attach = data->attach; + dma_addr_t *dma_list = NULL; + unsigned long tmp, i = 0; + struct nvme_dmabuf_map *map; + struct scatterlist *sg; + struct sg_table *sgt; + unsigned nr_entries; + int ret; + + dma_resv_assert_held(token->dmabuf->resv); + + map = kmalloc(sizeof(*map), GFP_KERNEL); + if (!map) + return ERR_PTR(-ENOMEM); + + nr_entries = token->dmabuf->size / NVME_CTRL_PAGE_SIZE; + dma_list = kmalloc_array(nr_entries, sizeof(dma_list[0]), GFP_KERNEL); + if (!dma_list) { + ret = -ENOMEM; + goto err; + } + + sgt = dma_buf_map_attachment(attach, token->dir); + if (IS_ERR(sgt)) { + ret = PTR_ERR(sgt); + sgt = NULL; + goto err; + } + + for_each_sgtable_dma_sg(sgt, sg, tmp) { + dma_addr_t dma_addr = sg_dma_address(sg); + unsigned long sg_len = sg_dma_len(sg); + + if (sg_len % NVME_CTRL_PAGE_SIZE) { + ret = -EINVAL; + goto err; + } + + while (sg_len) { + dma_list[i++] = dma_addr; + dma_addr += NVME_CTRL_PAGE_SIZE; + sg_len -= NVME_CTRL_PAGE_SIZE; + } + } + + ret = io_dmabuf_init_map(token, &map->base); + if (ret) + goto err; + map->nr_entries = nr_entries; + map->dma_list = dma_list; + map->sgt = sgt; + return &map->base; +err: + if (sgt) + dma_buf_unmap_attachment(attach, sgt, token->dir); + kfree(map); + kfree(dma_list); + return ERR_PTR(ret); +} + +static void nvme_dmabuf_token_unmap(struct io_dmabuf_token *token, + struct io_dmabuf_map *map_base) +{ + struct nvme_dmabuf_token *data = token->dev_priv; + struct nvme_dmabuf_map *map = container_of(map_base, + struct nvme_dmabuf_map, base); + + dma_resv_assert_held(token->dmabuf->resv); + + dma_buf_unmap_attachment(data->attach, map->sgt, token->dir); + kfree(map->dma_list); +} + +static void nvme_dmabuf_token_release(struct io_dmabuf_token *token) +{ + struct nvme_dmabuf_token *data = token->dev_priv; + + dma_buf_detach(token->dmabuf, data->attach); + kfree(data); +} + +const struct io_dmabuf_token_dev_ops nvme_dma_token_ops = { + .map = nvme_dmabuf_token_map, + .unmap = nvme_dmabuf_token_unmap, + .release = nvme_dmabuf_token_release, +}; + +static int nvme_create_dmabuf_token(struct request_queue *q, + struct io_dmabuf_token *token) +{ + struct nvme_dmabuf_token *data; + struct dma_buf_attachment *attach; + struct nvme_ns *ns = q->queuedata; + struct nvme_dev *dev = to_nvme_dev(ns->ctrl); + struct dma_buf *dmabuf = token->dmabuf; + + data = kzalloc(sizeof(data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + token->dev_priv = data; + token->dev_ops = &nvme_dma_token_ops; + + attach = dma_buf_dynamic_attach(dmabuf, dev->dev, + &nvme_dmabuf_importer_ops, token); + if (IS_ERR(attach)) + return PTR_ERR(attach); + data->attach = attach; + return 0; +} +#endif + static const struct blk_mq_ops nvme_mq_admin_ops = { .queue_rq = nvme_queue_rq, .complete = nvme_pci_complete_rq, @@ -2256,6 +2533,10 @@ static const struct blk_mq_ops nvme_mq_ops = { .map_queues = nvme_pci_map_queues, .timeout = nvme_timeout, .poll = nvme_poll, + +#ifdef CONFIG_DMABUF_TOKEN + .create_dmabuf_token = nvme_create_dmabuf_token, +#endif };
static void nvme_dev_remove_admin(struct nvme_dev *dev) @@ -4289,5 +4570,6 @@ MODULE_AUTHOR("Matthew Wilcox willy@linux.intel.com"); MODULE_LICENSE("GPL"); MODULE_VERSION("1.0"); MODULE_DESCRIPTION("NVMe host PCIe transport driver"); +MODULE_IMPORT_NS("DMA_BUF"); module_init(nvme_init); module_exit(nvme_exit);