If there is a large number (hundreds) of dmabufs allocated, the text
output generated from dmabuf_iter_seq_show can exceed common user buffer
sizes (e.g. PAGE_SIZE) necessitating multiple start/stop cycles to
iterate through all dmabufs. However the dmabuf iterator currently
returns NULL in dmabuf_iter_seq_start for all non-zero pos values, which
results in the truncation of the output before all dmabufs are handled.
After dma_buf_iter_begin / dma_buf_iter_next, the refcount of the buffer
is elevated so that the BPF iterator program can run without holding any
locks. When a stop occurs, instead of immediately dropping the reference
on the buffer, stash a pointer to the buffer in seq->priv until
either start is called or the iterator is released. This also enables
the resumption of iteration without first walking through the list of
dmabufs based on the pos value.
Fixes: 76ea95534995 ("bpf: Add dmabuf iterator")
Signed-off-by: T.J. Mercier <tjmercier(a)google.com>
---
kernel/bpf/dmabuf_iter.c | 56 +++++++++++++++++++++++++++++++++++-----
1 file changed, 49 insertions(+), 7 deletions(-)
diff --git a/kernel/bpf/dmabuf_iter.c b/kernel/bpf/dmabuf_iter.c
index 4dd7ef7c145c..cd500248abd9 100644
--- a/kernel/bpf/dmabuf_iter.c
+++ b/kernel/bpf/dmabuf_iter.c
@@ -6,10 +6,33 @@
#include <linux/kernel.h>
#include <linux/seq_file.h>
+struct dmabuf_iter_priv {
+ /*
+ * If this pointer is non-NULL, the buffer's refcount is elevated to
+ * prevent destruction between stop/start. If reading is not resumed and
+ * start is never called again, then dmabuf_iter_seq_fini drops the
+ * reference when the iterator is released.
+ */
+ struct dma_buf *dmabuf;
+};
+
static void *dmabuf_iter_seq_start(struct seq_file *seq, loff_t *pos)
{
- if (*pos)
- return NULL;
+ struct dmabuf_iter_priv *p = seq->private;
+
+ if (*pos) {
+ struct dma_buf *dmabuf = p->dmabuf;
+
+ if (!dmabuf)
+ return NULL;
+
+ /*
+ * Always resume from where we stopped, regardless of the value
+ * of pos.
+ */
+ p->dmabuf = NULL;
+ return dmabuf;
+ }
return dma_buf_iter_begin();
}
@@ -54,8 +77,11 @@ static void dmabuf_iter_seq_stop(struct seq_file *seq, void *v)
{
struct dma_buf *dmabuf = v;
- if (dmabuf)
- dma_buf_put(dmabuf);
+ if (dmabuf) {
+ struct dmabuf_iter_priv *p = seq->private;
+
+ p->dmabuf = dmabuf;
+ }
}
static const struct seq_operations dmabuf_iter_seq_ops = {
@@ -71,11 +97,27 @@ static void bpf_iter_dmabuf_show_fdinfo(const struct bpf_iter_aux_info *aux,
seq_puts(seq, "dmabuf iter\n");
}
+static int dmabuf_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux)
+{
+ struct dmabuf_iter_priv *p = (struct dmabuf_iter_priv *)priv;
+
+ p->dmabuf = NULL;
+ return 0;
+}
+
+static void dmabuf_iter_seq_fini(void *priv)
+{
+ struct dmabuf_iter_priv *p = (struct dmabuf_iter_priv *)priv;
+
+ if (p->dmabuf)
+ dma_buf_put(p->dmabuf);
+}
+
static const struct bpf_iter_seq_info dmabuf_iter_seq_info = {
.seq_ops = &dmabuf_iter_seq_ops,
- .init_seq_private = NULL,
- .fini_seq_private = NULL,
- .seq_priv_size = 0,
+ .init_seq_private = dmabuf_iter_seq_init,
+ .fini_seq_private = dmabuf_iter_seq_fini,
+ .seq_priv_size = sizeof(struct dmabuf_iter_priv),
};
static struct bpf_iter_reg bpf_dmabuf_reg_info = {
base-commit: 30f09200cc4aefbd8385b01e41bde2e4565a6f0e
--
2.52.0.177.g9f829587af-goog
On Sun, Nov 23, 2025 at 10:51:25PM +0000, Pavel Begunkov wrote:
> +struct dma_token *blkdev_dma_map(struct file *file,
> + struct dma_token_params *params)
Given that this is a direct file operation instance it should be
in block/fops.c. If we do want a generic helper below it, it
should take a struct block_device instead. But we can probably
defer that until a user for that shows up.
> +static void nvme_sync_dma(struct nvme_dev *nvme_dev, struct request *req,
> + enum dma_data_direction dir)
> +{
> + struct blk_mq_dma_map *map = req->dma_map;
> + int length = blk_rq_payload_bytes(req);
> + bool for_cpu = dir == DMA_FROM_DEVICE;
> + struct device *dev = nvme_dev->dev;
> + dma_addr_t *dma_list = map->private;
> + struct bio *bio = req->bio;
> + int offset, map_idx;
> +
> + offset = bio->bi_iter.bi_bvec_done;
> + map_idx = offset / NVME_CTRL_PAGE_SIZE;
> + length += offset & (NVME_CTRL_PAGE_SIZE - 1);
> +
> + while (length > 0) {
> + u64 dma_addr = dma_list[map_idx++];
> +
> + if (for_cpu)
> + __dma_sync_single_for_cpu(dev, dma_addr,
> + NVME_CTRL_PAGE_SIZE, dir);
> + else
> + __dma_sync_single_for_device(dev, dma_addr,
> + NVME_CTRL_PAGE_SIZE, dir);
> + length -= NVME_CTRL_PAGE_SIZE;
> + }
This looks really inefficient. Usually the ranges in the dmabuf should
be much larger than a controller page.
> +static void nvme_unmap_premapped_data(struct nvme_dev *dev,
> + struct request *req)
> +{
> + struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
> +
> + if (rq_data_dir(req) == READ)
> + nvme_sync_dma(dev, req, DMA_FROM_DEVICE);
> + if (!(iod->flags & IOD_SINGLE_SEGMENT))
> + nvme_free_descriptors(req);
> +}
This doesn't really unmap anything :)
Also the dma ownership rules say that you always need to call the
sync_to_device helpers before I/O and the sync_to_cpu helpers after I/O,
no matters if it is a read or write. The implementations then makes
them a no-op where possible.
> +
> + offset = bio->bi_iter.bi_bvec_done;
> + map_idx = offset / NVME_CTRL_PAGE_SIZE;
> + offset &= (NVME_CTRL_PAGE_SIZE - 1);
> +
> + prp1_dma = dma_list[map_idx++] + offset;
> +
> + length -= (NVME_CTRL_PAGE_SIZE - offset);
> + if (length <= 0) {
> + prp2_dma = 0;
Urgg, why is this building PRPs instead of SGLs? Yes, SGLs are an
optional feature, but for devices where you want to micro-optimize
like this I think we should simply require them. This should cut
down on both the memory use and the amount of special mapping code.
On Sun, Nov 23, 2025 at 10:51:25PM +0000, Pavel Begunkov wrote:
> Add blk-mq infrastructure to handle dmabuf tokens. There are two main
Please spell out infrastructure in the subject as well.
> +struct dma_token *blkdev_dma_map(struct file *file,
> + struct dma_token_params *params)
> +{
> + struct request_queue *q = bdev_get_queue(file_bdev(file));
> +
> + if (!(file->f_flags & O_DIRECT))
> + return ERR_PTR(-EINVAL);
Shouldn't the O_DIRECT check be in the caller?
> +++ b/block/blk-mq-dma-token.c
Missing SPDX and Copyright statement.
> @@ -0,0 +1,236 @@
> +#include <linux/blk-mq-dma-token.h>
> +#include <linux/dma-resv.h>
> +
> +struct blk_mq_dma_fence {
> + struct dma_fence base;
> + spinlock_t lock;
> +};
And a high-level comment explaining the fencing logic would be nice
as well.
> + struct blk_mq_dma_map *map = container_of(ref, struct blk_mq_dma_map, refs);
Overly long line.
> +static struct blk_mq_dma_map *blk_mq_alloc_dma_mapping(struct blk_mq_dma_token *token)
Another one. Also kinda inconsistent between _map in the data structure
and _mapping in the function name.
> +static inline
> +struct blk_mq_dma_map *blk_mq_get_token_map(struct blk_mq_dma_token *token)
Really odd return value / scope formatting.
> +{
> + struct blk_mq_dma_map *map;
> +
> + guard(rcu)();
> +
> + map = rcu_dereference(token->map);
> + if (unlikely(!map || !percpu_ref_tryget_live_rcu(&map->refs)))
> + return NULL;
> + return map;
Please use good old rcu_read_unlock to make this readable.
> + guard(mutex)(&token->mapping_lock);
Same.
> +
> + map = blk_mq_get_token_map(token);
> + if (map)
> + return map;
> +
> + map = blk_mq_alloc_dma_mapping(token);
> + if (IS_ERR(map))
> + return NULL;
> +
> + dma_resv_lock(dmabuf->resv, NULL);
> + ret = dma_resv_wait_timeout(dmabuf->resv, DMA_RESV_USAGE_BOOKKEEP,
> + true, MAX_SCHEDULE_TIMEOUT);
> + ret = ret ? ret : -ETIME;
if (!ret)
ret = -ETIME;
> +blk_status_t blk_rq_assign_dma_map(struct request *rq,
> + struct blk_mq_dma_token *token)
> +{
> + struct blk_mq_dma_map *map;
> +
> + map = blk_mq_get_token_map(token);
> + if (map)
> + goto complete;
> +
> + if (rq->cmd_flags & REQ_NOWAIT)
> + return BLK_STS_AGAIN;
> +
> + map = blk_mq_create_dma_map(token);
> + if (IS_ERR(map))
> + return BLK_STS_RESOURCE;
Having a few comments, that say this is creating the map lazily
would probably helper the reader. Also why not keep the !map
case in the branch, as the map case should be the fast path and
thus usually be straight line in the function?
> +void blk_mq_dma_map_move_notify(struct blk_mq_dma_token *token)
> +{
> + blk_mq_dma_map_remove(token);
> +}
Is there a good reason for having this blk_mq_dma_map_move_notify
wrapper?
> + if (bio_flagged(bio, BIO_DMA_TOKEN)) {
> + struct blk_mq_dma_token *token;
> + blk_status_t ret;
> +
> + token = dma_token_to_blk_mq(bio->dma_token);
> + ret = blk_rq_assign_dma_map(rq, token);
> + if (ret) {
> + if (ret == BLK_STS_AGAIN) {
> + bio_wouldblock_error(bio);
> + } else {
> + bio->bi_status = BLK_STS_RESOURCE;
> + bio_endio(bio);
> + }
> + goto queue_exit;
> + }
> + }
Any reason to not just keep the dma_token_to_blk_mq? Also why is this
overriding non-BLK_STS_AGAIN errors with BLK_STS_RESOURCE?
(I really wish we could make all BLK_STS_AGAIN errors be quiet without
the explicit setting of BIO_QUIET, which is a bit annoying, but that's
not for this patch).
> +static inline
> +struct blk_mq_dma_token *dma_token_to_blk_mq(struct dma_token *token)
More odd formatting.
> diff --git a/block/bio.c b/block/bio.c
> index 7b13bdf72de0..8793f1ee559d 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -843,6 +843,11 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
> bio_clone_blkg_association(bio, bio_src);
> }
>
> + if (bio_flagged(bio_src, BIO_DMA_TOKEN)) {
> + bio->dma_token = bio_src->dma_token;
> + bio_set_flag(bio, BIO_DMA_TOKEN);
> + }
Historically __bio_clone itself does not clone the payload, just the
bio. But we got rid of the callers that want to clone a bio but not
the payload long time ago.
I'd suggest a prep patch that moves assigning bi_io_vec from
bio_alloc_clone and bio_init_clone into __bio_clone, and given that they
are the same field that'll take carw of the dma token as well.
Alternatively do it in an if/else that the compiler will hopefully
optimize away.
> @@ -1349,6 +1366,10 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter,
> bio_iov_bvec_set(bio, iter);
> iov_iter_advance(iter, bio->bi_iter.bi_size);
> return 0;
> + } else if (iov_iter_is_dma_token(iter)) {
No else after an return please.
> +++ b/block/blk-merge.c
> @@ -328,6 +328,29 @@ int bio_split_io_at(struct bio *bio, const struct queue_limits *lim,
> unsigned nsegs = 0, bytes = 0, gaps = 0;
> struct bvec_iter iter;
>
> + if (bio_flagged(bio, BIO_DMA_TOKEN)) {
Please split the dmabuf logic into a self-contained
helper here.
> + int offset = offset_in_page(bio->bi_iter.bi_bvec_done);
> +
> + nsegs = ALIGN(bio->bi_iter.bi_size + offset, PAGE_SIZE);
> + nsegs >>= PAGE_SHIFT;
Why are we hardcoding PAGE_SIZE based "segments" here?
> +
> + if (offset & lim->dma_alignment || bytes & len_align_mask)
> + return -EINVAL;
> +
> + if (bio->bi_iter.bi_size > max_bytes) {
> + bytes = max_bytes;
> + nsegs = (bytes + offset) >> PAGE_SHIFT;
> + goto split;
> + } else if (nsegs > lim->max_segments) {
No else after a goto either.
On Sun, Nov 23, 2025 at 10:51:23PM +0000, Pavel Begunkov wrote:
> We'll need bio_flagged() earlier in bio.h in the next patch, move it
> together with all related helpers, and mark the bio_flagged()'s bio
> argument as const.
>
> Signed-off-by: Pavel Begunkov <asml.silence(a)gmail.com>
Looks good:
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
Maybe ask Jens to queue it up ASAP to get it out of the way?
On Sun, Nov 23, 2025 at 10:51:22PM +0000, Pavel Begunkov wrote:
> diff --git a/include/linux/uio.h b/include/linux/uio.h
> index 5b127043a151..1b22594ca35b 100644
> --- a/include/linux/uio.h
> +++ b/include/linux/uio.h
> @@ -29,6 +29,7 @@ enum iter_type {
> ITER_FOLIOQ,
> ITER_XARRAY,
> ITER_DISCARD,
> + ITER_DMA_TOKEN,
Please use DMABUF/dmabuf naming everywhere, this is about dmabufs and
not dma in general.
Otherwise this looks good.