On Mon, 13 Oct 2025 18:26:11 +0300 Leon Romanovsky leon@kernel.org wrote:
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index fe247d0e2831..56b1320238a9 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1511,6 +1520,19 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, return vfio_pci_core_pm_exit(vdev, flags, arg, argsz); case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN: return vfio_pci_core_feature_token(vdev, flags, arg, argsz);
- case VFIO_DEVICE_FEATURE_DMA_BUF:
if (device->ops->ioctl != vfio_pci_core_ioctl)/** Devices that overwrite general .ioctl() callback* usually do it to implement their own* VFIO_DEVICE_GET_REGION_INFO handlerm and they present
Typo, "handlerm"
* different BAR information from the real PCI.** DMABUF relies on real PCI information.*/return -EOPNOTSUPP; default: return -ENOTTY; }return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz);
...
@@ -2459,6 +2482,7 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, break; }
vfio_pci_zap_bars(vdev); }vfio_pci_dma_buf_move(vdev, true);@@ -2482,6 +2506,10 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, ret = pci_reset_bus(pdev);
- list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list)
if (__vfio_pci_memory_enabled(vdev))vfio_pci_dma_buf_move(vdev, false);- vdev = list_last_entry(&dev_set->device_list, struct vfio_pci_core_device, vdev.dev_set_list);
This needs to be placed in the existing undo loop with the up_write(), otherwise it can be missed in the error case.
diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c new file mode 100644 index 000000000000..eaba010777f3 --- /dev/null +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c +static unsigned int calc_sg_nents(struct vfio_pci_dma_buf *priv,
struct dma_iova_state *state)+{
- struct phys_vec *phys_vec = priv->phys_vec;
- unsigned int nents = 0;
- u32 i;
- if (!state || !dma_use_iova(state))
for (i = 0; i < priv->nr_ranges; i++)nents += DIV_ROUND_UP(phys_vec[i].len, UINT_MAX);- else
/** In IOVA case, there is only one SG entry which spans* for whole IOVA address space, but we need to make sure* that it fits sg->length, maybe we need more.*/nents = DIV_ROUND_UP(priv->size, UINT_MAX);
I think we're arguably running afoul of the coding style standard here that this is not a single simple statement and should use braces.
- return nents;
+}
+static struct sg_table * +vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
enum dma_data_direction dir)+{
- struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
- struct dma_iova_state *state = attachment->priv;
- struct phys_vec *phys_vec = priv->phys_vec;
- unsigned long attrs = DMA_ATTR_MMIO;
- unsigned int nents, mapped_len = 0;
- struct scatterlist *sgl;
- struct sg_table *sgt;
- dma_addr_t addr;
- int ret;
- u32 i;
- dma_resv_assert_held(priv->dmabuf->resv);
- if (priv->revoked)
return ERR_PTR(-ENODEV);- sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
- if (!sgt)
return ERR_PTR(-ENOMEM);- nents = calc_sg_nents(priv, state);
- ret = sg_alloc_table(sgt, nents, GFP_KERNEL | __GFP_ZERO);
- if (ret)
goto err_kfree_sgt;- sgl = sgt->sgl;
- for (i = 0; i < priv->nr_ranges; i++) {
if (!state) {addr = pci_p2pdma_bus_addr_map(priv->provider,phys_vec[i].paddr);} else if (dma_use_iova(state)) {ret = dma_iova_link(attachment->dev, state,phys_vec[i].paddr, 0,phys_vec[i].len, dir, attrs);if (ret)goto err_unmap_dma;mapped_len += phys_vec[i].len;} else {addr = dma_map_phys(attachment->dev, phys_vec[i].paddr,phys_vec[i].len, dir, attrs);ret = dma_mapping_error(attachment->dev, addr);if (ret)goto err_unmap_dma;}if (!state || !dma_use_iova(state))sgl = fill_sg_entry(sgl, phys_vec[i].len, addr);- }
- if (state && dma_use_iova(state)) {
WARN_ON_ONCE(mapped_len != priv->size);ret = dma_iova_sync(attachment->dev, state, 0, mapped_len);if (ret)goto err_unmap_dma;sgl = fill_sg_entry(sgl, mapped_len, state->addr);- }
- /*
* SGL must be NULL to indicate that SGL is the last one* and we allocated correct number of entries in sg_alloc_table()*/- WARN_ON_ONCE(sgl);
- return sgt;
+err_unmap_dma:
- if (!i || !state)
; /* Do nothing */- else if (dma_use_iova(state))
dma_iova_destroy(attachment->dev, state, mapped_len, dir,attrs);- else
for_each_sgtable_dma_sg(sgt, sgl, i)dma_unmap_phys(attachment->dev, sg_dma_address(sgl),sg_dma_len(sgl), dir, attrs);
Same, here for braces.
- sg_free_table(sgt);
+err_kfree_sgt:
- kfree(sgt);
- return ERR_PTR(ret);
+}
+static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
struct sg_table *sgt,enum dma_data_direction dir)+{
- struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
- struct dma_iova_state *state = attachment->priv;
- unsigned long attrs = DMA_ATTR_MMIO;
- struct scatterlist *sgl;
- int i;
- if (!state)
; /* Do nothing */- else if (dma_use_iova(state))
dma_iova_destroy(attachment->dev, state, priv->size, dir,attrs);- else
for_each_sgtable_dma_sg(sgt, sgl, i)dma_unmap_phys(attachment->dev, sg_dma_address(sgl),sg_dma_len(sgl), dir, attrs);
Here too.
- sg_free_table(sgt);
- kfree(sgt);
+}
...
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 75100bf009ba..63214467c875 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -1478,6 +1478,31 @@ struct vfio_device_feature_bus_master { }; #define VFIO_DEVICE_FEATURE_BUS_MASTER 10 +/**
- Upon VFIO_DEVICE_FEATURE_GET create a dma_buf fd for the
- regions selected.
- open_flags are the typical flags passed to open(2), eg O_RDWR, O_CLOEXEC,
- etc. offset/length specify a slice of the region to create the dmabuf from.
- nr_ranges is the total number of (P2P DMA) ranges that comprise the dmabuf.
Probably worth noting that .flags should be zero, I see we enforce that. Thanks,
Alex
- Return: The fd number on success, -1 and errno is set on failure.
- */
+#define VFIO_DEVICE_FEATURE_DMA_BUF 11
+struct vfio_region_dma_range {
- __u64 offset;
- __u64 length;
+};
+struct vfio_device_feature_dma_buf {
- __u32 region_index;
- __u32 open_flags;
- __u32 flags;
- __u32 nr_ranges;
- struct vfio_region_dma_range dma_ranges[];
+};
/* -------- API for Type1 VFIO IOMMU -------- */ /**