On Mon, 4 Aug 2025 16:00:45 +0300 Leon Romanovsky leon@kernel.org wrote:
From: Leon Romanovsky leonro@nvidia.com
Add support for exporting PCI device MMIO regions through dma-buf, enabling safe sharing of non-struct page memory with controlled lifetime management. This allows RDMA and other subsystems to import dma-buf FDs and build them into memory regions for PCI P2P operations.
The implementation provides a revocable attachment mechanism using dma-buf move operations. MMIO regions are normally pinned as BARs don't change physical addresses, but access is revoked when the VFIO device is closed or a PCI reset is issued. This ensures kernel self-defense against potentially hostile userspace.
Signed-off-by: Jason Gunthorpe jgg@nvidia.com Signed-off-by: Vivek Kasireddy vivek.kasireddy@intel.com Signed-off-by: Leon Romanovsky leonro@nvidia.com
drivers/vfio/pci/Kconfig | 20 ++ drivers/vfio/pci/Makefile | 2 + drivers/vfio/pci/vfio_pci_config.c | 22 +- drivers/vfio/pci/vfio_pci_core.c | 25 +- drivers/vfio/pci/vfio_pci_dmabuf.c | 390 +++++++++++++++++++++++++++++ drivers/vfio/pci/vfio_pci_priv.h | 23 ++ include/linux/dma-buf.h | 1 + include/linux/vfio_pci_core.h | 3 + include/uapi/linux/vfio.h | 25 ++ 9 files changed, 506 insertions(+), 5 deletions(-) create mode 100644 drivers/vfio/pci/vfio_pci_dmabuf.c
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index 2b0172f546652..55ae888bf26ae 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -55,6 +55,26 @@ config VFIO_PCI_ZDEV_KVM To enable s390x KVM vfio-pci extensions, say Y. +config VFIO_PCI_DMABUF
- bool "VFIO PCI extensions for DMA-BUF"
- depends on VFIO_PCI_CORE
- depends on PCI_P2PDMA && DMA_SHARED_BUFFER
- default y
- help
Enable support for VFIO PCI extensions that allow exporting
device MMIO regions as DMA-BUFs for peer devices to access via
peer-to-peer (P2P) DMA.
This feature enables a VFIO-managed PCI device to export a portion
of its MMIO BAR as a DMA-BUF file descriptor, which can be passed
to other userspace drivers or kernel subsystems capable of
initiating DMA to that region.
Say Y here if you want to enable VFIO DMABUF-based MMIO export
support for peer-to-peer DMA use cases.
If unsure, say N.
source "drivers/vfio/pci/mlx5/Kconfig" source "drivers/vfio/pci/hisilicon/Kconfig" diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index cf00c0a7e55c8..f9155e9c5f630 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -2,7 +2,9 @@ vfio-pci-core-y := vfio_pci_core.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o vfio-pci-core-$(CONFIG_VFIO_PCI_ZDEV_KVM) += vfio_pci_zdev.o
obj-$(CONFIG_VFIO_PCI_CORE) += vfio-pci-core.o +vfio-pci-core-$(CONFIG_VFIO_PCI_DMABUF) += vfio_pci_dmabuf.o vfio-pci-y := vfio_pci.o vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 8f02f236b5b4b..7e23387a43b4d 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -589,10 +589,12 @@ static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos, virt_mem = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_MEMORY); new_mem = !!(new_cmd & PCI_COMMAND_MEMORY);
if (!new_mem)
if (!new_mem) { vfio_pci_zap_and_down_write_memory_lock(vdev);
else
vfio_pci_dma_buf_move(vdev, true);
} else { down_write(&vdev->memory_lock);
}
/* * If the user is writing mem/io enable (new_mem/io) and we @@ -627,6 +629,8 @@ static int vfio_basic_config_write(struct vfio_pci_core_device *vdev, int pos, *virt_cmd &= cpu_to_le16(~mask); *virt_cmd |= cpu_to_le16(new_cmd & mask);
if (__vfio_pci_memory_enabled(vdev))
up_write(&vdev->memory_lock); }vfio_pci_dma_buf_move(vdev, false);
@@ -707,12 +711,16 @@ static int __init init_pci_cap_basic_perm(struct perm_bits *perm) static void vfio_lock_and_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t state) {
- if (state >= PCI_D3hot)
- if (state >= PCI_D3hot) { vfio_pci_zap_and_down_write_memory_lock(vdev);
- else
vfio_pci_dma_buf_move(vdev, true);
- } else { down_write(&vdev->memory_lock);
- }
vfio_pci_set_power_state(vdev, state);
- if (__vfio_pci_memory_enabled(vdev))
up_write(&vdev->memory_lock);vfio_pci_dma_buf_move(vdev, false);
} @@ -900,7 +908,10 @@ static int vfio_exp_config_write(struct vfio_pci_core_device *vdev, int pos, if (!ret && (cap & PCI_EXP_DEVCAP_FLR)) { vfio_pci_zap_and_down_write_memory_lock(vdev);
vfio_pci_dma_buf_move(vdev, true); pci_try_reset_function(vdev->pdev);
if (__vfio_pci_memory_enabled(vdev))
vfio_pci_dma_buf_move(vdev, true);
@revoked true -> true seems wrong.
up_write(&vdev->memory_lock); }
} @@ -982,7 +993,10 @@ static int vfio_af_config_write(struct vfio_pci_core_device *vdev, int pos, if (!ret && (cap & PCI_AF_CAP_FLR) && (cap & PCI_AF_CAP_TP)) { vfio_pci_zap_and_down_write_memory_lock(vdev);
vfio_pci_dma_buf_move(vdev, true); pci_try_reset_function(vdev->pdev);
if (__vfio_pci_memory_enabled(vdev))
vfio_pci_dma_buf_move(vdev, true);
Same.
up_write(&vdev->memory_lock); }
} diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index b1863d84b11aa..8e840ac413e9b 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -28,7 +28,9 @@ #include <linux/nospec.h> #include <linux/sched/mm.h> #include <linux/iommufd.h> +#ifdef CONFIG_VFIO_PCI_DMABUF #include <linux/pci-p2pdma.h> +#endif #if IS_ENABLED(CONFIG_EEH) #include <asm/eeh.h> #endif @@ -287,6 +289,8 @@ static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev, * semaphore. */ vfio_pci_zap_and_down_write_memory_lock(vdev);
- vfio_pci_dma_buf_move(vdev, true);
- if (vdev->pm_runtime_engaged) { up_write(&vdev->memory_lock); return -EINVAL;
@@ -370,6 +374,8 @@ static void vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev) */ down_write(&vdev->memory_lock); __vfio_pci_runtime_pm_exit(vdev);
- if (__vfio_pci_memory_enabled(vdev))
up_write(&vdev->memory_lock);vfio_pci_dma_buf_move(vdev, false);
} @@ -690,6 +696,8 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev) #endif vfio_pci_core_disable(vdev);
- vfio_pci_dma_buf_cleanup(vdev);
- mutex_lock(&vdev->igate); if (vdev->err_trigger) { eventfd_ctx_put(vdev->err_trigger);
@@ -1222,7 +1230,10 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev, */ vfio_pci_set_power_state(vdev, PCI_D0);
- vfio_pci_dma_buf_move(vdev, true); ret = pci_try_reset_function(vdev->pdev);
- if (__vfio_pci_memory_enabled(vdev))
up_write(&vdev->memory_lock);vfio_pci_dma_buf_move(vdev, false);
return ret; @@ -1511,6 +1522,8 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, return vfio_pci_core_pm_exit(vdev, flags, arg, argsz); case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN: return vfio_pci_core_feature_token(vdev, flags, arg, argsz);
- case VFIO_DEVICE_FEATURE_DMA_BUF:
default: return -ENOTTY; }return vfio_pci_core_feature_dma_buf(vdev, flags, arg, argsz);
@@ -2085,9 +2098,13 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev) INIT_LIST_HEAD(&vdev->dummy_resources_list); INIT_LIST_HEAD(&vdev->ioeventfds_list); INIT_LIST_HEAD(&vdev->sriov_pfs_item); +#ifdef CONFIG_VFIO_PCI_DMABUF vdev->provider = pci_p2pdma_enable(vdev->pdev); if (IS_ERR(vdev->provider)) return PTR_ERR(vdev->provider);
- INIT_LIST_HEAD(&vdev->dmabufs);
+#endif init_rwsem(&vdev->memory_lock); xa_init(&vdev->ctx); @@ -2470,11 +2487,17 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, * cause the PCI config space reset without restoring the original * state (saved locally in 'vdev->pm_save'). */
- list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list)
- list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) {
vfio_pci_set_power_state(vdev, PCI_D0);vfio_pci_dma_buf_move(vdev, true);
- }
The revoke should have happened at the time the BARs were zapped. Thanks,
Alex
ret = pci_reset_bus(pdev);
- list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list)
if (__vfio_pci_memory_enabled(vdev))
vfio_pci_dma_buf_move(vdev, false);
- vdev = list_last_entry(&dev_set->device_list, struct vfio_pci_core_device, vdev.dev_set_list);