A VFIO DMABUF can export a subset of a BAR to userspace by fd; add support for mmap() of this fd. This provides another route for a process to map BARs, except one where the process can only map a specific subset of a BAR represented by the exported DMABUF.
mmap() support enables userspace driver designs that safely delegate access to BAR sub-ranges to other client processes by sharing a DMABUF fd, without having to share the (omnipotent) VFIO device fd with them.
Since the main VFIO BAR mmap() is now DMABUF-aware, this path reuses the existing vm_ops. But, since the lifecycle of an exported DMABUF is still decoupled from that of the device fd it came from, the device fd might now be closed concurrent with a VMA fault.
Extra synchronisation is added to deal with the possibility of a fault racing with the DMABUF cleanup path. (Note that this differs to a DMABUF implicitly created on the mmap() path, which holds ownership of the device fd and so prevents close-during-fault scenarios in order to maintain the same user-facing behaviour on close.) It does this by temporarily taking a VFIO device registration to ensure vdev remains valid, then vdev->memory_lock can be taken.
Signed-off-by: Matt Evans mattev@meta.com --- drivers/vfio/pci/vfio_pci_core.c | 79 ++++++++++++++++++++++++++---- drivers/vfio/pci/vfio_pci_dmabuf.c | 28 +++++++++++ drivers/vfio/pci/vfio_pci_priv.h | 2 + 3 files changed, 99 insertions(+), 10 deletions(-)
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 464b63585bef..cad126cf8737 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -12,6 +12,8 @@
#include <linux/aperture.h> #include <linux/device.h> +#include <linux/dma-buf.h> +#include <linux/dma-resv.h> #include <linux/eventfd.h> #include <linux/file.h> #include <linux/interrupt.h> @@ -1703,20 +1705,76 @@ static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf, vm_fault_t ret = VM_FAULT_SIGBUS;
/* - * We can rely on the existence of both a DMABUF (priv) and - * the VFIO device it was exported from (vdev). This fault's - * VMA was established using vfio_pci_core_mmap_prep_dmabuf() - * which transfers ownership of the VFIO device fd to the - * DMABUF, and so the VFIO device is held open because the - * VMA's vm_file (DMABUF) is open. + * The only thing this can rely on is that the DMABUF relating + * to the VMA's vm_file exists (priv). * - * Since vfio_pci_dma_buf_cleanup() cannot have happened, - * vdev must be valid; we can take memory_lock. + * A DMABUF for a VFIO device fd mmap() holds a reference to + * the original VFIO device fd, but an explicitly-exported + * DMABUF does not. The original fd might have closed, + * meaning this fault can race with + * vfio_pci_dma_buf_cleanup(), meaning priv->vdev might be + * NULL, and the VFIO device registration might have been + * dropped. + * + * With the goal of taking vdev->memory_lock in a world where + * vdev might not still exist: + * + * 1. Take the resv lock on the DMABUF: + * - If racing cleanup got in first, vdev == NULL and buffer + * is revoked; stop/exit if so. + * - If we got in first, vdev is non-NULL, accessible, and + * cleanup _has not yet put the VFIO device registration_, + * so the device refcount must be >0. + * + * 2. Take vfio_device registration (refcount guaranteed >0 + * hereafter). + * + * 3. Unlock the DMABUF's resv lock: + * - A racing cleanup can now complete. + * - But, the device refcount >0, meaning the vfio_device + * (and vfio_pcie_core device vdev) have not yet been + * freed. vdev is accessible, even if the DMABUF has been + * revoked or cleanup has happened, because + * vfio_unregister_group_dev() can't complete. + * + * 4. Take the vdev->memory_lock + * - Either the DMABUF is usable, or has been cleaned up. + * Whichever, it can no longer change under us. + * - Test the DMABUF revocation status again: if it was + * revoked between 1 and 4 return a SIGBUS. Otherwise, + * return a PFN. + * - It's not necessary to also take the resv lock, because + * the status/vdev can't change while memory_lock is held. + * + * 5. Unlock, done. */ + + dma_resv_lock(priv->dmabuf->resv, NULL); vdev = READ_ONCE(priv->vdev);
+ if (READ_ONCE(priv->revoked) || !vdev) { + pr_debug_ratelimited("%s VA 0x%lx, pgoff 0x%lx: DMABUF revoked/cleaned up\n", + __func__, vmf->address, vma->vm_pgoff); + dma_resv_unlock(priv->dmabuf->resv); + return VM_FAULT_SIGBUS; + } + /* vdev is usable */ + + if (!vfio_device_try_get_registration(&vdev->vdev)) { + /* + * If vdev != NULL (above), the registration should + * already be >0 and so this try_get should never + * fail. + */ + dev_warn(&vdev->pdev->dev, "%s: Unexpected registration failure\n", + __func__); + dma_resv_unlock(priv->dmabuf->resv); + return VM_FAULT_SIGBUS; + } + dma_resv_unlock(priv->dmabuf->resv); + scoped_guard(rwsem_read, &vdev->memory_lock) { - if (!priv->revoked) { + if (!READ_ONCE(priv->revoked)) { int pres = vfio_pci_dma_buf_find_pfn(priv, vma, vmf->address, order, &pfn); @@ -1734,6 +1792,7 @@ static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf, vma->vm_pgoff, (unsigned int)ret); }
+ vfio_device_put_registration(&vdev->vdev); return ret; }
@@ -1742,7 +1801,7 @@ static vm_fault_t vfio_pci_mmap_page_fault(struct vm_fault *vmf) return vfio_pci_mmap_huge_fault(vmf, 0); }
-static const struct vm_operations_struct vfio_pci_mmap_ops = { +const struct vm_operations_struct vfio_pci_mmap_ops = { .fault = vfio_pci_mmap_page_fault, #ifdef CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP .huge_fault = vfio_pci_mmap_huge_fault, diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c index 04c7733fe712..cc477f46a7d5 100644 --- a/drivers/vfio/pci/vfio_pci_dmabuf.c +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c @@ -27,6 +27,33 @@ static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf,
return 0; } + +static int vfio_pci_dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma) +{ + struct vfio_pci_dma_buf *priv = dmabuf->priv; + u64 req_len, req_start; + + if (priv->revoked) + return -ENODEV; + if ((vma->vm_flags & VM_SHARED) == 0) + return -EINVAL; + + req_len = vma->vm_end - vma->vm_start; + req_start = vma->vm_pgoff << PAGE_SHIFT; + if (req_start + req_len > priv->size) + return -EINVAL; + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); + + /* See comments in vfio_pci_core_mmap() re VM_ALLOW_ANY_UNCACHED. */ + vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED | VM_IO | VM_PFNMAP | + VM_DONTEXPAND | VM_DONTDUMP); + vma->vm_private_data = priv; + vma->vm_ops = &vfio_pci_mmap_ops; + + return 0; +} #endif /* CONFIG_VFIO_PCI_DMABUF */
static void vfio_pci_dma_buf_done(struct kref *kref) @@ -94,6 +121,7 @@ static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf) static const struct dma_buf_ops vfio_pci_dmabuf_ops = { #ifdef CONFIG_VFIO_PCI_DMABUF .attach = vfio_pci_dma_buf_attach, + .mmap = vfio_pci_dma_buf_mmap, #endif .map_dma_buf = vfio_pci_dma_buf_map, .unmap_dma_buf = vfio_pci_dma_buf_unmap, diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index a8edbee6ce56..f837d6c8bddc 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -37,6 +37,8 @@ struct vfio_pci_dma_buf { u8 revoked : 1; };
+extern const struct vm_operations_struct vfio_pci_mmap_ops; + bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev); void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev);