From: Alex Williamson alex.williamson@redhat.com
commit 41311242221e3482b20bfed10fa4d9db98d87016 upstream.
With conversion to follow_pfn(), DMA mapping a PFNMAP range depends on the range being faulted into the vma. Add support to manually provide that, in the same way as done on KVM with hva_to_pfn_remapped().
Reviewed-by: Peter Xu peterx@redhat.com Signed-off-by: Alex Williamson alex.williamson@redhat.com [Ajay: Regenerated the patch for v4.9] Signed-off-by: Ajay Kaher akaher@vmware.com --- drivers/vfio/vfio_iommu_type1.c | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-)
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index a9f58f3..ccef02c 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -213,6 +213,32 @@ static int put_pfn(unsigned long pfn, int prot) return 0; }
+static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm, + unsigned long vaddr, unsigned long *pfn, + bool write_fault) +{ + int ret; + + ret = follow_pfn(vma, vaddr, pfn); + if (ret) { + bool unlocked = false; + + ret = fixup_user_fault(NULL, mm, vaddr, + FAULT_FLAG_REMOTE | + (write_fault ? FAULT_FLAG_WRITE : 0), + &unlocked); + if (unlocked) + return -EAGAIN; + + if (ret) + return ret; + + ret = follow_pfn(vma, vaddr, pfn); + } + + return ret; +} + static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn) { struct page *page[1]; @@ -226,12 +252,16 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
down_read(¤t->mm->mmap_sem);
+retry: vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
if (vma && vma->vm_flags & VM_PFNMAP) { - if (!follow_pfn(vma, vaddr, pfn) && - is_invalid_reserved_pfn(*pfn)) - ret = 0; + ret = follow_fault_pfn(vma, current->mm, vaddr, pfn, prot & IOMMU_WRITE); + if (ret == -EAGAIN) + goto retry; + + if (!ret && !is_invalid_reserved_pfn(*pfn)) + ret = -EFAULT; }
up_read(¤t->mm->mmap_sem);
From: Alex Williamson alex.williamson@redhat.com
commit 11c4cd07ba111a09f49625f9e4c851d83daf0a22 upstream.
Rather than calling remap_pfn_range() when a region is mmap'd, setup a vm_ops handler to support dynamic faulting of the range on access. This allows us to manage a list of vmas actively mapping the area that we can later use to invalidate those mappings. The open callback invalidates the vma range so that all tracking is inserted in the fault handler and removed in the close handler.
Reviewed-by: Peter Xu peterx@redhat.com Signed-off-by: Alex Williamson alex.williamson@redhat.com [Ajay: Regenerated the patch for v4.9] Signed-off-by: Ajay Kaher akaher@vmware.com --- drivers/vfio/pci/vfio_pci.c | 75 ++++++++++++++++++++++++++++++++++++- drivers/vfio/pci/vfio_pci_private.h | 7 ++++ 2 files changed, 80 insertions(+), 2 deletions(-)
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index c94167d..c97cf6c 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -1144,6 +1144,69 @@ static ssize_t vfio_pci_write(void *device_data, const char __user *buf, return vfio_pci_rw(device_data, (char __user *)buf, count, ppos, true); }
+static int vfio_pci_add_vma(struct vfio_pci_device *vdev, + struct vm_area_struct *vma) +{ + struct vfio_pci_mmap_vma *mmap_vma; + + mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL); + if (!mmap_vma) + return -ENOMEM; + + mmap_vma->vma = vma; + + mutex_lock(&vdev->vma_lock); + list_add(&mmap_vma->vma_next, &vdev->vma_list); + mutex_unlock(&vdev->vma_lock); + + return 0; +} + +/* + * Zap mmaps on open so that we can fault them in on access and therefore + * our vma_list only tracks mappings accessed since last zap. + */ +static void vfio_pci_mmap_open(struct vm_area_struct *vma) +{ + zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); +} + +static void vfio_pci_mmap_close(struct vm_area_struct *vma) +{ + struct vfio_pci_device *vdev = vma->vm_private_data; + struct vfio_pci_mmap_vma *mmap_vma; + + mutex_lock(&vdev->vma_lock); + list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) { + if (mmap_vma->vma == vma) { + list_del(&mmap_vma->vma_next); + kfree(mmap_vma); + break; + } + } + mutex_unlock(&vdev->vma_lock); +} + +static int vfio_pci_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct vfio_pci_device *vdev = vma->vm_private_data; + + if (vfio_pci_add_vma(vdev, vma)) + return VM_FAULT_OOM; + + if (remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, + vma->vm_end - vma->vm_start, vma->vm_page_prot)) + return VM_FAULT_SIGBUS; + + return VM_FAULT_NOPAGE; +} + +static const struct vm_operations_struct vfio_pci_mmap_ops = { + .open = vfio_pci_mmap_open, + .close = vfio_pci_mmap_close, + .fault = vfio_pci_mmap_fault, +}; + static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) { struct vfio_pci_device *vdev = device_data; @@ -1209,8 +1272,14 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
- return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, - req_len, vma->vm_page_prot); + /* + * See remap_pfn_range(), called from vfio_pci_fault() but we can't + * change vm_flags within the fault handler. Set them now. + */ + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + vma->vm_ops = &vfio_pci_mmap_ops; + + return 0; }
static void vfio_pci_request(void *device_data, unsigned int count) @@ -1268,6 +1337,8 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) mutex_init(&vdev->igate); spin_lock_init(&vdev->irqlock);
+ mutex_init(&vdev->vma_lock); + INIT_LIST_HEAD(&vdev->vma_list); ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); if (ret) { vfio_iommu_group_put(group, &pdev->dev); diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index f561ac1..4b78f12 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -63,6 +63,11 @@ struct vfio_pci_dummy_resource { struct list_head res_next; };
+struct vfio_pci_mmap_vma { + struct vm_area_struct *vma; + struct list_head vma_next; +}; + struct vfio_pci_device { struct pci_dev *pdev; void __iomem *barmap[PCI_STD_RESOURCE_END + 1]; @@ -95,6 +100,8 @@ struct vfio_pci_device { struct eventfd_ctx *err_trigger; struct eventfd_ctx *req_trigger; struct list_head dummy_resources_list; + struct mutex vma_lock; + struct list_head vma_list; };
#define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
From: Alex Williamson alex.williamson@redhat.com
commit abafbc551fddede3e0a08dee1dcde08fc0eb8476 upstream.
Accessing the disabled memory space of a PCI device would typically result in a master abort response on conventional PCI, or an unsupported request on PCI express. The user would generally see these as a -1 response for the read return data and the write would be silently discarded, possibly with an uncorrected, non-fatal AER error triggered on the host. Some systems however take it upon themselves to bring down the entire system when they see something that might indicate a loss of data, such as this discarded write to a disabled memory space.
To avoid this, we want to try to block the user from accessing memory spaces while they're disabled. We start with a semaphore around the memory enable bit, where writers modify the memory enable state and must be serialized, while readers make use of the memory region and can access in parallel. Writers include both direct manipulation via the command register, as well as any reset path where the internal mechanics of the reset may both explicitly and implicitly disable memory access, and manipulation of the MSI-X configuration, where the MSI-X vector table resides in MMIO space of the device. Readers include the read and write file ops to access the vfio device fd offsets as well as memory mapped access. In the latter case, we make use of our new vma list support to zap, or invalidate, those memory mappings in order to force them to be faulted back in on access.
Our semaphore usage will stall user access to MMIO spaces across internal operations like reset, but the user might experience new behavior when trying to access the MMIO space while disabled via the PCI command register. Access via read or write while disabled will return -EIO and access via memory maps will result in a SIGBUS. This is expected to be compatible with known use cases and potentially provides better error handling capabilities than present in the hardware, while avoiding the more readily accessible and severe platform error responses that might otherwise occur.
Fixes: CVE-2020-12888 Reviewed-by: Peter Xu peterx@redhat.com Signed-off-by: Alex Williamson alex.williamson@redhat.com [Ajay: Regenerated the patch for v4.9] Signed-off-by: Ajay Kaher akaher@vmware.com --- drivers/vfio/pci/vfio_pci.c | 294 +++++++++++++++++++++++++++++++----- drivers/vfio/pci/vfio_pci_config.c | 36 ++++- drivers/vfio/pci/vfio_pci_intrs.c | 14 ++ drivers/vfio/pci/vfio_pci_private.h | 9 ++ drivers/vfio/pci/vfio_pci_rdwr.c | 29 +++- 5 files changed, 334 insertions(+), 48 deletions(-)
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index c97cf6c..2254c28 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -29,6 +29,7 @@ #include <linux/vfio.h> #include <linux/vgaarb.h> #include <linux/nospec.h> +#include <linux/mm.h>
#include "vfio_pci_private.h"
@@ -181,6 +182,7 @@ static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev); static void vfio_pci_disable(struct vfio_pci_device *vdev); +static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data);
/* * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND @@ -656,6 +658,12 @@ int vfio_pci_register_dev_region(struct vfio_pci_device *vdev, return 0; }
+struct vfio_devices { + struct vfio_device **devices; + int cur_index; + int max_index; +}; + static long vfio_pci_ioctl(void *device_data, unsigned int cmd, unsigned long arg) { @@ -729,7 +737,7 @@ static long vfio_pci_ioctl(void *device_data, { void __iomem *io; size_t size; - u16 orig_cmd; + u16 cmd;
info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); info.flags = 0; @@ -749,10 +757,7 @@ static long vfio_pci_ioctl(void *device_data, * Is it really there? Enable memory decode for * implicit access in pci_map_rom(). */ - pci_read_config_word(pdev, PCI_COMMAND, &orig_cmd); - pci_write_config_word(pdev, PCI_COMMAND, - orig_cmd | PCI_COMMAND_MEMORY); - + cmd = vfio_pci_memory_lock_and_enable(vdev); io = pci_map_rom(pdev, &size); if (io) { info.flags = VFIO_REGION_INFO_FLAG_READ; @@ -760,8 +765,8 @@ static long vfio_pci_ioctl(void *device_data, } else { info.size = 0; } + vfio_pci_memory_unlock_and_restore(vdev, cmd);
- pci_write_config_word(pdev, PCI_COMMAND, orig_cmd); break; } case VFIO_PCI_VGA_REGION_INDEX: @@ -909,8 +914,16 @@ static long vfio_pci_ioctl(void *device_data, return ret;
} else if (cmd == VFIO_DEVICE_RESET) { - return vdev->reset_works ? - pci_try_reset_function(vdev->pdev) : -EINVAL; + int ret; + + if (!vdev->reset_works) + return -EINVAL; + + vfio_pci_zap_and_down_write_memory_lock(vdev); + ret = pci_try_reset_function(vdev->pdev); + up_write(&vdev->memory_lock); + + return ret;
} else if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) { struct vfio_pci_hot_reset_info hdr; @@ -990,8 +1003,9 @@ static long vfio_pci_ioctl(void *device_data, int32_t *group_fds; struct vfio_pci_group_entry *groups; struct vfio_pci_group_info info; + struct vfio_devices devs = { .cur_index = 0 }; bool slot = false; - int i, count = 0, ret = 0; + int i, group_idx, mem_idx = 0, count = 0, ret = 0;
minsz = offsetofend(struct vfio_pci_hot_reset, count);
@@ -1043,9 +1057,9 @@ static long vfio_pci_ioctl(void *device_data, * user interface and store the group and iommu ID. This * ensures the group is held across the reset. */ - for (i = 0; i < hdr.count; i++) { + for (group_idx = 0; group_idx < hdr.count; group_idx++) { struct vfio_group *group; - struct fd f = fdget(group_fds[i]); + struct fd f = fdget(group_fds[group_idx]); if (!f.file) { ret = -EBADF; break; @@ -1058,8 +1072,9 @@ static long vfio_pci_ioctl(void *device_data, break; }
- groups[i].group = group; - groups[i].id = vfio_external_user_iommu_id(group); + groups[group_idx].group = group; + groups[group_idx].id = + vfio_external_user_iommu_id(group); }
kfree(group_fds); @@ -1078,14 +1093,65 @@ static long vfio_pci_ioctl(void *device_data, ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_validate_devs, &info, slot); - if (!ret) - /* User has access, do the reset */ - ret = slot ? pci_try_reset_slot(vdev->pdev->slot) : - pci_try_reset_bus(vdev->pdev->bus); + + if (ret) + goto hot_reset_release; + + devs.max_index = count; + devs.devices = kcalloc(count, sizeof(struct vfio_device *), + GFP_KERNEL); + if (!devs.devices) { + ret = -ENOMEM; + goto hot_reset_release; + } + + /* + * We need to get memory_lock for each device, but devices + * can share mmap_sem, therefore we need to zap and hold + * the vma_lock for each device, and only then get each + * memory_lock. + */ + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, + vfio_pci_try_zap_and_vma_lock_cb, + &devs, slot); + if (ret) + goto hot_reset_release; + + for (; mem_idx < devs.cur_index; mem_idx++) { + struct vfio_pci_device *tmp; + + tmp = vfio_device_data(devs.devices[mem_idx]); + + ret = down_write_trylock(&tmp->memory_lock); + if (!ret) { + ret = -EBUSY; + goto hot_reset_release; + } + mutex_unlock(&tmp->vma_lock); + } + + /* User has access, do the reset */ + ret = slot ? pci_try_reset_slot(vdev->pdev->slot) : + pci_try_reset_bus(vdev->pdev->bus);
hot_reset_release: - for (i--; i >= 0; i--) - vfio_group_put_external_user(groups[i].group); + for (i = 0; i < devs.cur_index; i++) { + struct vfio_device *device; + struct vfio_pci_device *tmp; + + device = devs.devices[i]; + tmp = vfio_device_data(device); + + if (i < mem_idx) + up_write(&tmp->memory_lock); + else + mutex_unlock(&tmp->vma_lock); + vfio_device_put(device); + } + kfree(devs.devices); + + for (group_idx--; group_idx >= 0; group_idx--) + vfio_group_put_external_user(groups[group_idx].group);
kfree(groups); return ret; @@ -1144,8 +1210,126 @@ static ssize_t vfio_pci_write(void *device_data, const char __user *buf, return vfio_pci_rw(device_data, (char __user *)buf, count, ppos, true); }
-static int vfio_pci_add_vma(struct vfio_pci_device *vdev, - struct vm_area_struct *vma) +/* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */ +static int vfio_pci_zap_and_vma_lock(struct vfio_pci_device *vdev, bool try) +{ + struct vfio_pci_mmap_vma *mmap_vma, *tmp; + + /* + * Lock ordering: + * vma_lock is nested under mmap_sem for vm_ops callback paths. + * The memory_lock semaphore is used by both code paths calling + * into this function to zap vmas and the vm_ops.fault callback + * to protect the memory enable state of the device. + * + * When zapping vmas we need to maintain the mmap_sem => vma_lock + * ordering, which requires using vma_lock to walk vma_list to + * acquire an mm, then dropping vma_lock to get the mmap_sem and + * reacquiring vma_lock. This logic is derived from similar + * requirements in uverbs_user_mmap_disassociate(). + * + * mmap_sem must always be the top-level lock when it is taken. + * Therefore we can only hold the memory_lock write lock when + * vma_list is empty, as we'd need to take mmap_sem to clear + * entries. vma_list can only be guaranteed empty when holding + * vma_lock, thus memory_lock is nested under vma_lock. + * + * This enables the vm_ops.fault callback to acquire vma_lock, + * followed by memory_lock read lock, while already holding + * mmap_sem without risk of deadlock. + */ + while (1) { + struct mm_struct *mm = NULL; + + if (try) { + if (!mutex_trylock(&vdev->vma_lock)) + return 0; + } else { + mutex_lock(&vdev->vma_lock); + } + while (!list_empty(&vdev->vma_list)) { + mmap_vma = list_first_entry(&vdev->vma_list, + struct vfio_pci_mmap_vma, + vma_next); + mm = mmap_vma->vma->vm_mm; + if (mmget_not_zero(mm)) + break; + + list_del(&mmap_vma->vma_next); + kfree(mmap_vma); + mm = NULL; + } + if (!mm) + return 1; + mutex_unlock(&vdev->vma_lock); + + if (try) { + if (!down_read_trylock(&mm->mmap_sem)) { + mmput(mm); + return 0; + } + } else { + down_read(&mm->mmap_sem); + } + if (mmget_still_valid(mm)) { + if (try) { + if (!mutex_trylock(&vdev->vma_lock)) { + up_read(&mm->mmap_sem); + mmput(mm); + return 0; + } + } else { + mutex_lock(&vdev->vma_lock); + } + list_for_each_entry_safe(mmap_vma, tmp, + &vdev->vma_list, vma_next) { + struct vm_area_struct *vma = mmap_vma->vma; + + if (vma->vm_mm != mm) + continue; + + list_del(&mmap_vma->vma_next); + kfree(mmap_vma); + + zap_vma_ptes(vma, vma->vm_start, + vma->vm_end - vma->vm_start); + } + mutex_unlock(&vdev->vma_lock); + } + up_read(&mm->mmap_sem); + mmput(mm); + } +} + +void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_device *vdev) +{ + vfio_pci_zap_and_vma_lock(vdev, false); + down_write(&vdev->memory_lock); + mutex_unlock(&vdev->vma_lock); +} + +u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_device *vdev) +{ + u16 cmd; + + down_write(&vdev->memory_lock); + pci_read_config_word(vdev->pdev, PCI_COMMAND, &cmd); + if (!(cmd & PCI_COMMAND_MEMORY)) + pci_write_config_word(vdev->pdev, PCI_COMMAND, + cmd | PCI_COMMAND_MEMORY); + + return cmd; +} + +void vfio_pci_memory_unlock_and_restore(struct vfio_pci_device *vdev, u16 cmd) +{ + pci_write_config_word(vdev->pdev, PCI_COMMAND, cmd); + up_write(&vdev->memory_lock); +} + +/* Caller holds vma_lock */ +static int __vfio_pci_add_vma(struct vfio_pci_device *vdev, + struct vm_area_struct *vma) { struct vfio_pci_mmap_vma *mmap_vma;
@@ -1154,10 +1338,7 @@ static int vfio_pci_add_vma(struct vfio_pci_device *vdev, return -ENOMEM;
mmap_vma->vma = vma; - - mutex_lock(&vdev->vma_lock); list_add(&mmap_vma->vma_next, &vdev->vma_list); - mutex_unlock(&vdev->vma_lock);
return 0; } @@ -1190,15 +1371,32 @@ static void vfio_pci_mmap_close(struct vm_area_struct *vma) static int vfio_pci_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct vfio_pci_device *vdev = vma->vm_private_data; + int ret = VM_FAULT_NOPAGE; + + mutex_lock(&vdev->vma_lock); + down_read(&vdev->memory_lock); + + if (!__vfio_pci_memory_enabled(vdev)) { + ret = VM_FAULT_SIGBUS; + mutex_unlock(&vdev->vma_lock); + goto up_out; + } + + if (__vfio_pci_add_vma(vdev, vma)) { + ret = VM_FAULT_OOM; + mutex_unlock(&vdev->vma_lock); + goto up_out; + }
- if (vfio_pci_add_vma(vdev, vma)) - return VM_FAULT_OOM; + mutex_unlock(&vdev->vma_lock);
if (remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, vma->vm_end - vma->vm_start, vma->vm_page_prot)) - return VM_FAULT_SIGBUS; + ret = VM_FAULT_SIGBUS;
- return VM_FAULT_NOPAGE; +up_out: + up_read(&vdev->memory_lock); + return ret; }
static const struct vm_operations_struct vfio_pci_mmap_ops = { @@ -1339,6 +1537,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
mutex_init(&vdev->vma_lock); INIT_LIST_HEAD(&vdev->vma_list); + init_rwsem(&vdev->memory_lock); ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); if (ret) { vfio_iommu_group_put(group, &pdev->dev); @@ -1432,12 +1631,6 @@ static struct pci_driver vfio_pci_driver = { .err_handler = &vfio_err_handlers, };
-struct vfio_devices { - struct vfio_device **devices; - int cur_index; - int max_index; -}; - static int vfio_pci_get_devs(struct pci_dev *pdev, void *data) { struct vfio_devices *devs = data; @@ -1459,6 +1652,39 @@ static int vfio_pci_get_devs(struct pci_dev *pdev, void *data) return 0; }
+static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data) +{ + struct vfio_devices *devs = data; + struct vfio_device *device; + struct vfio_pci_device *vdev; + + if (devs->cur_index == devs->max_index) + return -ENOSPC; + + device = vfio_device_get_from_dev(&pdev->dev); + if (!device) + return -EINVAL; + + if (pci_dev_driver(pdev) != &vfio_pci_driver) { + vfio_device_put(device); + return -EBUSY; + } + + vdev = vfio_device_data(device); + + /* + * Locking multiple devices is prone to deadlock, runaway and + * unwind if we hit contention. + */ + if (!vfio_pci_zap_and_vma_lock(vdev, true)) { + vfio_device_put(device); + return -EBUSY; + } + + devs->devices[devs->cur_index++] = device; + return 0; +} + /* * Attempt to do a bus/slot reset if there are devices affected by a reset for * this device that are needs_reset and all of the affected devices are unused diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 84905d0..5c3239a 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -400,6 +400,14 @@ static inline void p_setd(struct perm_bits *p, int off, u32 virt, u32 write) *(__le32 *)(&p->write[off]) = cpu_to_le32(write); }
+/* Caller should hold memory_lock semaphore */ +bool __vfio_pci_memory_enabled(struct vfio_pci_device *vdev) +{ + u16 cmd = le16_to_cpu(*(__le16 *)&vdev->vconfig[PCI_COMMAND]); + + return cmd & PCI_COMMAND_MEMORY; +} + /* * Restore the *real* BARs after we detect a FLR or backdoor reset. * (backdoor = some device specific technique that we didn't catch) @@ -560,13 +568,18 @@ static int vfio_basic_config_write(struct vfio_pci_device *vdev, int pos,
new_cmd = le32_to_cpu(val);
+ phys_io = !!(phys_cmd & PCI_COMMAND_IO); + virt_io = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_IO); + new_io = !!(new_cmd & PCI_COMMAND_IO); + phys_mem = !!(phys_cmd & PCI_COMMAND_MEMORY); virt_mem = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_MEMORY); new_mem = !!(new_cmd & PCI_COMMAND_MEMORY);
- phys_io = !!(phys_cmd & PCI_COMMAND_IO); - virt_io = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_IO); - new_io = !!(new_cmd & PCI_COMMAND_IO); + if (!new_mem) + vfio_pci_zap_and_down_write_memory_lock(vdev); + else + down_write(&vdev->memory_lock);
/* * If the user is writing mem/io enable (new_mem/io) and we @@ -583,8 +596,11 @@ static int vfio_basic_config_write(struct vfio_pci_device *vdev, int pos, }
count = vfio_default_config_write(vdev, pos, count, perm, offset, val); - if (count < 0) + if (count < 0) { + if (offset == PCI_COMMAND) + up_write(&vdev->memory_lock); return count; + }
/* * Save current memory/io enable bits in vconfig to allow for @@ -595,6 +611,8 @@ static int vfio_basic_config_write(struct vfio_pci_device *vdev, int pos,
*virt_cmd &= cpu_to_le16(~mask); *virt_cmd |= cpu_to_le16(new_cmd & mask); + + up_write(&vdev->memory_lock); }
/* Emulate INTx disable */ @@ -832,8 +850,11 @@ static int vfio_exp_config_write(struct vfio_pci_device *vdev, int pos, pos - offset + PCI_EXP_DEVCAP, &cap);
- if (!ret && (cap & PCI_EXP_DEVCAP_FLR)) + if (!ret && (cap & PCI_EXP_DEVCAP_FLR)) { + vfio_pci_zap_and_down_write_memory_lock(vdev); pci_try_reset_function(vdev->pdev); + up_write(&vdev->memory_lock); + } }
/* @@ -911,8 +932,11 @@ static int vfio_af_config_write(struct vfio_pci_device *vdev, int pos, pos - offset + PCI_AF_CAP, &cap);
- if (!ret && (cap & PCI_AF_CAP_FLR) && (cap & PCI_AF_CAP_TP)) + if (!ret && (cap & PCI_AF_CAP_FLR) && (cap & PCI_AF_CAP_TP)) { + vfio_pci_zap_and_down_write_memory_lock(vdev); pci_try_reset_function(vdev->pdev); + up_write(&vdev->memory_lock); + } }
return count; diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 94594dc..bdfdd50 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -252,6 +252,7 @@ static int vfio_msi_enable(struct vfio_pci_device *vdev, int nvec, bool msix) struct pci_dev *pdev = vdev->pdev; unsigned int flag = msix ? PCI_IRQ_MSIX : PCI_IRQ_MSI; int ret; + u16 cmd;
if (!is_irq_none(vdev)) return -EINVAL; @@ -261,13 +262,16 @@ static int vfio_msi_enable(struct vfio_pci_device *vdev, int nvec, bool msix) return -ENOMEM;
/* return the number of supported vectors if we can't get all: */ + cmd = vfio_pci_memory_lock_and_enable(vdev); ret = pci_alloc_irq_vectors(pdev, 1, nvec, flag); if (ret < nvec) { if (ret > 0) pci_free_irq_vectors(pdev); + vfio_pci_memory_unlock_and_restore(vdev, cmd); kfree(vdev->ctx); return ret; } + vfio_pci_memory_unlock_and_restore(vdev, cmd);
vdev->num_ctx = nvec; vdev->irq_type = msix ? VFIO_PCI_MSIX_IRQ_INDEX : @@ -290,6 +294,7 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev, struct pci_dev *pdev = vdev->pdev; struct eventfd_ctx *trigger; int irq, ret; + u16 cmd;
if (vector < 0 || vector >= vdev->num_ctx) return -EINVAL; @@ -298,7 +303,11 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev,
if (vdev->ctx[vector].trigger) { irq_bypass_unregister_producer(&vdev->ctx[vector].producer); + + cmd = vfio_pci_memory_lock_and_enable(vdev); free_irq(irq, vdev->ctx[vector].trigger); + vfio_pci_memory_unlock_and_restore(vdev, cmd); + kfree(vdev->ctx[vector].name); eventfd_ctx_put(vdev->ctx[vector].trigger); vdev->ctx[vector].trigger = NULL; @@ -326,6 +335,7 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev, * such a reset it would be unsuccessful. To avoid this, restore the * cached value of the message prior to enabling. */ + cmd = vfio_pci_memory_lock_and_enable(vdev); if (msix) { struct msi_msg msg;
@@ -335,6 +345,7 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev,
ret = request_irq(irq, vfio_msihandler, 0, vdev->ctx[vector].name, trigger); + vfio_pci_memory_unlock_and_restore(vdev, cmd); if (ret) { kfree(vdev->ctx[vector].name); eventfd_ctx_put(trigger); @@ -379,6 +390,7 @@ static void vfio_msi_disable(struct vfio_pci_device *vdev, bool msix) { struct pci_dev *pdev = vdev->pdev; int i; + u16 cmd;
for (i = 0; i < vdev->num_ctx; i++) { vfio_virqfd_disable(&vdev->ctx[i].unmask); @@ -387,7 +399,9 @@ static void vfio_msi_disable(struct vfio_pci_device *vdev, bool msix)
vfio_msi_set_block(vdev, 0, vdev->num_ctx, NULL, msix);
+ cmd = vfio_pci_memory_lock_and_enable(vdev); pci_free_irq_vectors(pdev); + vfio_pci_memory_unlock_and_restore(vdev, cmd);
/* * Both disable paths above use pci_intx_for_msi() to clear DisINTx diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 4b78f12..f896ceb 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -102,6 +102,7 @@ struct vfio_pci_device { struct list_head dummy_resources_list; struct mutex vma_lock; struct list_head vma_list; + struct rw_semaphore memory_lock; };
#define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) @@ -137,6 +138,14 @@ extern int vfio_pci_register_dev_region(struct vfio_pci_device *vdev, unsigned int type, unsigned int subtype, const struct vfio_pci_regops *ops, size_t size, u32 flags, void *data); + +extern bool __vfio_pci_memory_enabled(struct vfio_pci_device *vdev); +extern void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_device + *vdev); +extern u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_device *vdev); +extern void vfio_pci_memory_unlock_and_restore(struct vfio_pci_device *vdev, + u16 cmd); + #ifdef CONFIG_VFIO_PCI_IGD extern int vfio_pci_igd_init(struct vfio_pci_device *vdev); #else diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index 357243d..6445461 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -122,6 +122,7 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf, size_t x_start = 0, x_end = 0; resource_size_t end; void __iomem *io; + struct resource *res = &vdev->pdev->resource[bar]; ssize_t done;
if (pci_resource_start(pdev, bar)) @@ -137,6 +138,14 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf,
count = min(count, (size_t)(end - pos));
+ if (res->flags & IORESOURCE_MEM) { + down_read(&vdev->memory_lock); + if (!__vfio_pci_memory_enabled(vdev)) { + up_read(&vdev->memory_lock); + return -EIO; + } + } + if (bar == PCI_ROM_RESOURCE) { /* * The ROM can fill less space than the BAR, so we start the @@ -144,20 +153,21 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf, * filling large ROM BARs much faster. */ io = pci_map_rom(pdev, &x_start); - if (!io) - return -ENOMEM; + if (!io) { + done = -ENOMEM; + goto out; + } x_end = end; } else if (!vdev->barmap[bar]) { - int ret; - - ret = pci_request_selected_regions(pdev, 1 << bar, "vfio"); - if (ret) - return ret; + done = pci_request_selected_regions(pdev, 1 << bar, "vfio"); + if (done) + goto out;
io = pci_iomap(pdev, bar, 0); if (!io) { pci_release_selected_regions(pdev, 1 << bar); - return -ENOMEM; + done = -ENOMEM; + goto out; }
vdev->barmap[bar] = io; @@ -176,6 +186,9 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf,
if (bar == PCI_ROM_RESOURCE) pci_unmap_rom(pdev, io); +out: + if (res->flags & IORESOURCE_MEM) + up_read(&vdev->memory_lock);
return done; }
CVE-2020-12888 Kernel: vfio: access to disabled MMIO space of some devices may lead to DoS scenario
The VFIO modules allow users (guest VMs) to enable or disable access to the devices' MMIO memory address spaces. If a user attempts to access (read/write) the devices' MMIO address space when it is disabled, some h/w devices issue an interrupt to the CPU to indicate a fatal error condition, crashing the system. This flaw allows a guest user or process to crash the host system resulting in a denial of service.
Patch 1/ is to force the user fault if PFNMAP vma might be DMA mapped before user access.
Patch 2/ setup a vm_ops handler to support dynamic faulting instead of calling remap_pfn_range(). Also provides a list of vmas actively mapping the area which can later use to invalidate those mappings.
Patch 3/ block the user from accessing memory spaces which is disabled by using new vma list support to zap, or invalidate, those memory mappings in order to force them to be faulted back in on access.
Upstreamed patches link: https://lore.kernel.org/kvm/158871401328.15589.17598154478222071285.stgit@gi...
[PATCH v4.9.y 1/3]: Backporting of upsream commit 41311242221e: vfio/type1: Support faulting PFNMAP vmas
[PATCH v4.9.y 2/3]: Backporting of upsream commit 11c4cd07ba11: vfio-pci: Fault mmaps to enable vma tracking
[PATCH v4.9.y 3/3]: Backporting of upsream commit abafbc551fdd: vfio-pci: Invalidate mmaps and block MMIO access on disabled memory
On Tue, Sep 08, 2020 at 02:05:17AM +0530, Ajay Kaher wrote:
CVE-2020-12888 Kernel: vfio: access to disabled MMIO space of some devices may lead to DoS scenario The VFIO modules allow users (guest VMs) to enable or disable access to the devices' MMIO memory address spaces. If a user attempts to access (read/write) the devices' MMIO address space when it is disabled, some h/w devices issue an interrupt to the CPU to indicate a fatal error condition, crashing the system. This flaw allows a guest user or process to crash the host system resulting in a denial of service. Patch 1/ is to force the user fault if PFNMAP vma might be DMA mapped before user access. Patch 2/ setup a vm_ops handler to support dynamic faulting instead of calling remap_pfn_range(). Also provides a list of vmas actively mapping the area which can later use to invalidate those mappings. Patch 3/ block the user from accessing memory spaces which is disabled by using new vma list support to zap, or invalidate, those memory mappings in order to force them to be faulted back in on access. Upstreamed patches link: https://lore.kernel.org/kvm/158871401328.15589.17598154478222071285.stgit@gi... [PATCH v4.9.y 1/3]: Backporting of upsream commit 41311242221e: vfio/type1: Support faulting PFNMAP vmas [PATCH v4.9.y 2/3]: Backporting of upsream commit 11c4cd07ba11: vfio-pci: Fault mmaps to enable vma tracking [PATCH v4.9.y 3/3]: Backporting of upsream commit abafbc551fdd: vfio-pci: Invalidate mmaps and block MMIO access on disabled memory
All now queued up, thanks.
greg k-h
On Tue, 8 Sep 2020 02:05:17 +0530 Ajay Kaher akaher@vmware.com wrote:
CVE-2020-12888 Kernel: vfio: access to disabled MMIO space of some devices may lead to DoS scenario The VFIO modules allow users (guest VMs) to enable or disable access to the devices' MMIO memory address spaces. If a user attempts to access (read/write) the devices' MMIO address space when it is disabled, some h/w devices issue an interrupt to the CPU to indicate a fatal error condition, crashing the system. This flaw allows a guest user or process to crash the host system resulting in a denial of service. Patch 1/ is to force the user fault if PFNMAP vma might be DMA mapped before user access. Patch 2/ setup a vm_ops handler to support dynamic faulting instead of calling remap_pfn_range(). Also provides a list of vmas actively mapping the area which can later use to invalidate those mappings. Patch 3/ block the user from accessing memory spaces which is disabled by using new vma list support to zap, or invalidate, those memory mappings in order to force them to be faulted back in on access. Upstreamed patches link: https://lore.kernel.org/kvm/158871401328.15589.17598154478222071285.stgit@gi... [PATCH v4.9.y 1/3]: Backporting of upsream commit 41311242221e: vfio/type1: Support faulting PFNMAP vmas [PATCH v4.9.y 2/3]: Backporting of upsream commit 11c4cd07ba11: vfio-pci: Fault mmaps to enable vma tracking [PATCH v4.9.y 3/3]: Backporting of upsream commit abafbc551fdd: vfio-pci: Invalidate mmaps and block MMIO access on disabled memory
I'd recommend also including the following or else SR-IOV VFs will be broken for DPDK:
commit ebfa440ce38b7e2e04c3124aa89c8a9f4094cf21 Author: Alex Williamson alex.williamson@redhat.com Date: Thu Jun 25 11:04:23 2020 -0600
vfio/pci: Fix SR-IOV VF handling with MMIO blocking
SR-IOV VFs do not implement the memory enable bit of the command register, therefore this bit is not set in config space after pci_enable_device(). This leads to an unintended difference between PF and VF in hand-off state to the user. We can correct this by setting the initial value of the memory enable bit in our virtualized config space. There's really no need however to ever fault a user on a VF though as this would only indicate an error in the user's management of the enable bit, versus a PF where the same access could trigger hardware faults.
Fixes: abafbc551fdd ("vfio-pci: Invalidate mmaps and block MMIO access on disabled memory") Signed-off-by: Alex Williamson alex.williamson@redhat.com
linux-stable-mirror@lists.linaro.org