On Fri, Jan 10, 2025 at 07:32:23PM -0800, Nicolin Chen wrote:
From: Jason Gunthorpe jgg@nvidia.com
iommufd has a model where the iommu_domain can be changed while the VFIO device is attached. In this case the MSI should continue to work. This corner case has not worked because the dma-iommu implementation of sw_msi is tied to a single domain.
Implement the sw_msi mapping directly and use a global per-fd table to associate assigned iova to the MSI pages. This allows the MSI pages to loaded into a domain before it is attached ensuring that MSI is not
s/loaded/be loaded/ ?
disrupted.
Signed-off-by: Jason Gunthorpe jgg@nvidia.com [nicolinc: set sw_msi pointer in nested hwpt allocators] Signed-off-by: Nicolin Chen nicolinc@nvidia.com
drivers/iommu/iommufd/iommufd_private.h | 23 +++- drivers/iommu/iommufd/device.c | 158 ++++++++++++++++++++---- drivers/iommu/iommufd/hw_pagetable.c | 3 + drivers/iommu/iommufd/main.c | 9 ++ 4 files changed, 170 insertions(+), 23 deletions(-)
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 063c0a42f54f..3e83bbb5912c 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -19,6 +19,22 @@ struct iommu_group; struct iommu_option; struct iommufd_device; +struct iommufd_sw_msi_map {
- struct list_head sw_msi_item;
- phys_addr_t sw_msi_start;
- phys_addr_t msi_addr;
- unsigned int pgoff;
- unsigned int id;
+};
+/* Bitmap of struct iommufd_sw_msi_map::id */ +struct iommufd_sw_msi_maps {
- DECLARE_BITMAP(bitmap, 64);
+};
+int iommufd_sw_msi(struct iommu_domain *domain, struct msi_desc *desc,
phys_addr_t msi_addr);
struct iommufd_ctx { struct file *file; struct xarray objects; @@ -26,6 +42,10 @@ struct iommufd_ctx { wait_queue_head_t destroy_wait; struct rw_semaphore ioas_creation_lock;
- struct mutex sw_msi_lock;
- struct list_head sw_msi_list;
- unsigned int sw_msi_id;
- u8 account_mode; /* Compatibility with VFIO no iommu */ u8 no_iommu_mode;
@@ -283,10 +303,10 @@ struct iommufd_hwpt_paging { struct iommufd_ioas *ioas; bool auto_domain : 1; bool enforce_cache_coherency : 1;
- bool msi_cookie : 1; bool nest_parent : 1; /* Head at iommufd_ioas::hwpt_list */ struct list_head hwpt_item;
- struct iommufd_sw_msi_maps present_sw_msi;
}; struct iommufd_hwpt_nested { @@ -383,6 +403,7 @@ struct iommufd_group { struct iommu_group *group; struct iommufd_hw_pagetable *hwpt; struct list_head device_list;
- struct iommufd_sw_msi_maps required_sw_msi; phys_addr_t sw_msi_start;
}; diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 38b31b652147..f75b3c23cd41 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -5,6 +5,7 @@ #include <linux/iommufd.h> #include <linux/slab.h> #include <uapi/linux/iommufd.h> +#include <linux/msi.h> #include "../iommu-priv.h" #include "io_pagetable.h" @@ -293,36 +294,149 @@ u32 iommufd_device_to_id(struct iommufd_device *idev) } EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, "IOMMUFD"); +/*
- Get a iommufd_sw_msi_map for the msi physical address requested by the irq
- layer. The mapping to IOVA is global to the iommufd file descriptor, every
- domain that is attached to a device using the same MSI parameters will use
- the same IOVA.
- */
+static struct iommufd_sw_msi_map * +iommufd_sw_msi_get_map(struct iommufd_ctx *ictx, phys_addr_t msi_addr,
phys_addr_t sw_msi_start)
+{
- struct iommufd_sw_msi_map *cur;
- unsigned int max_pgoff = 0;
- lockdep_assert_held(&ictx->sw_msi_lock);
- list_for_each_entry(cur, &ictx->sw_msi_list, sw_msi_item) {
if (cur->sw_msi_start != sw_msi_start)
continue;
max_pgoff = max(max_pgoff, cur->pgoff + 1);
if (cur->msi_addr == msi_addr)
return cur;
- }
- if (ictx->sw_msi_id >=
BITS_PER_BYTE * sizeof_field(struct iommufd_sw_msi_maps, bitmap))
return ERR_PTR(-EOVERFLOW);
- cur = kzalloc(sizeof(*cur), GFP_KERNEL);
- if (!cur)
cur = ERR_PTR(-ENOMEM);
- cur->sw_msi_start = sw_msi_start;
- cur->msi_addr = msi_addr;
- cur->pgoff = max_pgoff;
- cur->id = ictx->sw_msi_id++;
- list_add_tail(&cur->sw_msi_item, &ictx->sw_msi_list);
- return cur;
+}
+static int iommufd_sw_msi_install(struct iommufd_ctx *ictx,
struct iommufd_hwpt_paging *hwpt_paging,
struct iommufd_sw_msi_map *msi_map)
+{
- unsigned long iova;
- lockdep_assert_held(&ictx->sw_msi_lock);
- iova = msi_map->sw_msi_start + msi_map->pgoff * PAGE_SIZE;
- if (!test_bit(msi_map->id, hwpt_paging->present_sw_msi.bitmap)) {
int rc;
rc = iommu_map(hwpt_paging->common.domain, iova,
msi_map->msi_addr, PAGE_SIZE,
IOMMU_WRITE | IOMMU_READ | IOMMU_MMIO,
GFP_KERNEL_ACCOUNT);
if (rc)
return rc;
set_bit(msi_map->id, hwpt_paging->present_sw_msi.bitmap);
- }
- return 0;
+}
So, does sw_msi_lock protect the present_sw_msi bitmap? If so, you should use non-atomic __set_bit(). If not, you'd do something like:
if (test_and_set_bit(...)) return 0;
rc = iommu_map(...); if (rc) clear_bit(...);
return rc
Now it looks like a series of atomic accesses, which is not atomic, and it misleads...
+/*
- Called by the irq code if the platform translates the MSI address through the
- IOMMU. msi_addr is the physical address of the MSI page. iommufd will
- allocate a fd global iova for the physical page that is the same on all
- domains and devices.
- */
+#ifdef CONFIG_IRQ_MSI_IOMMU +int iommufd_sw_msi(struct iommu_domain *domain, struct msi_desc *desc,
phys_addr_t msi_addr)
+{
- struct device *dev = msi_desc_to_dev(desc);
- struct iommu_attach_handle *raw_handle;
- struct iommufd_hwpt_paging *hwpt_paging;
- struct iommufd_attach_handle *handle;
- struct iommufd_sw_msi_map *msi_map;
- struct iommufd_ctx *ictx;
- unsigned long iova;
- int rc;
- raw_handle =
iommu_attach_handle_get(dev->iommu_group, IOMMU_NO_PASID, 0);
Nit: no need to break the line.
- if (!raw_handle)
return 0;
- hwpt_paging = find_hwpt_paging(domain->iommufd_hwpt);
- handle = to_iommufd_handle(raw_handle);
- /* No IOMMU_RESV_SW_MSI means no change to the msi_msg */
- if (handle->idev->igroup->sw_msi_start == PHYS_ADDR_MAX)
return 0;
- ictx = handle->idev->ictx;
- guard(mutex)(&ictx->sw_msi_lock);
- /*
* The input msi_addr is the exact byte offset of the MSI doorbell, we
* assume the caller has checked that it is contained with a MMIO region
* that is secure to map at PAGE_SIZE.
*/
- msi_map = iommufd_sw_msi_get_map(handle->idev->ictx,
msi_addr & PAGE_MASK,
handle->idev->igroup->sw_msi_start);
- if (IS_ERR(msi_map))
return PTR_ERR(msi_map);
- rc = iommufd_sw_msi_install(ictx, hwpt_paging, msi_map);
- if (rc)
return rc;
- set_bit(msi_map->id, handle->idev->igroup->required_sw_msi.bitmap);
Same here. I guess, sw_msi_lock protects required_sw_msi.bitmap, right?
Thanks, Yury
- iova = msi_map->sw_msi_start + msi_map->pgoff * PAGE_SIZE;
- msi_desc_set_iommu_msi_iova(desc, iova, PAGE_SHIFT);
- return 0;
+} +#endif
+/*
- FIXME: when a domain is removed any ids that are not in the union of
- all still attached devices should be removed.
- */
static int iommufd_group_setup_msi(struct iommufd_group *igroup, struct iommufd_hwpt_paging *hwpt_paging) {
- phys_addr_t sw_msi_start = igroup->sw_msi_start;
- int rc;
- struct iommufd_ctx *ictx = igroup->ictx;
- struct iommufd_sw_msi_map *cur;
- if (igroup->sw_msi_start == PHYS_ADDR_MAX)
return 0;
/*
* If the IOMMU driver gives a IOMMU_RESV_SW_MSI then it is asking us to
* call iommu_get_msi_cookie() on its behalf. This is necessary to setup
* the MSI window so iommu_dma_prepare_msi() can install pages into our
* domain after request_irq(). If it is not done interrupts will not
* work on this domain.
*
* FIXME: This is conceptually broken for iommufd since we want to allow
* userspace to change the domains, eg switch from an identity IOAS to a
* DMA IOAS. There is currently no way to create a MSI window that
* matches what the IRQ layer actually expects in a newly created
* domain.
*/* Install all the MSI pages the device has been using into the domain
- if (sw_msi_start != PHYS_ADDR_MAX && !hwpt_paging->msi_cookie) {
rc = iommu_get_msi_cookie(hwpt_paging->common.domain,
sw_msi_start);
- guard(mutex)(&ictx->sw_msi_lock);
- list_for_each_entry(cur, &ictx->sw_msi_list, sw_msi_item) {
int rc;
if (cur->sw_msi_start != igroup->sw_msi_start ||
!test_bit(cur->id, igroup->required_sw_msi.bitmap))
continue;
if (rc) return rc;rc = iommufd_sw_msi_install(ictx, hwpt_paging, cur);
/*
* iommu_get_msi_cookie() can only be called once per domain,
* it returns -EBUSY on later calls.
*/
} return 0;hwpt_paging->msi_cookie = true;
} diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index f7c0d7b214b6..538484eecb3b 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -156,6 +156,7 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, goto out_abort; } }
- iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi);
/* * Set the coherency mode before we do iopt_table_add_domain() as some @@ -251,6 +252,7 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx, goto out_abort; } hwpt->domain->owner = ops;
- iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi);
if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) { rc = -EINVAL; @@ -303,6 +305,7 @@ iommufd_viommu_alloc_hwpt_nested(struct iommufd_viommu *viommu, u32 flags, goto out_abort; } hwpt->domain->owner = viommu->iommu_dev->ops;
- iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi);
if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) { rc = -EINVAL; diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 97c5e3567d33..7cc9497b7193 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -227,6 +227,8 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp) xa_init(&ictx->groups); ictx->file = filp; init_waitqueue_head(&ictx->destroy_wait);
- mutex_init(&ictx->sw_msi_lock);
- INIT_LIST_HEAD(&ictx->sw_msi_list); filp->private_data = ictx; return 0;
} @@ -234,6 +236,8 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp) static int iommufd_fops_release(struct inode *inode, struct file *filp) { struct iommufd_ctx *ictx = filp->private_data;
- struct iommufd_sw_msi_map *next;
- struct iommufd_sw_msi_map *cur; struct iommufd_object *obj;
/* @@ -262,6 +266,11 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp) break; } WARN_ON(!xa_empty(&ictx->groups));
- mutex_destroy(&ictx->sw_msi_lock);
- list_for_each_entry_safe(cur, next, &ictx->sw_msi_list, sw_msi_item)
kfree(cur);
- kfree(ictx); return 0;
}
2.43.0