PATCH-1 is an optional cleanup. PATCH-2 is an essential infrastructure for PATCH-3 to verify the loopback for IOMMU_RESV_SW_MSI.
You can find this series applied on the other two earlier patches here: https://github.com/nicolinc/iommufd/commits/iommufd_selftest_sw_msi/
[changelog] v2: * Resent with a proper bug fix.
Thanks Nicolin
Nicolin Chen (2): iommufd: Reorder include files iommufd/selftest: Add coverage for reserved IOVAs
Robin Murphy (1): iommu/dma: Support MSIs through nested domains
drivers/iommu/dma-iommu.c | 18 +++- drivers/iommu/iommufd/device.c | 4 +- drivers/iommu/iommufd/fault.c | 4 +- drivers/iommu/iommufd/io_pagetable.c | 8 +- drivers/iommu/iommufd/io_pagetable.h | 2 +- drivers/iommu/iommufd/ioas.c | 2 +- drivers/iommu/iommufd/iommufd_private.h | 9 +- drivers/iommu/iommufd/iommufd_test.h | 6 +- drivers/iommu/iommufd/iova_bitmap.c | 2 +- drivers/iommu/iommufd/main.c | 8 +- drivers/iommu/iommufd/pages.c | 10 +- drivers/iommu/iommufd/selftest.c | 92 ++++++++++++++++++- include/linux/iommu.h | 4 + include/linux/iommufd.h | 4 +- include/uapi/linux/iommufd.h | 2 +- tools/testing/selftests/iommu/iommufd_utils.h | 9 ++ 16 files changed, 150 insertions(+), 34 deletions(-)
Reorder include files to alphabetic order to simplify maintenance, and separate local headers and global headers with a blank line.
No functional change intended.
Signed-off-by: Nicolin Chen nicolinc@nvidia.com --- drivers/iommu/iommufd/device.c | 4 ++-- drivers/iommu/iommufd/fault.c | 4 ++-- drivers/iommu/iommufd/io_pagetable.c | 8 ++++---- drivers/iommu/iommufd/io_pagetable.h | 2 +- drivers/iommu/iommufd/ioas.c | 2 +- drivers/iommu/iommufd/iommufd_private.h | 9 +++++---- drivers/iommu/iommufd/iommufd_test.h | 2 +- drivers/iommu/iommufd/iova_bitmap.c | 2 +- drivers/iommu/iommufd/main.c | 8 ++++---- drivers/iommu/iommufd/pages.c | 10 +++++----- drivers/iommu/iommufd/selftest.c | 8 ++++---- include/linux/iommufd.h | 4 ++-- include/uapi/linux/iommufd.h | 2 +- 13 files changed, 33 insertions(+), 32 deletions(-)
diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 2ac26a948987e..eaa1fdf71bb0e 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -1,12 +1,12 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES */ +#include <linux/iommu.h> #include <linux/iommufd.h> #include <linux/slab.h> -#include <linux/iommu.h> #include <uapi/linux/iommufd.h> -#include "../iommu-priv.h"
+#include "../iommu-priv.h" #include "io_pagetable.h" #include "iommufd_private.h"
diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c index a643d5c7c535f..df03411c87289 100644 --- a/drivers/iommu/iommufd/fault.c +++ b/drivers/iommu/iommufd/fault.c @@ -3,14 +3,14 @@ */ #define pr_fmt(fmt) "iommufd: " fmt
+#include <linux/anon_inodes.h> #include <linux/file.h> #include <linux/fs.h> +#include <linux/iommufd.h> #include <linux/module.h> #include <linux/mutex.h> -#include <linux/iommufd.h> #include <linux/pci.h> #include <linux/poll.h> -#include <linux/anon_inodes.h> #include <uapi/linux/iommufd.h>
#include "../iommu-priv.h" diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 05fd9d3abf1b8..bbbc8a044bcf7 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -8,17 +8,17 @@ * The datastructure uses the iopt_pages to optimize the storage of the PFNs * between the domains and xarray. */ +#include <linux/err.h> +#include <linux/errno.h> +#include <linux/iommu.h> #include <linux/iommufd.h> #include <linux/lockdep.h> -#include <linux/iommu.h> #include <linux/sched/mm.h> -#include <linux/err.h> #include <linux/slab.h> -#include <linux/errno.h> #include <uapi/linux/iommufd.h>
-#include "io_pagetable.h" #include "double_span.h" +#include "io_pagetable.h"
struct iopt_pages_list { struct iopt_pages *pages; diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h index 0ec3509b7e339..c61d74471684e 100644 --- a/drivers/iommu/iommufd/io_pagetable.h +++ b/drivers/iommu/iommufd/io_pagetable.h @@ -6,8 +6,8 @@ #define __IO_PAGETABLE_H
#include <linux/interval_tree.h> -#include <linux/mutex.h> #include <linux/kref.h> +#include <linux/mutex.h> #include <linux/xarray.h>
#include "iommufd_private.h" diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c index 7422482765481..82428e44a837c 100644 --- a/drivers/iommu/iommufd/ioas.c +++ b/drivers/iommu/iommufd/ioas.c @@ -3,8 +3,8 @@ * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES */ #include <linux/interval_tree.h> -#include <linux/iommufd.h> #include <linux/iommu.h> +#include <linux/iommufd.h> #include <uapi/linux/iommufd.h>
#include "io_pagetable.h" diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 4a23c675ddfda..3f4ba3181fe91 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -4,13 +4,14 @@ #ifndef __IOMMUFD_PRIVATE_H #define __IOMMUFD_PRIVATE_H
-#include <linux/rwsem.h> -#include <linux/xarray.h> -#include <linux/refcount.h> -#include <linux/uaccess.h> #include <linux/iommu.h> #include <linux/iova_bitmap.h> +#include <linux/refcount.h> +#include <linux/rwsem.h> +#include <linux/uaccess.h> +#include <linux/xarray.h> #include <uapi/linux/iommufd.h> + #include "../iommu-priv.h"
struct iommu_domain; diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index acbbba1c66716..f4bc23a92f9a2 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -4,8 +4,8 @@ #ifndef _UAPI_IOMMUFD_TEST_H #define _UAPI_IOMMUFD_TEST_H
-#include <linux/types.h> #include <linux/iommufd.h> +#include <linux/types.h>
enum { IOMMU_TEST_OP_ADD_RESERVED = 1, diff --git a/drivers/iommu/iommufd/iova_bitmap.c b/drivers/iommu/iommufd/iova_bitmap.c index b9e964b1ad5cc..d90b9e253412f 100644 --- a/drivers/iommu/iommufd/iova_bitmap.c +++ b/drivers/iommu/iommufd/iova_bitmap.c @@ -3,10 +3,10 @@ * Copyright (c) 2022, Oracle and/or its affiliates. * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved */ +#include <linux/highmem.h> #include <linux/iova_bitmap.h> #include <linux/mm.h> #include <linux/slab.h> -#include <linux/highmem.h>
#define BITS_PER_PAGE (PAGE_SIZE * BITS_PER_BYTE)
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 83bbd7c5d1608..b5f5d27ee9634 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -8,15 +8,15 @@ */ #define pr_fmt(fmt) "iommufd: " fmt
+#include <linux/bug.h> #include <linux/file.h> #include <linux/fs.h> -#include <linux/module.h> -#include <linux/slab.h> +#include <linux/iommufd.h> #include <linux/miscdevice.h> +#include <linux/module.h> #include <linux/mutex.h> -#include <linux/bug.h> +#include <linux/slab.h> #include <uapi/linux/iommufd.h> -#include <linux/iommufd.h>
#include "io_pagetable.h" #include "iommufd_private.h" diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index 117f644a0c5b7..93d806c9c0731 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -45,16 +45,16 @@ * last_iova + 1 can overflow. An iopt_pages index will always be much less than * ULONG_MAX so last_index + 1 cannot overflow. */ +#include <linux/highmem.h> +#include <linux/iommu.h> +#include <linux/iommufd.h> +#include <linux/kthread.h> #include <linux/overflow.h> #include <linux/slab.h> -#include <linux/iommu.h> #include <linux/sched/mm.h> -#include <linux/highmem.h> -#include <linux/kthread.h> -#include <linux/iommufd.h>
-#include "io_pagetable.h" #include "double_span.h" +#include "io_pagetable.h"
#ifndef CONFIG_IOMMUFD_TEST #define TEMP_MEMORY_LIMIT 65536 diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index f95e32e291333..b60687f57bef3 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -3,13 +3,13 @@ * * Kernel side components to support tools/testing/selftests/iommu */ -#include <linux/slab.h> -#include <linux/iommu.h> -#include <linux/xarray.h> -#include <linux/file.h> #include <linux/anon_inodes.h> #include <linux/fault-inject.h> +#include <linux/file.h> +#include <linux/iommu.h> #include <linux/platform_device.h> +#include <linux/slab.h> +#include <linux/xarray.h> #include <uapi/linux/iommufd.h>
#include "../iommu-priv.h" diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index ffc3a949f8374..c2f2f6b9148e2 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -6,9 +6,9 @@ #ifndef __LINUX_IOMMUFD_H #define __LINUX_IOMMUFD_H
-#include <linux/types.h> -#include <linux/errno.h> #include <linux/err.h> +#include <linux/errno.h> +#include <linux/types.h>
struct device; struct iommufd_device; diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 4dde745cfb7e2..72010f71c5e47 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -4,8 +4,8 @@ #ifndef _UAPI_IOMMUFD_H #define _UAPI_IOMMUFD_H
-#include <linux/types.h> #include <linux/ioctl.h> +#include <linux/types.h>
#define IOMMUFD_TYPE (';')
On Fri, Aug 02, 2024 at 05:32:02PM -0700, Nicolin Chen wrote:
Reorder include files to alphabetic order to simplify maintenance, and separate local headers and global headers with a blank line.
No functional change intended.
Signed-off-by: Nicolin Chen nicolinc@nvidia.com
drivers/iommu/iommufd/device.c | 4 ++-- drivers/iommu/iommufd/fault.c | 4 ++-- drivers/iommu/iommufd/io_pagetable.c | 8 ++++---- drivers/iommu/iommufd/io_pagetable.h | 2 +- drivers/iommu/iommufd/ioas.c | 2 +- drivers/iommu/iommufd/iommufd_private.h | 9 +++++---- drivers/iommu/iommufd/iommufd_test.h | 2 +- drivers/iommu/iommufd/iova_bitmap.c | 2 +- drivers/iommu/iommufd/main.c | 8 ++++---- drivers/iommu/iommufd/pages.c | 10 +++++----- drivers/iommu/iommufd/selftest.c | 8 ++++---- include/linux/iommufd.h | 4 ++-- include/uapi/linux/iommufd.h | 2 +- 13 files changed, 33 insertions(+), 32 deletions(-)
I picked this on up to iommufd for-next
When you get an answer if we can do something else for the MSI lets reconsider that patch, also the 3rd one has a kbuild failure.
Thanks, Jason
On Thu, Aug 15, 2024 at 02:51:40PM -0300, Jason Gunthorpe wrote:
On Fri, Aug 02, 2024 at 05:32:02PM -0700, Nicolin Chen wrote:
Reorder include files to alphabetic order to simplify maintenance, and separate local headers and global headers with a blank line.
No functional change intended.
Signed-off-by: Nicolin Chen nicolinc@nvidia.com
drivers/iommu/iommufd/device.c | 4 ++-- drivers/iommu/iommufd/fault.c | 4 ++-- drivers/iommu/iommufd/io_pagetable.c | 8 ++++---- drivers/iommu/iommufd/io_pagetable.h | 2 +- drivers/iommu/iommufd/ioas.c | 2 +- drivers/iommu/iommufd/iommufd_private.h | 9 +++++---- drivers/iommu/iommufd/iommufd_test.h | 2 +- drivers/iommu/iommufd/iova_bitmap.c | 2 +- drivers/iommu/iommufd/main.c | 8 ++++---- drivers/iommu/iommufd/pages.c | 10 +++++----- drivers/iommu/iommufd/selftest.c | 8 ++++---- include/linux/iommufd.h | 4 ++-- include/uapi/linux/iommufd.h | 2 +- 13 files changed, 33 insertions(+), 32 deletions(-)
I picked this on up to iommufd for-next
Thanks! Was hoping for that.
When you get an answer if we can do something else for the MSI lets reconsider that patch, also the 3rd one has a kbuild failure.
Yea, let's hold the other patches in this series. I will submit another selftest patch, after we have an alternative solution.
Nicolin
From: Robin Murphy robin.murphy@arm.com
Currently, iommu-dma is the only place outside of IOMMUFD and drivers which might need to be aware of the stage 2 domain encapsulated within a nested domain. This would be in the legacy-VFIO-style case where we're using host-managed MSIs with an identity mapping at stage 1, where it is the underlying stage 2 domain which owns an MSI cookie and holds the corresponding dynamic mappings. Hook up the new op to resolve what we need from a nested domain.
Signed-off-by: Robin Murphy robin.murphy@arm.com Signed-off-by: Nicolin Chen nicolinc@nvidia.com --- drivers/iommu/dma-iommu.c | 18 ++++++++++++++++-- include/linux/iommu.h | 4 ++++ 2 files changed, 20 insertions(+), 2 deletions(-)
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 7b1dfa0665df6..05e04934a5f81 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1799,6 +1799,20 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev, return NULL; }
+/* + * Nested domains may not have an MSI cookie or accept mappings, but they may + * be related to a domain which does, so we let them tell us what they need. + */ +static struct iommu_domain *iommu_dma_get_msi_mapping_domain(struct device *dev) +{ + struct iommu_domain *domain = iommu_get_domain_for_dev(dev); + + if (domain && domain->type == IOMMU_DOMAIN_NESTED && + domain->ops->get_msi_mapping_domain) + domain = domain->ops->get_msi_mapping_domain(domain); + return domain; +} + /** * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain * @desc: MSI descriptor, will store the MSI page @@ -1809,7 +1823,7 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev, int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) { struct device *dev = msi_desc_to_dev(desc); - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); + struct iommu_domain *domain = iommu_dma_get_msi_mapping_domain(dev); struct iommu_dma_msi_page *msi_page; static DEFINE_MUTEX(msi_prepare_lock); /* see below */
@@ -1842,7 +1856,7 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg) { struct device *dev = msi_desc_to_dev(desc); - const struct iommu_domain *domain = iommu_get_domain_for_dev(dev); + const struct iommu_domain *domain = iommu_dma_get_msi_mapping_domain(dev); const struct iommu_dma_msi_page *msi_page;
msi_page = msi_desc_get_iommu_cookie(desc); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 4d47f2c333118..69ed76f9c3ec4 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -638,6 +638,8 @@ struct iommu_ops { * @enable_nesting: Enable nesting * @set_pgtable_quirks: Set io page table quirks (IO_PGTABLE_QUIRK_*) * @free: Release the domain after use. + * @get_msi_mapping_domain: Return the related iommu_domain that should hold the + * MSI cookie and accept mapping(s). */ struct iommu_domain_ops { int (*attach_dev)(struct iommu_domain *domain, struct device *dev); @@ -668,6 +670,8 @@ struct iommu_domain_ops { unsigned long quirks);
void (*free)(struct iommu_domain *domain); + struct iommu_domain * + (*get_msi_mapping_domain)(struct iommu_domain *domain); };
/**
From: Nicolin Chen nicolinc@nvidia.com Sent: Saturday, August 3, 2024 8:32 AM
From: Robin Murphy robin.murphy@arm.com
Currently, iommu-dma is the only place outside of IOMMUFD and drivers which might need to be aware of the stage 2 domain encapsulated within a nested domain. This would be in the legacy-VFIO-style case where we're
why is it a legacy-VFIO-style? We only support nested in IOMMUFD.
using host-managed MSIs with an identity mapping at stage 1, where it is the underlying stage 2 domain which owns an MSI cookie and holds the corresponding dynamic mappings. Hook up the new op to resolve what we need from a nested domain.
Signed-off-by: Robin Murphy robin.murphy@arm.com Signed-off-by: Nicolin Chen nicolinc@nvidia.com
drivers/iommu/dma-iommu.c | 18 ++++++++++++++++-- include/linux/iommu.h | 4 ++++ 2 files changed, 20 insertions(+), 2 deletions(-)
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 7b1dfa0665df6..05e04934a5f81 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1799,6 +1799,20 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev, return NULL; }
+/*
- Nested domains may not have an MSI cookie or accept mappings, but
they may
- be related to a domain which does, so we let them tell us what they need.
- */
+static struct iommu_domain *iommu_dma_get_msi_mapping_domain(struct device *dev) +{
- struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
- if (domain && domain->type == IOMMU_DOMAIN_NESTED &&
domain->ops->get_msi_mapping_domain)
I'm not sure the core should restrict it to the NESTED type. Given there is a new domain ops any type restriction can be handled inside the callback. Anyway the driver should register the op for a domain only when there is a need.
domain = domain->ops->get_msi_mapping_domain(domain);
- return domain;
+}
/**
- iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain
- @desc: MSI descriptor, will store the MSI page
@@ -1809,7 +1823,7 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev, int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) { struct device *dev = msi_desc_to_dev(desc);
- struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
- struct iommu_domain *domain =
iommu_dma_get_msi_mapping_domain(dev); struct iommu_dma_msi_page *msi_page; static DEFINE_MUTEX(msi_prepare_lock); /* see below */
@@ -1842,7 +1856,7 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg) { struct device *dev = msi_desc_to_dev(desc);
- const struct iommu_domain *domain =
iommu_get_domain_for_dev(dev);
- const struct iommu_domain *domain =
iommu_dma_get_msi_mapping_domain(dev); const struct iommu_dma_msi_page *msi_page;
msi_page = msi_desc_get_iommu_cookie(desc); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 4d47f2c333118..69ed76f9c3ec4 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -638,6 +638,8 @@ struct iommu_ops {
- @enable_nesting: Enable nesting
- @set_pgtable_quirks: Set io page table quirks (IO_PGTABLE_QUIRK_*)
- @free: Release the domain after use.
- @get_msi_mapping_domain: Return the related iommu_domain that
should hold the
*/
MSI cookie and accept mapping(s).
struct iommu_domain_ops { int (*attach_dev)(struct iommu_domain *domain, struct device *dev); @@ -668,6 +670,8 @@ struct iommu_domain_ops { unsigned long quirks);
void (*free)(struct iommu_domain *domain);
- struct iommu_domain *
(*get_msi_mapping_domain)(struct iommu_domain
*domain); };
/**
2.43.0
On Tue, Aug 06, 2024 at 08:25:33AM +0000, Tian, Kevin wrote:
From: Nicolin Chen nicolinc@nvidia.com Sent: Saturday, August 3, 2024 8:32 AM
From: Robin Murphy robin.murphy@arm.com
Currently, iommu-dma is the only place outside of IOMMUFD and drivers which might need to be aware of the stage 2 domain encapsulated within a nested domain. This would be in the legacy-VFIO-style case where we're
why is it a legacy-VFIO-style? We only support nested in IOMMUFD.
I think it's describing the RMR solution that was decided in Eric's VFIO solution prior to we having IOMMUFD at all.
So long as Robin won't mind (hopefully), I can rephrase it:
Currently, iommu-dma is the only place outside of IOMMUFD and drivers which might need to be aware of the stage 2 domain encapsulated within a nested domain. This would be still the RMR solution where we're using host-managed MSIs with an identity mapping at stage 1, where it is the underlying stage 2 domain which owns an MSI cookie and holds the corresponding dynamic mappings. Hook up the new op to resolve what we need from a nested domain.
Signed-off-by: Robin Murphy robin.murphy@arm.com Signed-off-by: Nicolin Chen nicolinc@nvidia.com
drivers/iommu/dma-iommu.c | 18 ++++++++++++++++-- include/linux/iommu.h | 4 ++++ 2 files changed, 20 insertions(+), 2 deletions(-)
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 7b1dfa0665df6..05e04934a5f81 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1799,6 +1799,20 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev, return NULL; }
+/*
- Nested domains may not have an MSI cookie or accept mappings, but
they may
- be related to a domain which does, so we let them tell us what they need.
- */
+static struct iommu_domain *iommu_dma_get_msi_mapping_domain(struct device *dev) +{
struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
if (domain && domain->type == IOMMU_DOMAIN_NESTED &&
domain->ops->get_msi_mapping_domain)
I'm not sure the core should restrict it to the NESTED type. Given there is a new domain ops any type restriction can be handled inside the callback. Anyway the driver should register the op for a domain only when there is a need.
I think we can do either way, given that the use case is very particular for IOMMU_DOMAIN_NESTED. Otherwise, driver doesn't need to be aware of the msi mapping domain at all that should be just taken care of by dma-iommu. If the domain pointer had a generic parent iommu pointer, the get_msi_mapping_domain op could have been omitted too.
That being said, yea, likely we should check !!domain->ops at least.
Thanks Nicolin
On 06/08/2024 9:25 am, Tian, Kevin wrote:
From: Nicolin Chen nicolinc@nvidia.com Sent: Saturday, August 3, 2024 8:32 AM
From: Robin Murphy robin.murphy@arm.com
Currently, iommu-dma is the only place outside of IOMMUFD and drivers which might need to be aware of the stage 2 domain encapsulated within a nested domain. This would be in the legacy-VFIO-style case where we're
why is it a legacy-VFIO-style? We only support nested in IOMMUFD.
Because with proper nesting we ideally shouldn't need the host-managed MSI mess at all, which all stems from the old VFIO paradigm of completely abstracting interrupts from userspace. I'm still hoping IOMMUFD can grow its own interface for efficient MSI passthrough, where the VMM can simply map the physical MSI doorbell into whatever IPA (GPA) it wants it to appear at in the S2 domain, then whatever the guest does with S1 it can program the MSI address into the endpoint accordingly without us having to fiddle with it.
FWIW, apart from IOMMUFD, we may also end up wanting something along those lines for Arm CCA (if non-Secure proxying of T=1 MSIs becomes an unavoidable thing).
using host-managed MSIs with an identity mapping at stage 1, where it is the underlying stage 2 domain which owns an MSI cookie and holds the corresponding dynamic mappings. Hook up the new op to resolve what we need from a nested domain.
Signed-off-by: Robin Murphy robin.murphy@arm.com Signed-off-by: Nicolin Chen nicolinc@nvidia.com
drivers/iommu/dma-iommu.c | 18 ++++++++++++++++-- include/linux/iommu.h | 4 ++++ 2 files changed, 20 insertions(+), 2 deletions(-)
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 7b1dfa0665df6..05e04934a5f81 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1799,6 +1799,20 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev, return NULL; }
+/*
- Nested domains may not have an MSI cookie or accept mappings, but
they may
- be related to a domain which does, so we let them tell us what they need.
- */
+static struct iommu_domain *iommu_dma_get_msi_mapping_domain(struct device *dev) +{
- struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
- if (domain && domain->type == IOMMU_DOMAIN_NESTED &&
domain->ops->get_msi_mapping_domain)
I'm not sure the core should restrict it to the NESTED type. Given there is a new domain ops any type restriction can be handled inside the callback. Anyway the driver should register the op for a domain only when there is a need.
Nested should remain the only case where domains are chained together such that the VFIO MSI cookie may be hiding somewhere else. This is effectively documenting that implementing the callback for any other domain type would be a bad smell. Much like how the mapping-related ops are explicitly short-circuited for non-paging domain types.
Thanks, Robin.
domain = domain->ops->get_msi_mapping_domain(domain);
- return domain;
+}
- /**
- iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain
- @desc: MSI descriptor, will store the MSI page
@@ -1809,7 +1823,7 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev, int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) { struct device *dev = msi_desc_to_dev(desc);
- struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
- struct iommu_domain *domain =
iommu_dma_get_msi_mapping_domain(dev); struct iommu_dma_msi_page *msi_page; static DEFINE_MUTEX(msi_prepare_lock); /* see below */
@@ -1842,7 +1856,7 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg) { struct device *dev = msi_desc_to_dev(desc);
- const struct iommu_domain *domain =
iommu_get_domain_for_dev(dev);
- const struct iommu_domain *domain =
iommu_dma_get_msi_mapping_domain(dev); const struct iommu_dma_msi_page *msi_page;
msi_page = msi_desc_get_iommu_cookie(desc); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 4d47f2c333118..69ed76f9c3ec4 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -638,6 +638,8 @@ struct iommu_ops {
- @enable_nesting: Enable nesting
- @set_pgtable_quirks: Set io page table quirks (IO_PGTABLE_QUIRK_*)
- @free: Release the domain after use.
- @get_msi_mapping_domain: Return the related iommu_domain that
should hold the
*/ struct iommu_domain_ops { int (*attach_dev)(struct iommu_domain *domain, struct device *dev);
MSI cookie and accept mapping(s).
@@ -668,6 +670,8 @@ struct iommu_domain_ops { unsigned long quirks);
void (*free)(struct iommu_domain *domain);
- struct iommu_domain *
(*get_msi_mapping_domain)(struct iommu_domain
*domain); };
/**
2.43.0
On Thu, Aug 08, 2024 at 01:38:44PM +0100, Robin Murphy wrote:
On 06/08/2024 9:25 am, Tian, Kevin wrote:
From: Nicolin Chen nicolinc@nvidia.com Sent: Saturday, August 3, 2024 8:32 AM
From: Robin Murphy robin.murphy@arm.com
Currently, iommu-dma is the only place outside of IOMMUFD and drivers which might need to be aware of the stage 2 domain encapsulated within a nested domain. This would be in the legacy-VFIO-style case where we're
why is it a legacy-VFIO-style? We only support nested in IOMMUFD.
Because with proper nesting we ideally shouldn't need the host-managed MSI mess at all, which all stems from the old VFIO paradigm of completely abstracting interrupts from userspace. I'm still hoping IOMMUFD can grow its own interface for efficient MSI passthrough, where the VMM can simply map the physical MSI doorbell into whatever IPA (GPA) it wants it to appear at in the S2 domain, then whatever the guest does with S1 it can program the MSI address into the endpoint accordingly without us having to fiddle with it.
Hmm, until now I wasn't so convinced myself that it could work as I was worried about the data. But having a second thought, since the host configures the MSI, it can still set the correct data. What we only need is to change the MSI address from a RMRed IPA/gIOVA to a real gIOVA of the vITS page.
I did a quick hack to test that loop. MSI in the guest still works fine without having the RMR node in its IORT. Sweet!
To go further on this path, we will need the following changes: - MSI configuration in the host (via a VFIO_IRQ_SET_ACTION_TRIGGER hypercall) should set gIOVA instead of fetching from msi_cookie. That hypercall doesn't forward an address currently, since host kernel pre-sets the msi_cookie. So, we need a way to forward the gIOVA to kernel and pack it into the msi_msg structure. I haven't read the VFIO PCI code thoroughly, yet wonder if we could just let the guest program the gIOVA to the PCI register and fall it through to the hardware, so host kernel handling that hypercall can just read it back from the register? - IOMMUFD should provide VMM a way to tell the gPA (or directly + GITS_TRANSLATER?). Then kernel should do the stage-2 mapping. I have talked to Jason about this a while ago, and we have a few thoughts how to implement it. But eventually, I think we still can't avoid a middle man like msi_cookie to associate the gPA in IOMMUFD to PA in irqchip?
One more concern is the MSI window size. VMM sets up a MSI region that must fit the hardware window size. Most of ITS versions have only one page size but one of them can have multiple pages? What if vITS is one-page size while the underlying pITS has multiple?
My understanding of the current kernel-defined 1MB size is also a hard-coding window to potential fit all cases, since IOMMU code in the code can just eyeball what's going on in the irqchip subsystem and adjust accordingly if someday it needs to. But VMM can't?
Thanks Nicolin
From: Nicolin Chen nicolinc@nvidia.com Sent: Friday, August 9, 2024 7:00 AM
On Thu, Aug 08, 2024 at 01:38:44PM +0100, Robin Murphy wrote:
On 06/08/2024 9:25 am, Tian, Kevin wrote:
From: Nicolin Chen nicolinc@nvidia.com Sent: Saturday, August 3, 2024 8:32 AM
From: Robin Murphy robin.murphy@arm.com
Currently, iommu-dma is the only place outside of IOMMUFD and
drivers
which might need to be aware of the stage 2 domain encapsulated
within
a nested domain. This would be in the legacy-VFIO-style case where
we're
why is it a legacy-VFIO-style? We only support nested in IOMMUFD.
Because with proper nesting we ideally shouldn't need the host-managed MSI mess at all, which all stems from the old VFIO paradigm of completely abstracting interrupts from userspace. I'm still hoping IOMMUFD can grow its own interface for efficient MSI passthrough, where the VMM can simply map the physical MSI doorbell into whatever IPA (GPA) it wants it to appear at in the S2 domain, then whatever the guest does with S1 it can program the MSI address into the endpoint accordingly without us having to fiddle with it.
Hmm, until now I wasn't so convinced myself that it could work as I was worried about the data. But having a second thought, since the host configures the MSI, it can still set the correct data. What we only need is to change the MSI address from a RMRed IPA/gIOVA to a real gIOVA of the vITS page.
I did a quick hack to test that loop. MSI in the guest still works fine without having the RMR node in its IORT. Sweet!
To go further on this path, we will need the following changes:
- MSI configuration in the host (via a VFIO_IRQ_SET_ACTION_TRIGGER hypercall) should set gIOVA instead of fetching from msi_cookie. That hypercall doesn't forward an address currently, since host kernel pre-sets the msi_cookie. So, we need a way to forward the gIOVA to kernel and pack it into the msi_msg structure. I haven't read the VFIO PCI code thoroughly, yet wonder if we could just let the guest program the gIOVA to the PCI register and fall it through to the hardware, so host kernel handling that hypercall can just read it back from the register?
- IOMMUFD should provide VMM a way to tell the gPA (or directly + GITS_TRANSLATER?). Then kernel should do the stage-2 mapping. I have talked to Jason about this a while ago, and we have a few thoughts how to implement it. But eventually, I think we still can't avoid a middle man like msi_cookie to associate the gPA in IOMMUFD to PA in irqchip?
Probably a new IOMMU_DMA_MSI_COOKIE_USER type which uses GPA (passed in in ALLOC_HWPT for a nested_parent type) as IOVA in iommu_dma_get_msi_page()?
One more concern is the MSI window size. VMM sets up a MSI region that must fit the hardware window size. Most of ITS versions have only one page size but one of them can have multiple pages? What if vITS is one-page size while the underlying pITS has multiple?
My understanding of the current kernel-defined 1MB size is also a hard-coding window to potential fit all cases, since IOMMU code in the code can just eyeball what's going on in the irqchip subsystem and adjust accordingly if someday it needs to. But VMM can't?
Thanks Nicolin
On 2024-08-09 9:00 am, Tian, Kevin wrote:
From: Nicolin Chen nicolinc@nvidia.com Sent: Friday, August 9, 2024 7:00 AM
On Thu, Aug 08, 2024 at 01:38:44PM +0100, Robin Murphy wrote:
On 06/08/2024 9:25 am, Tian, Kevin wrote:
From: Nicolin Chen nicolinc@nvidia.com Sent: Saturday, August 3, 2024 8:32 AM
From: Robin Murphy robin.murphy@arm.com
Currently, iommu-dma is the only place outside of IOMMUFD and
drivers
which might need to be aware of the stage 2 domain encapsulated
within
a nested domain. This would be in the legacy-VFIO-style case where
we're
why is it a legacy-VFIO-style? We only support nested in IOMMUFD.
Because with proper nesting we ideally shouldn't need the host-managed MSI mess at all, which all stems from the old VFIO paradigm of completely abstracting interrupts from userspace. I'm still hoping IOMMUFD can grow its own interface for efficient MSI passthrough, where the VMM can simply map the physical MSI doorbell into whatever IPA (GPA) it wants it to appear at in the S2 domain, then whatever the guest does with S1 it can program the MSI address into the endpoint accordingly without us having to fiddle with it.
Hmm, until now I wasn't so convinced myself that it could work as I was worried about the data. But having a second thought, since the host configures the MSI, it can still set the correct data. What we only need is to change the MSI address from a RMRed IPA/gIOVA to a real gIOVA of the vITS page.
I did a quick hack to test that loop. MSI in the guest still works fine without having the RMR node in its IORT. Sweet!
To go further on this path, we will need the following changes:
- MSI configuration in the host (via a VFIO_IRQ_SET_ACTION_TRIGGER hypercall) should set gIOVA instead of fetching from msi_cookie. That hypercall doesn't forward an address currently, since host kernel pre-sets the msi_cookie. So, we need a way to forward the gIOVA to kernel and pack it into the msi_msg structure. I haven't read the VFIO PCI code thoroughly, yet wonder if we could just let the guest program the gIOVA to the PCI register and fall it through to the hardware, so host kernel handling that hypercall can just read it back from the register?
- IOMMUFD should provide VMM a way to tell the gPA (or directly + GITS_TRANSLATER?). Then kernel should do the stage-2 mapping. I have talked to Jason about this a while ago, and we have a few thoughts how to implement it. But eventually, I think we still can't avoid a middle man like msi_cookie to associate the gPA in IOMMUFD to PA in irqchip?
Probably a new IOMMU_DMA_MSI_COOKIE_USER type which uses GPA (passed in in ALLOC_HWPT for a nested_parent type) as IOVA in iommu_dma_get_msi_page()?
No, the whole point is to get away from cookies and having to keep track of things in the kernel that can and should just be simple regular user-owned S2 mappings.
One more concern is the MSI window size. VMM sets up a MSI region that must fit the hardware window size. Most of ITS versions have only one page size but one of them can have multiple pages? What if vITS is one-page size while the underlying pITS has multiple?
My understanding of the current kernel-defined 1MB size is also a hard-coding window to potential fit all cases, since IOMMU code in the code can just eyeball what's going on in the irqchip subsystem and adjust accordingly if someday it needs to. But VMM can't?
The existing design is based around the kernel potentially having to stuff multiple different mappings for different devices into the MSI hole in a single domain, since VFIO userspace is allowed to do wacky things like emulate INTx using an underlying physical MSI, so there may not be any actual vITS region in the VM IPA space at all. I think that was also why it ended up being a fake reserved region exposed by the SMMU drivers rather than relying on userspace to say where to put it - making things look superficially a bit more x86-like meant fewer changes to userspace, which I think by now we can consider a tasty slice of technical debt.
For a dedicated "MSI passthrough" model where, in parallel to IOMMU nesting, the abstraction is thinner and userspace is in on the game of knowingly emulating a GIC ITS backed by a GIC ITS, I'd imagine it could be pretty straightforward, at least conceptually. Userspace has something like an IOAS_MAP_MSI(device, IOVA) to indicate where it's placing a vITS to which it wants that device's MSIs to be able to go, the kernel resolves the PA from the IRQ layer and maps it, job done. If userspace wants to associate two devices with the same vITS when they have different physical ITSes, either we split the IOAS into two HWPTs to hold the different mappings, or we punt it back to userspace to resolve at the IOAS level.
Or I guess the really cheeky version is the IRQ layer exposes its own thing for userspace to mmap the ITS, then it can call a literal IOAS_MAP on that mapping... :D
Thanks, Robin.
On Fri, Aug 09, 2024 at 06:43:47PM +0100, Robin Murphy wrote:
On 2024-08-09 9:00 am, Tian, Kevin wrote:
From: Nicolin Chen nicolinc@nvidia.com Sent: Friday, August 9, 2024 7:00 AM
On Thu, Aug 08, 2024 at 01:38:44PM +0100, Robin Murphy wrote:
On 06/08/2024 9:25 am, Tian, Kevin wrote:
From: Nicolin Chen nicolinc@nvidia.com Sent: Saturday, August 3, 2024 8:32 AM
From: Robin Murphy robin.murphy@arm.com
Currently, iommu-dma is the only place outside of IOMMUFD and
drivers
which might need to be aware of the stage 2 domain encapsulated
within
a nested domain. This would be in the legacy-VFIO-style case where
we're
why is it a legacy-VFIO-style? We only support nested in IOMMUFD.
Because with proper nesting we ideally shouldn't need the host-managed MSI mess at all, which all stems from the old VFIO paradigm of completely abstracting interrupts from userspace. I'm still hoping IOMMUFD can grow its own interface for efficient MSI passthrough, where the VMM can simply map the physical MSI doorbell into whatever IPA (GPA) it wants it to appear at in the S2 domain, then whatever the guest does with S1 it can program the MSI address into the endpoint accordingly without us having to fiddle with it.
Hmm, until now I wasn't so convinced myself that it could work as I was worried about the data. But having a second thought, since the host configures the MSI, it can still set the correct data. What we only need is to change the MSI address from a RMRed IPA/gIOVA to a real gIOVA of the vITS page.
I did a quick hack to test that loop. MSI in the guest still works fine without having the RMR node in its IORT. Sweet!
To go further on this path, we will need the following changes:
- MSI configuration in the host (via a VFIO_IRQ_SET_ACTION_TRIGGER hypercall) should set gIOVA instead of fetching from msi_cookie. That hypercall doesn't forward an address currently, since host kernel pre-sets the msi_cookie. So, we need a way to forward the gIOVA to kernel and pack it into the msi_msg structure. I haven't read the VFIO PCI code thoroughly, yet wonder if we could just let the guest program the gIOVA to the PCI register and fall it through to the hardware, so host kernel handling that hypercall can just read it back from the register?
- IOMMUFD should provide VMM a way to tell the gPA (or directly + GITS_TRANSLATER?). Then kernel should do the stage-2 mapping. I have talked to Jason about this a while ago, and we have a few thoughts how to implement it. But eventually, I think we still can't avoid a middle man like msi_cookie to associate the gPA in IOMMUFD to PA in irqchip?
Probably a new IOMMU_DMA_MSI_COOKIE_USER type which uses GPA (passed in in ALLOC_HWPT for a nested_parent type) as IOVA in iommu_dma_get_msi_page()?
No, the whole point is to get away from cookies and having to keep track of things in the kernel that can and should just be simple regular user-owned S2 mappings.
I see. That'd be ideal.
One more concern is the MSI window size. VMM sets up a MSI region that must fit the hardware window size. Most of ITS versions have only one page size but one of them can have multiple pages? What if vITS is one-page size while the underlying pITS has multiple?
My understanding of the current kernel-defined 1MB size is also a hard-coding window to potential fit all cases, since IOMMU code in the code can just eyeball what's going on in the irqchip subsystem and adjust accordingly if someday it needs to. But VMM can't?
The existing design is based around the kernel potentially having to stuff multiple different mappings for different devices into the MSI hole in a single domain, since VFIO userspace is allowed to do wacky things like emulate INTx using an underlying physical MSI, so there may not be any actual vITS region in the VM IPA space at all. I think that was also why it ended up being a fake reserved region exposed by the SMMU drivers rather than relying on userspace to say where to put it - making things look superficially a bit more x86-like meant fewer changes to userspace, which I think by now we can consider a tasty slice of technical debt.
Just found that intel-iommu's MSI window has the same 1MB size. Everything makes sense now!
For a dedicated "MSI passthrough" model where, in parallel to IOMMU nesting, the abstraction is thinner and userspace is in on the game of knowingly emulating a GIC ITS backed by a GIC ITS, I'd imagine it could be pretty straightforward, at least conceptually. Userspace has something like an IOAS_MAP_MSI(device, IOVA) to indicate where it's placing a vITS to which it wants that device's MSIs to be able to go, the kernel resolves the PA from the IRQ layer and maps it, job done. If
Will try draft something.
userspace wants to associate two devices with the same vITS when they have different physical ITSes, either we split the IOAS into two HWPTs to hold the different mappings, or we punt it back to userspace to resolve at the IOAS level.
I see. That sounds like a good solution. EEXIST for the case, so VMM will need to allocate a different IOAS/HWPT for that device to attach.
Or I guess the really cheeky version is the IRQ layer exposes its own thing for userspace to mmap the ITS, then it can call a literal IOAS_MAP on that mapping... :D
I recall one benefit from the IOMMU_RESV_SW_MSI is to expose the MSI region via sysfs. Would it be safe to expose it in the same way? The reserved region itself is per-device, which feels quite a fit...
Thanks Nicolin
On Fri, Aug 09, 2024 at 08:00:34AM +0000, Tian, Kevin wrote:
- IOMMUFD should provide VMM a way to tell the gPA (or directly + GITS_TRANSLATER?). Then kernel should do the stage-2 mapping. I have talked to Jason about this a while ago, and we have a few thoughts how to implement it. But eventually, I think we still can't avoid a middle man like msi_cookie to associate the gPA in IOMMUFD to PA in irqchip?
Probably a new IOMMU_DMA_MSI_COOKIE_USER type which uses GPA (passed in in ALLOC_HWPT for a nested_parent type) as IOVA in iommu_dma_get_msi_page()?
To get the ITS page into the iommufd I suspect we'd add a new iommufd ioctl:
struct iommu_ioas_map_msi_window { __u32 size; __u32 flags; __u32 ioas_id; __u32 __reserved; __s32 kvm_device_fd; __u32 kvm_device_type; // == KVM_DEV_TYPE_ARM_VGIC_ITS for safety __aligned_u64 device_args; // ?? __aligned_u64 length; __aligned_u64 iova; };
Where kvm_device_fd would be the KVM_DEV_TYPE_ARM_VGIC_ITS (?) device (or the RISCV version).
This would let us get the ITS physical address from the GIC driver and put it into the S2 at IOVA while relying on KVM to authorize and locate the correct PA for whatever is going on here.
Jason
From: Robin Murphy robin.murphy@arm.com Sent: Thursday, August 8, 2024 8:39 PM
On 06/08/2024 9:25 am, Tian, Kevin wrote:
From: Nicolin Chen nicolinc@nvidia.com Sent: Saturday, August 3, 2024 8:32 AM
From: Robin Murphy robin.murphy@arm.com
Currently, iommu-dma is the only place outside of IOMMUFD and drivers which might need to be aware of the stage 2 domain encapsulated within a nested domain. This would be in the legacy-VFIO-style case where we're
why is it a legacy-VFIO-style? We only support nested in IOMMUFD.
Because with proper nesting we ideally shouldn't need the host-managed MSI mess at all, which all stems from the old VFIO paradigm of completely abstracting interrupts from userspace. I'm still hoping
yes this was the consensus from previous discussion iirc. Just a bit distracting by calling it old VFIO paradigm. 😊
IOMMUFD can grow its own interface for efficient MSI passthrough, where the VMM can simply map the physical MSI doorbell into whatever IPA (GPA) it wants it to appear at in the S2 domain, then whatever the guest does with S1 it can program the MSI address into the endpoint accordingly without us having to fiddle with it.
FWIW, apart from IOMMUFD, we may also end up wanting something along those lines for Arm CCA (if non-Secure proxying of T=1 MSIs becomes an unavoidable thing).
using host-managed MSIs with an identity mapping at stage 1, where it is the underlying stage 2 domain which owns an MSI cookie and holds the corresponding dynamic mappings. Hook up the new op to resolve what
we
need from a nested domain.
Signed-off-by: Robin Murphy robin.murphy@arm.com Signed-off-by: Nicolin Chen nicolinc@nvidia.com
drivers/iommu/dma-iommu.c | 18 ++++++++++++++++-- include/linux/iommu.h | 4 ++++ 2 files changed, 20 insertions(+), 2 deletions(-)
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 7b1dfa0665df6..05e04934a5f81 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1799,6 +1799,20 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev, return NULL; }
+/*
- Nested domains may not have an MSI cookie or accept mappings, but
they may
- be related to a domain which does, so we let them tell us what they
need.
- */
+static struct iommu_domain *iommu_dma_get_msi_mapping_domain(struct device *dev) +{
- struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
- if (domain && domain->type == IOMMU_DOMAIN_NESTED &&
domain->ops->get_msi_mapping_domain)
I'm not sure the core should restrict it to the NESTED type. Given there is a new domain ops any type restriction can be handled inside the callback. Anyway the driver should register the op for a domain only when there is a need.
Nested should remain the only case where domains are chained together such that the VFIO MSI cookie may be hiding somewhere else. This is effectively documenting that implementing the callback for any other domain type would be a bad smell. Much like how the mapping-related ops are explicitly short-circuited for non-paging domain types.
Then probably good to clarify it in below comment for @get_msi_mapping_domain so driver developers could catch that restriction easily w/o searching the related code.
Thanks, Robin.
domain = domain->ops->get_msi_mapping_domain(domain);
- return domain;
+}
- /**
- iommu_dma_prepare_msi() - Map the MSI page in the IOMMU
domain
- @desc: MSI descriptor, will store the MSI page
@@ -1809,7 +1823,7 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev, int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t
msi_addr)
{ struct device *dev = msi_desc_to_dev(desc);
- struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
- struct iommu_domain *domain =
iommu_dma_get_msi_mapping_domain(dev); struct iommu_dma_msi_page *msi_page; static DEFINE_MUTEX(msi_prepare_lock); /* see below */
@@ -1842,7 +1856,7 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct
msi_msg
*msg) { struct device *dev = msi_desc_to_dev(desc);
- const struct iommu_domain *domain =
iommu_get_domain_for_dev(dev);
- const struct iommu_domain *domain =
iommu_dma_get_msi_mapping_domain(dev); const struct iommu_dma_msi_page *msi_page;
msi_page = msi_desc_get_iommu_cookie(desc); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 4d47f2c333118..69ed76f9c3ec4 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -638,6 +638,8 @@ struct iommu_ops {
- @enable_nesting: Enable nesting
- @set_pgtable_quirks: Set io page table quirks (IO_PGTABLE_QUIRK_*)
- @free: Release the domain after use.
- @get_msi_mapping_domain: Return the related iommu_domain that
should hold the
*/ struct iommu_domain_ops { int (*attach_dev)(struct iommu_domain *domain, struct device *dev);
MSI cookie and accept mapping(s).
@@ -668,6 +670,8 @@ struct iommu_domain_ops { unsigned long quirks);
void (*free)(struct iommu_domain *domain);
- struct iommu_domain *
(*get_msi_mapping_domain)(struct iommu_domain
*domain); };
/**
2.43.0
On Thu, Aug 08, 2024 at 01:38:44PM +0100, Robin Murphy wrote:
On 06/08/2024 9:25 am, Tian, Kevin wrote:
From: Nicolin Chen nicolinc@nvidia.com Sent: Saturday, August 3, 2024 8:32 AM
From: Robin Murphy robin.murphy@arm.com
Currently, iommu-dma is the only place outside of IOMMUFD and drivers which might need to be aware of the stage 2 domain encapsulated within a nested domain. This would be in the legacy-VFIO-style case where we're
why is it a legacy-VFIO-style? We only support nested in IOMMUFD.
Because with proper nesting we ideally shouldn't need the host-managed MSI mess at all, which all stems from the old VFIO paradigm of completely abstracting interrupts from userspace. I'm still hoping IOMMUFD can grow its own interface for efficient MSI passthrough, where the VMM can simply map the physical MSI doorbell into whatever IPA (GPA) it wants it to appear at in the S2 domain, then whatever the guest does with S1 it can program the MSI address into the endpoint accordingly without us having to fiddle with it.
+1
I don't have a staged plan to do this though. Getting the ITS page into the S2 at a user specified address should be simple enough to manage.
The bigger issue is that we still have the hypervisor GIC driver controlling things and it will need to know to use the guest provided MSI address captured during the MSI trap, not its own address. I don't have an idea how to connect those two parts yet.
Jason
On Fri, Aug 09, 2024 at 03:41:36PM -0300, Jason Gunthorpe wrote:
On Thu, Aug 08, 2024 at 01:38:44PM +0100, Robin Murphy wrote:
On 06/08/2024 9:25 am, Tian, Kevin wrote:
From: Nicolin Chen nicolinc@nvidia.com Sent: Saturday, August 3, 2024 8:32 AM
From: Robin Murphy robin.murphy@arm.com
Currently, iommu-dma is the only place outside of IOMMUFD and drivers which might need to be aware of the stage 2 domain encapsulated within a nested domain. This would be in the legacy-VFIO-style case where we're
why is it a legacy-VFIO-style? We only support nested in IOMMUFD.
Because with proper nesting we ideally shouldn't need the host-managed MSI mess at all, which all stems from the old VFIO paradigm of completely abstracting interrupts from userspace. I'm still hoping IOMMUFD can grow its own interface for efficient MSI passthrough, where the VMM can simply map the physical MSI doorbell into whatever IPA (GPA) it wants it to appear at in the S2 domain, then whatever the guest does with S1 it can program the MSI address into the endpoint accordingly without us having to fiddle with it.
+1
I don't have a staged plan to do this though. Getting the ITS page into the S2 at a user specified address should be simple enough to manage.
The bigger issue is that we still have the hypervisor GIC driver controlling things and it will need to know to use the guest provided MSI address captured during the MSI trap, not its own address. I don't have an idea how to connect those two parts yet.
You mean the gPA of the vITS v.s. PA of the ITS, right? I think that's because only VMM knows the virtual IRQ number to insert? We don't seem to have a choice for that unless we want to poke a hole to the vGIC design..
With that, it feels a quite big improvement already by getting rid of the entire shadow MSI mapping, including msi_cookie and RMR..
Thanks Nicolin
On Fri, Aug 09, 2024 at 12:18:42PM -0700, Nicolin Chen wrote:
The bigger issue is that we still have the hypervisor GIC driver controlling things and it will need to know to use the guest provided MSI address captured during the MSI trap, not its own address. I don't have an idea how to connect those two parts yet.
You mean the gPA of the vITS v.s. PA of the ITS, right? I think that's because only VMM knows the virtual IRQ number to insert? We don't seem to have a choice for that unless we want to poke a hole to the vGIC design..
I mean what you explained in your other email:
- MSI configuration in the host (via a VFIO_IRQ_SET_ACTION_TRIGGER hypercall) should set gIOVA instead of fetching from msi_cookie. That hypercall doesn't forward an address currently, since host kernel pre-sets the msi_cookie. So, we need a way to forward the gIOVA to kernel and pack it into the msi_msg structure. I haven't read the VFIO PCI code thoroughly, yet wonder if we could just let the guest program the gIOVA to the PCI register and fall it through to the hardware, so host kernel handling that hypercall can just read it back from the register?
We still need to convay the MSI-X address from the register trap into the kernel and use the VM supplied address instead of calling iommu_dma_compose_msi_msg().
When you did your test you may have lucked out that the guest was putting the ITS at the same place the host kernel expected because they are both running the same software and making the same decision :)
Maybe take a look at what pushing the address down through the VFIO_IRQ_SET_ACTION_TRIGGER would look like?
Jason
On Fri, Aug 09, 2024 at 07:49:10PM -0300, Jason Gunthorpe wrote:
On Fri, Aug 09, 2024 at 12:18:42PM -0700, Nicolin Chen wrote:
The bigger issue is that we still have the hypervisor GIC driver controlling things and it will need to know to use the guest provided MSI address captured during the MSI trap, not its own address. I don't have an idea how to connect those two parts yet.
You mean the gPA of the vITS v.s. PA of the ITS, right? I think that's because only VMM knows the virtual IRQ number to insert? We don't seem to have a choice for that unless we want to poke a hole to the vGIC design..
I mean what you explained in your other email:
- MSI configuration in the host (via a VFIO_IRQ_SET_ACTION_TRIGGER hypercall) should set gIOVA instead of fetching from msi_cookie. That hypercall doesn't forward an address currently, since host kernel pre-sets the msi_cookie. So, we need a way to forward the gIOVA to kernel and pack it into the msi_msg structure. I haven't read the VFIO PCI code thoroughly, yet wonder if we could just let the guest program the gIOVA to the PCI register and fall it through to the hardware, so host kernel handling that hypercall can just read it back from the register?
We still need to convay the MSI-X address from the register trap into the kernel and use the VM supplied address instead of calling iommu_dma_compose_msi_msg().
Yes.
When you did your test you may have lucked out that the guest was putting the ITS at the same place the host kernel expected because they are both running the same software and making the same decision :)
Oh, the devil's in the details: I hard-coded every address in the vITS's 2-stage mapping lol
Maybe take a look at what pushing the address down through the VFIO_IRQ_SET_ACTION_TRIGGER would look like?
Yea, there's a data field and we can add a new flag to define the format/type. Will take a closer look.
Thanks Nicolin
Define an IOMMU_RESV_SW_MSI region (within the mock aperture) as the ARM SMMU drivers do.
Implement the get_msi_mapping_domain nested domain op to allow dma-iommu to find the correct paging domain.
Add a new IOMMU_TEST_OP_MD_CHECK_SW_MSI to loopback test the MSI mapping using public dma-iommu and iommu helpers.
Note that iommu_fwspec_init is a must for iommu_dma_get_resv_regions().
Signed-off-by: Nicolin Chen nicolinc@nvidia.com --- drivers/iommu/iommufd/iommufd_test.h | 4 + drivers/iommu/iommufd/selftest.c | 84 +++++++++++++++++++ tools/testing/selftests/iommu/iommufd_utils.h | 9 ++ 3 files changed, 97 insertions(+)
diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index f4bc23a92f9a2..0bb30275a92f7 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -23,6 +23,7 @@ enum { IOMMU_TEST_OP_DIRTY, IOMMU_TEST_OP_MD_CHECK_IOTLB, IOMMU_TEST_OP_TRIGGER_IOPF, + IOMMU_TEST_OP_MD_CHECK_SW_MSI, };
enum { @@ -135,6 +136,9 @@ struct iommu_test_cmd { __u32 perm; __u64 addr; } trigger_iopf; + struct { + __u32 stdev_id; + } check_sw_msi; }; __u32 last; }; diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index b60687f57bef3..642ae135ada98 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -7,11 +7,13 @@ #include <linux/fault-inject.h> #include <linux/file.h> #include <linux/iommu.h> +#include <linux/msi.h> #include <linux/platform_device.h> #include <linux/slab.h> #include <linux/xarray.h> #include <uapi/linux/iommufd.h>
+#include "../dma-iommu.h" #include "../iommu-priv.h" #include "io_pagetable.h" #include "iommufd_private.h" @@ -539,6 +541,24 @@ static int mock_dev_disable_feat(struct device *dev, enum iommu_dev_features fea return 0; }
+#define MSI_IOVA_BASE 0x80000000 +#define MSI_IOVA_LENGTH 0x100000 + +static void mock_dev_get_resv_regions(struct device *dev, + struct list_head *head) +{ + int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO; + struct iommu_resv_region *region; + + region = iommu_alloc_resv_region(MSI_IOVA_BASE, MSI_IOVA_LENGTH, + prot, IOMMU_RESV_SW_MSI, GFP_KERNEL); + if (!region) + return; + + list_add_tail(®ion->list, head); + iommu_dma_get_resv_regions(dev, head); +} + static const struct iommu_ops mock_ops = { /* * IOMMU_DOMAIN_BLOCKED cannot be returned from def_domain_type() @@ -557,6 +577,7 @@ static const struct iommu_ops mock_ops = { .page_response = mock_domain_page_response, .dev_enable_feat = mock_dev_enable_feat, .dev_disable_feat = mock_dev_disable_feat, + .get_resv_regions = mock_dev_get_resv_regions, .user_pasid_table = true, .default_domain_ops = &(struct iommu_domain_ops){ @@ -625,10 +646,20 @@ mock_domain_cache_invalidate_user(struct iommu_domain *domain, return rc; }
+static struct iommu_domain * +mock_domain_get_msi_mapping_domain(struct iommu_domain *domain) +{ + struct mock_iommu_domain_nested *mock_nested = + container_of(domain, struct mock_iommu_domain_nested, domain); + + return &mock_nested->parent->domain; +} + static struct iommu_domain_ops domain_nested_ops = { .free = mock_domain_free_nested, .attach_dev = mock_domain_nop_attach, .cache_invalidate_user = mock_domain_cache_invalidate_user, + .get_msi_mapping_domain = mock_domain_get_msi_mapping_domain, };
static inline struct iommufd_hw_pagetable * @@ -701,6 +732,7 @@ static struct mock_dev *mock_dev_create(unsigned long dev_flags) return ERR_PTR(-ENOMEM);
device_initialize(&mdev->dev); + iommu_fwspec_init(&mdev->dev, NULL); mdev->flags = dev_flags; mdev->dev.release = mock_dev_release; mdev->dev.bus = &iommufd_mock_bus_type.bus; @@ -960,6 +992,55 @@ static int iommufd_test_md_check_iotlb(struct iommufd_ucmd *ucmd, return rc; }
+#define MOCK_MSI_PAGE 0xbeef0000 +static int iommufd_test_md_check_sw_msi(struct iommufd_ucmd *ucmd, + u32 mockpt_id, u32 device_id) +{ + struct mock_iommu_domain_nested *mock_nested; + struct iommufd_hw_pagetable *hwpt; + struct iommufd_object *dev_obj; + struct mock_iommu_domain *mock; + struct selftest_obj *sobj; + struct msi_desc desc = {}; + int rc = 0; + + hwpt = get_md_pagetable(ucmd, mockpt_id, &mock); + if (IS_ERR(hwpt)) { + /* Try nested domain */ + hwpt = get_md_pagetable_nested(ucmd, mockpt_id, &mock_nested); + if (IS_ERR(hwpt)) + return PTR_ERR(hwpt); + mock = mock_nested->parent; + } + + dev_obj = + iommufd_get_object(ucmd->ictx, device_id, IOMMUFD_OBJ_SELFTEST); + if (IS_ERR(dev_obj)) { + rc = PTR_ERR(dev_obj); + goto out_put_hwpt; + } + + sobj = container_of(dev_obj, struct selftest_obj, obj); + if (sobj->type != TYPE_IDEV) { + rc = -EINVAL; + goto out_dev_obj; + } + + desc.dev = sobj->idev.idev->dev; + rc = iommu_dma_prepare_msi(&desc, MOCK_MSI_PAGE); + if (rc) + goto out_dev_obj; + + if (iommu_iova_to_phys(&mock->domain, MSI_IOVA_BASE) != MOCK_MSI_PAGE) + rc = -EINVAL; + +out_dev_obj: + iommufd_put_object(ucmd->ictx, dev_obj); +out_put_hwpt: + iommufd_put_object(ucmd->ictx, &hwpt->obj); + return rc; +} + struct selftest_access { struct iommufd_access *access; struct file *file; @@ -1470,6 +1551,9 @@ int iommufd_test(struct iommufd_ucmd *ucmd) return iommufd_test_md_check_iotlb(ucmd, cmd->id, cmd->check_iotlb.id, cmd->check_iotlb.iotlb); + case IOMMU_TEST_OP_MD_CHECK_SW_MSI: + return iommufd_test_md_check_sw_msi(ucmd, cmd->id, + cmd->check_sw_msi.stdev_id); case IOMMU_TEST_OP_CREATE_ACCESS: return iommufd_test_create_access(ucmd, cmd->id, cmd->create_access.flags); diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 40f6f14ce136f..4c76689e804f4 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -130,6 +130,13 @@ static int _test_cmd_mock_domain_flags(int fd, unsigned int ioas_id, static int _test_cmd_mock_domain_replace(int fd, __u32 stdev_id, __u32 pt_id, __u32 *hwpt_id) { + struct iommu_test_cmd sw_msi = { + .size = sizeof(sw_msi), + .op = IOMMU_TEST_OP_MD_CHECK_SW_MSI, + .check_sw_msi = { + .stdev_id = stdev_id, + }, + }; struct iommu_test_cmd cmd = { .size = sizeof(cmd), .op = IOMMU_TEST_OP_MOCK_DOMAIN_REPLACE, @@ -145,6 +152,8 @@ static int _test_cmd_mock_domain_replace(int fd, __u32 stdev_id, __u32 pt_id, return ret; if (hwpt_id) *hwpt_id = cmd.mock_domain_replace.pt_id; + sw_msi.id = cmd.mock_domain_replace.pt_id; + assert(0 == ioctl(fd, IOMMU_TEST_CMD, &sw_msi)); return 0; }
Hi Nicolin,
kernel test robot noticed the following build errors:
[auto build test ERROR on shuah-kselftest/next] [also build test ERROR on shuah-kselftest/fixes linus/master v6.11-rc2 next-20240809] [cannot apply to joro-iommu/next] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Nicolin-Chen/iommufd-Reorder-... base: https://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git next patch link: https://lore.kernel.org/r/544ab894a301c83eb9f9d7a6326f4cb87f517019.172264486... patch subject: [PATCH v2 3/3] iommufd/selftest: Add coverage for reserved IOVAs config: x86_64-rhel-8.3-kselftests (https://download.01.org/0day-ci/archive/20240809/202408092301.M51NPXgL-lkp@i...) compiler: gcc-12 (Debian 12.2.0-14) 12.2.0 reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240809/202408092301.M51NPXgL-lkp@i...)
If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot lkp@intel.com | Closes: https://lore.kernel.org/oe-kbuild-all/202408092301.M51NPXgL-lkp@intel.com/
All errors (new ones prefixed by >>, old ones prefixed by <<):
WARNING: modpost: missing MODULE_DESCRIPTION() in kernel/locking/test-ww_mutex.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/usb/serial/usb_debug.o
ERROR: modpost: "iommu_dma_prepare_msi" [drivers/iommu/iommufd/iommufd.ko] undefined!
linux-kselftest-mirror@lists.linaro.org