The vmalloc() and vfree() functions manage virtually contiguous, but not necessarily physically contiguous, kernel memory regions. When vfree() unmaps such a region, it tears down the associated kernel page table entries and frees the physical pages.
In the IOMMU Shared Virtual Addressing (SVA) context, the IOMMU hardware shares and walks the CPU's page tables. Architectures like x86 share static kernel address mappings across all user page tables, allowing the IOMMU to access the kernel portion of these tables.
Modern IOMMUs often cache page table entries to optimize walk performance, even for intermediate page table levels. If kernel page table mappings are changed (e.g., by vfree()), but the IOMMU's internal caches retain stale entries, Use-After-Free (UAF) vulnerability condition arises. If these freed page table pages are reallocated for a different purpose, potentially by an attacker, the IOMMU could misinterpret the new data as valid page table entries. This allows the IOMMU to walk into attacker-controlled memory, leading to arbitrary physical memory DMA access or privilege escalation.
To mitigate this, introduce a new iommu interface to flush IOMMU caches and fence pending page table walks when kernel page mappings are updated. This interface should be invoked from architecture-specific code that manages combined user and kernel page tables.
Fixes: 26b25a2b98e4 ("iommu: Bind process address spaces to devices") Cc: stable@vger.kernel.org Co-developed-by: Jason Gunthorpe jgg@nvidia.com Signed-off-by: Jason Gunthorpe jgg@nvidia.com Signed-off-by: Lu Baolu baolu.lu@linux.intel.com --- arch/x86/mm/tlb.c | 2 ++ drivers/iommu/iommu-sva.c | 32 +++++++++++++++++++++++++++++++- include/linux/iommu.h | 4 ++++ 3 files changed, 37 insertions(+), 1 deletion(-)
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 39f80111e6f1..a41499dfdc3f 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -12,6 +12,7 @@ #include <linux/task_work.h> #include <linux/mmu_notifier.h> #include <linux/mmu_context.h> +#include <linux/iommu.h>
#include <asm/tlbflush.h> #include <asm/mmu_context.h> @@ -1540,6 +1541,7 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end) kernel_tlb_flush_range(info);
put_flush_tlb_info(); + iommu_sva_invalidate_kva_range(start, end); }
/* diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index 1a51cfd82808..154384eab8a3 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -10,6 +10,8 @@ #include "iommu-priv.h"
static DEFINE_MUTEX(iommu_sva_lock); +static DEFINE_STATIC_KEY_FALSE(iommu_sva_present); +static LIST_HEAD(iommu_sva_mms); static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm);
@@ -42,6 +44,7 @@ static struct iommu_mm_data *iommu_alloc_mm_data(struct mm_struct *mm, struct de return ERR_PTR(-ENOSPC); } iommu_mm->pasid = pasid; + iommu_mm->mm = mm; INIT_LIST_HEAD(&iommu_mm->sva_domains); /* * Make sure the write to mm->iommu_mm is not reordered in front of @@ -132,8 +135,13 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm if (ret) goto out_free_domain; domain->users = 1; - list_add(&domain->next, &mm->iommu_mm->sva_domains);
+ if (list_empty(&iommu_mm->sva_domains)) { + if (list_empty(&iommu_sva_mms)) + static_branch_enable(&iommu_sva_present); + list_add(&iommu_mm->mm_list_elm, &iommu_sva_mms); + } + list_add(&domain->next, &iommu_mm->sva_domains); out: refcount_set(&handle->users, 1); mutex_unlock(&iommu_sva_lock); @@ -175,6 +183,13 @@ void iommu_sva_unbind_device(struct iommu_sva *handle) list_del(&domain->next); iommu_domain_free(domain); } + + if (list_empty(&iommu_mm->sva_domains)) { + list_del(&iommu_mm->mm_list_elm); + if (list_empty(&iommu_sva_mms)) + static_branch_disable(&iommu_sva_present); + } + mutex_unlock(&iommu_sva_lock); kfree(handle); } @@ -312,3 +327,18 @@ static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
return domain; } + +void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) +{ + struct iommu_mm_data *iommu_mm; + + might_sleep(); + + if (!static_branch_unlikely(&iommu_sva_present)) + return; + + guard(mutex)(&iommu_sva_lock); + list_for_each_entry(iommu_mm, &iommu_sva_mms, mm_list_elm) + mmu_notifier_arch_invalidate_secondary_tlbs(iommu_mm->mm, start, end); +} +EXPORT_SYMBOL_GPL(iommu_sva_invalidate_kva_range); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 156732807994..31330c12b8ee 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1090,7 +1090,9 @@ struct iommu_sva {
struct iommu_mm_data { u32 pasid; + struct mm_struct *mm; struct list_head sva_domains; + struct list_head mm_list_elm; };
int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode); @@ -1571,6 +1573,7 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm); void iommu_sva_unbind_device(struct iommu_sva *handle); u32 iommu_sva_get_pasid(struct iommu_sva *handle); +void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end); #else static inline struct iommu_sva * iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) @@ -1595,6 +1598,7 @@ static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm) }
static inline void mm_pasid_drop(struct mm_struct *mm) {} +static inline void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) {} #endif /* CONFIG_IOMMU_SVA */
#ifdef CONFIG_IOMMU_IOPF
On Fri, Jul 04, 2025 at 09:30:56PM +0800, Lu Baolu wrote:
The vmalloc() and vfree() functions manage virtually contiguous, but not necessarily physically contiguous, kernel memory regions. When vfree() unmaps such a region, it tears down the associated kernel page table entries and frees the physical pages.
In the IOMMU Shared Virtual Addressing (SVA) context, the IOMMU hardware shares and walks the CPU's page tables. Architectures like x86 share static kernel address mappings across all user page tables, allowing the IOMMU to access the kernel portion of these tables.
Modern IOMMUs often cache page table entries to optimize walk performance, even for intermediate page table levels. If kernel page table mappings are changed (e.g., by vfree()), but the IOMMU's internal caches retain stale entries, Use-After-Free (UAF) vulnerability condition arises. If these freed page table pages are reallocated for a different purpose, potentially by an attacker, the IOMMU could misinterpret the new data as valid page table entries. This allows the IOMMU to walk into attacker-controlled memory, leading to arbitrary physical memory DMA access or privilege escalation.
To mitigate this, introduce a new iommu interface to flush IOMMU caches and fence pending page table walks when kernel page mappings are updated. This interface should be invoked from architecture-specific code that manages combined user and kernel page tables.
Fixes: 26b25a2b98e4 ("iommu: Bind process address spaces to devices") Cc: stable@vger.kernel.org Co-developed-by: Jason Gunthorpe jgg@nvidia.com Signed-off-by: Jason Gunthorpe jgg@nvidia.com Signed-off-by: Lu Baolu baolu.lu@linux.intel.com
arch/x86/mm/tlb.c | 2 ++ drivers/iommu/iommu-sva.c | 32 +++++++++++++++++++++++++++++++- include/linux/iommu.h | 4 ++++ 3 files changed, 37 insertions(+), 1 deletion(-)
Reported-by: Jann Horn jannh@google.com
@@ -1540,6 +1541,7 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end) kernel_tlb_flush_range(info); put_flush_tlb_info();
- iommu_sva_invalidate_kva_range(start, end);
}
This is much less call sites than I guessed!
+void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) +{
- struct iommu_mm_data *iommu_mm;
- might_sleep();
- if (!static_branch_unlikely(&iommu_sva_present))
return;
- guard(mutex)(&iommu_sva_lock);
- list_for_each_entry(iommu_mm, &iommu_sva_mms, mm_list_elm)
mmu_notifier_arch_invalidate_secondary_tlbs(iommu_mm->mm, start, end);
+} +EXPORT_SYMBOL_GPL(iommu_sva_invalidate_kva_range);
I don't think it needs to be exported it only arch code is calling it?
Looks Ok to me:
Reviewed-by: Jason Gunthorpe jgg@nvidia.com
Jason
On 7/4/2025 9:38 PM, Jason Gunthorpe wrote:
On Fri, Jul 04, 2025 at 09:30:56PM +0800, Lu Baolu wrote:
The vmalloc() and vfree() functions manage virtually contiguous, but not necessarily physically contiguous, kernel memory regions. When vfree() unmaps such a region, it tears down the associated kernel page table entries and frees the physical pages.
In the IOMMU Shared Virtual Addressing (SVA) context, the IOMMU hardware shares and walks the CPU's page tables. Architectures like x86 share static kernel address mappings across all user page tables, allowing the IOMMU to access the kernel portion of these tables.
Modern IOMMUs often cache page table entries to optimize walk performance, even for intermediate page table levels. If kernel page table mappings are changed (e.g., by vfree()), but the IOMMU's internal caches retain stale entries, Use-After-Free (UAF) vulnerability condition arises. If these freed page table pages are reallocated for a different purpose, potentially by an attacker, the IOMMU could misinterpret the new data as valid page table entries. This allows the IOMMU to walk into attacker-controlled memory, leading to arbitrary physical memory DMA access or privilege escalation.
To mitigate this, introduce a new iommu interface to flush IOMMU caches and fence pending page table walks when kernel page mappings are updated. This interface should be invoked from architecture-specific code that manages combined user and kernel page tables.
Fixes: 26b25a2b98e4 ("iommu: Bind process address spaces to devices") Cc:stable@vger.kernel.org Co-developed-by: Jason Gunthorpejgg@nvidia.com Signed-off-by: Jason Gunthorpejgg@nvidia.com Signed-off-by: Lu Baolubaolu.lu@linux.intel.com
arch/x86/mm/tlb.c | 2 ++ drivers/iommu/iommu-sva.c | 32 +++++++++++++++++++++++++++++++- include/linux/iommu.h | 4 ++++ 3 files changed, 37 insertions(+), 1 deletion(-)
Reported-by: Jann Hornjannh@google.com
@@ -1540,6 +1541,7 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end) kernel_tlb_flush_range(info); put_flush_tlb_info();
- iommu_sva_invalidate_kva_range(start, end); }
This is much less call sites than I guessed!
+void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) +{
- struct iommu_mm_data *iommu_mm;
- might_sleep();
- if (!static_branch_unlikely(&iommu_sva_present))
return;
- guard(mutex)(&iommu_sva_lock);
- list_for_each_entry(iommu_mm, &iommu_sva_mms, mm_list_elm)
mmu_notifier_arch_invalidate_secondary_tlbs(iommu_mm->mm, start, end);
+} +EXPORT_SYMBOL_GPL(iommu_sva_invalidate_kva_range);
I don't think it needs to be exported it only arch code is calling it?
Yes. Done.
Looks Ok to me:
Reviewed-by: Jason Gunthorpejgg@nvidia.com
Thanks, baolu
On 7/4/2025 7:00 PM, Lu Baolu wrote:
The vmalloc() and vfree() functions manage virtually contiguous, but not necessarily physically contiguous, kernel memory regions. When vfree() unmaps such a region, it tears down the associated kernel page table entries and frees the physical pages.
In the IOMMU Shared Virtual Addressing (SVA) context, the IOMMU hardware shares and walks the CPU's page tables. Architectures like x86 share static kernel address mappings across all user page tables, allowing the IOMMU to access the kernel portion of these tables.
Modern IOMMUs often cache page table entries to optimize walk performance, even for intermediate page table levels. If kernel page table mappings are changed (e.g., by vfree()), but the IOMMU's internal caches retain stale entries, Use-After-Free (UAF) vulnerability condition arises. If these freed page table pages are reallocated for a different purpose, potentially by an attacker, the IOMMU could misinterpret the new data as valid page table entries. This allows the IOMMU to walk into attacker-controlled memory, leading to arbitrary physical memory DMA access or privilege escalation.
To mitigate this, introduce a new iommu interface to flush IOMMU caches and fence pending page table walks when kernel page mappings are updated. This interface should be invoked from architecture-specific code that manages combined user and kernel page tables.
Fixes: 26b25a2b98e4 ("iommu: Bind process address spaces to devices") Cc: stable@vger.kernel.org Co-developed-by: Jason Gunthorpe jgg@nvidia.com Signed-off-by: Jason Gunthorpe jgg@nvidia.com Signed-off-by: Lu Baolu baolu.lu@linux.intel.com
Thanks for getting this patch. Looks good to me.
Reviewed-by: Vasant Hegde vasant.hegde@amd.com
-Vasant
arch/x86/mm/tlb.c | 2 ++ drivers/iommu/iommu-sva.c | 32 +++++++++++++++++++++++++++++++- include/linux/iommu.h | 4 ++++ 3 files changed, 37 insertions(+), 1 deletion(-)
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 39f80111e6f1..a41499dfdc3f 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -12,6 +12,7 @@ #include <linux/task_work.h> #include <linux/mmu_notifier.h> #include <linux/mmu_context.h> +#include <linux/iommu.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> @@ -1540,6 +1541,7 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end) kernel_tlb_flush_range(info); put_flush_tlb_info();
- iommu_sva_invalidate_kva_range(start, end);
} /* diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index 1a51cfd82808..154384eab8a3 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -10,6 +10,8 @@ #include "iommu-priv.h" static DEFINE_MUTEX(iommu_sva_lock); +static DEFINE_STATIC_KEY_FALSE(iommu_sva_present); +static LIST_HEAD(iommu_sva_mms); static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm); @@ -42,6 +44,7 @@ static struct iommu_mm_data *iommu_alloc_mm_data(struct mm_struct *mm, struct de return ERR_PTR(-ENOSPC); } iommu_mm->pasid = pasid;
- iommu_mm->mm = mm; INIT_LIST_HEAD(&iommu_mm->sva_domains); /*
- Make sure the write to mm->iommu_mm is not reordered in front of
@@ -132,8 +135,13 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm if (ret) goto out_free_domain; domain->users = 1;
- list_add(&domain->next, &mm->iommu_mm->sva_domains);
- if (list_empty(&iommu_mm->sva_domains)) {
if (list_empty(&iommu_sva_mms))
static_branch_enable(&iommu_sva_present);
list_add(&iommu_mm->mm_list_elm, &iommu_sva_mms);
- }
- list_add(&domain->next, &iommu_mm->sva_domains);
out: refcount_set(&handle->users, 1); mutex_unlock(&iommu_sva_lock); @@ -175,6 +183,13 @@ void iommu_sva_unbind_device(struct iommu_sva *handle) list_del(&domain->next); iommu_domain_free(domain); }
- if (list_empty(&iommu_mm->sva_domains)) {
list_del(&iommu_mm->mm_list_elm);
if (list_empty(&iommu_sva_mms))
static_branch_disable(&iommu_sva_present);
- }
- mutex_unlock(&iommu_sva_lock); kfree(handle);
} @@ -312,3 +327,18 @@ static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, return domain; }
+void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) +{
- struct iommu_mm_data *iommu_mm;
- might_sleep();
- if (!static_branch_unlikely(&iommu_sva_present))
return;
- guard(mutex)(&iommu_sva_lock);
- list_for_each_entry(iommu_mm, &iommu_sva_mms, mm_list_elm)
mmu_notifier_arch_invalidate_secondary_tlbs(iommu_mm->mm, start, end);
+} +EXPORT_SYMBOL_GPL(iommu_sva_invalidate_kva_range); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 156732807994..31330c12b8ee 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1090,7 +1090,9 @@ struct iommu_sva { struct iommu_mm_data { u32 pasid;
- struct mm_struct *mm; struct list_head sva_domains;
- struct list_head mm_list_elm;
}; int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode); @@ -1571,6 +1573,7 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm); void iommu_sva_unbind_device(struct iommu_sva *handle); u32 iommu_sva_get_pasid(struct iommu_sva *handle); +void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end); #else static inline struct iommu_sva * iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) @@ -1595,6 +1598,7 @@ static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm) } static inline void mm_pasid_drop(struct mm_struct *mm) {} +static inline void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) {} #endif /* CONFIG_IOMMU_SVA */ #ifdef CONFIG_IOMMU_IOPF
On 7/4/25 21:30, Lu Baolu wrote:
The vmalloc() and vfree() functions manage virtually contiguous, but not necessarily physically contiguous, kernel memory regions. When vfree() unmaps such a region, it tears down the associated kernel page table entries and frees the physical pages.
In the IOMMU Shared Virtual Addressing (SVA) context, the IOMMU hardware shares and walks the CPU's page tables. Architectures like x86 share static kernel address mappings across all user page tables, allowing the IOMMU to access the kernel portion of these tables.
Modern IOMMUs often cache page table entries to optimize walk performance, even for intermediate page table levels. If kernel page table mappings are changed (e.g., by vfree()), but the IOMMU's internal caches retain stale entries, Use-After-Free (UAF) vulnerability condition arises. If these freed page table pages are reallocated for a different purpose, potentially by an attacker, the IOMMU could misinterpret the new data as valid page table entries. This allows the IOMMU to walk into attacker-controlled memory, leading to arbitrary physical memory DMA access or privilege escalation.
To mitigate this, introduce a new iommu interface to flush IOMMU caches and fence pending page table walks when kernel page mappings are updated. This interface should be invoked from architecture-specific code that manages combined user and kernel page tables.
Fixes: 26b25a2b98e4 ("iommu: Bind process address spaces to devices") Cc:stable@vger.kernel.org Co-developed-by: Jason Gunthorpejgg@nvidia.com Signed-off-by: Jason Gunthorpejgg@nvidia.com Signed-off-by: Lu Baolubaolu.lu@linux.intel.com
arch/x86/mm/tlb.c | 2 ++ drivers/iommu/iommu-sva.c | 32 +++++++++++++++++++++++++++++++- include/linux/iommu.h | 4 ++++ 3 files changed, 37 insertions(+), 1 deletion(-)
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 39f80111e6f1..a41499dfdc3f 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -12,6 +12,7 @@ #include <linux/task_work.h> #include <linux/mmu_notifier.h> #include <linux/mmu_context.h> +#include <linux/iommu.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> @@ -1540,6 +1541,7 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end) kernel_tlb_flush_range(info); put_flush_tlb_info();
- iommu_sva_invalidate_kva_range(start, end); }
/* diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index 1a51cfd82808..154384eab8a3 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -10,6 +10,8 @@ #include "iommu-priv.h" static DEFINE_MUTEX(iommu_sva_lock); +static DEFINE_STATIC_KEY_FALSE(iommu_sva_present); +static LIST_HEAD(iommu_sva_mms); static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm); @@ -42,6 +44,7 @@ static struct iommu_mm_data *iommu_alloc_mm_data(struct mm_struct *mm, struct de return ERR_PTR(-ENOSPC); } iommu_mm->pasid = pasid;
- iommu_mm->mm = mm; INIT_LIST_HEAD(&iommu_mm->sva_domains); /*
- Make sure the write to mm->iommu_mm is not reordered in front of
@@ -132,8 +135,13 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm if (ret) goto out_free_domain; domain->users = 1;
- list_add(&domain->next, &mm->iommu_mm->sva_domains);
- if (list_empty(&iommu_mm->sva_domains)) {
if (list_empty(&iommu_sva_mms))
static_branch_enable(&iommu_sva_present);
list_add(&iommu_mm->mm_list_elm, &iommu_sva_mms);
- }
- list_add(&domain->next, &iommu_mm->sva_domains); out: refcount_set(&handle->users, 1); mutex_unlock(&iommu_sva_lock);
@@ -175,6 +183,13 @@ void iommu_sva_unbind_device(struct iommu_sva *handle) list_del(&domain->next); iommu_domain_free(domain); }
- if (list_empty(&iommu_mm->sva_domains)) {
list_del(&iommu_mm->mm_list_elm);
if (list_empty(&iommu_sva_mms))
static_branch_disable(&iommu_sva_present);
- }
- mutex_unlock(&iommu_sva_lock); kfree(handle); }
@@ -312,3 +327,18 @@ static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, return domain; }
+void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) +{
- struct iommu_mm_data *iommu_mm;
- might_sleep();
Yi Lai yi1.lai@intel.com reported an issue here. This interface could potentially be called in a non-sleepable context.
[ 4.605633] BUG: sleeping function called from invalid context at drivers/iommu/iommu-sva.c:335 [ 4.606433] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 1, name: swapper/0 [ 4.606975] preempt_count: 1, expected: 0 [ 4.607210] RCU nest depth: 0, expected: 0 [ 4.607467] 1 lock held by swapper/0/1: [ 4.607773] #0: ffffffff8743b5c8 (vmap_purge_lock){+.+.}-{4:4}, at: _vm_unmap_aliases+0xcd/0x800 [ 4.608304] Preemption disabled at: [ 4.608308] [<ffffffff81413f2a>] flush_tlb_kernel_range+0x2a/0x420 [ 4.608841] CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.16.0-rc5-e864c1d7585d+ #1 PREEMPT(voluntary) [ 4.608851] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 [ 4.608856] Call Trace: [ 4.608862] <TASK> [ 4.608867] dump_stack_lvl+0x121/0x150 [ 4.608887] dump_stack+0x19/0x20 [ 4.608894] __might_resched+0x37b/0x5a0 [ 4.608910] __might_sleep+0xa3/0x170 [ 4.608919] iommu_sva_invalidate_kva_range+0x32/0x140 [ 4.608939] flush_tlb_kernel_range+0x2d1/0x420 [ 4.608951] __purge_vmap_area_lazy+0x5ae/0xc60 [ 4.608964] _vm_unmap_aliases+0x653/0x800 [ 4.608973] ? kmemdup_noprof+0x37/0x70 [ 4.608985] ? __pfx__vm_unmap_aliases+0x10/0x10 [ 4.608992] ? ret_from_fork_asm+0x1a/0x30 [ 4.609004] ? __free_frozen_pages+0x493/0x1000 [ 4.609014] ? __free_frozen_pages+0x493/0x1000 [ 4.609025] vm_unmap_aliases+0x22/0x30 [ 4.609032] change_page_attr_set_clr+0x272/0x4c0 [ 4.609046] ? __pfx_change_page_attr_set_clr+0x10/0x10 [ 4.609059] ? __this_cpu_preempt_check+0x21/0x30 [ 4.609078] ? kasan_save_track+0x18/0x40 [ 4.609099] set_memory_nx+0xbd/0x110 [ 4.609115] ? __pfx_set_memory_nx+0x10/0x10 [ 4.609128] free_init_pages+0x82/0xd0 [ 4.609137] ? __pfx_kernel_init+0x10/0x10 [ 4.609148] mem_encrypt_free_decrypted_mem+0x4e/0x70 [ 4.609173] free_initmem+0x1c/0x40 [ 4.609179] kernel_init+0x4a/0x2f0 [ 4.609190] ret_from_fork+0x38e/0x490 [ 4.609201] ? __pfx_kernel_init+0x10/0x10 [ 4.609212] ret_from_fork_asm+0x1a/0x30 [ 4.609227] </TASK>
So we might need a spinlock to protect the sva mm_struct list? An additional change like this:
diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index f6fe250d12e5..d503dd95e4e5 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -12,6 +12,7 @@ static DEFINE_MUTEX(iommu_sva_lock); static DEFINE_STATIC_KEY_FALSE(iommu_sva_present); static LIST_HEAD(iommu_sva_mms); +static DEFINE_SPINLOCK(iommu_mms_lock); static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm);
@@ -137,9 +138,11 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm domain->users = 1;
if (list_empty(&iommu_mm->sva_domains)) { + spin_lock(&iommu_mms_lock); if (list_empty(&iommu_sva_mms)) static_branch_enable(&iommu_sva_present); list_add(&iommu_mm->mm_list_elm, &iommu_sva_mms); + spin_unlock(&iommu_mms_lock); } list_add(&domain->next, &iommu_mm->sva_domains); out: @@ -185,9 +188,11 @@ void iommu_sva_unbind_device(struct iommu_sva *handle) }
if (list_empty(&iommu_mm->sva_domains)) { + spin_lock(&iommu_mms_lock); list_del(&iommu_mm->mm_list_elm); if (list_empty(&iommu_sva_mms)) static_branch_disable(&iommu_sva_present); + spin_unlock(&iommu_mms_lock); }
mutex_unlock(&iommu_sva_lock); @@ -332,12 +337,10 @@ void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) { struct iommu_mm_data *iommu_mm;
- might_sleep(); - if (!static_branch_unlikely(&iommu_sva_present)) return;
- guard(mutex)(&iommu_sva_lock); + guard(spinlock)(&iommu_mms_lock); list_for_each_entry(iommu_mm, &iommu_sva_mms, mm_list_elm) mmu_notifier_arch_invalidate_secondary_tlbs(iommu_mm->mm, start, end); }
On Tue, Jul 08, 2025 at 01:42:53PM +0800, Baolu Lu wrote:
+void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) +{
- struct iommu_mm_data *iommu_mm;
- might_sleep();
Yi Lai yi1.lai@intel.com reported an issue here. This interface could potentially be called in a non-sleepable context.
Oh thats really bad, the notifiers inside the iommu driver are not required to be called in a sleepable context either and I don't really want to change that requirement.
Can you do something about how the notifier is called to not be inside an atomic context?
Maybe we can push the kernel page table pages onto a list and free them from a work queue kind of like what the normal mm does?
Back to the shadowing idea?
Jason
On Tue, Jul 08, 2025 at 09:27:55AM -0300, Jason Gunthorpe wrote:
On Tue, Jul 08, 2025 at 01:42:53PM +0800, Baolu Lu wrote:
+void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) +{
- struct iommu_mm_data *iommu_mm;
- might_sleep();
Yi Lai yi1.lai@intel.com reported an issue here. This interface could potentially be called in a non-sleepable context.
Oh thats really bad, the notifiers inside the iommu driver are not required to be called in a sleepable context either and I don't really want to change that requirement.
Actually, I have got confused here with the hmm use of notifiers.
The iommu drivers use arch_invalidate_secondary_tlbs so they are already in atomic contexts.
So your idea to use a spinlock seems correct.
Jason
On 7/8/25 22:06, Jason Gunthorpe wrote:
On Tue, Jul 08, 2025 at 09:27:55AM -0300, Jason Gunthorpe wrote:
On Tue, Jul 08, 2025 at 01:42:53PM +0800, Baolu Lu wrote:
+void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) +{
- struct iommu_mm_data *iommu_mm;
- might_sleep();
Yi Lai yi1.lai@intel.com reported an issue here. This interface could potentially be called in a non-sleepable context.
Oh thats really bad, the notifiers inside the iommu driver are not required to be called in a sleepable context either and I don't really want to change that requirement.
Actually, I have got confused here with the hmm use of notifiers.
The iommu drivers use arch_invalidate_secondary_tlbs so they are already in atomic contexts.
So your idea to use a spinlock seems correct.
Okay, then let me post an updated version.
Thanks, baolu
Hi BaoLu,
On Fri, 4 Jul 2025 21:30:56 +0800 Lu Baolu baolu.lu@linux.intel.com wrote:
The vmalloc() and vfree() functions manage virtually contiguous, but not necessarily physically contiguous, kernel memory regions. When vfree() unmaps such a region, it tears down the associated kernel page table entries and frees the physical pages.
In the IOMMU Shared Virtual Addressing (SVA) context, the IOMMU hardware shares and walks the CPU's page tables. Architectures like x86 share static kernel address mappings across all user page tables, allowing the IOMMU to access the kernel portion of these tables.
Is there a use case where a SVA user can access kernel memory in the first place? It seems VT-d code does not set supervisor request (SRE) for the user PASID, I don't see SRE equivalent in AMD IOMMU GCR3 table. So the PTE U/S bit will prevent kernel memory access, no?
Modern IOMMUs often cache page table entries to optimize walk performance, even for intermediate page table levels. If kernel page
Just wondering if this patch has anything specific to "intermediate page table", since invalidation hint is always 0 so the intermediate TLBs are always flushed.
table mappings are changed (e.g., by vfree()), but the IOMMU's internal caches retain stale entries, Use-After-Free (UAF) vulnerability condition arises. If these freed page table pages are reallocated for a different purpose, potentially by an attacker, the IOMMU could misinterpret the new data as valid page table entries. This allows the IOMMU to walk into attacker-controlled memory, leading to arbitrary physical memory DMA access or privilege escalation.
To mitigate this, introduce a new iommu interface to flush IOMMU caches and fence pending page table walks when kernel page mappings are updated. This interface should be invoked from architecture-specific code that manages combined user and kernel page tables.
Fixes: 26b25a2b98e4 ("iommu: Bind process address spaces to devices") Cc: stable@vger.kernel.org Co-developed-by: Jason Gunthorpe jgg@nvidia.com Signed-off-by: Jason Gunthorpe jgg@nvidia.com Signed-off-by: Lu Baolu baolu.lu@linux.intel.com
arch/x86/mm/tlb.c | 2 ++ drivers/iommu/iommu-sva.c | 32 +++++++++++++++++++++++++++++++- include/linux/iommu.h | 4 ++++ 3 files changed, 37 insertions(+), 1 deletion(-)
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 39f80111e6f1..a41499dfdc3f 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -12,6 +12,7 @@ #include <linux/task_work.h> #include <linux/mmu_notifier.h> #include <linux/mmu_context.h> +#include <linux/iommu.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> @@ -1540,6 +1541,7 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end) kernel_tlb_flush_range(info); put_flush_tlb_info();
- iommu_sva_invalidate_kva_range(start, end);
} /* diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index 1a51cfd82808..154384eab8a3 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -10,6 +10,8 @@ #include "iommu-priv.h" static DEFINE_MUTEX(iommu_sva_lock); +static DEFINE_STATIC_KEY_FALSE(iommu_sva_present); +static LIST_HEAD(iommu_sva_mms); static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm); @@ -42,6 +44,7 @@ static struct iommu_mm_data *iommu_alloc_mm_data(struct mm_struct *mm, struct de return ERR_PTR(-ENOSPC); } iommu_mm->pasid = pasid;
- iommu_mm->mm = mm; INIT_LIST_HEAD(&iommu_mm->sva_domains); /*
- Make sure the write to mm->iommu_mm is not reordered in
front of @@ -132,8 +135,13 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm if (ret) goto out_free_domain; domain->users = 1;
- list_add(&domain->next, &mm->iommu_mm->sva_domains);
- if (list_empty(&iommu_mm->sva_domains)) {
if (list_empty(&iommu_sva_mms))
static_branch_enable(&iommu_sva_present);
list_add(&iommu_mm->mm_list_elm, &iommu_sva_mms);
- }
- list_add(&domain->next, &iommu_mm->sva_domains);
out: refcount_set(&handle->users, 1); mutex_unlock(&iommu_sva_lock); @@ -175,6 +183,13 @@ void iommu_sva_unbind_device(struct iommu_sva *handle) list_del(&domain->next); iommu_domain_free(domain); }
- if (list_empty(&iommu_mm->sva_domains)) {
list_del(&iommu_mm->mm_list_elm);
if (list_empty(&iommu_sva_mms))
static_branch_disable(&iommu_sva_present);
- }
- mutex_unlock(&iommu_sva_lock); kfree(handle);
} @@ -312,3 +327,18 @@ static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, return domain; }
+void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) +{
- struct iommu_mm_data *iommu_mm;
- might_sleep();
- if (!static_branch_unlikely(&iommu_sva_present))
return;
- guard(mutex)(&iommu_sva_lock);
- list_for_each_entry(iommu_mm, &iommu_sva_mms, mm_list_elm)
mmu_notifier_arch_invalidate_secondary_tlbs(iommu_mm->mm, start, end); +} +EXPORT_SYMBOL_GPL(iommu_sva_invalidate_kva_range); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 156732807994..31330c12b8ee 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1090,7 +1090,9 @@ struct iommu_sva { struct iommu_mm_data { u32 pasid;
- struct mm_struct *mm; struct list_head sva_domains;
- struct list_head mm_list_elm;
}; int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode); @@ -1571,6 +1573,7 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm); void iommu_sva_unbind_device(struct iommu_sva *handle); u32 iommu_sva_get_pasid(struct iommu_sva *handle); +void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end); #else static inline struct iommu_sva * iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) @@ -1595,6 +1598,7 @@ static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm) } static inline void mm_pasid_drop(struct mm_struct *mm) {} +static inline void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) {} #endif /* CONFIG_IOMMU_SVA */ #ifdef CONFIG_IOMMU_IOPF
On Wed, Jul 09, 2025 at 08:51:58AM -0700, Jacob Pan wrote:
In the IOMMU Shared Virtual Addressing (SVA) context, the IOMMU hardware shares and walks the CPU's page tables. Architectures like x86 share static kernel address mappings across all user page tables, allowing the IOMMU to access the kernel portion of these tables.
Is there a use case where a SVA user can access kernel memory in the first place?
No. It should be fully blocked.
Jason
Hi Jason,
On Wed, 9 Jul 2025 13:27:24 -0300 Jason Gunthorpe jgg@nvidia.com wrote:
On Wed, Jul 09, 2025 at 08:51:58AM -0700, Jacob Pan wrote:
In the IOMMU Shared Virtual Addressing (SVA) context, the IOMMU hardware shares and walks the CPU's page tables. Architectures like x86 share static kernel address mappings across all user page tables, allowing the IOMMU to access the kernel portion of these tables.
Is there a use case where a SVA user can access kernel memory in the first place?
No. It should be fully blocked.
Then I don't understand what is the "vulnerability condition" being addressed here. We are talking about KVA range here.
On 7/9/25 11:15, Jacob Pan wrote:
Is there a use case where a SVA user can access kernel memory in the first place?
No. It should be fully blocked.
Then I don't understand what is the "vulnerability condition" being addressed here. We are talking about KVA range here.
SVA users can't access kernel memory, but they can compel walks of kernel page tables, which the IOMMU caches. The trouble starts if the kernel happens to free that page table page and the IOMMU is using the cache after the page is freed.
That was covered in the changelog, but I guess it could be made a bit more succinct.
Hi Dave,
On Wed, 9 Jul 2025 11:22:34 -0700 Dave Hansen dave.hansen@intel.com wrote:
On 7/9/25 11:15, Jacob Pan wrote:
Is there a use case where a SVA user can access kernel memory in the first place?
No. It should be fully blocked.
Then I don't understand what is the "vulnerability condition" being addressed here. We are talking about KVA range here.
SVA users can't access kernel memory, but they can compel walks of kernel page tables, which the IOMMU caches. The trouble starts if the kernel happens to free that page table page and the IOMMU is using the cache after the page is freed.
According to VT-d spec. 6.2.4 S1 IOTLB caching includes access privilege. "First-stage mappings: — Each of these is a mapping from a input page number in a request to the physical page frame to which it translates (derived from first-stage translation), along with information about access privileges and memory typing (if applicable)."
So you are saying IOMMU can cache user DMA initiated walks and cache with supervisor privilige? Since the SVA PASID is a user PASID, even if IOMMU uses the cache later on, how could it get supervior privilege?
On Wed, Jul 09, 2025 at 11:44:32AM -0700, Jacob Pan wrote:
So you are saying IOMMU can cache user DMA initiated walks and cache with supervisor privilige? Since the SVA PASID is a user PASID, even if IOMMU uses the cache later on, how could it get supervior privilege?
The walk cache (aka paging structure cache) and IOTLB cache are different things.
The walk cache has no concept of privilege. All memory holding page tables can be loaded into the walk cache. Meaning any table in the radix tree is eligible to reside in the walk cache.
So we point the IOMMU at the CR3 of a MM struct with KVA's mapped into it and the walk cache is permitted to somehow cache every single 4k page that comprises that radix tree.
Supervisor does not come into it at all. I had hoped the U/S bits within the table structure itself would effect the walk cache but it was confirmed that it does not.
Jason
On Wed, 9 Jul 2025 11:22:34 -0700 Dave Hansen dave.hansen@intel.com wrote:
On 7/9/25 11:15, Jacob Pan wrote:
Is there a use case where a SVA user can access kernel memory in the first place?
No. It should be fully blocked.
Then I don't understand what is the "vulnerability condition" being addressed here. We are talking about KVA range here.
SVA users can't access kernel memory, but they can compel walks of kernel page tables, which the IOMMU caches. The trouble starts if the kernel happens to free that page table page and the IOMMU is using the cache after the page is freed.
That was covered in the changelog, but I guess it could be made a bit more succinct.
Is it worth just never freeing the page tables used for vmalloc() memory? After all they are likely to be reallocated again.
That (should) only require IOMMU invalidate for pages that are actually used for io.
David
On Mon, Jul 14, 2025 at 01:39:20PM +0100, David Laight wrote:
On Wed, 9 Jul 2025 11:22:34 -0700 Dave Hansen dave.hansen@intel.com wrote:
On 7/9/25 11:15, Jacob Pan wrote:
Is there a use case where a SVA user can access kernel memory in the first place?
No. It should be fully blocked.
Then I don't understand what is the "vulnerability condition" being addressed here. We are talking about KVA range here.
SVA users can't access kernel memory, but they can compel walks of kernel page tables, which the IOMMU caches. The trouble starts if the kernel happens to free that page table page and the IOMMU is using the cache after the page is freed.
That was covered in the changelog, but I guess it could be made a bit more succinct.
Is it worth just never freeing the page tables used for vmalloc() memory? After all they are likely to be reallocated again.
Do we free? Maybe on some arches? According to my tests(AMD x86-64) i did once upon a time, the PTE entries were not freed after vfree(). It could be expensive if we did it, due to a global "page_table_lock" lock.
I see one place though, it is in the vmap_try_huge_pud()
if (pud_present(*pud) && !pud_free_pmd_page(pud, addr)) return 0;
it is when replace a pud by a huge-page.
-- Uladzislau Rezki
On Mon, Jul 14, 2025 at 03:19:17PM +0200, Uladzislau Rezki wrote:
On Mon, Jul 14, 2025 at 01:39:20PM +0100, David Laight wrote:
On Wed, 9 Jul 2025 11:22:34 -0700 Dave Hansen dave.hansen@intel.com wrote:
On 7/9/25 11:15, Jacob Pan wrote:
Is there a use case where a SVA user can access kernel memory in the first place?
No. It should be fully blocked.
Then I don't understand what is the "vulnerability condition" being addressed here. We are talking about KVA range here.
SVA users can't access kernel memory, but they can compel walks of kernel page tables, which the IOMMU caches. The trouble starts if the kernel happens to free that page table page and the IOMMU is using the cache after the page is freed.
That was covered in the changelog, but I guess it could be made a bit more succinct.
But does this really mean that every flush_tlb_kernel_range() should flush the IOMMU page tables as well? AFAIU, set_memory flushes TLB even when bits in pte change and it seems like an overkill...
Is it worth just never freeing the page tables used for vmalloc() memory? After all they are likely to be reallocated again.
Do we free? Maybe on some arches? According to my tests(AMD x86-64) i did once upon a time, the PTE entries were not freed after vfree(). It could be expensive if we did it, due to a global "page_table_lock" lock.
I see one place though, it is in the vmap_try_huge_pud()
if (pud_present(*pud) && !pud_free_pmd_page(pud, addr)) return 0;
it is when replace a pud by a huge-page.
There's also a place that replaces a pmd by a smaller huge page, but other than that vmalloc does not free page tables.
-- Uladzislau Rezki
From: Mike Rapoport rppt@kernel.org Sent: Monday, July 14, 2025 10:50 PM
On Mon, Jul 14, 2025 at 03:19:17PM +0200, Uladzislau Rezki wrote:
On Mon, Jul 14, 2025 at 01:39:20PM +0100, David Laight wrote:
On Wed, 9 Jul 2025 11:22:34 -0700 Dave Hansen dave.hansen@intel.com wrote:
On 7/9/25 11:15, Jacob Pan wrote:
> Is there a use case where a SVA user can access kernel memory in
the
> first place? No. It should be fully blocked.
Then I don't understand what is the "vulnerability condition" being addressed here. We are talking about KVA range here.
SVA users can't access kernel memory, but they can compel walks of kernel page tables, which the IOMMU caches. The trouble starts if the kernel happens to free that page table page and the IOMMU is using
the
cache after the page is freed.
That was covered in the changelog, but I guess it could be made a bit more succinct.
But does this really mean that every flush_tlb_kernel_range() should flush the IOMMU page tables as well? AFAIU, set_memory flushes TLB even when bits in pte change and it seems like an overkill...
Is it worth just never freeing the page tables used for vmalloc() memory? After all they are likely to be reallocated again.
Do we free? Maybe on some arches? According to my tests(AMD x86-64) i
did
once upon a time, the PTE entries were not freed after vfree(). It could be expensive if we did it, due to a global "page_table_lock" lock.
I see one place though, it is in the vmap_try_huge_pud()
if (pud_present(*pud) && !pud_free_pmd_page(pud, addr)) return 0;
it is when replace a pud by a huge-page.
There's also a place that replaces a pmd by a smaller huge page, but other than that vmalloc does not free page tables.
Dave spotted two other places where page tables might be freed:
https://lore.kernel.org/all/62580eab-3e68-4132-981a-84167d130d9f@intel.com/
On 7/14/25 22:50, Mike Rapoport wrote:
On Mon, Jul 14, 2025 at 03:19:17PM +0200, Uladzislau Rezki wrote:
On Mon, Jul 14, 2025 at 01:39:20PM +0100, David Laight wrote:
On Wed, 9 Jul 2025 11:22:34 -0700 Dave Hansendave.hansen@intel.com wrote:
On 7/9/25 11:15, Jacob Pan wrote:
> Is there a use case where a SVA user can access kernel memory in the > first place? No. It should be fully blocked.
Then I don't understand what is the "vulnerability condition" being addressed here. We are talking about KVA range here.
SVA users can't access kernel memory, but they can compel walks of kernel page tables, which the IOMMU caches. The trouble starts if the kernel happens to free that page table page and the IOMMU is using the cache after the page is freed.
That was covered in the changelog, but I guess it could be made a bit more succinct.
But does this really mean that every flush_tlb_kernel_range() should flush the IOMMU page tables as well? AFAIU, set_memory flushes TLB even when bits in pte change and it seems like an overkill...
As far as I can see, only the next-level page table pointer in the middle-level entry matters. SVA is not allowed to access kernel addresses, which has been ensured by the U/S bit in the leaf PTEs, so other bit changes don't matter here.
Thanks, baolu
Hi Jacob,
On 7/10/25 02:15, Jacob Pan wrote:
Hi Jason,
On Wed, 9 Jul 2025 13:27:24 -0300 Jason Gunthorpe jgg@nvidia.com wrote:
On Wed, Jul 09, 2025 at 08:51:58AM -0700, Jacob Pan wrote:
In the IOMMU Shared Virtual Addressing (SVA) context, the IOMMU hardware shares and walks the CPU's page tables. Architectures like x86 share static kernel address mappings across all user page tables, allowing the IOMMU to access the kernel portion of these tables.
Is there a use case where a SVA user can access kernel memory in the first place?
No. It should be fully blocked.
Then I don't understand what is the "vulnerability condition" being addressed here. We are talking about KVA range here.
Let me take a real example:
A device might be mistakenly configured to access memory at IOVA 0xffffa866001d5000 (a vmalloc'd memory region) with user-mode access permission. The corresponding page table entries for this IOVA translation, assuming a five-level page table, would appear as follows:
PGD: Entry present with U/S bit set (1) P4D: Entry present with U/S bit set (1) PUD: Entry present with U/S bit set (1) PMD: Entry present with U/S bit set (1) PTE: Entry present with U/S bit clear (0)
When the IOMMU walks this page table, it may potentially cache all present entries, regardless of the U/S bit's state. Upon reaching the leaf PTE, the IOMMU performs a permission check. This involves comparing the device's DMA access mode (in this case, user mode) against the cumulative U/S permission derived from an AND operation across all U/S bits in the traversed page table entries (which here results in U/S == 0).
The IOMMU correctly blocks this DMA access because the device's requested access (user mode) exceeds the permissions granted by the page table (supervisor-only at the PTE level). However, the PGD, P4D, PUD, and PMD entries that were traversed might remain cached within the IOMMU's paging structure cache.
Now, consider a scenario where the page table leaf page is freed and subsequently repurposed, and the U/S bit at its previous location is modified to 1. From the IOMMU's perspective, the page table for the aforementioned IOVA would now appear as follows:
PGD: Entry present with U/S bit set (1) [retrieved from paging cache] P4D: Entry present with U/S bit set (1) [retrieved from paging cache] PUD: Entry present with U/S bit set (1) [retrieved from paging cache] PMD: Entry present with U/S bit set (1) [retrieved from paging cache] PTE: Entry present with U/S bit set (1) {read from physical memory}
As a result, the device could then potentially access the memory at IOVA 0xffffa866001d5000 with user-mode permission, which was explicitly disallowed.
Thanks, baolu
Hi Baolu,
On Thu, 10 Jul 2025 10:57:19 +0800 Baolu Lu baolu.lu@linux.intel.com wrote:
Hi Jacob,
On 7/10/25 02:15, Jacob Pan wrote:
Hi Jason,
On Wed, 9 Jul 2025 13:27:24 -0300 Jason Gunthorpe jgg@nvidia.com wrote:
On Wed, Jul 09, 2025 at 08:51:58AM -0700, Jacob Pan wrote:
In the IOMMU Shared Virtual Addressing (SVA) context, the IOMMU hardware shares and walks the CPU's page tables. Architectures like x86 share static kernel address mappings across all user page tables, allowing the IOMMU to access the kernel portion of these tables.
Is there a use case where a SVA user can access kernel memory in the first place?
No. It should be fully blocked.
Then I don't understand what is the "vulnerability condition" being addressed here. We are talking about KVA range here.
Let me take a real example:
A device might be mistakenly configured to access memory at IOVA 0xffffa866001d5000 (a vmalloc'd memory region) with user-mode access permission. The corresponding page table entries for this IOVA translation, assuming a five-level page table, would appear as follows:
PGD: Entry present with U/S bit set (1) P4D: Entry present with U/S bit set (1) PUD: Entry present with U/S bit set (1) PMD: Entry present with U/S bit set (1) PTE: Entry present with U/S bit clear (0)
When the IOMMU walks this page table, it may potentially cache all present entries, regardless of the U/S bit's state. Upon reaching the leaf PTE, the IOMMU performs a permission check. This involves comparing the device's DMA access mode (in this case, user mode) against the cumulative U/S permission derived from an AND operation across all U/S bits in the traversed page table entries (which here results in U/S == 0).
why would IOMMU cache all the entries if the walk is not successful?
Also, per x86 vm map how could this example (UUUUS) happen to SVA? i.e. sharing intermediate levels.
ffffc90000000000 | -55 TB | ffffe8ffffffffff | 32 TB | vmalloc/ioremap 0000000000000000 | 0 | 00007fffffffffff | 128 TB | user-space
The IOMMU correctly blocks this DMA access because the device's requested access (user mode) exceeds the permissions granted by the page table (supervisor-only at the PTE level). However, the PGD, P4D, PUD, and PMD entries that were traversed might remain cached within the IOMMU's paging structure cache.
Now, consider a scenario where the page table leaf page is freed and subsequently repurposed, and the U/S bit at its previous location is modified to 1. From the IOMMU's perspective, the page table for the aforementioned IOVA would now appear as follows:
PGD: Entry present with U/S bit set (1) [retrieved from paging cache] P4D: Entry present with U/S bit set (1) [retrieved from paging cache] PUD: Entry present with U/S bit set (1) [retrieved from paging cache] PMD: Entry present with U/S bit set (1) [retrieved from paging cache] PTE: Entry present with U/S bit set (1) {read from physical memory}
As a result, the device could then potentially access the memory at IOVA 0xffffa866001d5000 with user-mode permission, which was explicitly disallowed.
Thanks, baolu
On Thu, Jul 10, 2025 at 08:28:08AM -0700, Jacob Pan wrote:
why would IOMMU cache all the entries if the walk is not successful?
Sadly, because nothing in the architecture said not to..
Also, per x86 vm map how could this example (UUUUS) happen to SVA? i.e. sharing intermediate levels.
ffffc90000000000 | -55 TB | ffffe8ffffffffff | 32 TB | vmalloc/ioremap 0000000000000000 | 0 | 00007fffffffffff | 128 TB | user-space
Because Linux only uses the leaf U/S bit, the interior bits are set to not-override the leaf.
Jason
On 7/10/25 08:28, Jacob Pan wrote:
why would IOMMU cache all the entries if the walk is not successful?
This was one of those things which the IOMMU folks could have gone either direction on. But, they generally choose to mirror the CPU behavior when they can.
The CPU does page walks the same way. It probably requires less logic because the caches can be filled while walking down the tree and don't have to be evicted if the walk is ultimately unsuccessful.
linux-stable-mirror@lists.linaro.org