The vmalloc() and vfree() functions manage virtually contiguous, but not necessarily physically contiguous, kernel memory regions. When vfree() unmaps such a region, it tears down the associated kernel page table entries and frees the physical pages.
In the IOMMU Shared Virtual Addressing (SVA) context, the IOMMU hardware shares and walks the CPU's page tables. Architectures like x86 share static kernel address mappings across all user page tables, allowing the IOMMU to access the kernel portion of these tables.
Modern IOMMUs often cache page table entries to optimize walk performance, even for intermediate page table levels. If kernel page table mappings are changed (e.g., by vfree()), but the IOMMU's internal caches retain stale entries, Use-After-Free (UAF) vulnerability condition arises. If these freed page table pages are reallocated for a different purpose, potentially by an attacker, the IOMMU could misinterpret the new data as valid page table entries. This allows the IOMMU to walk into attacker-controlled memory, leading to arbitrary physical memory DMA access or privilege escalation.
To mitigate this, introduce a new iommu interface to flush IOMMU caches and fence pending page table walks when kernel page mappings are updated. This interface should be invoked from architecture-specific code that manages combined user and kernel page tables.
Fixes: 26b25a2b98e4 ("iommu: Bind process address spaces to devices") Cc: stable@vger.kernel.org Reported-by: Jann Horn jannh@google.com Co-developed-by: Jason Gunthorpe jgg@nvidia.com Signed-off-by: Jason Gunthorpe jgg@nvidia.com Signed-off-by: Lu Baolu baolu.lu@linux.intel.com Reviewed-by: Jason Gunthorpe jgg@nvidia.com Reviewed-by: Vasant Hegde vasant.hegde@amd.com Tested-by: Yi Lai yi1.lai@intel.com --- arch/x86/mm/tlb.c | 2 ++ drivers/iommu/iommu-sva.c | 34 +++++++++++++++++++++++++++++++++- include/linux/iommu.h | 4 ++++ 3 files changed, 39 insertions(+), 1 deletion(-)
Change log: v2: - Remove EXPORT_SYMBOL_GPL(iommu_sva_invalidate_kva_range); - Replace the mutex with a spinlock to make the interface usable in the critical regions.
v1: https://lore.kernel.org/linux-iommu/20250704133056.4023816-1-baolu.lu@linux....
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 39f80111e6f1..a41499dfdc3f 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -12,6 +12,7 @@ #include <linux/task_work.h> #include <linux/mmu_notifier.h> #include <linux/mmu_context.h> +#include <linux/iommu.h>
#include <asm/tlbflush.h> #include <asm/mmu_context.h> @@ -1540,6 +1541,7 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end) kernel_tlb_flush_range(info);
put_flush_tlb_info(); + iommu_sva_invalidate_kva_range(start, end); }
/* diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index 1a51cfd82808..fd76aefa5a88 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -10,6 +10,9 @@ #include "iommu-priv.h"
static DEFINE_MUTEX(iommu_sva_lock); +static DEFINE_STATIC_KEY_FALSE(iommu_sva_present); +static LIST_HEAD(iommu_sva_mms); +static DEFINE_SPINLOCK(iommu_mms_lock); static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm);
@@ -42,6 +45,7 @@ static struct iommu_mm_data *iommu_alloc_mm_data(struct mm_struct *mm, struct de return ERR_PTR(-ENOSPC); } iommu_mm->pasid = pasid; + iommu_mm->mm = mm; INIT_LIST_HEAD(&iommu_mm->sva_domains); /* * Make sure the write to mm->iommu_mm is not reordered in front of @@ -132,8 +136,15 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm if (ret) goto out_free_domain; domain->users = 1; - list_add(&domain->next, &mm->iommu_mm->sva_domains);
+ if (list_empty(&iommu_mm->sva_domains)) { + scoped_guard(spinlock_irqsave, &iommu_mms_lock) { + if (list_empty(&iommu_sva_mms)) + static_branch_enable(&iommu_sva_present); + list_add(&iommu_mm->mm_list_elm, &iommu_sva_mms); + } + } + list_add(&domain->next, &iommu_mm->sva_domains); out: refcount_set(&handle->users, 1); mutex_unlock(&iommu_sva_lock); @@ -175,6 +186,15 @@ void iommu_sva_unbind_device(struct iommu_sva *handle) list_del(&domain->next); iommu_domain_free(domain); } + + if (list_empty(&iommu_mm->sva_domains)) { + scoped_guard(spinlock_irqsave, &iommu_mms_lock) { + list_del(&iommu_mm->mm_list_elm); + if (list_empty(&iommu_sva_mms)) + static_branch_disable(&iommu_sva_present); + } + } + mutex_unlock(&iommu_sva_lock); kfree(handle); } @@ -312,3 +332,15 @@ static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
return domain; } + +void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) +{ + struct iommu_mm_data *iommu_mm; + + if (!static_branch_unlikely(&iommu_sva_present)) + return; + + guard(spinlock_irqsave)(&iommu_mms_lock); + list_for_each_entry(iommu_mm, &iommu_sva_mms, mm_list_elm) + mmu_notifier_arch_invalidate_secondary_tlbs(iommu_mm->mm, start, end); +} diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 156732807994..31330c12b8ee 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1090,7 +1090,9 @@ struct iommu_sva {
struct iommu_mm_data { u32 pasid; + struct mm_struct *mm; struct list_head sva_domains; + struct list_head mm_list_elm; };
int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode); @@ -1571,6 +1573,7 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm); void iommu_sva_unbind_device(struct iommu_sva *handle); u32 iommu_sva_get_pasid(struct iommu_sva *handle); +void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end); #else static inline struct iommu_sva * iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) @@ -1595,6 +1598,7 @@ static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm) }
static inline void mm_pasid_drop(struct mm_struct *mm) {} +static inline void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) {} #endif /* CONFIG_IOMMU_SVA */
#ifdef CONFIG_IOMMU_IOPF
On 7/8/25 23:28, Lu Baolu wrote:
Modern IOMMUs often cache page table entries to optimize walk performance, even for intermediate page table levels. If kernel page table mappings are changed (e.g., by vfree()), but the IOMMU's internal caches retain stale entries, Use-After-Free (UAF) vulnerability condition arises. If these freed page table pages are reallocated for a different purpose, potentially by an attacker, the IOMMU could misinterpret the new data as valid page table entries. This allows the IOMMU to walk into attacker-controlled memory, leading to arbitrary physical memory DMA access or privilege escalation.
The approach here is certainly conservative and simple. It's also not going to cause big problems on systems without fancy IOMMUs.
But I am a _bit_ worried that it's _too_ conservative. The changelog talks about page table page freeing, but the actual code:
@@ -1540,6 +1541,7 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end) kernel_tlb_flush_range(info); put_flush_tlb_info();
- iommu_sva_invalidate_kva_range(start, end);
}
is in a very generic TLB flushing spot that's used for a lot more than just freeing page tables.
If the problem is truly limited to freeing page tables, it needs to be commented appropriately.
On 7/9/25 23:29, Dave Hansen wrote:
On 7/8/25 23:28, Lu Baolu wrote:
Modern IOMMUs often cache page table entries to optimize walk performance, even for intermediate page table levels. If kernel page table mappings are changed (e.g., by vfree()), but the IOMMU's internal caches retain stale entries, Use-After-Free (UAF) vulnerability condition arises. If these freed page table pages are reallocated for a different purpose, potentially by an attacker, the IOMMU could misinterpret the new data as valid page table entries. This allows the IOMMU to walk into attacker-controlled memory, leading to arbitrary physical memory DMA access or privilege escalation.
The approach here is certainly conservative and simple. It's also not going to cause big problems on systems without fancy IOMMUs.
But I am a _bit_ worried that it's _too_ conservative. The changelog talks about page table page freeing, but the actual code:
@@ -1540,6 +1541,7 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end) kernel_tlb_flush_range(info); put_flush_tlb_info();
- iommu_sva_invalidate_kva_range(start, end); }
is in a very generic TLB flushing spot that's used for a lot more than just freeing page tables.
If the problem is truly limited to freeing page tables, it needs to be commented appropriately.
Yeah, good comments. It should not be limited to freeing page tables; freeing page tables is just a real case that we can see in the vmalloc/ vfree paths. Theoretically, whenever a kernel page table update is done and the CPU TLB needs to be flushed, the secondary TLB (i.e., the caches on the IOMMU) should be flushed accordingly. It's assumed that this happens in flush_tlb_kernel_range().
Thanks, baolu
From: Baolu Lu baolu.lu@linux.intel.com Sent: Thursday, July 10, 2025 10:15 AM
On 7/9/25 23:29, Dave Hansen wrote:
On 7/8/25 23:28, Lu Baolu wrote:
Modern IOMMUs often cache page table entries to optimize walk
performance,
even for intermediate page table levels. If kernel page table mappings are changed (e.g., by vfree()), but the IOMMU's internal caches retain stale entries, Use-After-Free (UAF) vulnerability condition arises. If these freed page table pages are reallocated for a different purpose, potentially by an attacker, the IOMMU could misinterpret the new data as valid page table entries. This allows the IOMMU to walk into attacker-controlled memory, leading to arbitrary physical memory DMA access or privilege escalation.
The approach here is certainly conservative and simple. It's also not going to cause big problems on systems without fancy IOMMUs.
But I am a _bit_ worried that it's _too_ conservative. The changelog talks about page table page freeing, but the actual code:
@@ -1540,6 +1541,7 @@ void flush_tlb_kernel_range(unsigned long start,
unsigned long end)
kernel_tlb_flush_range(info);
put_flush_tlb_info();
- iommu_sva_invalidate_kva_range(start, end); }
is in a very generic TLB flushing spot that's used for a lot more than just freeing page tables.
If the problem is truly limited to freeing page tables, it needs to be commented appropriately.
Yeah, good comments. It should not be limited to freeing page tables; freeing page tables is just a real case that we can see in the vmalloc/ vfree paths. Theoretically, whenever a kernel page table update is done and the CPU TLB needs to be flushed, the secondary TLB (i.e., the caches on the IOMMU) should be flushed accordingly. It's assumed that this happens in flush_tlb_kernel_range().
this conservative approach sounds safer - even if we overlooked any threat beyond relying on page table free, doing invalidation in this function is sufficient to mitigate.
but as Dave suggested let's add a comment in above to clarify the motivation of doing so.
On 7/9/25 19:14, Baolu Lu wrote:
If the problem is truly limited to freeing page tables, it needs to be commented appropriately.
Yeah, good comments. It should not be limited to freeing page tables; freeing page tables is just a real case that we can see in the vmalloc/ vfree paths. Theoretically, whenever a kernel page table update is done and the CPU TLB needs to be flushed, the secondary TLB (i.e., the caches on the IOMMU) should be flushed accordingly. It's assumed that this happens in flush_tlb_kernel_range().
Could you elaborate on this a bit further?
I thought that the IOMMU was only ever doing "user" permission walks and never "supervisor". That's why we had the whole discussion about whether the IOMMU would stop if it saw an upper-level entry with _PAGE_USER clear.
The reason this matters is that leaf kernel page table entries always have _PAGE_USER clear. That means an IOMMU doing "user" permission walks should never be able to do anything with them. Even if an IOTLB entry was created* for a _PAGE_USER=0 PTE, the "user" permission walk will stop when it finds that entry.
It doesn't matter if the entry is stale or not. The "user" permission IOMMU walk can't do anything with the supervisor mapping.
Why does this matter? We flush the CPU TLB in a bunch of different ways, _especially_ when it's being done for kernel mappings. For example, __flush_tlb_all() is a non-ranged kernel flush which has a completely parallel implementation with flush_tlb_kernel_range(). Call sites that use _it_ are unaffected by the patch here.
Basically, if we're only worried about vmalloc/vfree freeing page tables, then this patch is OK. If the problem is bigger than that, then we need a more comprehensive patch.
* I'm not sure if the IOMMU will even create an IOTLB entry for a supervisor-permission mapping while doing a user-permission walk.
On Thu, Jul 10, 2025 at 05:53:16AM -0700, Dave Hansen wrote:
On 7/9/25 19:14, Baolu Lu wrote:
If the problem is truly limited to freeing page tables, it needs to be commented appropriately.
Yeah, good comments. It should not be limited to freeing page tables; freeing page tables is just a real case that we can see in the vmalloc/ vfree paths. Theoretically, whenever a kernel page table update is done and the CPU TLB needs to be flushed, the secondary TLB (i.e., the caches on the IOMMU) should be flushed accordingly. It's assumed that this happens in flush_tlb_kernel_range().
Could you elaborate on this a bit further?
I thought that the IOMMU was only ever doing "user" permission walks and never "supervisor". That's why we had the whole discussion about whether the IOMMU would stop if it saw an upper-level entry with _PAGE_USER clear.
Yes
The reason this matters is that leaf kernel page table entries always have _PAGE_USER clear. That means an IOMMU doing "user" permission walks should never be able to do anything with them. Even if an IOTLB entry was created* for a _PAGE_USER=0 PTE, the "user" permission walk will stop when it finds that entry.
Yes, if the IOTLB caches a supervisor entry it doesn't matter since if the cache hits the S/U check will still fail and it will never get used. It is just wasting IOTLB cache space. No need to invalidate IOTLB.
Why does this matter? We flush the CPU TLB in a bunch of different ways, _especially_ when it's being done for kernel mappings. For example, __flush_tlb_all() is a non-ranged kernel flush which has a completely parallel implementation with flush_tlb_kernel_range(). Call sites that use _it_ are unaffected by the patch here.
Basically, if we're only worried about vmalloc/vfree freeing page tables, then this patch is OK. If the problem is bigger than that, then we need a more comprehensive patch.
I think we are worried about any place that frees page tables.
- I'm not sure if the IOMMU will even create an IOTLB entry for a supervisor-permission mapping while doing a user-permission walk.
It doesn't matter if it does or doesn't, it doesn't change the argument that we don't have to invalidate on PTEs being changed.
Jason
On 7/10/25 06:22, Jason Gunthorpe wrote:
Why does this matter? We flush the CPU TLB in a bunch of different ways, _especially_ when it's being done for kernel mappings. For example, __flush_tlb_all() is a non-ranged kernel flush which has a completely parallel implementation with flush_tlb_kernel_range(). Call sites that use _it_ are unaffected by the patch here.
Basically, if we're only worried about vmalloc/vfree freeing page tables, then this patch is OK. If the problem is bigger than that, then we need a more comprehensive patch.
I think we are worried about any place that frees page tables.
The two places that come to mind are the remove_memory() code and __change_page_attr().
The remove_memory() gunk is in arch/x86/mm/init_64.c. It has a few sites that do flush_tlb_all(). Now that I'm looking at it, there look to be some races between freeing page tables pages and flushing the TLB. But, basically, if you stick to the sites in there that do flush_tlb_all() after free_pagetable(), you should be good.
As for the __change_page_attr() code, I think the only spot you need to hit is cpa_collapse_large_pages() and maybe the one in __split_large_page() as well.
This is all disturbingly ad-hoc, though. The remove_memory() code needs fixing and I'll probably go try to bring some order to the chaos in the process of fixing it up. But that's a separate problem than this IOMMU fun.
From: Hansen, Dave dave.hansen@intel.com Sent: Thursday, July 10, 2025 11:26 PM
On 7/10/25 06:22, Jason Gunthorpe wrote:
Why does this matter? We flush the CPU TLB in a bunch of different ways, _especially_ when it's being done for kernel mappings. For example, __flush_tlb_all() is a non-ranged kernel flush which has a completely parallel implementation with flush_tlb_kernel_range(). Call sites that use _it_ are unaffected by the patch here.
Basically, if we're only worried about vmalloc/vfree freeing page tables, then this patch is OK. If the problem is bigger than that, then we need a more comprehensive patch.
I think we are worried about any place that frees page tables.
The two places that come to mind are the remove_memory() code and __change_page_attr().
The remove_memory() gunk is in arch/x86/mm/init_64.c. It has a few sites that do flush_tlb_all(). Now that I'm looking at it, there look to be some races between freeing page tables pages and flushing the TLB. But, basically, if you stick to the sites in there that do flush_tlb_all() after free_pagetable(), you should be good.
Isn't doing flush after free_pagetable() leaving a small window for attack? the page table is freed and may have been repurposed while the stale paging structure cache still treats it as a page table page...
looks it's not unusual to see similar pattern in other mm code:
vmemmap_split_pmd() { if (likely(pmd_leaf(*pmd))) { ... } else { pte_free_kernel(&init_mm, pgtable); } ... }
then the tlb flush is postponed to vmemmap_remap_range():
walk_page_range_novma(); if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH)) flush_tlb_kernel_range(start, end);
or even postponed even later if NO_TLB_FLUSH is set.
those call sites might have been scrutinized to be safe regarding to CPU execution flows, but not sure the conditions for considering it safe can also apply to the said attack via device SVA.
somehow we may really need to re-look at the other two options (kpti or shadow pgd) which solve this problem from another angle...
From: Tian, Kevin Sent: Friday, July 11, 2025 10:46 AM
From: Hansen, Dave dave.hansen@intel.com Sent: Thursday, July 10, 2025 11:26 PM
On 7/10/25 06:22, Jason Gunthorpe wrote:
Why does this matter? We flush the CPU TLB in a bunch of different
ways,
_especially_ when it's being done for kernel mappings. For example, __flush_tlb_all() is a non-ranged kernel flush which has a completely parallel implementation with flush_tlb_kernel_range(). Call sites that use _it_ are unaffected by the patch here.
Basically, if we're only worried about vmalloc/vfree freeing page tables, then this patch is OK. If the problem is bigger than that, then we need a more comprehensive patch.
I think we are worried about any place that frees page tables.
The two places that come to mind are the remove_memory() code and __change_page_attr().
The remove_memory() gunk is in arch/x86/mm/init_64.c. It has a few sites that do flush_tlb_all(). Now that I'm looking at it, there look to be some races between freeing page tables pages and flushing the TLB. But, basically, if you stick to the sites in there that do flush_tlb_all() after free_pagetable(), you should be good.
Isn't doing flush after free_pagetable() leaving a small window for attack? the page table is freed and may have been repurposed while the stale paging structure cache still treats it as a page table page...
looks it's not unusual to see similar pattern in other mm code:
vmemmap_split_pmd() { if (likely(pmd_leaf(*pmd))) { ... } else { pte_free_kernel(&init_mm, pgtable); } ... }
oh, please ignore this comment. It's not about freeing the existing page table. incautious look. 😊
On Thu, Jul 10, 2025 at 08:26:06AM -0700, Dave Hansen wrote:
On 7/10/25 06:22, Jason Gunthorpe wrote:
Why does this matter? We flush the CPU TLB in a bunch of different ways, _especially_ when it's being done for kernel mappings. For example, __flush_tlb_all() is a non-ranged kernel flush which has a completely parallel implementation with flush_tlb_kernel_range(). Call sites that use _it_ are unaffected by the patch here.
Basically, if we're only worried about vmalloc/vfree freeing page tables, then this patch is OK. If the problem is bigger than that, then we need a more comprehensive patch.
I think we are worried about any place that frees page tables.
The two places that come to mind are the remove_memory() code and __change_page_attr().
The remove_memory() gunk is in arch/x86/mm/init_64.c. It has a few sites that do flush_tlb_all(). Now that I'm looking at it, there look to be some races between freeing page tables pages and flushing the TLB. But, basically, if you stick to the sites in there that do flush_tlb_all() after free_pagetable(), you should be good.
As for the __change_page_attr() code, I think the only spot you need to hit is cpa_collapse_large_pages() and maybe the one in __split_large_page() as well.
This is all disturbingly ad-hoc, though. The remove_memory() code needs fixing and I'll probably go try to bring some order to the chaos in the process of fixing it up. But that's a separate problem than this IOMMU fun.
Could we consider to split the flush_tlb_kernel_range() into 2 different versions: - the one which only flushes the CPU TLB - the one which flushes the CPU paging structure cache and then notifies IOMMU to do the same(e.g., in pud_free_pmd_page()/pmd_free_pte_page())?
B.R. Yu
From: Jason Gunthorpe jgg@nvidia.com Sent: Thursday, July 10, 2025 9:23 PM
On Thu, Jul 10, 2025 at 05:53:16AM -0700, Dave Hansen wrote:
- I'm not sure if the IOMMU will even create an IOTLB entry for a supervisor-permission mapping while doing a user-permission walk.
Early VT-d platforms may cache faulted translation. But they are pretty old and way before SVA was introduced in spec.
It doesn't matter if it does or doesn't, it doesn't change the argument that we don't have to invalidate on PTEs being changed.
but yes, that fact doesn't matter.
From: Lu Baolu baolu.lu@linux.intel.com Sent: Wednesday, July 9, 2025 2:28 PM
The vmalloc() and vfree() functions manage virtually contiguous, but not necessarily physically contiguous, kernel memory regions. When vfree() unmaps such a region, it tears down the associated kernel page table entries and frees the physical pages.
In the IOMMU Shared Virtual Addressing (SVA) context, the IOMMU hardware shares and walks the CPU's page tables. Architectures like x86 share static kernel address mappings across all user page tables, allowing the
I'd remove 'static'
IOMMU to access the kernel portion of these tables.
Modern IOMMUs often cache page table entries to optimize walk performance, even for intermediate page table levels. If kernel page table mappings are changed (e.g., by vfree()), but the IOMMU's internal caches retain stale entries, Use-After-Free (UAF) vulnerability condition arises. If these freed page table pages are reallocated for a different purpose, potentially by an attacker, the IOMMU could misinterpret the new data as valid page table entries. This allows the IOMMU to walk into attacker-controlled memory, leading to arbitrary physical memory DMA access or privilege escalation.
this lacks of a background that currently the iommu driver is notified only for changes of user VA mappings, so the IOMMU's internal caches may retain stale entries for kernel VA.
To mitigate this, introduce a new iommu interface to flush IOMMU caches and fence pending page table walks when kernel page mappings are updated. This interface should be invoked from architecture-specific code that manages combined user and kernel page tables.
this also needs some words about the fact that new flushes are triggered not just for freeing page tables.
static DEFINE_MUTEX(iommu_sva_lock); +static DEFINE_STATIC_KEY_FALSE(iommu_sva_present); +static LIST_HEAD(iommu_sva_mms); +static DEFINE_SPINLOCK(iommu_mms_lock);
s/iommu_mms_lock/iommu_mm_lock/
Reviewed-by: Kevin Tian kevin.tian@intel.com
On Thu, Jul 10, 2025 at 03:02:07AM +0000, Tian, Kevin wrote:
From: Lu Baolu baolu.lu@linux.intel.com Sent: Wednesday, July 9, 2025 2:28 PM
The vmalloc() and vfree() functions manage virtually contiguous, but not necessarily physically contiguous, kernel memory regions. When vfree() unmaps such a region, it tears down the associated kernel page table entries and frees the physical pages.
In the IOMMU Shared Virtual Addressing (SVA) context, the IOMMU hardware shares and walks the CPU's page tables. Architectures like x86 share static kernel address mappings across all user page tables, allowing the
I'd remove 'static'
IOMMU to access the kernel portion of these tables.
Modern IOMMUs often cache page table entries to optimize walk performance, even for intermediate page table levels. If kernel page table mappings are changed (e.g., by vfree()), but the IOMMU's internal caches retain stale entries, Use-After-Free (UAF) vulnerability condition arises. If these freed page table pages are reallocated for a different purpose, potentially by an attacker, the IOMMU could misinterpret the new data as valid page table entries. This allows the IOMMU to walk into attacker-controlled memory, leading to arbitrary physical memory DMA access or privilege escalation.
this lacks of a background that currently the iommu driver is notified only for changes of user VA mappings, so the IOMMU's internal caches may retain stale entries for kernel VA.
To mitigate this, introduce a new iommu interface to flush IOMMU caches and fence pending page table walks when kernel page mappings are updated. This interface should be invoked from architecture-specific code that manages combined user and kernel page tables.
this also needs some words about the fact that new flushes are triggered not just for freeing page tables.
Thank you, Kevin. A question about the background of this issue:
My understanding of the attacking scenario is, a malicious user application could initiate DMAs to some vmalloced address, causing the paging structure cache being loaded and then possibly being used after that paging structure is freed(may be allocated to some other users later).
If that is the case, only when the paging structures are freed, do we need to do the flush. I mean, the IOTLB entries may not be loaded at all when the permission check failes. Did I miss anything? :)
B.R. Yu
From: Yu Zhang zhangyu1@linux.microsoft.com Sent: Thursday, July 10, 2025 4:11 PM
On Thu, Jul 10, 2025 at 03:02:07AM +0000, Tian, Kevin wrote:
From: Lu Baolu baolu.lu@linux.intel.com Sent: Wednesday, July 9, 2025 2:28 PM
The vmalloc() and vfree() functions manage virtually contiguous, but not necessarily physically contiguous, kernel memory regions. When vfree() unmaps such a region, it tears down the associated kernel page table entries and frees the physical pages.
In the IOMMU Shared Virtual Addressing (SVA) context, the IOMMU hardware shares and walks the CPU's page tables. Architectures like x86 share static kernel address mappings across all user page tables, allowing the
I'd remove 'static'
IOMMU to access the kernel portion of these tables.
Modern IOMMUs often cache page table entries to optimize walk performance, even for intermediate page table levels. If kernel page table mappings are changed (e.g., by vfree()), but the IOMMU's internal caches retain stale entries, Use-After-Free (UAF) vulnerability condition arises. If these freed page table pages are reallocated for a different purpose, potentially by an attacker, the IOMMU could misinterpret the new data as valid page table entries. This allows the IOMMU to walk into attacker-controlled memory, leading to arbitrary physical memory DMA access or privilege escalation.
this lacks of a background that currently the iommu driver is notified only for changes of user VA mappings, so the IOMMU's internal caches may retain stale entries for kernel VA.
To mitigate this, introduce a new iommu interface to flush IOMMU caches and fence pending page table walks when kernel page mappings are
updated.
This interface should be invoked from architecture-specific code that manages combined user and kernel page tables.
this also needs some words about the fact that new flushes are triggered not just for freeing page tables.
Thank you, Kevin. A question about the background of this issue:
My understanding of the attacking scenario is, a malicious user application could initiate DMAs to some vmalloced address, causing the paging structure cache being loaded and then possibly being used after that paging structure is freed(may be allocated to some other users later).
If that is the case, only when the paging structures are freed, do we need to do the flush. I mean, the IOTLB entries may not be loaded at all when the permission check failes. Did I miss anything? :)
It's about the paging structure cache instead of IOTLB.
You may look at the discussion in v1 for more background, especially the latest reply from Baolu about a detailed example:
https://lore.kernel.org/linux-iommu/2080aaea-0d6e-418e-8391-ddac9b39c109@lin...
On Thu, Jul 10, 2025 at 08:15:27AM +0000, Tian, Kevin wrote:
From: Yu Zhang zhangyu1@linux.microsoft.com Sent: Thursday, July 10, 2025 4:11 PM
On Thu, Jul 10, 2025 at 03:02:07AM +0000, Tian, Kevin wrote:
From: Lu Baolu baolu.lu@linux.intel.com Sent: Wednesday, July 9, 2025 2:28 PM
The vmalloc() and vfree() functions manage virtually contiguous, but not necessarily physically contiguous, kernel memory regions. When vfree() unmaps such a region, it tears down the associated kernel page table entries and frees the physical pages.
In the IOMMU Shared Virtual Addressing (SVA) context, the IOMMU hardware shares and walks the CPU's page tables. Architectures like x86 share static kernel address mappings across all user page tables, allowing the
I'd remove 'static'
IOMMU to access the kernel portion of these tables.
Modern IOMMUs often cache page table entries to optimize walk performance, even for intermediate page table levels. If kernel page table mappings are changed (e.g., by vfree()), but the IOMMU's internal caches retain stale entries, Use-After-Free (UAF) vulnerability condition arises. If these freed page table pages are reallocated for a different purpose, potentially by an attacker, the IOMMU could misinterpret the new data as valid page table entries. This allows the IOMMU to walk into attacker-controlled memory, leading to arbitrary physical memory DMA access or privilege escalation.
this lacks of a background that currently the iommu driver is notified only for changes of user VA mappings, so the IOMMU's internal caches may retain stale entries for kernel VA.
To mitigate this, introduce a new iommu interface to flush IOMMU caches and fence pending page table walks when kernel page mappings are
updated.
This interface should be invoked from architecture-specific code that manages combined user and kernel page tables.
this also needs some words about the fact that new flushes are triggered not just for freeing page tables.
Thank you, Kevin. A question about the background of this issue:
My understanding of the attacking scenario is, a malicious user application could initiate DMAs to some vmalloced address, causing the paging structure cache being loaded and then possibly being used after that paging structure is freed(may be allocated to some other users later).
If that is the case, only when the paging structures are freed, do we need to do the flush. I mean, the IOTLB entries may not be loaded at all when the permission check failes. Did I miss anything? :)
It's about the paging structure cache instead of IOTLB.
You may look at the discussion in v1 for more background, especially the latest reply from Baolu about a detailed example:
https://lore.kernel.org/linux-iommu/2080aaea-0d6e-418e-8391-ddac9b39c109@lin...
Thank you, Kevin. This really helps.
And by pointing out "this also needs some words about the fact that new flushes are triggered not just for freeing page tables". Do you mean this fix is not an optimal one, because iommu_sva_invalidate_kva_range() is triggered for each unmapping of the vmalloced address?
Do we have any choice, e.g., to not trigger the flush e.g., when the page table(or directory etc.) is not freed?
B.R. Yu
On Wed, Jul 09, 2025 at 02:28:00PM +0800, Lu Baolu wrote:
The vmalloc() and vfree() functions manage virtually contiguous, but not necessarily physically contiguous, kernel memory regions. When vfree() unmaps such a region, it tears down the associated kernel page table entries and frees the physical pages.
In the IOMMU Shared Virtual Addressing (SVA) context, the IOMMU hardware shares and walks the CPU's page tables. Architectures like x86 share static kernel address mappings across all user page tables, allowing the IOMMU to access the kernel portion of these tables.
Modern IOMMUs often cache page table entries to optimize walk performance, even for intermediate page table levels. If kernel page table mappings are changed (e.g., by vfree()), but the IOMMU's internal caches retain stale entries, Use-After-Free (UAF) vulnerability condition arises. If these freed page table pages are reallocated for a different purpose, potentially by an attacker, the IOMMU could misinterpret the new data as valid page table entries. This allows the IOMMU to walk into attacker-controlled memory, leading to arbitrary physical memory DMA access or privilege escalation.
To mitigate this, introduce a new iommu interface to flush IOMMU caches and fence pending page table walks when kernel page mappings are updated. This interface should be invoked from architecture-specific code that manages combined user and kernel page tables.
I must say I liked the kPTI based idea better. Having to iterate and invalidate an unspecified number of IOMMUs from non-preemptible context seems 'unfortunate'.
Why was this approach chosen over the kPTI one, where we keep a page-table root that simply does not include the kernel bits, and therefore the IOMMU will never see them (change) and we'll never have to invalidate?
@@ -132,8 +136,15 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm if (ret) goto out_free_domain; domain->users = 1;
- list_add(&domain->next, &mm->iommu_mm->sva_domains);
- if (list_empty(&iommu_mm->sva_domains)) {
scoped_guard(spinlock_irqsave, &iommu_mms_lock) {
if (list_empty(&iommu_sva_mms))
static_branch_enable(&iommu_sva_present);
list_add(&iommu_mm->mm_list_elm, &iommu_sva_mms);
}
- }
- list_add(&domain->next, &iommu_mm->sva_domains);
out: refcount_set(&handle->users, 1); mutex_unlock(&iommu_sva_lock); @@ -175,6 +186,15 @@ void iommu_sva_unbind_device(struct iommu_sva *handle) list_del(&domain->next); iommu_domain_free(domain); }
- if (list_empty(&iommu_mm->sva_domains)) {
scoped_guard(spinlock_irqsave, &iommu_mms_lock) {
list_del(&iommu_mm->mm_list_elm);
if (list_empty(&iommu_sva_mms))
static_branch_disable(&iommu_sva_present);
}
- }
- mutex_unlock(&iommu_sva_lock); kfree(handle);
}
This seems an odd coding style choice; why the extra unneeded indentation? That is, what's wrong with:
if (list_empty()) { guard(spinlock_irqsave)(&iommu_mms_lock); list_del(); if (list_empty() static_branch_disable(); }
@@ -312,3 +332,15 @@ static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, return domain; }
+void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) +{
- struct iommu_mm_data *iommu_mm;
- if (!static_branch_unlikely(&iommu_sva_present))
return;
- guard(spinlock_irqsave)(&iommu_mms_lock);
- list_for_each_entry(iommu_mm, &iommu_sva_mms, mm_list_elm)
mmu_notifier_arch_invalidate_secondary_tlbs(iommu_mm->mm, start, end);
+}
This is absolutely the wrong way to use static_branch. You want them in inline functions guarding the function call, not inside the function call.
On Thu, Jul 10, 2025 at 03:54:32PM +0200, Peter Zijlstra wrote:
@@ -132,8 +136,15 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm if (ret) goto out_free_domain; domain->users = 1;
- list_add(&domain->next, &mm->iommu_mm->sva_domains);
- if (list_empty(&iommu_mm->sva_domains)) {
scoped_guard(spinlock_irqsave, &iommu_mms_lock) {
if (list_empty(&iommu_sva_mms))
static_branch_enable(&iommu_sva_present);
list_add(&iommu_mm->mm_list_elm, &iommu_sva_mms);
}
- }
- list_add(&domain->next, &iommu_mm->sva_domains);
out: refcount_set(&handle->users, 1); mutex_unlock(&iommu_sva_lock); @@ -175,6 +186,15 @@ void iommu_sva_unbind_device(struct iommu_sva *handle) list_del(&domain->next); iommu_domain_free(domain); }
- if (list_empty(&iommu_mm->sva_domains)) {
scoped_guard(spinlock_irqsave, &iommu_mms_lock) {
list_del(&iommu_mm->mm_list_elm);
if (list_empty(&iommu_sva_mms))
static_branch_disable(&iommu_sva_present);
}
- }
- mutex_unlock(&iommu_sva_lock); kfree(handle);
}
This seems an odd coding style choice; why the extra unneeded indentation? That is, what's wrong with:
if (list_empty()) { guard(spinlock_irqsave)(&iommu_mms_lock); list_del(); if (list_empty() static_branch_disable(); }
Well, for one, you can't do static_branch_{en,dis}able() from atomic context...
Was this ever tested?
On 7/10/25 23:53, Peter Zijlstra wrote:
On Thu, Jul 10, 2025 at 03:54:32PM +0200, Peter Zijlstra wrote:
@@ -132,8 +136,15 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm if (ret) goto out_free_domain; domain->users = 1;
- list_add(&domain->next, &mm->iommu_mm->sva_domains);
- if (list_empty(&iommu_mm->sva_domains)) {
scoped_guard(spinlock_irqsave, &iommu_mms_lock) {
if (list_empty(&iommu_sva_mms))
static_branch_enable(&iommu_sva_present);
list_add(&iommu_mm->mm_list_elm, &iommu_sva_mms);
}
- }
- list_add(&domain->next, &iommu_mm->sva_domains); out: refcount_set(&handle->users, 1); mutex_unlock(&iommu_sva_lock);
@@ -175,6 +186,15 @@ void iommu_sva_unbind_device(struct iommu_sva *handle) list_del(&domain->next); iommu_domain_free(domain); }
- if (list_empty(&iommu_mm->sva_domains)) {
scoped_guard(spinlock_irqsave, &iommu_mms_lock) {
list_del(&iommu_mm->mm_list_elm);
if (list_empty(&iommu_sva_mms))
static_branch_disable(&iommu_sva_present);
}
- }
- mutex_unlock(&iommu_sva_lock); kfree(handle); }
This seems an odd coding style choice; why the extra unneeded indentation? That is, what's wrong with:
if (list_empty()) { guard(spinlock_irqsave)(&iommu_mms_lock); list_del(); if (list_empty() static_branch_disable(); }
Well, for one, you can't do static_branch_{en,dis}able() from atomic context...
Was this ever tested?
I conducted unit tests for vmalloc()/vfree() scenarios, and Yi performed fuzzing tests. We have not observed any warning messages. Perhaps static_branch_disable() is not triggered in the test cases?
Thanks, baolu
On Fri, Jul 11, 2025 at 11:09:00AM +0800, Baolu Lu wrote:
On 7/10/25 23:53, Peter Zijlstra wrote:
On Thu, Jul 10, 2025 at 03:54:32PM +0200, Peter Zijlstra wrote:
@@ -132,8 +136,15 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm if (ret) goto out_free_domain; domain->users = 1;
- list_add(&domain->next, &mm->iommu_mm->sva_domains);
- if (list_empty(&iommu_mm->sva_domains)) {
scoped_guard(spinlock_irqsave, &iommu_mms_lock) {
if (list_empty(&iommu_sva_mms))
static_branch_enable(&iommu_sva_present);
list_add(&iommu_mm->mm_list_elm, &iommu_sva_mms);
}
- }
- list_add(&domain->next, &iommu_mm->sva_domains); out: refcount_set(&handle->users, 1); mutex_unlock(&iommu_sva_lock);
@@ -175,6 +186,15 @@ void iommu_sva_unbind_device(struct iommu_sva *handle) list_del(&domain->next); iommu_domain_free(domain); }
- if (list_empty(&iommu_mm->sva_domains)) {
scoped_guard(spinlock_irqsave, &iommu_mms_lock) {
list_del(&iommu_mm->mm_list_elm);
if (list_empty(&iommu_sva_mms))
static_branch_disable(&iommu_sva_present);
}
- }
- mutex_unlock(&iommu_sva_lock); kfree(handle); }
This seems an odd coding style choice; why the extra unneeded indentation? That is, what's wrong with:
if (list_empty()) { guard(spinlock_irqsave)(&iommu_mms_lock); list_del(); if (list_empty() static_branch_disable(); }
Well, for one, you can't do static_branch_{en,dis}able() from atomic context...
Was this ever tested?
I conducted unit tests for vmalloc()/vfree() scenarios, and Yi performed fuzzing tests. We have not observed any warning messages. Perhaps static_branch_disable() is not triggered in the test cases?
Same with static_branch_enable(). These functions start with cpus_read_lock(), which is percpu_down_read(), which has might_sleep().
Hi Peter Z,
On 7/10/25 21:54, Peter Zijlstra wrote:
On Wed, Jul 09, 2025 at 02:28:00PM +0800, Lu Baolu wrote:
The vmalloc() and vfree() functions manage virtually contiguous, but not necessarily physically contiguous, kernel memory regions. When vfree() unmaps such a region, it tears down the associated kernel page table entries and frees the physical pages.
In the IOMMU Shared Virtual Addressing (SVA) context, the IOMMU hardware shares and walks the CPU's page tables. Architectures like x86 share static kernel address mappings across all user page tables, allowing the IOMMU to access the kernel portion of these tables.
Modern IOMMUs often cache page table entries to optimize walk performance, even for intermediate page table levels. If kernel page table mappings are changed (e.g., by vfree()), but the IOMMU's internal caches retain stale entries, Use-After-Free (UAF) vulnerability condition arises. If these freed page table pages are reallocated for a different purpose, potentially by an attacker, the IOMMU could misinterpret the new data as valid page table entries. This allows the IOMMU to walk into attacker-controlled memory, leading to arbitrary physical memory DMA access or privilege escalation.
To mitigate this, introduce a new iommu interface to flush IOMMU caches and fence pending page table walks when kernel page mappings are updated. This interface should be invoked from architecture-specific code that manages combined user and kernel page tables.
I must say I liked the kPTI based idea better. Having to iterate and invalidate an unspecified number of IOMMUs from non-preemptible context seems 'unfortunate'.
The cache invalidation path in IOMMU drivers is already critical and operates within a non-preemptible context. This approach is, in fact, already utilized for user-space page table updates since the beginning of SVA support.
Why was this approach chosen over the kPTI one, where we keep a page-table root that simply does not include the kernel bits, and therefore the IOMMU will never see them (change) and we'll never have to invalidate?
The IOMMU subsystem started supporting the SVA feature in 2019, and it has been broadly adopted in various production kernels. The issue described here is fundamentally a software bug related to not maintaining IOMMU cache coherence. Therefore, we need a quick and simple fix to address it, and this patch can be easily backported to production kernels.
While a kPTI-based approach might appear more attractive, I believe some extra work is still required to properly integrate it into the IOMMU subsystem. For instance, kPTI is currently an optional mitigation, enabled via CONFIG_MITIGATION_PAGE_TABLE_ISOLATION and bypassable with the "nopti" kernel parameter. This optionality is not suitable for the IOMMU subsystem, as software must always guarantee IOMMU cache coherence for functional correctness and security.
So, in the short term, let's proceed with a straightforward solution to resolve this issue and ensure the SVA feature functions correctly. For the long term, we can explore optimizations and deeper integration aligned with features like kPTI.
@@ -132,8 +136,15 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm if (ret) goto out_free_domain; domain->users = 1;
- list_add(&domain->next, &mm->iommu_mm->sva_domains);
- if (list_empty(&iommu_mm->sva_domains)) {
scoped_guard(spinlock_irqsave, &iommu_mms_lock) {
if (list_empty(&iommu_sva_mms))
static_branch_enable(&iommu_sva_present);
list_add(&iommu_mm->mm_list_elm, &iommu_sva_mms);
}
- }
- list_add(&domain->next, &iommu_mm->sva_domains); out: refcount_set(&handle->users, 1); mutex_unlock(&iommu_sva_lock);
@@ -175,6 +186,15 @@ void iommu_sva_unbind_device(struct iommu_sva *handle) list_del(&domain->next); iommu_domain_free(domain); }
- if (list_empty(&iommu_mm->sva_domains)) {
scoped_guard(spinlock_irqsave, &iommu_mms_lock) {
list_del(&iommu_mm->mm_list_elm);
if (list_empty(&iommu_sva_mms))
static_branch_disable(&iommu_sva_present);
}
- }
- mutex_unlock(&iommu_sva_lock); kfree(handle); }
This seems an odd coding style choice; why the extra unneeded indentation? That is, what's wrong with:
if (list_empty()) { guard(spinlock_irqsave)(&iommu_mms_lock); list_del(); if (list_empty() static_branch_disable(); }
Perhaps I overlooked or misunderstood something, but my understanding is,
The lock order in this function is:
mutex_lock(&iommu_sva_lock); spin_lock(&iommu_mms_lock); spin_unlock(&iommu_mms_lock); mutex_unlock(&iommu_sva_lock);
With above change, it is changed to:
mutex_lock(&iommu_sva_lock); spin_lock(&iommu_mms_lock); mutex_unlock(&iommu_sva_lock); spin_unlock(&iommu_mms_lock);
@@ -312,3 +332,15 @@ static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, return domain; }
+void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) +{
- struct iommu_mm_data *iommu_mm;
- if (!static_branch_unlikely(&iommu_sva_present))
return;
- guard(spinlock_irqsave)(&iommu_mms_lock);
- list_for_each_entry(iommu_mm, &iommu_sva_mms, mm_list_elm)
mmu_notifier_arch_invalidate_secondary_tlbs(iommu_mm->mm, start, end);
+}
This is absolutely the wrong way to use static_branch. You want them in inline functions guarding the function call, not inside the function call.
I don't think a static branch is desirable here, as we have no idea how often the condition will switch in real-world scenarios. I will remove it in the next version if there are no objections.
Thanks, baolu
From: Baolu Lu baolu.lu@linux.intel.com Sent: Friday, July 11, 2025 11:00 AM
On 7/10/25 21:54, Peter Zijlstra wrote:
On Wed, Jul 09, 2025 at 02:28:00PM +0800, Lu Baolu wrote:
- if (list_empty(&iommu_mm->sva_domains)) {
scoped_guard(spinlock_irqsave, &iommu_mms_lock) {
list_del(&iommu_mm->mm_list_elm);
if (list_empty(&iommu_sva_mms))
static_branch_disable(&iommu_sva_present);
}
- }
- mutex_unlock(&iommu_sva_lock); kfree(handle); }
This seems an odd coding style choice; why the extra unneeded indentation? That is, what's wrong with:
if (list_empty()) { guard(spinlock_irqsave)(&iommu_mms_lock); list_del(); if (list_empty() static_branch_disable(); }
Perhaps I overlooked or misunderstood something, but my understanding is,
The lock order in this function is:
mutex_lock(&iommu_sva_lock); spin_lock(&iommu_mms_lock); spin_unlock(&iommu_mms_lock); mutex_unlock(&iommu_sva_lock);
With above change, it is changed to:
mutex_lock(&iommu_sva_lock); spin_lock(&iommu_mms_lock); mutex_unlock(&iommu_sva_lock); spin_unlock(&iommu_mms_lock);
guard() follows the scope of variable declaration. Actually above is a perfect match to the example in cleanup.h:
* The lifetime of the lock obtained by the guard() helper follows the * scope of automatic variable declaration. Take the following example:: * * func(...) * { * if (...) { * ... * guard(pci_dev)(dev); // pci_dev_lock() invoked here * ... * } // <- implied pci_dev_unlock() triggered here * } * * Observe the lock is held for the remainder of the "if ()" block not * the remainder of "func()".
On Fri, Jul 11, 2025 at 11:00:06AM +0800, Baolu Lu wrote:
Hi Peter Z,
On 7/10/25 21:54, Peter Zijlstra wrote:
On Wed, Jul 09, 2025 at 02:28:00PM +0800, Lu Baolu wrote:
The vmalloc() and vfree() functions manage virtually contiguous, but not necessarily physically contiguous, kernel memory regions. When vfree() unmaps such a region, it tears down the associated kernel page table entries and frees the physical pages.
In the IOMMU Shared Virtual Addressing (SVA) context, the IOMMU hardware shares and walks the CPU's page tables. Architectures like x86 share static kernel address mappings across all user page tables, allowing the IOMMU to access the kernel portion of these tables.
Modern IOMMUs often cache page table entries to optimize walk performance, even for intermediate page table levels. If kernel page table mappings are changed (e.g., by vfree()), but the IOMMU's internal caches retain stale entries, Use-After-Free (UAF) vulnerability condition arises. If these freed page table pages are reallocated for a different purpose, potentially by an attacker, the IOMMU could misinterpret the new data as valid page table entries. This allows the IOMMU to walk into attacker-controlled memory, leading to arbitrary physical memory DMA access or privilege escalation.
To mitigate this, introduce a new iommu interface to flush IOMMU caches and fence pending page table walks when kernel page mappings are updated. This interface should be invoked from architecture-specific code that manages combined user and kernel page tables.
I must say I liked the kPTI based idea better. Having to iterate and invalidate an unspecified number of IOMMUs from non-preemptible context seems 'unfortunate'.
The cache invalidation path in IOMMU drivers is already critical and operates within a non-preemptible context. This approach is, in fact, already utilized for user-space page table updates since the beginning of SVA support.
OK, fair enough I suppose. What kind of delays are we talking about here? The fact that you basically have a unbounded list of IOMMUs (although in practise I suppose it is limited by the amount of GPUs and other fancy stuff you can stick in your machine) does slightly worry me.
At some point the low latency folks are going to come hunting you down. Do you have a plan on how to deal with this; or are we throwing up our hands an say, the hardware sucks, deal with it?
On Fri, Jul 11, 2025 at 10:32:52AM +0200, Peter Zijlstra wrote:
At some point the low latency folks are going to come hunting you down. Do you have a plan on how to deal with this; or are we throwing up our hands an say, the hardware sucks, deal with it?
If you turn on SVA in a process then all this same invalidation work happens under someone's spinlock even today.. If it becomes a latency problem for someone then it seems to be much bigger than just this bit.
Most likely the answer is that SVA and realtime are not compatible ideas. I know I've had this conversation with some of our realtime people and they have said they don't want to touch SVA because it makes the device latencies indeterminate with possible faults.
Jason
On Fri, Jul 11, 2025 at 11:00:06AM +0800, Baolu Lu wrote:
+void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) +{
- struct iommu_mm_data *iommu_mm;
- if (!static_branch_unlikely(&iommu_sva_present))
return;
- guard(spinlock_irqsave)(&iommu_mms_lock);
- list_for_each_entry(iommu_mm, &iommu_sva_mms, mm_list_elm)
mmu_notifier_arch_invalidate_secondary_tlbs(iommu_mm->mm, start, end);
+}
This is absolutely the wrong way to use static_branch. You want them in inline functions guarding the function call, not inside the function call.
I don't think a static branch is desirable here, as we have no idea how often the condition will switch in real-world scenarios. I will remove it in the next version if there are no objections.
The point of the static branch was to make the 99% normal case where SVA has never been used have no-cost in the core MM. I think we should keep that idea
Jason
linux-stable-mirror@lists.linaro.org