From: Baolu Lu baolu.lu@linux.intel.com Sent: Monday, August 18, 2025 2:22 PM
On 8/8/25 10:57, Tian, Kevin wrote:
From: Jason Gunthorpe jgg@nvidia.com Sent: Friday, August 8, 2025 3:52 AM
On Thu, Aug 07, 2025 at 10:40:39PM +0800, Baolu Lu wrote:
+static void kernel_pte_work_func(struct work_struct *work) +{
- struct page *page, *next;
- iommu_sva_invalidate_kva_range(0, TLB_FLUSH_ALL);
- guard(spinlock)(&kernel_pte_work.lock);
- list_for_each_entry_safe(page, next, &kernel_pte_work.list, lru) {
list_del_init(&page->lru);
Please don't add new usages of lru, we are trying to get rid of this. :(
I think the memory should be struct ptdesc, use that..
btw with this change we should also defer free of the pmd page:
pud_free_pmd_page() ... for (i = 0; i < PTRS_PER_PMD; i++) { if (!pmd_none(pmd_sv[i])) { pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]); pte_free_kernel(&init_mm, pte); } }
free_page((unsigned long)pmd_sv);
Otherwise the risk still exists if the pmd page is repurposed before the pte work is scheduled.
My understanding is that you were talking about pmd_free(), correct? It
yes
appears that both pmd_free() and pte_free_kernel() call pagetable_dtor_free(). If so, how about the following change?
diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index dbddacdca2ce..04f6b5bc212c 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -172,10 +172,8 @@ static inline pmd_t *pmd_alloc_one_noprof(struct mm_struct *mm, unsigned long ad #ifndef __HAVE_ARCH_PMD_FREE static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) {
struct ptdesc *ptdesc = virt_to_ptdesc(pmd);
BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
pagetable_dtor_free(ptdesc);
}pte_free_kernel(mm, (pte_t *)pmd);
better to rename pte_free_kernel_async() to pagetable_free_kernel_async() and call it directly here like you did in pte_free_kernel(). otherwise it's a bit weird to call a pte helper for pmd.
accordingly do we need a new helper pmd_free_kernel()? Now there is no restriction that pmd_free() can only be called on kernel entries. e.g. mop_up_one_pmd() (only in PAE and KPTI), destroy_args() if CONFIG_DEBUG_VM_PGTABLE, etc.
another observation - pte_free_kernel is not used in remove_pagetable () and __change_page_attr(). Is it straightforward to put it in those paths or do we need duplicate some deferring logic there?
In both of these cases, pages in an array or list require deferred freeing. I don't believe the previous approach, which calls pagetable_dtor_free(), will still work here. Perhaps a separate list is
this is the part which I don't quite understand. Is there guarantee that page tables removed in those path are not allocated with any pagetable_ctor helpers? Otherwise some state might be broken w/o proper pagetable_dtor().
Knowing little here, so likely I missed some basic context.
needed? What about something like the following?
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 76e33bd7c556..2e887463c165 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1013,7 +1013,10 @@ static void __meminit free_pagetable(struct page *page, int order) free_reserved_pages(page, nr_pages); #endif } else {
free_pages((unsigned long)page_address(page), order);
if (IS_ENABLED(CONFIG_ASYNC_PGTABLE_FREE))
kernel_pgtable_free_pages_async(page, order);
else
free_pages((unsigned long)page_address(page),
order); } }
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 8834c76f91c9..7e567bdfddce 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -436,9 +436,13 @@ static void cpa_collapse_large_pages(struct cpa_data *cpa)
flush_tlb_all();
list_for_each_entry_safe(ptdesc, tmp, &pgtables, pt_list) {
list_del(&ptdesc->pt_list);
__free_page(ptdesc_page(ptdesc));
if (IS_ENABLED(CONFIG_ASYNC_PGTABLE_FREE)) {
kernel_pgtable_free_list_async(&pgtables);
} else {
list_for_each_entry_safe(ptdesc, tmp, &pgtables, pt_list) {
list_del(&ptdesc->pt_list);
__free_page(ptdesc_page(ptdesc));
}} }
diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index 4938eff5b482..13bbe1588872 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -48,8 +48,12 @@ static inline pte_t *pte_alloc_one_kernel_noprof(struct mm_struct *mm)
#ifdef CONFIG_ASYNC_PGTABLE_FREE void pte_free_kernel_async(struct ptdesc *ptdesc); +void kernel_pgtable_free_list_async(struct list_head *list); +void kernel_pgtable_free_pages_async(struct page *pages, int order); #else static inline void pte_free_kernel_async(struct ptdesc *ptdesc) {} +static inline void kernel_pgtable_free_list_async(struct list_head *list) {} +void kernel_pgtable_free_pages_async(struct page *pages, int order) {} #endif
/** diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index b9a785dd6b8d..d1d19132bbe8 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -413,6 +413,7 @@ static void kernel_pte_work_func(struct work_struct *work);
struct { struct list_head list;
struct list_head pages;
the real difference between the two lists is about whether to use pagetable_dtor_free() or __free_page(). Then clearer to call them 'pages_dtor' and 'pages'?
spinlock_t lock; struct work_struct work;
} kernel_pte_work = { @@ -425,17 +426,24 @@ static void kernel_pte_work_func(struct work_struct *work) { struct ptdesc *ptdesc, *next; LIST_HEAD(free_list);
LIST_HEAD(pages_list); iommu_sva_invalidate_kva_range(0, TLB_FLUSH_ALL); spin_lock(&kernel_pte_work.lock); list_move(&kernel_pte_work.list, &free_list);
list_move(&kernel_pte_work.pages, &pages_list); spin_unlock(&kernel_pte_work.lock); list_for_each_entry_safe(ptdesc, next, &free_list, pt_list) { list_del(&ptdesc->pt_list); pagetable_dtor_free(ptdesc); }
list_for_each_entry_safe(ptdesc, next, &pages_list, pt_list) {
list_del(&ptdesc->pt_list);
__free_page(ptdesc_page(ptdesc));
}
}
void pte_free_kernel_async(struct ptdesc *ptdesc)
@@ -444,4 +452,25 @@ void pte_free_kernel_async(struct ptdesc *ptdesc) list_add(&ptdesc->pt_list, &kernel_pte_work.list); schedule_work(&kernel_pte_work.work); }
+void kernel_pgtable_free_list_async(struct list_head *list) +{
guard(spinlock)(&kernel_pte_work.lock);
list_splice_tail(list, &kernel_pte_work.pages);
schedule_work(&kernel_pte_work.work);
+}
+void kernel_pgtable_free_pages_async(struct page *pages, int order) +{
unsigned long nr_pages = 1 << order;
struct ptdesc *ptdesc;
int i;
guard(spinlock)(&kernel_pte_work.lock);
for (i = 0; i < nr_pages; i++) {
ptdesc = page_ptdesc(&pages[i]);
list_add(&ptdesc->pt_list, &kernel_pte_work.pages);
}
there is a side-effect here. Now a contiguous range of pages (order=N) are freed one-by-one, so they are first fed back to the order=0 free list with possibility of getting split due to small order allocations before having chance to lift back to the order=N list. then the number of available huge pages is unnecessarily reduced.
but look at the code probably we don't need to handle the order>0 case?
now only free_hugepage_table() may free a huge page, called from remove_pmd_table() when a pmd is a *leaf* entry pointing to a vmemmap huge page. So it's not a real page table. I'm not sure why it's piggybacked in a pagetable helper, but sounds like a case we can ignore in this mitigation...
schedule_work(&kernel_pte_work.work);
+} #endif