On 8/8/25 10:57, Tian, Kevin wrote:
From: Jason Gunthorpe jgg@nvidia.com Sent: Friday, August 8, 2025 3:52 AM
On Thu, Aug 07, 2025 at 10:40:39PM +0800, Baolu Lu wrote:
+static void kernel_pte_work_func(struct work_struct *work) +{
- struct page *page, *next;
- iommu_sva_invalidate_kva_range(0, TLB_FLUSH_ALL);
- guard(spinlock)(&kernel_pte_work.lock);
- list_for_each_entry_safe(page, next, &kernel_pte_work.list, lru) {
list_del_init(&page->lru);
Please don't add new usages of lru, we are trying to get rid of this. :(
I think the memory should be struct ptdesc, use that..
btw with this change we should also defer free of the pmd page:
pud_free_pmd_page() ... for (i = 0; i < PTRS_PER_PMD; i++) { if (!pmd_none(pmd_sv[i])) { pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]); pte_free_kernel(&init_mm, pte); } }
free_page((unsigned long)pmd_sv);
Otherwise the risk still exists if the pmd page is repurposed before the pte work is scheduled.
My understanding is that you were talking about pmd_free(), correct? It appears that both pmd_free() and pte_free_kernel() call pagetable_dtor_free(). If so, how about the following change?
diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index dbddacdca2ce..04f6b5bc212c 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -172,10 +172,8 @@ static inline pmd_t *pmd_alloc_one_noprof(struct mm_struct *mm, unsigned long ad #ifndef __HAVE_ARCH_PMD_FREE static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { - struct ptdesc *ptdesc = virt_to_ptdesc(pmd); - BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); - pagetable_dtor_free(ptdesc); + pte_free_kernel(mm, (pte_t *)pmd); }
another observation - pte_free_kernel is not used in remove_pagetable () and __change_page_attr(). Is it straightforward to put it in those paths or do we need duplicate some deferring logic there?
In both of these cases, pages in an array or list require deferred freeing. I don't believe the previous approach, which calls pagetable_dtor_free(), will still work here. Perhaps a separate list is needed? What about something like the following?
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 76e33bd7c556..2e887463c165 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1013,7 +1013,10 @@ static void __meminit free_pagetable(struct page *page, int order) free_reserved_pages(page, nr_pages); #endif } else { - free_pages((unsigned long)page_address(page), order); + if (IS_ENABLED(CONFIG_ASYNC_PGTABLE_FREE)) + kernel_pgtable_free_pages_async(page, order); + else + free_pages((unsigned long)page_address(page), order); } }
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 8834c76f91c9..7e567bdfddce 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -436,9 +436,13 @@ static void cpa_collapse_large_pages(struct cpa_data *cpa)
flush_tlb_all();
- list_for_each_entry_safe(ptdesc, tmp, &pgtables, pt_list) { - list_del(&ptdesc->pt_list); - __free_page(ptdesc_page(ptdesc)); + if (IS_ENABLED(CONFIG_ASYNC_PGTABLE_FREE)) { + kernel_pgtable_free_list_async(&pgtables); + } else { + list_for_each_entry_safe(ptdesc, tmp, &pgtables, pt_list) { + list_del(&ptdesc->pt_list); + __free_page(ptdesc_page(ptdesc)); + } } }
diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index 4938eff5b482..13bbe1588872 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -48,8 +48,12 @@ static inline pte_t *pte_alloc_one_kernel_noprof(struct mm_struct *mm)
#ifdef CONFIG_ASYNC_PGTABLE_FREE void pte_free_kernel_async(struct ptdesc *ptdesc); +void kernel_pgtable_free_list_async(struct list_head *list); +void kernel_pgtable_free_pages_async(struct page *pages, int order); #else static inline void pte_free_kernel_async(struct ptdesc *ptdesc) {} +static inline void kernel_pgtable_free_list_async(struct list_head *list) {} +void kernel_pgtable_free_pages_async(struct page *pages, int order) {} #endif
/** diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index b9a785dd6b8d..d1d19132bbe8 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -413,6 +413,7 @@ static void kernel_pte_work_func(struct work_struct *work);
struct { struct list_head list; + struct list_head pages; spinlock_t lock; struct work_struct work; } kernel_pte_work = { @@ -425,17 +426,24 @@ static void kernel_pte_work_func(struct work_struct *work) { struct ptdesc *ptdesc, *next; LIST_HEAD(free_list); + LIST_HEAD(pages_list);
iommu_sva_invalidate_kva_range(0, TLB_FLUSH_ALL);
spin_lock(&kernel_pte_work.lock); list_move(&kernel_pte_work.list, &free_list); + list_move(&kernel_pte_work.pages, &pages_list); spin_unlock(&kernel_pte_work.lock);
list_for_each_entry_safe(ptdesc, next, &free_list, pt_list) { list_del(&ptdesc->pt_list); pagetable_dtor_free(ptdesc); } + + list_for_each_entry_safe(ptdesc, next, &pages_list, pt_list) { + list_del(&ptdesc->pt_list); + __free_page(ptdesc_page(ptdesc)); + } }
void pte_free_kernel_async(struct ptdesc *ptdesc) @@ -444,4 +452,25 @@ void pte_free_kernel_async(struct ptdesc *ptdesc) list_add(&ptdesc->pt_list, &kernel_pte_work.list); schedule_work(&kernel_pte_work.work); } + +void kernel_pgtable_free_list_async(struct list_head *list) +{ + guard(spinlock)(&kernel_pte_work.lock); + list_splice_tail(list, &kernel_pte_work.pages); + schedule_work(&kernel_pte_work.work); +} + +void kernel_pgtable_free_pages_async(struct page *pages, int order) +{ + unsigned long nr_pages = 1 << order; + struct ptdesc *ptdesc; + int i; + + guard(spinlock)(&kernel_pte_work.lock); + for (i = 0; i < nr_pages; i++) { + ptdesc = page_ptdesc(&pages[i]); + list_add(&ptdesc->pt_list, &kernel_pte_work.pages); + } + schedule_work(&kernel_pte_work.work); +} #endif
Thanks, baolu