During our internal testing, we started observing intermittent boot failures when the machine uses 4-level paging and has a large amount of persistent memory:
BUG: unable to handle page fault for address: ffffe70000000034 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD 0 P4D 0 Oops: 0002 [#1] SMP NOPTI RIP: 0010:__init_single_page+0x9/0x6d Call Trace: <TASK> __init_zone_device_page+0x17/0x5d memmap_init_zone_device+0x154/0x1bb pagemap_range+0x2e0/0x40f memremap_pages+0x10b/0x2f0 devm_memremap_pages+0x1e/0x60 dev_dax_probe+0xce/0x2ec [device_dax] dax_bus_probe+0x6d/0xc9 [... snip ...] </TASK>
It turns out that the kernel panics while initializing vmemmap (struct page array) when the vmemmap region spans two PGD entries, because the new PGD entry is only installed in init_mm.pgd, but not in the page tables of other tasks.
And looking at __populate_section_memmap(): if (vmemmap_can_optimize(altmap, pgmap)) // does not sync top level page tables r = vmemmap_populate_compound_pages(pfn, start, end, nid, pgmap); else // sync top level page tables in x86 r = vmemmap_populate(start, end, nid, altmap);
In the normal path, vmemmap_populate() in arch/x86/mm/init_64.c synchronizes the top level page table (See commit 9b861528a801 ("x86-64, mem: Update all PGDs for direct mapping and vmemmap mapping changes")) so that all tasks in the system can see the new vmemmap area.
However, when vmemmap_can_optimize() returns true, the optimized path skips synchronization of top-level page tables. This is because vmemmap_populate_compound_pages() is implemented in core MM code, which does not handle synchronization of the top-level page tables. Instead, the core MM has historically relied on each architecture to perform this synchronization manually.
It turns out that current approach of relying on each arch to handle the page table sync manually is fragile because 1) it's easy to forget to sync the top level page table, and 2) it's also easy to overlook that the kernel should not access vmemmap / direct mapping area before the sync.
As suggested by Dave Hansen, define x86_64 versions of {pgd,p4d}_populate_kernel() and arch_sync_kernel_pagetables(), and explicitly perform top-level page table synchronization in {pgd,p4d}_populate_kernel(). Top level page tables are synchronized in pgd_pouplate_kernel() for 5-level paging and in p4d_populate_kernel() for 4-level paging.
arch_sync_kernel_pagetables(addr) synchronizes the top level page table entry for address. It calls sync_kernel_pagetables_{l4,l5} depending on the page table levels and installs the page entry in all page tables in the system to make it visible to all tasks.
Note that sync_kernel_pagetables_{l4,l5} are simply versions of sync_global_pgds_{l4,l5} that synchronizes only a single page table entry for specified address, instead of for all page table entries corresponding to a range. No functional difference intended between sync_global_pgds_* and sync_kernel_pagetables_* other than that.
This also fixes a crash in vmemmap_set_pmd() caused by accessing vmemmap before sync_global_pgds() [1]:
BUG: unable to handle page fault for address: ffffeb3ff1200000 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD 0 P4D 0 Oops: Oops: 0002 [#1] PREEMPT SMP NOPTI Tainted: [W]=WARN RIP: 0010:vmemmap_set_pmd+0xff/0x230 <TASK> vmemmap_populate_hugepages+0x176/0x180 vmemmap_populate+0x34/0x80 __populate_section_memmap+0x41/0x90 sparse_add_section+0x121/0x3e0 __add_pages+0xba/0x150 add_pages+0x1d/0x70 memremap_pages+0x3dc/0x810 devm_memremap_pages+0x1c/0x60 xe_devm_add+0x8b/0x100 [xe] xe_tile_init_noalloc+0x6a/0x70 [xe] xe_device_probe+0x48c/0x740 [xe] [... snip ...]
Cc: stable@vger.kernel.org Fixes: 4917f55b4ef9 ("mm/sparse-vmemmap: improve memory savings for compound devmaps") Fixes: faf1c0008a33 ("x86/vmemmap: optimize for consecutive sections in partial populated PMDs") Closes: https://lore.kernel.org/linux-mm/20250311114420.240341-1-gwan-gyeong.mun@int... [1] Suggested-by: Dave Hansen dave.hansen@linux.intel.com Signed-off-by: Harry Yoo harry.yoo@oracle.com --- arch/x86/include/asm/pgalloc.h | 22 ++++++++++ arch/x86/mm/init_64.c | 80 ++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+)
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index c88691b15f3c..d66f2db54b16 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -10,6 +10,7 @@
#define __HAVE_ARCH_PTE_ALLOC_ONE #define __HAVE_ARCH_PGD_FREE +#define __HAVE_ARCH_SYNC_KERNEL_PGTABLE #include <asm-generic/pgalloc.h>
static inline int __paravirt_pgd_alloc(struct mm_struct *mm) { return 0; } @@ -114,6 +115,17 @@ static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud) set_p4d(p4d, __p4d(_PAGE_TABLE | __pa(pud))); }
+void arch_sync_kernel_pagetables(unsigned long addr); + +static inline void p4d_populate_kernel(unsigned long addr, + p4d_t *p4d, pud_t *pud) +{ + paravirt_alloc_pud(&init_mm, __pa(pud) >> PAGE_SHIFT); + set_p4d(p4d, __p4d(_PAGE_TABLE | __pa(pud))); + if (!pgtable_l5_enabled()) + arch_sync_kernel_pagetables(addr); +} + static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d, pud_t *pud) { paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT); @@ -137,6 +149,16 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d) set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d))); }
+static inline void pgd_populate_kernel(unsigned long addr, + pgd_t *pgd, p4d_t *p4d) +{ + if (!pgtable_l5_enabled()) + return; + paravirt_alloc_p4d(&init_mm, __pa(p4d) >> PAGE_SHIFT); + set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d))); + arch_sync_kernel_pagetables(addr); +} + static inline void pgd_populate_safe(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d) { if (!pgtable_l5_enabled()) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index fdb6cab524f0..cbddbef434d5 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -223,6 +223,86 @@ static void sync_global_pgds(unsigned long start, unsigned long end) sync_global_pgds_l4(start, end); }
+static void sync_kernel_pagetables_l4(unsigned long addr) +{ + pgd_t *pgd_ref = pgd_offset_k(addr); + const p4d_t *p4d_ref; + struct page *page; + + VM_WARN_ON_ONCE(pgtable_l5_enabled()); + /* + * With folded p4d, pgd_none() is always false, we need to + * handle synchronization on p4d level. + */ + MAYBE_BUILD_BUG_ON(pgd_none(*pgd_ref)); + p4d_ref = p4d_offset(pgd_ref, addr); + + if (p4d_none(*p4d_ref)) + return; + + spin_lock(&pgd_lock); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + p4d_t *p4d; + spinlock_t *pgt_lock; + + pgd = (pgd_t *)page_address(page) + pgd_index(addr); + p4d = p4d_offset(pgd, addr); + /* the pgt_lock only for Xen */ + pgt_lock = &pgd_page_get_mm(page)->page_table_lock; + spin_lock(pgt_lock); + + if (!p4d_none(*p4d_ref) && !p4d_none(*p4d)) + BUG_ON(p4d_pgtable(*p4d) + != p4d_pgtable(*p4d_ref)); + + if (p4d_none(*p4d)) + set_p4d(p4d, *p4d_ref); + + spin_unlock(pgt_lock); + } + spin_unlock(&pgd_lock); +} + +static void sync_kernel_pagetables_l5(unsigned long addr) +{ + const pgd_t *pgd_ref = pgd_offset_k(addr); + struct page *page; + + VM_WARN_ON_ONCE(!pgtable_l5_enabled()); + + if (pgd_none(*pgd_ref)) + return; + + spin_lock(&pgd_lock); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + spinlock_t *pgt_lock; + + pgd = (pgd_t *)page_address(page) + pgd_index(addr); + /* the pgt_lock only for Xen */ + pgt_lock = &pgd_page_get_mm(page)->page_table_lock; + spin_lock(pgt_lock); + + if (!pgd_none(*pgd_ref) && !pgd_none(*pgd)) + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + + spin_unlock(pgt_lock); + } + spin_unlock(&pgd_lock); +} + +void arch_sync_kernel_pagetables(unsigned long addr) +{ + if (pgtable_l5_enabled()) + sync_kernel_pagetables_l5(addr); + else + sync_kernel_pagetables_l4(addr); +} + /* * NOTE: This function is marked __ref because it calls __init function * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.