5.15-stable review patch. If anyone has any objections, please let me know.
------------------
From: Jann Horn jannh@google.com
commit 081056dc00a27bccb55ccc3c6f230a3d5fd3f7e0 upstream.
Currently, __split_vma() triggers hugetlb page table unsharing through vm_ops->may_split(). This happens before the VMA lock and rmap locks are taken - which is too early, it allows racing VMA-locked page faults in our process and racing rmap walks from other processes to cause page tables to be shared again before we actually perform the split.
Fix it by explicitly calling into the hugetlb unshare logic from __split_vma() in the same place where THP splitting also happens. At that point, both the VMA and the rmap(s) are write-locked.
An annoying detail is that we can now call into the helper hugetlb_unshare_pmds() from two different locking contexts:
1. from hugetlb_split(), holding: - mmap lock (exclusively) - VMA lock - file rmap lock (exclusively) 2. hugetlb_unshare_all_pmds(), which I think is designed to be able to call us with only the mmap lock held (in shared mode), but currently only runs while holding mmap lock (exclusively) and VMA lock
Backporting note: This commit fixes a racy protection that was introduced in commit b30c14cd6102 ("hugetlb: unshare some PMDs when splitting VMAs"); that commit claimed to fix an issue introduced in 5.13, but it should actually also go all the way back.
[jannh@google.com: v2] Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-1-1329349bad1a... Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-0-1329349bad1a... Link: https://lkml.kernel.org/r/20250527-hugetlb-fixes-splitrace-v1-1-f4136f5ec58a... Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page") Signed-off-by: Jann Horn jannh@google.com Cc: Liam Howlett liam.howlett@oracle.com Reviewed-by: Lorenzo Stoakes lorenzo.stoakes@oracle.com Reviewed-by: Oscar Salvador osalvador@suse.de Cc: Lorenzo Stoakes lorenzo.stoakes@oracle.com Cc: Vlastimil Babka vbabka@suse.cz Cc: stable@vger.kernel.org [b30c14cd6102: hugetlb: unshare some PMDs when splitting VMAs] Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org [stable backport: code got moved around, VMA splitting is in __vma_adjust, hugetlb lock wasn't used back then] Signed-off-by: Jann Horn jannh@google.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org --- include/linux/hugetlb.h | 3 ++ mm/hugetlb.c | 56 ++++++++++++++++++++++++++++++++++++------------ mm/mmap.c | 8 ++++++ 3 files changed, 53 insertions(+), 14 deletions(-)
--- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -213,6 +213,7 @@ unsigned long hugetlb_change_protection(
bool is_hugetlb_entry_migration(pte_t pte); void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); +void hugetlb_split(struct vm_area_struct *vma, unsigned long addr);
#else /* !CONFIG_HUGETLB_PAGE */
@@ -409,6 +410,8 @@ static inline vm_fault_t hugetlb_fault(s
static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { }
+static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {} + #endif /* !CONFIG_HUGETLB_PAGE */ /* * hugepages at page global directory. If arch support --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -83,7 +83,7 @@ struct mutex *hugetlb_fault_mutex_table /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); static void hugetlb_unshare_pmds(struct vm_area_struct *vma, - unsigned long start, unsigned long end); + unsigned long start, unsigned long end, bool take_locks);
static inline bool subpool_is_free(struct hugepage_subpool *spool) { @@ -4165,26 +4165,40 @@ static int hugetlb_vm_op_split(struct vm { if (addr & ~(huge_page_mask(hstate_vma(vma)))) return -EINVAL; + return 0; +}
+void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) +{ /* * PMD sharing is only possible for PUD_SIZE-aligned address ranges * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this * split, unshare PMDs in the PUD_SIZE interval surrounding addr now. + * This function is called in the middle of a VMA split operation, with + * MM, VMA and rmap all write-locked to prevent concurrent page table + * walks (except hardware and gup_fast()). */ + mmap_assert_write_locked(vma->vm_mm); + i_mmap_assert_write_locked(vma->vm_file->f_mapping); + if (addr & ~PUD_MASK) { - /* - * hugetlb_vm_op_split is called right before we attempt to - * split the VMA. We will need to unshare PMDs in the old and - * new VMAs, so let's unshare before we split. - */ unsigned long floor = addr & PUD_MASK; unsigned long ceil = floor + PUD_SIZE;
- if (floor >= vma->vm_start && ceil <= vma->vm_end) - hugetlb_unshare_pmds(vma, floor, ceil); + if (floor >= vma->vm_start && ceil <= vma->vm_end) { + /* + * Locking: + * Use take_locks=false here. + * The file rmap lock is already held. + * The hugetlb VMA lock can't be taken when we already + * hold the file rmap lock, and we don't need it because + * its purpose is to synchronize against concurrent page + * table walks, which are not possible thanks to the + * locks held by our caller. + */ + hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false); + } } - - return 0; }
static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma) @@ -6369,9 +6383,16 @@ void move_hugetlb_state(struct page *old } }
+/* + * If @take_locks is false, the caller must ensure that no concurrent page table + * access can happen (except for gup_fast() and hardware page walks). + * If @take_locks is true, we take the hugetlb VMA lock (to lock out things like + * concurrent page fault handling) and the file rmap lock. + */ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, unsigned long start, - unsigned long end) + unsigned long end, + bool take_locks) { struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); @@ -6394,7 +6415,11 @@ static void hugetlb_unshare_pmds(struct mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, start, end); mmu_notifier_invalidate_range_start(&range); - i_mmap_lock_write(vma->vm_file->f_mapping); + if (take_locks) { + i_mmap_lock_write(vma->vm_file->f_mapping); + } else { + i_mmap_assert_write_locked(vma->vm_file->f_mapping); + } for (address = start; address < end; address += PUD_SIZE) { unsigned long tmp = address;
@@ -6407,7 +6432,9 @@ static void hugetlb_unshare_pmds(struct spin_unlock(ptl); } flush_hugetlb_tlb_range(vma, start, end); - i_mmap_unlock_write(vma->vm_file->f_mapping); + if (take_locks) { + i_mmap_unlock_write(vma->vm_file->f_mapping); + } /* * No need to call mmu_notifier_invalidate_range(), see * Documentation/vm/mmu_notifier.rst. @@ -6422,7 +6449,8 @@ static void hugetlb_unshare_pmds(struct void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE), - ALIGN_DOWN(vma->vm_end, PUD_SIZE)); + ALIGN_DOWN(vma->vm_end, PUD_SIZE), + /* take_locks = */ true); }
#ifdef CONFIG_CMA --- a/mm/mmap.c +++ b/mm/mmap.c @@ -833,7 +833,15 @@ int __vma_adjust(struct vm_area_struct * } } again: + /* + * Get rid of huge pages and shared page tables straddling the split + * boundary. + */ vma_adjust_trans_huge(orig_vma, start, end, adjust_next); + if (is_vm_hugetlb_page(orig_vma)) { + hugetlb_split(orig_vma, start); + hugetlb_split(orig_vma, end); + }
if (file) { mapping = file->f_mapping;