Re: [PATCH 6.1.y 1/3] mm/hugetlb: unshare page tables during VMA split, not before

7 Jul 2025

Hello Jann,
On Fri, 20 Jun 2025 23:33:31 +0200, Jann Horn wrote:
...
Currently, __split_vma() triggers hugetlb page table unsharing through
vm_ops->may_split().  This happens before the VMA lock and rmap locks are
taken - which is too early, it allows racing VMA-locked page faults in our
process and racing rmap walks from other processes to cause page tables to
be shared again before we actually perform the split.
Fix it by explicitly calling into the hugetlb unshare logic from
__split_vma() in the same place where THP splitting also happens.  At that
point, both the VMA and the rmap(s) are write-locked.
An annoying detail is that we can now call into the helper
hugetlb_unshare_pmds() from two different locking contexts:

from hugetlb_split(), holding:
mmap lock (exclusively)
VMA lock
file rmap lock (exclusively)


hugetlb_unshare_all_pmds(), which I think is designed to be able to
call us with only the mmap lock held (in shared mode), but currently
only runs while holding mmap lock (exclusively) and VMA lock

Backporting note:
This commit fixes a racy protection that was introduced in commit
b30c14cd6102 ("hugetlb: unshare some PMDs when splitting VMAs"); that
commit claimed to fix an issue introduced in 5.13, but it should actually
also go all the way back.
[jannh@google.com: v2]
  Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-1-1329349bad1a...
Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-0-1329349bad1a...
Link: https://lkml.kernel.org/r/20250527-hugetlb-fixes-splitrace-v1-1-f4136f5ec58a...
Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page")
Signed-off-by: Jann Horn jannh@google.com
Cc: Liam Howlett liam.howlett@oracle.com
Reviewed-by: Lorenzo Stoakes lorenzo.stoakes@oracle.com
Reviewed-by: Oscar Salvador osalvador@suse.de
Cc: Lorenzo Stoakes lorenzo.stoakes@oracle.com
Cc: Vlastimil Babka vbabka@suse.cz
Cc: stable@vger.kernel.org	[b30c14cd6102: hugetlb: unshare some PMDs when splitting VMAs]
Cc: stable@vger.kernel.org
Signed-off-by: Andrew Morton akpm@linux-foundation.org
[stable backport: code got moved around, VMA splitting is in
__vma_adjust]
Signed-off-by: Jann Horn jannh@google.com

include/linux/hugetlb.h |  3 +++
 mm/hugetlb.c            | 60 ++++++++++++++++++++++++++++++-----------
 mm/mmap.c               |  8 ++++++
 3 files changed, 55 insertions(+), 16 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index cc555072940f..26f2947c399d 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -239,6 +239,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 
 bool is_hugetlb_entry_migration(pte_t pte);
 void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
+void hugetlb_split(struct vm_area_struct *vma, unsigned long addr);
 
 #else /* !CONFIG_HUGETLB_PAGE */
 
@@ -472,6 +473,8 @@ static inline vm_fault_t hugetlb_fault(struct mm_struct *mm,
 
 static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { }
 
+static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {}



#endif /* !CONFIG_HUGETLB_PAGE */
 /*

hugepages at page global directory. If arch support

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 14b9494c58ed..fc5d3d665266 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -95,7 +95,7 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
 static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
 static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
 static void hugetlb_unshare_pmds(struct vm_area_struct *vma,

unsigned long start, unsigned long end);




unsigned long start, unsigned long end, bool take_locks);



static struct resv_map *vma_resv_map(struct vm_area_struct *vma);
 
 static inline bool subpool_is_free(struct hugepage_subpool *spool)
@@ -4900,26 +4900,40 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
 {
   if (addr & ~(huge_page_mask(hstate_vma(vma))))
   	return -EINVAL;

return 0;

+}
 
+void hugetlb_split(struct vm_area_struct *vma, unsigned long addr)
+{
   /*
    * PMD sharing is only possible for PUD_SIZE-aligned address ranges
    * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
    * split, unshare PMDs in the PUD_SIZE interval surrounding addr now.

* This function is called in the middle of a VMA split operation, with


* MM, VMA and rmap all write-locked to prevent concurrent page table


* walks (except hardware and gup_fast()).

*/
mmap_assert_write_locked(vma->vm_mm);
i_mmap_assert_write_locked(vma->vm_file->f_mapping);

The above i_mmap lock assertion is firing on stable kernels from 5.10 to 6.1
included.
------------[ cut here ]------------
WARNING: CPU: 0 PID: 11489 at include/linux/fs.h:503 i_mmap_assert_write_locked include/linux/fs.h:503 [inline]
WARNING: CPU: 0 PID: 11489 at include/linux/fs.h:503 hugetlb_split+0x267/0x300 mm/hugetlb.c:4917
Modules linked in:
CPU: 0 PID: 11489 Comm: syz-executor.4 Not tainted 6.1.142-syzkaller-00296-gfd0df5221577 #0
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014
RIP: 0010:i_mmap_assert_write_locked include/linux/fs.h:503 [inline]
RIP: 0010:hugetlb_split+0x267/0x300 mm/hugetlb.c:4917
Call Trace:
 <TASK>
 __vma_adjust+0xd73/0x1c10 mm/mmap.c:736
 vma_adjust include/linux/mm.h:2745 [inline]
 __split_vma+0x459/0x540 mm/mmap.c:2385
 do_mas_align_munmap+0x5f2/0xf10 mm/mmap.c:2497
 do_mas_munmap+0x26c/0x2c0 mm/mmap.c:2646
 __mmap_region mm/mmap.c:2694 [inline]
 mmap_region+0x19f/0x1770 mm/mmap.c:2912
 do_mmap+0x84b/0xf20 mm/mmap.c:1432
 vm_mmap_pgoff+0x1af/0x280 mm/util.c:520
 ksys_mmap_pgoff+0x41f/0x5a0 mm/mmap.c:1478
 do_syscall_x64 arch/x86/entry/common.c:51 [inline]
 do_syscall_64+0x35/0x80 arch/x86/entry/common.c:81
 entry_SYSCALL_64_after_hwframe+0x6e/0xd8
RIP: 0033:0x46a269
 </TASK>
Found by Linux Verification Center (linuxtesting.org) with Syzkaller.
The main reason is that those branches lack the following
commit ccf1d78d8b86e28502fa1b575a459a402177def4
  Author: Suren Baghdasaryan surenb@google.com
  Date:   Mon Feb 27 09:36:13 2023 -0800
mm/mmap: move vma_prepare before vma_adjust_trans_huge
vma_prepare() acquires all locks required before VMA modifications.  Move
      vma_prepare() before vma_adjust_trans_huge() so that VMA is locked before
      any modification.
Link: https://lkml.kernel.org/r/20230227173632.3292573-15-surenb@google.com
      Signed-off-by: Suren Baghdasaryan surenb@google.com
      Signed-off-by: Andrew Morton akpm@linux-foundation.org
thus the needed lock is acquired just after vma_adjust_trans_huge() and
the newly added hugetlb_split().
Please have a look at a straightforward write-up which comes to my mind.
It does something like the ccf1d78d8b86 ("mm/mmap: move vma_prepare before
vma_adjust_trans_huge"), but in context of an old stable branch.
If looks okay, I'll be glad to prepare it as a formal patch and send it
out for the 5.10-5.15, too.
against 6.1.y
-------------
diff --git a/mm/mmap.c b/mm/mmap.c
index 0f303dc8425a..941880ed62d7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -543,8 +543,6 @@ inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma,
        if (mas_preallocate(mas, vma, GFP_KERNEL))
                goto nomem;
-       vma_adjust_trans_huge(vma, start, end, 0);
-
        if (file) {
                mapping = file->f_mapping;
                root = &mapping->i_mmap;
@@ -562,6 +560,8 @@ inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma,
                vma_interval_tree_remove(vma, root);
        }
+       vma_adjust_trans_huge(vma, start, end, 0);
+
        vma->vm_start = start;
        vma->vm_end = end;
        vma->vm_pgoff = pgoff;
@@ -727,15 +727,6 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
                return -ENOMEM;
        }
-       /*
-        * Get rid of huge pages and shared page tables straddling the split
-        * boundary.
-        */
-       vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
-       if (is_vm_hugetlb_page(orig_vma)) {
-               hugetlb_split(orig_vma, start);
-               hugetlb_split(orig_vma, end);
-       }
        if (file) {
                mapping = file->f_mapping;
                root = &mapping->i_mmap;
@@ -775,6 +766,16 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
                        vma_interval_tree_remove(next, root);
        }
+       /*
+        * Get rid of huge pages and shared page tables straddling the split
+        * boundary.
+        */
+       vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
+       if (is_vm_hugetlb_page(orig_vma)) {
+               hugetlb_split(orig_vma, start);
+               hugetlb_split(orig_vma, end);
+       }
+
        if (start != vma->vm_start) {
                if ((vma->vm_start < start) &&
                    (!insert || (insert->vm_end != start))) {
--
Fedor
...

if (addr & ~PUD_MASK) {


/*


 * hugetlb_vm_op_split is called right before we attempt to


 * split the VMA. We will need to unshare PMDs in the old and


 * new VMAs, so let's unshare before we split.


 */

unsigned long floor = addr & PUD_MASK;
unsigned long ceil = floor + PUD_SIZE;


if (floor >= vma->vm_start && ceil <= vma->vm_end)


	hugetlb_unshare_pmds(vma, floor, ceil);




if (floor >= vma->vm_start && ceil <= vma->vm_end) {


	/*


	 * Locking:


	 * Use take_locks=false here.


	 * The file rmap lock is already held.


	 * The hugetlb VMA lock can't be taken when we already


	 * hold the file rmap lock, and we don't need it because


	 * its purpose is to synchronize against concurrent page


	 * table walks, which are not possible thanks to the


	 * locks held by our caller.


	 */


	hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false);


}

}



return 0;

}
 
 static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
@@ -7495,9 +7509,16 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
   }
 }
 
+/*


If @take_locks is false, the caller must ensure that no concurrent page table



access can happen (except for gup_fast() and hardware page walks).



If @take_locks is true, we take the hugetlb VMA lock (to lock out things like



concurrent page fault handling) and the file rmap lock.


*/

static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
   			   unsigned long start,

		   unsigned long end)




		   unsigned long end,


		   bool take_locks)



{
   struct hstate *h = hstate_vma(vma);
   unsigned long sz = huge_page_size(h);
@@ -7521,8 +7542,12 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
   mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
   			start, end);
   mmu_notifier_invalidate_range_start(&range);

hugetlb_vma_lock_write(vma);
i_mmap_lock_write(vma->vm_file->f_mapping);


if (take_locks) {
hugetlb_vma_lock_write(vma);


i_mmap_lock_write(vma->vm_file->f_mapping);


} else {
i_mmap_assert_write_locked(vma->vm_file->f_mapping);


}
for (address = start; address < end; address += PUD_SIZE) {
ptep = huge_pte_offset(mm, address, sz);
if (!ptep)

@@ -7532,8 +7557,10 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
   	spin_unlock(ptl);
   }
   flush_hugetlb_tlb_range(vma, start, end);

i_mmap_unlock_write(vma->vm_file->f_mapping);
hugetlb_vma_unlock_write(vma);


if (take_locks) {
i_mmap_unlock_write(vma->vm_file->f_mapping);


hugetlb_vma_unlock_write(vma);


}
/*
No need to call mmu_notifier_invalidate_range(), see
Documentation/mm/mmu_notifier.rst.



@@ -7548,7 +7575,8 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
 void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
 {
   hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),

	ALIGN_DOWN(vma->vm_end, PUD_SIZE));




	ALIGN_DOWN(vma->vm_end, PUD_SIZE),


	/* take_locks = */ true);



}
 
 #ifdef CONFIG_CMA
diff --git a/mm/mmap.c b/mm/mmap.c
index ebc3583fa612..0f303dc8425a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -727,7 +727,15 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
   	return -ENOMEM;
   }

/*
* Get rid of huge pages and shared page tables straddling the split


* boundary.


*/

vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
if (is_vm_hugetlb_page(orig_vma)) {
hugetlb_split(orig_vma, start);


hugetlb_split(orig_vma, end);


}
if (file) {
mapping = file->f_mapping;
root = &mapping->i_mmap;

-- 
2.50.0.rc2.701.gf1e915cc24-goog

    

2025

2024

2023

2022

2021

2020

2019

2018

2017

Re: [PATCH 6.1.y 1/3] mm/hugetlb: unshare page tables during VMA split, not before