[PATCH v3] mm/hugetlb: fix a deadlock with pagecache_folio and hugetlb_fault_mutex_table - Linux-stable-mirror

28 May 2025

There is ABBA dead locking scenario happening between hugetlb_fault()
and hugetlb_wp() on the pagecache folio's lock and hugetlb global mutex,
which is reproducible with syzkaller [1]. As below stack traces reveal,
process-1 tries to take the hugetlb global mutex (A3), but with the
pagecache folio's lock hold. Process-2 took the hugetlb global mutex but
tries to take the pagecache folio's lock.
Process-1                               Process-2
=========                               =========
hugetlb_fault
   mutex_lock                  (A1)
   filemap_lock_hugetlb_folio  (B1)
   hugetlb_wp
     alloc_hugetlb_folio       #error
       mutex_unlock            (A2)
                                        hugetlb_fault
                                          mutex_lock                  (A4)
                                          filemap_lock_hugetlb_folio  (B4)
       unmap_ref_private
       mutex_lock              (A3)
Fix it by releasing the pagecache folio's lock at (A2) of process-1 so
that pagecache folio's lock is available to process-2 at (B4), to avoid
the deadlock. In process-1, a new variable is added to track if the
pagecache folio's lock has been released by its child function
hugetlb_wp() to avoid double releases on the lock in hugetlb_fault().
The similar changes are applied to hugetlb_no_page().
Link: https://drive.google.com/file/d/1DVRnIW-vSayU5J1re9Ct_br3jJQU6Vpb/view?usp=d... [1]
Fixes: 40549ba8f8e0 ("hugetlb: use new vma_lock for pmd sharing synchronization")
Cc: stable@vger.kernel.org
Cc: Hugh Dickins hughd@google.com
Cc: Florent Revest revest@google.com
Reviewed-by: Gavin Shan gshan@redhat.com
Signed-off-by: Gavin Guo gavinguo@igalia.com
---
V1 -> V2
Suggested-by Oscar Salvador:
  - Use folio_test_locked to replace the unnecessary parameter passing.
V2 -> V3
- Dropped the approach suggested by Oscar.
- Refine the code and git commit suggested by Gavin Shan.
mm/hugetlb.c | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6a3cf7935c14..560b9b35262a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6137,7 +6137,8 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
  * Keep the pte_same checks anyway to make transition from the mutex easier.
  */
 static vm_fault_t hugetlb_wp(struct folio *pagecache_folio,
-		       struct vm_fault *vmf)
+		       struct vm_fault *vmf,
+		       bool *pagecache_folio_locked)
 {
    struct vm_area_struct *vma = vmf->vma;
    struct mm_struct *mm = vma->vm_mm;
@@ -6234,6 +6235,18 @@ static vm_fault_t hugetlb_wp(struct folio *pagecache_folio,
    		u32 hash;
folio_put(old_folio);
+			/*
+			 * The pagecache_folio has to be unlocked to avoid
+			 * deadlock and we won't re-lock it in hugetlb_wp(). The
+			 * pagecache_folio could be truncated after being
+			 * unlocked. So its state should not be reliable
+			 * subsequently.
+			 */
+			if (pagecache_folio) {
+				folio_unlock(pagecache_folio);
+				if (pagecache_folio_locked)
+					*pagecache_folio_locked = false;
+			}
    		/*
    		 * Drop hugetlb_fault_mutex and vma_lock before
    		 * unmapping.  unmapping needs to hold vma_lock
@@ -6588,7 +6601,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
    hugetlb_count_add(pages_per_huge_page(h), mm);
    if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
    	/* Optimization, do the COW without a second fault */
-		ret = hugetlb_wp(folio, vmf);
+		ret = hugetlb_wp(folio, vmf, NULL);
    }
spin_unlock(vmf->ptl);
@@ -6660,6 +6673,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
    struct hstate *h = hstate_vma(vma);
    struct address_space *mapping;
    int need_wait_lock = 0;
+	bool pagecache_folio_locked = true;
    struct vm_fault vmf = {
    	.vma = vma,
    	.address = address & huge_page_mask(h),
@@ -6814,7 +6828,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
    	if (!huge_pte_write(vmf.orig_pte)) {
-			ret = hugetlb_wp(pagecache_folio, &vmf);
+			ret = hugetlb_wp(pagecache_folio, &vmf,
+					&pagecache_folio_locked);
    		goto out_put_page;
    	} else if (likely(flags & FAULT_FLAG_WRITE)) {
    		vmf.orig_pte = huge_pte_mkdirty(vmf.orig_pte);
@@ -6832,7 +6847,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
    spin_unlock(vmf.ptl);
if (pagecache_folio) {
-		folio_unlock(pagecache_folio);
+		if (pagecache_folio_locked)
+			folio_unlock(pagecache_folio);
+
    	folio_put(pagecache_folio);
    }
 out_mutex:
base-commit: 914873bc7df913db988284876c16257e6ab772c6
-- 
2.43.0