On 5/21/25 03:53, Oscar Salvador wrote:
On Tue, May 13, 2025 at 05:34:48PM +0800, Gavin Guo wrote:
The patch fixes a deadlock which can be triggered by an internal syzkaller [1] reproducer and captured by bpftrace script [2] and its log [3] in this scenario:
Process 1 Process 2
hugetlb_fault mutex_lock(B) // take B filemap_lock_hugetlb_folio filemap_lock_folio __filemap_get_folio folio_lock(A) // take A hugetlb_wp mutex_unlock(B) // release B ... hugetlb_fault ... mutex_lock(B) // take B filemap_lock_hugetlb_folio filemap_lock_folio __filemap_get_folio folio_lock(A) // blocked unmap_ref_private ... mutex_lock(B) // retake and blocked
...
Signed-off-by: Gavin Guo gavinguo@igalia.com
I think this is more convoluted that it needs to be?
hugetlb_wp() is called from hugetlb_no_page() and hugetlb_fault(). hugetlb_no_page() locks and unlocks the lock itself, which leaves us with hugetlb_fault().
hugetlb_fault() always passed the folio locked to hugetlb_wp(), and the latter only unlocks it when we have a cow from owner happening and we cannot satisfy the allocation. So, should not checking whether the folio is still locked after returning enough? What speaks against:
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bd8971388236..23b57c5689a4 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6228,6 +6228,12 @@ static vm_fault_t hugetlb_wp(struct folio *pagecache_folio, u32 hash; folio_put(old_folio);
/*
* The pagecache_folio needs to be unlocked to avoid
* deadlock when the child unmaps the folio.
*/
if (pagecache_folio)
folio_unlock(pagecache_folio); /* * Drop hugetlb_fault_mutex and vma_lock before * unmapping. unmapping needs to hold vma_lock
@@ -6825,7 +6831,12 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, spin_unlock(vmf.ptl); if (pagecache_folio) {
folio_unlock(pagecache_folio);
/*
* hugetlb_wp() might have already unlocked pagecache_folio, so
* skip it if that is the case.
*/
if (folio_test_locked(pagecache_folio))
}folio_unlock(pagecache_folio); folio_put(pagecache_folio);
out_mutex:
Really appreciate your review. Good catch! This is elegant. The patch is under testing and I'll send out the v2 patch soon.
mm/hugetlb.c | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e3e6ac991b9c..ad54a74aa563 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6115,7 +6115,8 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
- Keep the pte_same checks anyway to make transition from the mutex easier.
*/ static vm_fault_t hugetlb_wp(struct folio *pagecache_folio,
struct vm_fault *vmf)
struct vm_fault *vmf,
{ struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm;bool *pagecache_folio_unlocked)
@@ -6212,6 +6213,22 @@ static vm_fault_t hugetlb_wp(struct folio *pagecache_folio, u32 hash; folio_put(old_folio);
/*
* The pagecache_folio needs to be unlocked to avoid
* deadlock and we won't re-lock it in hugetlb_wp(). The
* pagecache_folio could be truncated after being
* unlocked. So its state should not be relied
* subsequently.
*
* Setting *pagecache_folio_unlocked to true allows the
* caller to handle any necessary logic related to the
* folio's unlocked state.
*/
if (pagecache_folio) {
folio_unlock(pagecache_folio);
if (pagecache_folio_unlocked)
*pagecache_folio_unlocked = true;
} /* * Drop hugetlb_fault_mutex and vma_lock before * unmapping. unmapping needs to hold vma_lock
@@ -6566,7 +6583,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, hugetlb_count_add(pages_per_huge_page(h), mm); if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */
ret = hugetlb_wp(folio, vmf);
}ret = hugetlb_wp(folio, vmf, NULL);
spin_unlock(vmf->ptl); @@ -6638,6 +6655,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, struct hstate *h = hstate_vma(vma); struct address_space *mapping; int need_wait_lock = 0;
- bool pagecache_folio_unlocked = false; struct vm_fault vmf = { .vma = vma, .address = address & huge_page_mask(h),
@@ -6792,7 +6810,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { if (!huge_pte_write(vmf.orig_pte)) {
ret = hugetlb_wp(pagecache_folio, &vmf);
ret = hugetlb_wp(pagecache_folio, &vmf,
} else if (likely(flags & FAULT_FLAG_WRITE)) { vmf.orig_pte = huge_pte_mkdirty(vmf.orig_pte);&pagecache_folio_unlocked); goto out_put_page;
@@ -6809,10 +6828,14 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, out_ptl: spin_unlock(vmf.ptl);
- if (pagecache_folio) {
- /*
* If the pagecache_folio is unlocked in hugetlb_wp(), we skip
* folio_unlock() here.
*/
- if (pagecache_folio && !pagecache_folio_unlocked) folio_unlock(pagecache_folio);
- if (pagecache_folio) folio_put(pagecache_folio);
- } out_mutex: hugetlb_vma_unlock_read(vma);
base-commit: d76bb1ebb5587f66b0f8b8099bfbb44722bc08b3
2.43.0