Let's support FAULT_FLAG_UNSHARE to implement GUP-triggered unsharing, preparing for its use in the GUP paths when there is need to unshare a shared anonymous hugetlb page.
We'll make use of it next by setting FAULT_FLAG_UNSHARE in case we detect that unsharing is necessary.
This commit is based on a prototype patch by Andrea.
Co-developed-by: Andrea Arcangeli aarcange@redhat.com Signed-off-by: Andrea Arcangeli aarcange@redhat.com Reviewed-by: Peter Xu peterx@redhat.com Signed-off-by: David Hildenbrand david@redhat.com --- mm/hugetlb.c | 86 ++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 63 insertions(+), 23 deletions(-)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a1baa198519a..5f2863b046ef 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5130,14 +5130,15 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, }
/* - * Hugetlb_cow() should be called with page lock of the original hugepage held. + * __wp_hugetlb() should be called with page lock of the original hugepage held. * Called with hugetlb_fault_mutex_table held and pte_page locked so we * cannot race with other handlers or page migration. * Keep the pte_same checks anyway to make transition from the mutex easier. */ -static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, - struct page *pagecache_page, spinlock_t *ptl) +static __always_inline vm_fault_t +__wp_hugetlb(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, struct page *pagecache_page, + spinlock_t *ptl, bool unshare) { pte_t pte; struct hstate *h = hstate_vma(vma); @@ -5151,11 +5152,21 @@ static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, old_page = pte_page(pte);
retry_avoidcopy: - /* If no-one else is actually using this page, avoid the copy - * and just make the page writable */ - if (page_mapcount(old_page) == 1 && PageAnon(old_page)) { - page_move_anon_rmap(old_page, vma); - set_huge_ptep_writable(vma, haddr, ptep); + if (!unshare) { + /* + * If no-one else is actually using this page, avoid the copy + * and just make the page writable. + */ + if (page_mapcount(old_page) == 1 && PageAnon(old_page)) { + page_move_anon_rmap(old_page, vma); + set_huge_ptep_writable(vma, haddr, ptep); + return 0; + } + } else if (!PageAnon(old_page) || page_mapcount(old_page) == 1) { + /* + * GUP-triggered unsharing only applies to shared anonymous + * pages. If that does no longer apply, there is nothing to do. + */ return 0; }
@@ -5256,11 +5267,11 @@ static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { ClearHPageRestoreReserve(new_page);
- /* Break COW */ + /* Break COW or unshare */ huge_ptep_clear_flush(vma, haddr, ptep); mmu_notifier_invalidate_range(mm, range.start, range.end); set_huge_pte_at(mm, haddr, ptep, - make_huge_pte(vma, new_page, 1)); + make_huge_pte(vma, new_page, !unshare)); page_remove_rmap(old_page, true); hugepage_add_new_anon_rmap(new_page, vma, haddr); SetHPageMigratable(new_page); @@ -5270,7 +5281,10 @@ static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, spin_unlock(ptl); mmu_notifier_invalidate_range_end(&range); out_release_all: - /* No restore in case of successful pagetable update (Break COW) */ + /* + * No restore in case of successful pagetable update (Break COW or + * unshare) + */ if (new_page != old_page) restore_reserve_on_error(h, vma, haddr, new_page); put_page(new_page); @@ -5281,6 +5295,23 @@ static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, return ret; }
+static vm_fault_t +wp_hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, struct page *pagecache_page, + spinlock_t *ptl) +{ + return __wp_hugetlb(mm, vma, address, ptep, pagecache_page, ptl, + false); +} + +static vm_fault_t +wp_hugetlb_unshare(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + struct page *pagecache_page, spinlock_t *ptl) +{ + return __wp_hugetlb(mm, vma, address, ptep, pagecache_page, ptl, true); +} + /* Return the pagecache page at a given address within a VMA */ static struct page *hugetlbfs_pagecache_page(struct hstate *h, struct vm_area_struct *vma, unsigned long address) @@ -5393,7 +5424,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, /* * Currently, we are forced to kill the process in the event the * original mapper has unmapped pages from the child due to a failed - * COW. Warn that such a situation has occurred as it may not be obvious + * COW/unsharing. Warn that such a situation has occurred as it may not + * be obvious. */ if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n", @@ -5519,7 +5551,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, hugetlb_count_add(pages_per_huge_page(h), mm); if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_cow(mm, vma, address, ptep, page, ptl); + ret = wp_hugetlb_cow(mm, vma, address, ptep, page, ptl); }
spin_unlock(ptl); @@ -5649,14 +5681,15 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, goto out_mutex;
/* - * If we are going to COW the mapping later, we examine the pending - * reservations for this page now. This will ensure that any + * If we are going to COW/unshare the mapping later, we examine the + * pending reservations for this page now. This will ensure that any * allocations necessary to record that reservation occur outside the * spinlock. For private mappings, we also lookup the pagecache * page now as it is used to determine if a reservation has been * consumed. */ - if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { + if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && + !huge_pte_write(entry)) { if (vma_needs_reservation(h, vma, haddr) < 0) { ret = VM_FAULT_OOM; goto out_mutex; @@ -5671,14 +5704,17 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
ptl = huge_pte_lock(h, mm, ptep);
- /* Check for a racing update before calling hugetlb_cow */ + /* + * Check for a racing update before calling wp_hugetlb_cow / + * wp_hugetlb_unshare + */ if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) goto out_ptl;
/* - * hugetlb_cow() requires page locks of pte_page(entry) and - * pagecache_page, so here we need take the former one - * when page != pagecache_page or !pagecache_page. + * wp_hugetlb_cow()/wp_hugetlb_unshare() requires page locks of + * pte_page(entry) and pagecache_page, so here we need take the former + * one when page != pagecache_page or !pagecache_page. */ page = pte_page(entry); if (page != pagecache_page) @@ -5691,11 +5727,15 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (flags & FAULT_FLAG_WRITE) { if (!huge_pte_write(entry)) { - ret = hugetlb_cow(mm, vma, address, ptep, - pagecache_page, ptl); + ret = wp_hugetlb_cow(mm, vma, address, ptep, + pagecache_page, ptl); goto out_put_page; } entry = huge_pte_mkdirty(entry); + } else if (flags & FAULT_FLAG_UNSHARE && !huge_pte_write(entry)) { + ret = wp_hugetlb_unshare(mm, vma, address, ptep, pagecache_page, + ptl); + goto out_put_page; } entry = pte_mkyoung(entry); if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,