Remove shmem-specific code from UFFDIO_CONTINUE implementation for non-huge pages by calling vm_ops->fault(). A new VMF flag, FAULT_FLAG_USERFAULT_CONTINUE, is introduced to avoid recursive call to handle_userfault().
Suggested-by: James Houghton jthoughton@google.com Signed-off-by: Nikita Kalyazin kalyazin@amazon.com --- include/linux/mm_types.h | 4 ++++ mm/hugetlb.c | 2 +- mm/shmem.c | 9 ++++++--- mm/userfaultfd.c | 37 +++++++++++++++++++++++++++---------- 4 files changed, 38 insertions(+), 14 deletions(-)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0234f14f2aa6..2f26ee9742bf 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1429,6 +1429,9 @@ enum tlb_flush_reason { * @FAULT_FLAG_ORIG_PTE_VALID: whether the fault has vmf->orig_pte cached. * We should only access orig_pte if this flag set. * @FAULT_FLAG_VMA_LOCK: The fault is handled under VMA lock. + * @FAULT_FLAG_USERFAULT_CONTINUE: The fault handler must not call userfaultfd + * minor handler as it is being called by the + * userfaultfd code itself. * * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify * whether we would allow page faults to retry by specifying these two @@ -1467,6 +1470,7 @@ enum fault_flag { FAULT_FLAG_UNSHARE = 1 << 10, FAULT_FLAG_ORIG_PTE_VALID = 1 << 11, FAULT_FLAG_VMA_LOCK = 1 << 12, + FAULT_FLAG_USERFAULT_CONTINUE = 1 << 13, };
typedef unsigned int __bitwise zap_flags_t; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 97930d44d460..c004cfdcd4e2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6228,7 +6228,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, }
/* Check for page in userfault range. */ - if (userfaultfd_minor(vma)) { + if (userfaultfd_minor(vma) && !(vmf->flags & FAULT_FLAG_USERFAULT_CONTINUE)) { folio_unlock(folio); folio_put(folio); /* See comment in userfaultfd_missing() block above */ diff --git a/mm/shmem.c b/mm/shmem.c index 1ede0800e846..b4159303fe59 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2467,7 +2467,8 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, fault_mm = vma ? vma->vm_mm : NULL;
folio = filemap_get_entry(inode->i_mapping, index); - if (folio && vma && userfaultfd_minor(vma)) { + if (folio && vma && userfaultfd_minor(vma) && + !(vmf->flags & FAULT_FLAG_USERFAULT_CONTINUE)) { if (!xa_is_value(folio)) folio_put(folio); *fault_type = handle_userfault(vmf, VM_UFFD_MINOR); @@ -2727,6 +2728,8 @@ static vm_fault_t shmem_falloc_wait(struct vm_fault *vmf, struct inode *inode) static vm_fault_t shmem_fault(struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); + enum sgp_type sgp = vmf->flags & FAULT_FLAG_USERFAULT_CONTINUE ? + SGP_NOALLOC : SGP_CACHE; gfp_t gfp = mapping_gfp_mask(inode->i_mapping); struct folio *folio = NULL; vm_fault_t ret = 0; @@ -2743,8 +2746,8 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) }
WARN_ON_ONCE(vmf->page != NULL); - err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, SGP_CACHE, - gfp, vmf, &ret); + err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, sgp, gfp, vmf, + &ret); if (err) return vmf_error(err); if (folio) { diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index d06453fa8aba..4b3dbc7dac64 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -380,30 +380,47 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, return ret; }
-/* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */ +/* Handles UFFDIO_CONTINUE for all VMAs */ static int mfill_atomic_pte_continue(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, uffd_flags_t flags) { - struct inode *inode = file_inode(dst_vma->vm_file); - pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); struct folio *folio; struct page *page; int ret; + struct vm_fault vmf = { + .vma = dst_vma, + .address = dst_addr, + .flags = FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE | + FAULT_FLAG_USERFAULT_CONTINUE, + .pte = NULL, + .page = NULL, + .pgoff = linear_page_index(dst_vma, dst_addr), + }; + + if (!dst_vma->vm_ops || !dst_vma->vm_ops->fault) + return -EINVAL;
- ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC); - /* Our caller expects us to return -EFAULT if we failed to find folio */ - if (ret == -ENOENT) +retry: + ret = dst_vma->vm_ops->fault(&vmf); + if (ret & VM_FAULT_ERROR) { ret = -EFAULT; - if (ret) goto out; - if (!folio) { - ret = -EFAULT; + } + + if (ret & VM_FAULT_NOPAGE) { + ret = -EAGAIN; goto out; }
- page = folio_file_page(folio, pgoff); + if (ret & VM_FAULT_RETRY) + goto retry; + + page = vmf.page; + folio = page_folio(page); + BUG_ON(!folio); + if (PageHWPoison(page)) { ret = -EIO; goto out_release;