This series, currently based on 6.2, is divided into two parts:
- Commits 1-4 refactor userfaultfd ioctl code without behavior changes, with the main goal of improving consistency and reducing the number of function args. - Commit 5 adds UFFDIO_CONTINUE_MODE_WP.
The refactors are sorted by increasing controversial-ness, the idea being we could drop some of the refactors if they are deemed not worth it.
Changelog: v1->v2: refactor before adding the new flag, to avoid perpetuating messiness
Axel Rasmussen (5): mm: userfaultfd: rename functions for clarity + consistency mm: userfaultfd: don't pass around both mm and vma mm: userfaultfd: combine 'mode' and 'wp_copy' arguments mm: userfaultfd: don't separate addr + len arguments mm: userfaultfd: add UFFDIO_CONTINUE_MODE_WP to install WP PTEs
fs/userfaultfd.c | 121 +++++------- include/linux/hugetlb.h | 28 ++- include/linux/shmem_fs.h | 8 +- include/linux/userfaultfd_k.h | 57 +++--- include/uapi/linux/userfaultfd.h | 7 + mm/hugetlb.c | 35 ++-- mm/shmem.c | 13 +- mm/userfaultfd.c | 236 +++++++++++------------ tools/testing/selftests/vm/userfaultfd.c | 4 + 9 files changed, 245 insertions(+), 264 deletions(-)
-- 2.39.2.637.g21b0678d19-goog
The basic problem is, over time we've added new userfaultfd ioctls, and we've refactored the code so functions which used to handle only one case are now re-used to deal with several cases. While this happened, we didn't bother to rename the functions.
Similarly, as we added new functions, we cargo-culted pieces of the now-inconsistent naming scheme, so those functions too ended up with names that don't make a lot of sense.
A key point here is, "copy" in most userfaultfd code refers specifically to UFFDIO_COPY, where we allocate a new page and copy its contents from userspace. There are many functions with "copy" in the name that don't actually do this (at least in some cases).
So, rename things into a consistent scheme. The high level idea is that the call stack for userfaultfd ioctls becomes:
userfaultfd_ioctl -> userfaultfd_(particular ioctl) -> mfill_atomic_(particular kind of fill operation) -> mfill_atomic /* loops over pages in range */ -> mfill_atomic_pte /* deals with single pages */ -> mfill_atomic_pte_(particular kind of fill operation) -> mfill_atomic_install_pte
There are of course some special cases (shmem, hugetlb), but this is the general structure which all function names now adhere to.
Signed-off-by: Axel Rasmussen axelrasmussen@google.com --- fs/userfaultfd.c | 18 +++---- include/linux/hugetlb.h | 30 +++++------ include/linux/userfaultfd_k.h | 18 +++---- mm/hugetlb.c | 20 +++---- mm/userfaultfd.c | 98 +++++++++++++++++------------------ 5 files changed, 92 insertions(+), 92 deletions(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index cc694846617a..c08a26ae77d6 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1752,9 +1752,9 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP)) goto out; if (mmget_not_zero(ctx->mm)) { - ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src, - uffdio_copy.len, &ctx->mmap_changing, - uffdio_copy.mode); + ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src, + uffdio_copy.len, &ctx->mmap_changing, + uffdio_copy.mode); mmput(ctx->mm); } else { return -ESRCH; @@ -1804,9 +1804,9 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, goto out;
if (mmget_not_zero(ctx->mm)) { - ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start, - uffdio_zeropage.range.len, - &ctx->mmap_changing); + ret = mfill_atomic_zeropage(ctx->mm, uffdio_zeropage.range.start, + uffdio_zeropage.range.len, + &ctx->mmap_changing); mmput(ctx->mm); } else { return -ESRCH; @@ -1914,9 +1914,9 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) goto out;
if (mmget_not_zero(ctx->mm)) { - ret = mcopy_continue(ctx->mm, uffdio_continue.range.start, - uffdio_continue.range.len, - &ctx->mmap_changing); + ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start, + uffdio_continue.range.len, + &ctx->mmap_changing); mmput(ctx->mm); } else { return -ESRCH; diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 9ab9d3105d5c..3c389b74e02d 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -157,13 +157,13 @@ unsigned long hugetlb_total_pages(void); vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags); #ifdef CONFIG_USERFAULTFD -int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - enum mcopy_atomic_mode mode, - struct page **pagep, - bool wp_copy); +int hugetlb_mfill_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + enum mcopy_atomic_mode mode, + struct page **pagep, + bool wp_copy); #endif /* CONFIG_USERFAULTFD */ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, @@ -355,14 +355,14 @@ static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, }
#ifdef CONFIG_USERFAULTFD -static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, - pte_t *dst_pte, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - enum mcopy_atomic_mode mode, - struct page **pagep, - bool wp_copy) +static inline int hugetlb_mfill_atomic_pte(struct mm_struct *dst_mm, + pte_t *dst_pte, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + enum mcopy_atomic_mode mode, + struct page **pagep, + bool wp_copy) { BUG(); return 0; diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 9df0b9a762cc..6c5ad5d4aa06 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -61,15 +61,15 @@ extern int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, unsigned long dst_addr, struct page *page, bool newly_allocated, bool wp_copy);
-extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, - unsigned long src_start, unsigned long len, - atomic_t *mmap_changing, __u64 mode); -extern ssize_t mfill_zeropage(struct mm_struct *dst_mm, - unsigned long dst_start, - unsigned long len, - atomic_t *mmap_changing); -extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start, - unsigned long len, atomic_t *mmap_changing); +extern ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start, + unsigned long src_start, unsigned long len, + atomic_t *mmap_changing, __u64 mode); +extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, + unsigned long dst_start, + unsigned long len, + atomic_t *mmap_changing); +extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start, + unsigned long len, atomic_t *mmap_changing); extern int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool enable_wp, atomic_t *mmap_changing); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bdbfeb6fb393..915a390442e7 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6159,17 +6159,17 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
#ifdef CONFIG_USERFAULTFD /* - * Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with - * modifications for huge pages. + * Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte + * with modifications for hugetlb pages. */ -int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, - pte_t *dst_pte, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - enum mcopy_atomic_mode mode, - struct page **pagep, - bool wp_copy) +int hugetlb_mfill_atomic_pte(struct mm_struct *dst_mm, + pte_t *dst_pte, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + enum mcopy_atomic_mode mode, + struct page **pagep, + bool wp_copy) { bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE); struct hstate *h = hstate_vma(dst_vma); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 0499907b6f1a..3980e1b7b7f8 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -141,13 +141,13 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, return ret; }
-static int mcopy_atomic_pte(struct mm_struct *dst_mm, - pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - struct page **pagep, - bool wp_copy) +static int mfill_atomic_pte_copy(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + struct page **pagep, + bool wp_copy) { void *page_kaddr; int ret; @@ -218,10 +218,10 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, goto out; }
-static int mfill_zeropage_pte(struct mm_struct *dst_mm, - pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr) +static int mfill_atomic_pte_zeropage(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr) { pte_t _dst_pte, *dst_pte; spinlock_t *ptl; @@ -254,11 +254,11 @@ static int mfill_zeropage_pte(struct mm_struct *dst_mm, }
/* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */ -static int mcontinue_atomic_pte(struct mm_struct *dst_mm, - pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - bool wp_copy) +static int mfill_atomic_pte_continue(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + bool wp_copy) { struct inode *inode = file_inode(dst_vma->vm_file); pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); @@ -321,10 +321,10 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
#ifdef CONFIG_HUGETLB_PAGE /* - * __mcopy_atomic processing for HUGETLB vmas. Note that this routine is + * mfill_atomic processing for HUGETLB vmas. Note that this routine is * called with mmap_lock held, it will release mmap_lock before returning. */ -static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, +static __always_inline ssize_t mfill_atomic_hugetlb(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, unsigned long dst_start, unsigned long src_start, @@ -425,7 +425,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, goto out_unlock; }
- err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, + err = hugetlb_mfill_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, src_addr, mode, &page, wp_copy);
@@ -477,7 +477,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, } #else /* !CONFIG_HUGETLB_PAGE */ /* fail at build time if gcc attempts to use this */ -extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, +extern ssize_t mfill_atomic_hugetlb(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, unsigned long dst_start, unsigned long src_start, @@ -498,8 +498,8 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, ssize_t err;
if (mode == MCOPY_ATOMIC_CONTINUE) { - return mcontinue_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, - wp_copy); + return mfill_atomic_pte_continue(dst_mm, dst_pmd, dst_vma, + dst_addr, wp_copy); }
/* @@ -514,11 +514,11 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, */ if (!(dst_vma->vm_flags & VM_SHARED)) { if (mode == MCOPY_ATOMIC_NORMAL) - err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, - dst_addr, src_addr, page, - wp_copy); + err = mfill_atomic_pte_copy(dst_mm, dst_pmd, dst_vma, + dst_addr, src_addr, page, + wp_copy); else - err = mfill_zeropage_pte(dst_mm, dst_pmd, + err = mfill_atomic_pte_zeropage(dst_mm, dst_pmd, dst_vma, dst_addr); } else { err = shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, @@ -530,13 +530,13 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, return err; }
-static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, - unsigned long dst_start, - unsigned long src_start, - unsigned long len, - enum mcopy_atomic_mode mcopy_mode, - atomic_t *mmap_changing, - __u64 mode) +static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm, + unsigned long dst_start, + unsigned long src_start, + unsigned long len, + enum mcopy_atomic_mode mcopy_mode, + atomic_t *mmap_changing, + __u64 mode) { struct vm_area_struct *dst_vma; ssize_t err; @@ -602,9 +602,9 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, * If this is a HUGETLB vma, pass off to appropriate routine */ if (is_vm_hugetlb_page(dst_vma)) - return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start, - src_start, len, mcopy_mode, - wp_copy); + return mfill_atomic_hugetlb(dst_mm, dst_vma, dst_start, + src_start, len, mcopy_mode, + wp_copy);
if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) goto out_unlock; @@ -702,26 +702,26 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, return copied ? copied : err; }
-ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, - unsigned long src_start, unsigned long len, - atomic_t *mmap_changing, __u64 mode) +ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start, + unsigned long src_start, unsigned long len, + atomic_t *mmap_changing, __u64 mode) { - return __mcopy_atomic(dst_mm, dst_start, src_start, len, - MCOPY_ATOMIC_NORMAL, mmap_changing, mode); + return mfill_atomic(dst_mm, dst_start, src_start, len, + MCOPY_ATOMIC_NORMAL, mmap_changing, mode); }
-ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, - unsigned long len, atomic_t *mmap_changing) +ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long start, + unsigned long len, atomic_t *mmap_changing) { - return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE, - mmap_changing, 0); + return mfill_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE, + mmap_changing, 0); }
-ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start, - unsigned long len, atomic_t *mmap_changing) +ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start, + unsigned long len, atomic_t *mmap_changing) { - return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE, - mmap_changing, 0); + return mfill_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE, + mmap_changing, 0); }
void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma,
Quite a few userfaultfd functions took both mm and vma pointers as arguments. Since the mm is trivially accessible via vma->vm_mm, there's no reason to pass both; it just needlessly extends the already long argument list.
Get rid of the mm pointer, where possible, to shorten the argument list.
Signed-off-by: Axel Rasmussen axelrasmussen@google.com --- fs/userfaultfd.c | 2 +- include/linux/hugetlb.h | 5 ++- include/linux/shmem_fs.h | 4 +-- include/linux/userfaultfd_k.h | 4 +-- mm/hugetlb.c | 9 +++-- mm/shmem.c | 7 ++-- mm/userfaultfd.c | 66 ++++++++++++++++------------------- 7 files changed, 45 insertions(+), 52 deletions(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index c08a26ae77d6..a95f6aaef76b 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1637,7 +1637,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
/* Reset ptes for the whole vma range if wr-protected */ if (userfaultfd_wp(vma)) - uffd_wp_range(mm, vma, start, vma_end - start, false); + uffd_wp_range(vma, start, vma_end - start, false);
new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; prev = vma_merge(mm, prev, start, vma_end, new_flags, diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 3c389b74e02d..d3fc104aab78 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -157,7 +157,7 @@ unsigned long hugetlb_total_pages(void); vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags); #ifdef CONFIG_USERFAULTFD -int hugetlb_mfill_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte, +int hugetlb_mfill_atomic_pte(pte_t *dst_pte, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, @@ -355,8 +355,7 @@ static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, }
#ifdef CONFIG_USERFAULTFD -static inline int hugetlb_mfill_atomic_pte(struct mm_struct *dst_mm, - pte_t *dst_pte, +static inline int hugetlb_mfill_atomic_pte(pte_t *dst_pte, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index d500ea967dc7..2a0b1dc0460f 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -149,14 +149,14 @@ extern void shmem_uncharge(struct inode *inode, long pages);
#ifdef CONFIG_USERFAULTFD #ifdef CONFIG_SHMEM -extern int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, +extern int shmem_mfill_atomic_pte(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, bool zeropage, bool wp_copy, struct page **pagep); #else /* !CONFIG_SHMEM */ -#define shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, \ +#define shmem_mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, \ src_addr, zeropage, wp_copy, pagep) ({ BUG(); 0; }) #endif /* CONFIG_SHMEM */ #endif /* CONFIG_USERFAULTFD */ diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 6c5ad5d4aa06..c6c23408d300 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -56,7 +56,7 @@ enum mcopy_atomic_mode { MCOPY_ATOMIC_CONTINUE, };
-extern int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, +extern int mfill_atomic_install_pte(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, struct page *page, bool newly_allocated, bool wp_copy); @@ -73,7 +73,7 @@ extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst extern int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool enable_wp, atomic_t *mmap_changing); -extern void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *vma, +extern void uffd_wp_range(struct vm_area_struct *vma, unsigned long start, unsigned long len, bool enable_wp);
/* mm helpers */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 915a390442e7..0afd2ed8ad39 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6162,8 +6162,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte * with modifications for hugetlb pages. */ -int hugetlb_mfill_atomic_pte(struct mm_struct *dst_mm, - pte_t *dst_pte, +int hugetlb_mfill_atomic_pte(pte_t *dst_pte, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, @@ -6282,7 +6281,7 @@ int hugetlb_mfill_atomic_pte(struct mm_struct *dst_mm, page_in_pagecache = true; }
- ptl = huge_pte_lock(h, dst_mm, dst_pte); + ptl = huge_pte_lock(h, dst_vma->vm_mm, dst_pte);
ret = -EIO; if (PageHWPoison(page)) @@ -6324,9 +6323,9 @@ int hugetlb_mfill_atomic_pte(struct mm_struct *dst_mm, if (wp_copy) _dst_pte = huge_pte_mkuffd_wp(_dst_pte);
- set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + set_huge_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte);
- hugetlb_count_add(pages_per_huge_page(h), dst_mm); + hugetlb_count_add(pages_per_huge_page(h), dst_vma->vm_mm);
/* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, dst_pte); diff --git a/mm/shmem.c b/mm/shmem.c index 41f82c5a5e28..cc03c61190eb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2398,8 +2398,7 @@ static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block }
#ifdef CONFIG_USERFAULTFD -int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, - pmd_t *dst_pmd, +int shmem_mfill_atomic_pte(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, @@ -2489,11 +2488,11 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, goto out_release;
ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, - gfp & GFP_RECLAIM_MASK, dst_mm); + gfp & GFP_RECLAIM_MASK, dst_vma->vm_mm); if (ret) goto out_release;
- ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr, + ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, &folio->page, true, wp_copy); if (ret) goto out_delete_from_cache; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 3980e1b7b7f8..4bf5c97c665a 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -55,7 +55,7 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm, * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem * and anon, and for both shared and private VMAs. */ -int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, +int mfill_atomic_install_pte(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, struct page *page, bool newly_allocated, bool wp_copy) @@ -93,7 +93,7 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, */ _dst_pte = pte_wrprotect(_dst_pte);
- dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); + dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl);
if (vma_is_shmem(dst_vma)) { /* serialize against truncate with the page table lock */ @@ -129,9 +129,9 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, * Must happen after rmap, as mm_counter() checks mapping (via * PageAnon()), which is set by __page_set_anon_rmap(). */ - inc_mm_counter(dst_mm, mm_counter(page)); + inc_mm_counter(dst_vma->vm_mm, mm_counter(page));
- set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte);
/* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, dst_pte); @@ -141,8 +141,7 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, return ret; }
-static int mfill_atomic_pte_copy(struct mm_struct *dst_mm, - pmd_t *dst_pmd, +static int mfill_atomic_pte_copy(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, @@ -204,10 +203,10 @@ static int mfill_atomic_pte_copy(struct mm_struct *dst_mm, __SetPageUptodate(page);
ret = -ENOMEM; - if (mem_cgroup_charge(page_folio(page), dst_mm, GFP_KERNEL)) + if (mem_cgroup_charge(page_folio(page), dst_vma->vm_mm, GFP_KERNEL)) goto out_release;
- ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr, + ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, page, true, wp_copy); if (ret) goto out_release; @@ -218,8 +217,7 @@ static int mfill_atomic_pte_copy(struct mm_struct *dst_mm, goto out; }
-static int mfill_atomic_pte_zeropage(struct mm_struct *dst_mm, - pmd_t *dst_pmd, +static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr) { @@ -231,7 +229,7 @@ static int mfill_atomic_pte_zeropage(struct mm_struct *dst_mm,
_dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), dst_vma->vm_page_prot)); - dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); + dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl); if (dst_vma->vm_file) { /* the shmem MAP_PRIVATE case requires checking the i_size */ inode = dst_vma->vm_file->f_inode; @@ -244,7 +242,7 @@ static int mfill_atomic_pte_zeropage(struct mm_struct *dst_mm, ret = -EEXIST; if (!pte_none(*dst_pte)) goto out_unlock; - set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte); /* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, dst_pte); ret = 0; @@ -254,8 +252,7 @@ static int mfill_atomic_pte_zeropage(struct mm_struct *dst_mm, }
/* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */ -static int mfill_atomic_pte_continue(struct mm_struct *dst_mm, - pmd_t *dst_pmd, +static int mfill_atomic_pte_continue(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, bool wp_copy) @@ -283,7 +280,7 @@ static int mfill_atomic_pte_continue(struct mm_struct *dst_mm, goto out_release; }
- ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr, + ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, page, false, wp_copy); if (ret) goto out_release; @@ -324,7 +321,7 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) * mfill_atomic processing for HUGETLB vmas. Note that this routine is * called with mmap_lock held, it will release mmap_lock before returning. */ -static __always_inline ssize_t mfill_atomic_hugetlb(struct mm_struct *dst_mm, +static __always_inline ssize_t mfill_atomic_hugetlb( struct vm_area_struct *dst_vma, unsigned long dst_start, unsigned long src_start, @@ -332,6 +329,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(struct mm_struct *dst_mm, enum mcopy_atomic_mode mode, bool wp_copy) { + struct mm_struct *dst_mm = dst_vma->vm_mm; int vm_shared = dst_vma->vm_flags & VM_SHARED; ssize_t err; pte_t *dst_pte; @@ -425,7 +423,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(struct mm_struct *dst_mm, goto out_unlock; }
- err = hugetlb_mfill_atomic_pte(dst_mm, dst_pte, dst_vma, + err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr, src_addr, mode, &page, wp_copy);
@@ -477,17 +475,15 @@ static __always_inline ssize_t mfill_atomic_hugetlb(struct mm_struct *dst_mm, } #else /* !CONFIG_HUGETLB_PAGE */ /* fail at build time if gcc attempts to use this */ -extern ssize_t mfill_atomic_hugetlb(struct mm_struct *dst_mm, - struct vm_area_struct *dst_vma, - unsigned long dst_start, - unsigned long src_start, - unsigned long len, - enum mcopy_atomic_mode mode, - bool wp_copy); +extern ssize_t mfill_atomic_hugetlb(struct vm_area_struct *dst_vma, + unsigned long dst_start, + unsigned long src_start, + unsigned long len, + enum mcopy_atomic_mode mode, + bool wp_copy); #endif /* CONFIG_HUGETLB_PAGE */
-static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, - pmd_t *dst_pmd, +static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, @@ -498,7 +494,7 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, ssize_t err;
if (mode == MCOPY_ATOMIC_CONTINUE) { - return mfill_atomic_pte_continue(dst_mm, dst_pmd, dst_vma, + return mfill_atomic_pte_continue(dst_pmd, dst_vma, dst_addr, wp_copy); }
@@ -514,14 +510,14 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, */ if (!(dst_vma->vm_flags & VM_SHARED)) { if (mode == MCOPY_ATOMIC_NORMAL) - err = mfill_atomic_pte_copy(dst_mm, dst_pmd, dst_vma, + err = mfill_atomic_pte_copy(dst_pmd, dst_vma, dst_addr, src_addr, page, wp_copy); else - err = mfill_atomic_pte_zeropage(dst_mm, dst_pmd, + err = mfill_atomic_pte_zeropage(dst_pmd, dst_vma, dst_addr); } else { - err = shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, + err = shmem_mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, src_addr, mode != MCOPY_ATOMIC_NORMAL, wp_copy, page); @@ -602,7 +598,7 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm, * If this is a HUGETLB vma, pass off to appropriate routine */ if (is_vm_hugetlb_page(dst_vma)) - return mfill_atomic_hugetlb(dst_mm, dst_vma, dst_start, + return mfill_atomic_hugetlb(dst_vma, dst_start, src_start, len, mcopy_mode, wp_copy);
@@ -655,7 +651,7 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm, BUG_ON(pmd_none(*dst_pmd)); BUG_ON(pmd_trans_huge(*dst_pmd));
- err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, + err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, src_addr, &page, mcopy_mode, wp_copy); cond_resched();
@@ -724,7 +720,7 @@ ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start, mmap_changing, 0); }
-void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, +void uffd_wp_range(struct vm_area_struct *dst_vma, unsigned long start, unsigned long len, bool enable_wp) { struct mmu_gather tlb; @@ -735,7 +731,7 @@ void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, else newprot = vm_get_page_prot(dst_vma->vm_flags);
- tlb_gather_mmu(&tlb, dst_mm); + tlb_gather_mmu(&tlb, dst_vma->vm_mm); change_protection(&tlb, dst_vma, start, start + len, newprot, enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE); tlb_finish_mmu(&tlb); @@ -786,7 +782,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, goto out_unlock; }
- uffd_wp_range(dst_mm, dst_vma, start, len, enable_wp); + uffd_wp_range(dst_vma, start, len, enable_wp);
err = 0; out_unlock:
Many userfaultfd ioctl functions take both a 'mode' and a 'wp_copy' argument. In future commits we plan to plumb the flags through to more places, so we'd be proliferating the very long argument list even further.
Let's take the time to simplify the argument list. Combine the two arguments into one - and generalize, so when we add more flags in the future, it doesn't imply more function arguments.
Since the modes (copy, zeropage, continue) are mutually exclusive, store them as an integer value (0, 1, 2) in the low bits. Place combine-able flag bits in the high bits.
Signed-off-by: Axel Rasmussen axelrasmussen@google.com --- fs/userfaultfd.c | 5 ++- include/linux/hugetlb.h | 11 ++--- include/linux/shmem_fs.h | 4 +- include/linux/userfaultfd_k.h | 30 +++++++------- mm/hugetlb.c | 14 ++++--- mm/shmem.c | 6 +-- mm/userfaultfd.c | 76 ++++++++++++++++------------------- 7 files changed, 70 insertions(+), 76 deletions(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index a95f6aaef76b..2db15a5e3224 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1725,6 +1725,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, struct uffdio_copy uffdio_copy; struct uffdio_copy __user *user_uffdio_copy; struct userfaultfd_wake_range range; + int flags = 0;
user_uffdio_copy = (struct uffdio_copy __user *) arg;
@@ -1751,10 +1752,12 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, goto out; if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP)) goto out; + if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP) + flags |= MFILL_ATOMIC_WP; if (mmget_not_zero(ctx->mm)) { ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src, uffdio_copy.len, &ctx->mmap_changing, - uffdio_copy.mode); + flags); mmput(ctx->mm); } else { return -ESRCH; diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d3fc104aab78..1e66a75b4da4 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -12,7 +12,6 @@ #include <linux/kref.h> #include <linux/pgtable.h> #include <linux/gfp.h> -#include <linux/userfaultfd_k.h>
struct ctl_table; struct user_struct; @@ -161,9 +160,8 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, - enum mcopy_atomic_mode mode, - struct page **pagep, - bool wp_copy); + int mode_flags, + struct page **pagep); #endif /* CONFIG_USERFAULTFD */ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, @@ -359,9 +357,8 @@ static inline int hugetlb_mfill_atomic_pte(pte_t *dst_pte, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, - enum mcopy_atomic_mode mode, - struct page **pagep, - bool wp_copy) + int mode_flags, + struct page **pagep) { BUG(); return 0; diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 2a0b1dc0460f..6bbb243716f3 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -153,11 +153,11 @@ extern int shmem_mfill_atomic_pte(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, - bool zeropage, bool wp_copy, + int mode_flags, struct page **pagep); #else /* !CONFIG_SHMEM */ #define shmem_mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, \ - src_addr, zeropage, wp_copy, pagep) ({ BUG(); 0; }) + src_addr, mode_flags, pagep) ({ BUG(); 0; }) #endif /* CONFIG_SHMEM */ #endif /* CONFIG_USERFAULTFD */
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index c6c23408d300..185024128e0f 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -40,30 +40,28 @@ extern int sysctl_unprivileged_userfaultfd;
extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
-/* - * The mode of operation for __mcopy_atomic and its helpers. - * - * This is almost an implementation detail (mcopy_atomic below doesn't take this - * as a parameter), but it's exposed here because memory-kind-specific - * implementations (e.g. hugetlbfs) need to know the mode of operation. - */ -enum mcopy_atomic_mode { - /* A normal copy_from_user into the destination range. */ - MCOPY_ATOMIC_NORMAL, - /* Don't copy; map the destination range to the zero page. */ - MCOPY_ATOMIC_ZEROPAGE, - /* Just install pte(s) with the existing page(s) in the page cache. */ - MCOPY_ATOMIC_CONTINUE, +/* Mutually exclusive modes of operation. */ +enum mfill_atomic_mode { + MFILL_ATOMIC_COPY, + MFILL_ATOMIC_ZEROPAGE, + MFILL_ATOMIC_CONTINUE, + NR_MFILL_ATOMIC_MODES, };
+#define MFILL_ATOMIC_MODE_BITS (const_ilog2(NR_MFILL_ATOMIC_MODES - 1) + 1) +#define MFILL_ATOMIC_MODE_MASK (BIT(MFILL_ATOMIC_MODE_BITS) - 1) + +/* Flags controlling behavior. */ +#define MFILL_ATOMIC_WP BIT(MFILL_ATOMIC_MODE_BITS + 0) + extern int mfill_atomic_install_pte(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, struct page *page, - bool newly_allocated, bool wp_copy); + bool newly_allocated, int mode_flags);
extern ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - atomic_t *mmap_changing, __u64 mode); + atomic_t *mmap_changing, int flags); extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long len, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0afd2ed8ad39..7fc4f529b4d7 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -34,6 +34,7 @@ #include <linux/nospec.h> #include <linux/delayacct.h> #include <linux/memory.h> +#include <linux/userfaultfd_k.h>
#include <asm/page.h> #include <asm/pgalloc.h> @@ -6166,11 +6167,12 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, - enum mcopy_atomic_mode mode, - struct page **pagep, - bool wp_copy) + int mode_flags, + struct page **pagep) { - bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE); + int mode = mode_flags & MFILL_ATOMIC_MODE_MASK; + bool is_continue = (mode == MFILL_ATOMIC_CONTINUE); + bool wp_enabled = (mode_flags & MFILL_ATOMIC_WP); struct hstate *h = hstate_vma(dst_vma); struct address_space *mapping = dst_vma->vm_file->f_mapping; pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr); @@ -6305,7 +6307,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY * with wp flag set, don't set pte write bit. */ - if (wp_copy || (is_continue && !vm_shared)) + if (wp_enabled || (is_continue && !vm_shared)) writable = 0; else writable = dst_vma->vm_flags & VM_WRITE; @@ -6320,7 +6322,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, _dst_pte = huge_pte_mkdirty(_dst_pte); _dst_pte = pte_mkyoung(_dst_pte);
- if (wp_copy) + if (wp_enabled) _dst_pte = huge_pte_mkuffd_wp(_dst_pte);
set_huge_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte); diff --git a/mm/shmem.c b/mm/shmem.c index cc03c61190eb..98c9c1f08389 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2402,7 +2402,7 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, - bool zeropage, bool wp_copy, + int mode_flags, struct page **pagep) { struct inode *inode = file_inode(dst_vma->vm_file); @@ -2434,7 +2434,7 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd, if (!folio) goto out_unacct_blocks;
- if (!zeropage) { /* COPY */ + if ((mode_flags & MFILL_ATOMIC_MODE_MASK) == MFILL_ATOMIC_COPY) { page_kaddr = kmap_local_folio(folio, 0); /* * The read mmap_lock is held here. Despite the @@ -2493,7 +2493,7 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd, goto out_release;
ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, - &folio->page, true, wp_copy); + &folio->page, true, mode_flags); if (ret) goto out_delete_from_cache;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 4bf5c97c665a..7882e4c60f60 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -58,7 +58,7 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm, int mfill_atomic_install_pte(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, struct page *page, - bool newly_allocated, bool wp_copy) + bool newly_allocated, int mode_flags) { int ret; pte_t _dst_pte, *dst_pte; @@ -79,7 +79,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, * Always mark a PTE as write-protected when needed, regardless of * VM_WRITE, which the user might change. */ - if (wp_copy) { + if (mode_flags & MFILL_ATOMIC_WP) { _dst_pte = pte_mkuffd_wp(_dst_pte); writable = false; } @@ -145,8 +145,8 @@ static int mfill_atomic_pte_copy(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, - struct page **pagep, - bool wp_copy) + int mode_flags, + struct page **pagep) { void *page_kaddr; int ret; @@ -207,7 +207,7 @@ static int mfill_atomic_pte_copy(pmd_t *dst_pmd, goto out_release;
ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, - page, true, wp_copy); + page, true, mode_flags); if (ret) goto out_release; out: @@ -255,7 +255,7 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, static int mfill_atomic_pte_continue(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, - bool wp_copy) + int mode_flags) { struct inode *inode = file_inode(dst_vma->vm_file); pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); @@ -281,7 +281,7 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd, }
ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, - page, false, wp_copy); + page, false, mode_flags); if (ret) goto out_release;
@@ -326,9 +326,9 @@ static __always_inline ssize_t mfill_atomic_hugetlb( unsigned long dst_start, unsigned long src_start, unsigned long len, - enum mcopy_atomic_mode mode, - bool wp_copy) + int mode_flags) { + int mode = mode_flags & MFILL_ATOMIC_MODE_MASK; struct mm_struct *dst_mm = dst_vma->vm_mm; int vm_shared = dst_vma->vm_flags & VM_SHARED; ssize_t err; @@ -347,7 +347,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb( * by THP. Since we can not reliably insert a zero page, this * feature is not supported. */ - if (mode == MCOPY_ATOMIC_ZEROPAGE) { + if (mode == MFILL_ATOMIC_ZEROPAGE) { mmap_read_unlock(dst_mm); return -EINVAL; } @@ -415,7 +415,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb( goto out_unlock; }
- if (mode != MCOPY_ATOMIC_CONTINUE && + if (mode != MFILL_ATOMIC_CONTINUE && !huge_pte_none_mostly(huge_ptep_get(dst_pte))) { err = -EEXIST; hugetlb_vma_unlock_read(dst_vma); @@ -423,9 +423,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb( goto out_unlock; }
- err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, - dst_addr, src_addr, mode, &page, - wp_copy); + err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr, + src_addr, mode_flags, &page);
hugetlb_vma_unlock_read(dst_vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); @@ -479,23 +478,22 @@ extern ssize_t mfill_atomic_hugetlb(struct vm_area_struct *dst_vma, unsigned long dst_start, unsigned long src_start, unsigned long len, - enum mcopy_atomic_mode mode, - bool wp_copy); + int mode_flags); #endif /* CONFIG_HUGETLB_PAGE */
static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, - struct page **page, - enum mcopy_atomic_mode mode, - bool wp_copy) + struct page **pagep, + int mode_flags) { + int mode = mode_flags & MFILL_ATOMIC_MODE_MASK; ssize_t err;
- if (mode == MCOPY_ATOMIC_CONTINUE) { + if (mode == MFILL_ATOMIC_CONTINUE) { return mfill_atomic_pte_continue(dst_pmd, dst_vma, - dst_addr, wp_copy); + dst_addr, mode_flags); }
/* @@ -509,18 +507,17 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd, * and not in the radix tree. */ if (!(dst_vma->vm_flags & VM_SHARED)) { - if (mode == MCOPY_ATOMIC_NORMAL) + if (mode == MFILL_ATOMIC_COPY) err = mfill_atomic_pte_copy(dst_pmd, dst_vma, - dst_addr, src_addr, page, - wp_copy); + dst_addr, src_addr, + mode_flags, pagep); else err = mfill_atomic_pte_zeropage(dst_pmd, dst_vma, dst_addr); } else { err = shmem_mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, src_addr, - mode != MCOPY_ATOMIC_NORMAL, - wp_copy, page); + mode_flags, pagep); }
return err; @@ -530,9 +527,8 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - enum mcopy_atomic_mode mcopy_mode, atomic_t *mmap_changing, - __u64 mode) + int mode_flags) { struct vm_area_struct *dst_vma; ssize_t err; @@ -540,7 +536,6 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm, unsigned long src_addr, dst_addr; long copied; struct page *page; - bool wp_copy;
/* * Sanitize the command parameters: @@ -590,8 +585,7 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm, * validate 'mode' now that we know the dst_vma: don't allow * a wrprotect copy if the userfaultfd didn't register as WP. */ - wp_copy = mode & UFFDIO_COPY_MODE_WP; - if (wp_copy && !(dst_vma->vm_flags & VM_UFFD_WP)) + if ((mode_flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP)) goto out_unlock;
/* @@ -599,12 +593,12 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm, */ if (is_vm_hugetlb_page(dst_vma)) return mfill_atomic_hugetlb(dst_vma, dst_start, - src_start, len, mcopy_mode, - wp_copy); + src_start, len, mode_flags);
if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) goto out_unlock; - if (!vma_is_shmem(dst_vma) && mcopy_mode == MCOPY_ATOMIC_CONTINUE) + if (!vma_is_shmem(dst_vma) && + (mode_flags & MFILL_ATOMIC_MODE_MASK) == MFILL_ATOMIC_CONTINUE) goto out_unlock;
/* @@ -652,7 +646,7 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm, BUG_ON(pmd_trans_huge(*dst_pmd));
err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, - src_addr, &page, mcopy_mode, wp_copy); + src_addr, &page, mode_flags); cond_resched();
if (unlikely(err == -ENOENT)) { @@ -700,24 +694,24 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - atomic_t *mmap_changing, __u64 mode) + atomic_t *mmap_changing, int flags) { return mfill_atomic(dst_mm, dst_start, src_start, len, - MCOPY_ATOMIC_NORMAL, mmap_changing, mode); + mmap_changing, flags | MFILL_ATOMIC_COPY); }
ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long start, unsigned long len, atomic_t *mmap_changing) { - return mfill_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE, - mmap_changing, 0); + return mfill_atomic(dst_mm, start, 0, len, + mmap_changing, MFILL_ATOMIC_ZEROPAGE); }
ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start, unsigned long len, atomic_t *mmap_changing) { - return mfill_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE, - mmap_changing, 0); + return mfill_atomic(dst_mm, start, 0, len, + mmap_changing, MFILL_ATOMIC_CONTINUE); }
void uffd_wp_range(struct vm_area_struct *dst_vma,
On Wed, Feb 22, 2023 at 4:58 PM Axel Rasmussen axelrasmussen@google.com wrote:
Many userfaultfd ioctl functions take both a 'mode' and a 'wp_copy' argument. In future commits we plan to plumb the flags through to more places, so we'd be proliferating the very long argument list even further.
Let's take the time to simplify the argument list. Combine the two arguments into one - and generalize, so when we add more flags in the future, it doesn't imply more function arguments.
Since the modes (copy, zeropage, continue) are mutually exclusive, store them as an integer value (0, 1, 2) in the low bits. Place combine-able flag bits in the high bits.
Signed-off-by: Axel Rasmussen axelrasmussen@google.com
Acked-by: James Houghton jthoughton@google.com
I don't see anything wrong with this patch. Thanks, Axel.
On Feb 22, 2023, at 4:57 PM, Axel Rasmussen axelrasmussen@google.com wrote:
Many userfaultfd ioctl functions take both a 'mode' and a 'wp_copy' argument. In future commits we plan to plumb the flags through to more places, so we'd be proliferating the very long argument list even further.
Let's take the time to simplify the argument list. Combine the two arguments into one - and generalize, so when we add more flags in the future, it doesn't imply more function arguments.
Since the modes (copy, zeropage, continue) are mutually exclusive, store them as an integer value (0, 1, 2) in the low bits. Place combine-able flag bits in the high bits.
Signed-off-by: Axel Rasmussen axelrasmussen@google.com
Hi Axel,
I sent a patch a long time ago called “userfaultfd: introduce uffd_flags”. For some reason it does not appear on lore, but you were a recipient.
It was pretty similar, but one thing that I preferred in my version is that it defined a different type to avoid confusion. Since “flags” is a very generic name, perhaps you’d like to adapt this approach.
On Thu, Feb 23, 2023 at 10:51 AM Nadav Amit namit@vmware.com wrote:
On Feb 22, 2023, at 4:57 PM, Axel Rasmussen axelrasmussen@google.com wrote:
Many userfaultfd ioctl functions take both a 'mode' and a 'wp_copy' argument. In future commits we plan to plumb the flags through to more places, so we'd be proliferating the very long argument list even further.
Let's take the time to simplify the argument list. Combine the two arguments into one - and generalize, so when we add more flags in the future, it doesn't imply more function arguments.
Since the modes (copy, zeropage, continue) are mutually exclusive, store them as an integer value (0, 1, 2) in the low bits. Place combine-able flag bits in the high bits.
Signed-off-by: Axel Rasmussen axelrasmussen@google.com
Hi Axel,
I sent a patch a long time ago called “userfaultfd: introduce uffd_flags”. For some reason it does not appear on lore, but you were a recipient.
It was pretty similar, but one thing that I preferred in my version is that it defined a different type to avoid confusion. Since “flags” is a very generic name, perhaps you’d like to adapt this approach.
Oh! I must have missed it, or just forgot. I'll take a look and see about making some improvements. I'll also be sure to credit you in the commit message.
On Feb 23, 2023, at 11:03 AM, Axel Rasmussen axelrasmussen@google.com wrote:
Oh! I must have missed it, or just forgot. I'll take a look and see about making some improvements. I'll also be sure to credit you in the commit message.
It was not about credit. I just wanted to refer you to what I had in mind, and David did spend some time on reviewing my patch...
;-)
We have a lot of functions which take an address + length pair, currently passed as separate arguments. However, in our userspace API we already have struct uffdio_range, which is exactly this pair, and this is what we get from userspace when ioctls are called.
Instead of splitting the struct up into two separate arguments, just plumb the struct through to the functions which use it (once we get to the mfill_atomic_pte level, we're dealing with single (huge)pages, so we don't need both parts).
Relatedly, for waking, just re-use this existing structure instead of defining a new "struct uffdio_wake_range".
Signed-off-by: Axel Rasmussen axelrasmussen@google.com --- fs/userfaultfd.c | 108 ++++++++++++++-------------------- include/linux/userfaultfd_k.h | 17 +++--- mm/userfaultfd.c | 91 ++++++++++++++-------------- 3 files changed, 97 insertions(+), 119 deletions(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 2db15a5e3224..f43d8b0ae47b 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -95,11 +95,6 @@ struct userfaultfd_wait_queue { bool waken; };
-struct userfaultfd_wake_range { - unsigned long start; - unsigned long len; -}; - /* internal indication that UFFD_API ioctl was successfully executed */ #define UFFD_FEATURE_INITIALIZED (1u << 31)
@@ -126,7 +121,7 @@ static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, int wake_flags, void *key) { - struct userfaultfd_wake_range *range = key; + struct uffdio_range *range = key; int ret; struct userfaultfd_wait_queue *uwq; unsigned long start, len; @@ -871,7 +866,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) struct mm_struct *mm = ctx->mm; struct vm_area_struct *vma, *prev; /* len == 0 means wake all */ - struct userfaultfd_wake_range range = { .len = 0, }; + struct uffdio_range range = {0}; unsigned long new_flags; MA_STATE(mas, &mm->mm_mt, 0, 0);
@@ -1217,7 +1212,7 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf, }
static void __wake_userfault(struct userfaultfd_ctx *ctx, - struct userfaultfd_wake_range *range) + struct uffdio_range *range) { spin_lock_irq(&ctx->fault_pending_wqh.lock); /* wake all in the range and autoremove */ @@ -1230,7 +1225,7 @@ static void __wake_userfault(struct userfaultfd_ctx *ctx, }
static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx, - struct userfaultfd_wake_range *range) + struct uffdio_range *range) { unsigned seq; bool need_wakeup; @@ -1261,21 +1256,21 @@ static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx, }
static __always_inline int validate_range(struct mm_struct *mm, - __u64 start, __u64 len) + const struct uffdio_range *range) { __u64 task_size = mm->task_size;
- if (start & ~PAGE_MASK) + if (range->start & ~PAGE_MASK) return -EINVAL; - if (len & ~PAGE_MASK) + if (range->len & ~PAGE_MASK) return -EINVAL; - if (!len) + if (!range->len) return -EINVAL; - if (start < mmap_min_addr) + if (range->start < mmap_min_addr) return -EINVAL; - if (start >= task_size) + if (range->start >= task_size) return -EINVAL; - if (len > task_size - start) + if (range->len > task_size - range->start) return -EINVAL; return 0; } @@ -1322,8 +1317,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, vm_flags |= VM_UFFD_MINOR; }
- ret = validate_range(mm, uffdio_register.range.start, - uffdio_register.range.len); + ret = validate_range(mm, &uffdio_register.range); if (ret) goto out;
@@ -1539,11 +1533,11 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) goto out;
- ret = validate_range(mm, uffdio_unregister.start, - uffdio_unregister.len); + ret = validate_range(mm, &uffdio_unregister); if (ret) goto out;
+ /* Get rid of start + end in favor of range *? */ start = uffdio_unregister.start; end = start + uffdio_unregister.len;
@@ -1605,6 +1599,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
ret = 0; do { + struct uffdio_range range; + cond_resched();
BUG_ON(!vma_can_userfault(vma, vma->vm_flags)); @@ -1622,6 +1618,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, start = vma->vm_start; vma_end = min(end, vma->vm_end);
+ range.start = start; + range.len = vma_end - start; if (userfaultfd_missing(vma)) { /* * Wake any concurrent pending userfault while @@ -1629,15 +1627,12 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, * permanently and it avoids userland to call * UFFDIO_WAKE explicitly. */ - struct userfaultfd_wake_range range; - range.start = start; - range.len = vma_end - start; wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range); }
/* Reset ptes for the whole vma range if wr-protected */ if (userfaultfd_wp(vma)) - uffd_wp_range(vma, start, vma_end - start, false); + uffd_wp_range(vma, &range, false);
new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; prev = vma_merge(mm, prev, start, vma_end, new_flags, @@ -1691,27 +1686,23 @@ static int userfaultfd_wake(struct userfaultfd_ctx *ctx, { int ret; struct uffdio_range uffdio_wake; - struct userfaultfd_wake_range range; const void __user *buf = (void __user *)arg;
ret = -EFAULT; if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake))) goto out;
- ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len); + ret = validate_range(ctx->mm, &uffdio_wake); if (ret) goto out;
- range.start = uffdio_wake.start; - range.len = uffdio_wake.len; - /* * len == 0 means wake all and we don't want to wake all here, * so check it again to be sure. */ - VM_BUG_ON(!range.len); + VM_BUG_ON(!uffdio_wake.len);
- wake_userfault(ctx, &range); + wake_userfault(ctx, &uffdio_wake); ret = 0;
out: @@ -1724,7 +1715,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, __s64 ret; struct uffdio_copy uffdio_copy; struct uffdio_copy __user *user_uffdio_copy; - struct userfaultfd_wake_range range; + struct uffdio_range range; int flags = 0;
user_uffdio_copy = (struct uffdio_copy __user *) arg; @@ -1739,7 +1730,9 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, sizeof(uffdio_copy)-sizeof(__s64))) goto out;
- ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len); + range.start = uffdio_copy.dst; + range.len = uffdio_copy.len; + ret = validate_range(ctx->mm, &range); if (ret) goto out; /* @@ -1755,9 +1748,8 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP) flags |= MFILL_ATOMIC_WP; if (mmget_not_zero(ctx->mm)) { - ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src, - uffdio_copy.len, &ctx->mmap_changing, - flags); + ret = mfill_atomic_copy(ctx->mm, uffdio_copy.src, &range, + &ctx->mmap_changing, flags); mmput(ctx->mm); } else { return -ESRCH; @@ -1769,10 +1761,8 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, BUG_ON(!ret); /* len == 0 would wake all */ range.len = ret; - if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) { - range.start = uffdio_copy.dst; + if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) wake_userfault(ctx, &range); - } ret = range.len == uffdio_copy.len ? 0 : -EAGAIN; out: return ret; @@ -1784,7 +1774,7 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, __s64 ret; struct uffdio_zeropage uffdio_zeropage; struct uffdio_zeropage __user *user_uffdio_zeropage; - struct userfaultfd_wake_range range; + struct uffdio_range range;
user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
@@ -1798,8 +1788,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, sizeof(uffdio_zeropage)-sizeof(__s64))) goto out;
- ret = validate_range(ctx->mm, uffdio_zeropage.range.start, - uffdio_zeropage.range.len); + range = uffdio_zeropage.range; + ret = validate_range(ctx->mm, &range); if (ret) goto out; ret = -EINVAL; @@ -1807,8 +1797,7 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, goto out;
if (mmget_not_zero(ctx->mm)) { - ret = mfill_atomic_zeropage(ctx->mm, uffdio_zeropage.range.start, - uffdio_zeropage.range.len, + ret = mfill_atomic_zeropage(ctx->mm, &uffdio_zeropage.range, &ctx->mmap_changing); mmput(ctx->mm); } else { @@ -1822,7 +1811,6 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, BUG_ON(!ret); range.len = ret; if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) { - range.start = uffdio_zeropage.range.start; wake_userfault(ctx, &range); } ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN; @@ -1836,7 +1824,6 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx, int ret; struct uffdio_writeprotect uffdio_wp; struct uffdio_writeprotect __user *user_uffdio_wp; - struct userfaultfd_wake_range range; bool mode_wp, mode_dontwake;
if (atomic_read(&ctx->mmap_changing)) @@ -1848,8 +1835,7 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx, sizeof(struct uffdio_writeprotect))) return -EFAULT;
- ret = validate_range(ctx->mm, uffdio_wp.range.start, - uffdio_wp.range.len); + ret = validate_range(ctx->mm, &uffdio_wp.range); if (ret) return ret;
@@ -1864,9 +1850,8 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx, return -EINVAL;
if (mmget_not_zero(ctx->mm)) { - ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start, - uffdio_wp.range.len, mode_wp, - &ctx->mmap_changing); + ret = mwriteprotect_range(ctx->mm, &uffdio_wp.range, + mode_wp, &ctx->mmap_changing); mmput(ctx->mm); } else { return -ESRCH; @@ -1875,11 +1860,8 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx, if (ret) return ret;
- if (!mode_wp && !mode_dontwake) { - range.start = uffdio_wp.range.start; - range.len = uffdio_wp.range.len; - wake_userfault(ctx, &range); - } + if (!mode_wp && !mode_dontwake) + wake_userfault(ctx, &uffdio_wp.range); return ret; }
@@ -1888,7 +1870,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) __s64 ret; struct uffdio_continue uffdio_continue; struct uffdio_continue __user *user_uffdio_continue; - struct userfaultfd_wake_range range; + struct uffdio_range range;
user_uffdio_continue = (struct uffdio_continue __user *)arg;
@@ -1902,23 +1884,20 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) sizeof(uffdio_continue) - (sizeof(__s64)))) goto out;
- ret = validate_range(ctx->mm, uffdio_continue.range.start, - uffdio_continue.range.len); + range = uffdio_continue.range; + ret = validate_range(ctx->mm, &range); if (ret) goto out;
ret = -EINVAL; /* double check for wraparound just in case. */ - if (uffdio_continue.range.start + uffdio_continue.range.len <= - uffdio_continue.range.start) { + if (range.start + range.len <= range.start) goto out; - } if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE) goto out;
if (mmget_not_zero(ctx->mm)) { - ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start, - uffdio_continue.range.len, + ret = mfill_atomic_continue(ctx->mm, &range, &ctx->mmap_changing); mmput(ctx->mm); } else { @@ -1934,7 +1913,6 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) BUG_ON(!ret); range.len = ret; if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) { - range.start = uffdio_continue.range.start; wake_userfault(ctx, &range); } ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN; diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 185024128e0f..3cf87f019db3 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -59,20 +59,21 @@ extern int mfill_atomic_install_pte(pmd_t *dst_pmd, unsigned long dst_addr, struct page *page, bool newly_allocated, int mode_flags);
-extern ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start, - unsigned long src_start, unsigned long len, +extern ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long src_start, + const struct uffdio_range *dst, atomic_t *mmap_changing, int flags); extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, - unsigned long dst_start, - unsigned long len, + const struct uffdio_range *dst, atomic_t *mmap_changing); -extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start, - unsigned long len, atomic_t *mmap_changing); +extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, + const struct uffdio_range *dst, + atomic_t *mmap_changing); extern int mwriteprotect_range(struct mm_struct *dst_mm, - unsigned long start, unsigned long len, + const struct uffdio_range *range, bool enable_wp, atomic_t *mmap_changing); extern void uffd_wp_range(struct vm_area_struct *vma, - unsigned long start, unsigned long len, bool enable_wp); + const struct uffdio_range *range, + bool enable_wp);
/* mm helpers */ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 7882e4c60f60..ebbba2b9f687 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -21,8 +21,7 @@
static __always_inline struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm, - unsigned long dst_start, - unsigned long len) + const struct uffdio_range *dst) { /* * Make sure that the dst range is both valid and fully within a @@ -30,12 +29,12 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm, */ struct vm_area_struct *dst_vma;
- dst_vma = find_vma(dst_mm, dst_start); + dst_vma = find_vma(dst_mm, dst->start); if (!dst_vma) return NULL;
- if (dst_start < dst_vma->vm_start || - dst_start + len > dst_vma->vm_end) + if (dst->start < dst_vma->vm_start || + dst->start + dst->len > dst_vma->vm_end) return NULL;
/* @@ -323,9 +322,8 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) */ static __always_inline ssize_t mfill_atomic_hugetlb( struct vm_area_struct *dst_vma, - unsigned long dst_start, unsigned long src_start, - unsigned long len, + const struct uffdio_range *dst, int mode_flags) { int mode = mode_flags & MFILL_ATOMIC_MODE_MASK; @@ -353,7 +351,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb( }
src_addr = src_start; - dst_addr = dst_start; + dst_addr = dst->start; copied = 0; page = NULL; vma_hpagesize = vma_kernel_pagesize(dst_vma); @@ -362,7 +360,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb( * Validate alignment based on huge page size */ err = -EINVAL; - if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1)) + if (dst->start & (vma_hpagesize - 1) || dst->len & (vma_hpagesize - 1)) goto out_unlock;
retry: @@ -372,7 +370,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb( */ if (!dst_vma) { err = -ENOENT; - dst_vma = find_dst_vma(dst_mm, dst_start, len); + dst_vma = find_dst_vma(dst_mm, dst); if (!dst_vma || !is_vm_hugetlb_page(dst_vma)) goto out_unlock;
@@ -392,8 +390,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb( goto out_unlock; }
- while (src_addr < src_start + len) { - BUG_ON(dst_addr >= dst_start + len); + while (src_addr < src_start + dst->len) { + BUG_ON(dst_addr >= dst->start + dst->len);
/* * Serialize via vma_lock and hugetlb_fault_mutex. @@ -475,9 +473,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb( #else /* !CONFIG_HUGETLB_PAGE */ /* fail at build time if gcc attempts to use this */ extern ssize_t mfill_atomic_hugetlb(struct vm_area_struct *dst_vma, - unsigned long dst_start, unsigned long src_start, - unsigned long len, + struct uffdio_range dst, int mode_flags); #endif /* CONFIG_HUGETLB_PAGE */
@@ -524,9 +521,8 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd, }
static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm, - unsigned long dst_start, unsigned long src_start, - unsigned long len, + const struct uffdio_range *dst, atomic_t *mmap_changing, int mode_flags) { @@ -540,15 +536,15 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm, /* * Sanitize the command parameters: */ - BUG_ON(dst_start & ~PAGE_MASK); - BUG_ON(len & ~PAGE_MASK); + BUG_ON(dst->start & ~PAGE_MASK); + BUG_ON(dst->len & ~PAGE_MASK);
/* Does the address range wrap, or is the span zero-sized? */ - BUG_ON(src_start + len <= src_start); - BUG_ON(dst_start + len <= dst_start); + BUG_ON(src_start + dst->len <= src_start); + BUG_ON(dst->start + dst->len <= dst->start);
src_addr = src_start; - dst_addr = dst_start; + dst_addr = dst->start; copied = 0; page = NULL; retry: @@ -568,7 +564,7 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm, * both valid and fully within a single existing vma. */ err = -ENOENT; - dst_vma = find_dst_vma(dst_mm, dst_start, len); + dst_vma = find_dst_vma(dst_mm, dst); if (!dst_vma) goto out_unlock;
@@ -592,8 +588,8 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm, * If this is a HUGETLB vma, pass off to appropriate routine */ if (is_vm_hugetlb_page(dst_vma)) - return mfill_atomic_hugetlb(dst_vma, dst_start, - src_start, len, mode_flags); + return mfill_atomic_hugetlb(dst_vma, src_start, dst, + mode_flags);
if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) goto out_unlock; @@ -611,10 +607,10 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm, unlikely(anon_vma_prepare(dst_vma))) goto out_unlock;
- while (src_addr < src_start + len) { + while (src_addr < src_start + dst->len) { pmd_t dst_pmdval;
- BUG_ON(dst_addr >= dst_start + len); + BUG_ON(dst_addr >= dst->start + dst->len);
dst_pmd = mm_alloc_pmd(dst_mm, dst_addr); if (unlikely(!dst_pmd)) { @@ -692,30 +688,32 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm, return copied ? copied : err; }
-ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start, - unsigned long src_start, unsigned long len, +ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long src_start, + const struct uffdio_range *dst, atomic_t *mmap_changing, int flags) { - return mfill_atomic(dst_mm, dst_start, src_start, len, + return mfill_atomic(dst_mm, src_start, dst, mmap_changing, flags | MFILL_ATOMIC_COPY); }
-ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long start, - unsigned long len, atomic_t *mmap_changing) +ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, + const struct uffdio_range *dst, + atomic_t *mmap_changing) { - return mfill_atomic(dst_mm, start, 0, len, + return mfill_atomic(dst_mm, 0, dst, mmap_changing, MFILL_ATOMIC_ZEROPAGE); }
-ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start, - unsigned long len, atomic_t *mmap_changing) +ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, + const struct uffdio_range *dst, + atomic_t *mmap_changing) { - return mfill_atomic(dst_mm, start, 0, len, + return mfill_atomic(dst_mm, 0, dst, mmap_changing, MFILL_ATOMIC_CONTINUE); }
void uffd_wp_range(struct vm_area_struct *dst_vma, - unsigned long start, unsigned long len, bool enable_wp) + const struct uffdio_range *range, bool enable_wp) { struct mmu_gather tlb; pgprot_t newprot; @@ -726,14 +724,15 @@ void uffd_wp_range(struct vm_area_struct *dst_vma, newprot = vm_get_page_prot(dst_vma->vm_flags);
tlb_gather_mmu(&tlb, dst_vma->vm_mm); - change_protection(&tlb, dst_vma, start, start + len, newprot, + change_protection(&tlb, dst_vma, range->start, + range->start + range->len, newprot, enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE); tlb_finish_mmu(&tlb); }
-int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, - unsigned long len, bool enable_wp, - atomic_t *mmap_changing) +int mwriteprotect_range(struct mm_struct *dst_mm, + const struct uffdio_range *dst, + bool enable_wp, atomic_t *mmap_changing) { struct vm_area_struct *dst_vma; unsigned long page_mask; @@ -742,11 +741,11 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, /* * Sanitize the command parameters: */ - BUG_ON(start & ~PAGE_MASK); - BUG_ON(len & ~PAGE_MASK); + BUG_ON(dst->start & ~PAGE_MASK); + BUG_ON(dst->len & ~PAGE_MASK);
/* Does the address range wrap, or is the span zero-sized? */ - BUG_ON(start + len <= start); + BUG_ON(dst->start + dst->len <= dst->start);
mmap_read_lock(dst_mm);
@@ -760,7 +759,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, goto out_unlock;
err = -ENOENT; - dst_vma = find_dst_vma(dst_mm, start, len); + dst_vma = find_dst_vma(dst_mm, dst);
if (!dst_vma) goto out_unlock; @@ -772,11 +771,11 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, if (is_vm_hugetlb_page(dst_vma)) { err = -EINVAL; page_mask = vma_kernel_pagesize(dst_vma) - 1; - if ((start & page_mask) || (len & page_mask)) + if ((dst->start & page_mask) || (dst->len & page_mask)) goto out_unlock; }
- uffd_wp_range(dst_vma, start, len, enable_wp); + uffd_wp_range(dst_vma, dst, enable_wp);
err = 0; out_unlock:
UFFDIO_COPY already has UFFDIO_COPY_MODE_WP, so when installing a new PTE to resolve a missing fault, one can install a write-protected one. This is useful when using UFFDIO_REGISTER_MODE_{MISSING,WP} in combination.
So, add an analogous UFFDIO_CONTINUE_MODE_WP, which does the same thing but for *minor* faults.
Update the selftest to do some very basic exercising of the new flag.
Signed-off-by: Axel Rasmussen axelrasmussen@google.com --- fs/userfaultfd.c | 8 ++++++-- include/linux/userfaultfd_k.h | 2 +- include/uapi/linux/userfaultfd.h | 7 +++++++ mm/userfaultfd.c | 5 +++-- tools/testing/selftests/vm/userfaultfd.c | 4 ++++ 5 files changed, 21 insertions(+), 5 deletions(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index f43d8b0ae47b..5ce4502737a1 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1871,6 +1871,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) struct uffdio_continue uffdio_continue; struct uffdio_continue __user *user_uffdio_continue; struct uffdio_range range; + int flags = 0;
user_uffdio_continue = (struct uffdio_continue __user *)arg;
@@ -1893,12 +1894,15 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) /* double check for wraparound just in case. */ if (range.start + range.len <= range.start) goto out; - if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE) + if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE | + UFFDIO_CONTINUE_MODE_WP)) goto out; + if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP) + flags |= MFILL_ATOMIC_WP;
if (mmget_not_zero(ctx->mm)) { ret = mfill_atomic_continue(ctx->mm, &range, - &ctx->mmap_changing); + &ctx->mmap_changing, flags); mmput(ctx->mm); } else { return -ESRCH; diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 3cf87f019db3..1e1229430f56 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -67,7 +67,7 @@ extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, atomic_t *mmap_changing); extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, const struct uffdio_range *dst, - atomic_t *mmap_changing); + atomic_t *mmap_changing, int flags); extern int mwriteprotect_range(struct mm_struct *dst_mm, const struct uffdio_range *range, bool enable_wp, atomic_t *mmap_changing); diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 005e5e306266..14059a0861bf 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -297,6 +297,13 @@ struct uffdio_writeprotect { struct uffdio_continue { struct uffdio_range range; #define UFFDIO_CONTINUE_MODE_DONTWAKE ((__u64)1<<0) + /* + * UFFDIO_CONTINUE_MODE_WP will map the page write protected on + * the fly. UFFDIO_CONTINUE_MODE_WP is available only if the + * write protected ioctl is implemented for the range + * according to the uffdio_register.ioctls. + */ +#define UFFDIO_CONTINUE_MODE_WP ((__u64)1<<1) __u64 mode;
/* diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index ebbba2b9f687..03f388aaa405 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -706,10 +706,11 @@ ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm,
ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, const struct uffdio_range *dst, - atomic_t *mmap_changing) + atomic_t *mmap_changing, + int flags) { return mfill_atomic(dst_mm, 0, dst, - mmap_changing, MFILL_ATOMIC_CONTINUE); + mmap_changing, flags | MFILL_ATOMIC_CONTINUE); }
void uffd_wp_range(struct vm_area_struct *dst_vma, diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 7f22844ed704..41c1f9abc481 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -585,6 +585,8 @@ static void continue_range(int ufd, __u64 start, __u64 len) req.range.start = start; req.range.len = len; req.mode = 0; + if (test_uffdio_wp) + req.mode |= UFFDIO_CONTINUE_MODE_WP;
if (ioctl(ufd, UFFDIO_CONTINUE, &req)) err("UFFDIO_CONTINUE failed for address 0x%" PRIx64, @@ -1332,6 +1334,8 @@ static int userfaultfd_minor_test(void) uffdio_register.range.start = (unsigned long)area_dst_alias; uffdio_register.range.len = nr_pages * page_size; uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR; + if (test_uffdio_wp) + uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) err("register failure");
linux-kselftest-mirror@lists.linaro.org