Base ====
This series is based on top of my series which adds minor fault handling for hugetlbfs [1]. (And, therefore, it is based on 5.12-rc1 and Peter Xu's series for disabling huge pmd sharing as well.)
[1] https://lore.kernel.org/linux-fsdevel/20210301222728.176417-1-axelrasmussen@...
Changelog =========
v1->v2: - For UFFDIO_CONTINUE, don't mess with page flags. Just use find_lock_page to get a locked page from the page cache, instead of doing __SetPageLocked. This fixes a VM_BUG_ON v1 hit when handling minor faults for THP-backed shmem (a tmpfs mounted with huge=always).
Overview ========
See my original series linked above for a detailed overview of minor fault handling in general. The feature in this series works exactly like the hugetblfs version (from userspace's perspective).
I'm sending this as a separate series because:
- The original minor fault handling series has a full set of R-Bs, and seems close to being merged. So, it seems reasonable to start looking at this next step, which extends the basic functionality.
- shmem is different enough that this series may require some additional work before it's ready, and I don't want to delay the original series unnecessarily by bundling them together.
Use Case ========
In some cases it is useful to have VM memory backed by tmpfs instead of hugetlbfs. So, this feature will be used to support the same VM live migration use case described in my original series.
Additionally, Android folks (Lokesh Gidra lokeshgidra@google.com) hope to optimize the Android Runtime garbage collector using this feature:
"The plan is to use userfaultfd for concurrently compacting the heap. With this feature, the heap can be shared-mapped at another location where the GC-thread(s) could continue the compaction operation without the need to invoke userfault ioctl(UFFDIO_COPY) each time. OTOH, if and when Java threads get faults on the heap, UFFDIO_CONTINUE can be used to resume execution. Furthermore, this feature enables updating references in the 'non-moving' portion of the heap efficiently. Without this feature, uneccessary page copying (ioctl(UFFDIO_COPY)) would be required."
Axel Rasmussen (5): userfaultfd: support minor fault handling for shmem userfaultfd/selftests: use memfd_create for shmem test type userfaultfd/selftests: create alias mappings in the shmem test userfaultfd/selftests: reinitialize test context in each test userfaultfd/selftests: exercise minor fault handling shmem support
fs/userfaultfd.c | 6 +- include/linux/shmem_fs.h | 26 +- include/uapi/linux/userfaultfd.h | 4 +- mm/memory.c | 8 +- mm/shmem.c | 92 +++---- mm/userfaultfd.c | 27 +- tools/testing/selftests/vm/userfaultfd.c | 322 +++++++++++++++-------- 7 files changed, 295 insertions(+), 190 deletions(-)
-- 2.30.1.766.gb4fecdf3b7-goog
Modify the userfaultfd register API to allow registering shmem VMAs in minor mode. Modify the shmem mcopy implementation to support UFFDIO_CONTINUE in order to resolve such faults.
Combine the shmem mcopy handler functions into a single shmem_mcopy_atomic_pte, which takes a mode parameter. This matches how the hugetlbfs implementation is structured, and lets us remove a good chunk of boilerplate.
Signed-off-by: Axel Rasmussen axelrasmussen@google.com --- fs/userfaultfd.c | 6 +-- include/linux/shmem_fs.h | 26 ++++----- include/uapi/linux/userfaultfd.h | 4 +- mm/memory.c | 8 +-- mm/shmem.c | 92 +++++++++++++++----------------- mm/userfaultfd.c | 27 +++++----- 6 files changed, 79 insertions(+), 84 deletions(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 14f92285d04f..9f3b8684cf3c 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1267,8 +1267,7 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, }
if (vm_flags & VM_UFFD_MINOR) { - /* FIXME: Add minor fault interception for shmem. */ - if (!is_vm_hugetlb_page(vma)) + if (!(is_vm_hugetlb_page(vma) || vma_is_shmem(vma))) return false; }
@@ -1941,7 +1940,8 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, /* report all available features and ioctls to userland */ uffdio_api.features = UFFD_API_FEATURES; #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR - uffdio_api.features &= ~UFFD_FEATURE_MINOR_HUGETLBFS; + uffdio_api.features &= + ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM); #endif uffdio_api.ioctls = UFFD_API_IOCTLS; ret = -EFAULT; diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index d82b6f396588..f0919c3722e7 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -9,6 +9,7 @@ #include <linux/percpu_counter.h> #include <linux/xattr.h> #include <linux/fs_parser.h> +#include <linux/userfaultfd_k.h>
/* inode in-kernel data */
@@ -122,21 +123,16 @@ static inline bool shmem_file(struct file *file) extern bool shmem_charge(struct inode *inode, long pages); extern void shmem_uncharge(struct inode *inode, long pages);
+#ifdef CONFIG_USERFAULTFD #ifdef CONFIG_SHMEM -extern int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - struct page **pagep); -extern int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm, - pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr); -#else -#define shmem_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \ - src_addr, pagep) ({ BUG(); 0; }) -#define shmem_mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, \ - dst_addr) ({ BUG(); 0; }) -#endif +int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, unsigned long src_addr, + enum mcopy_atomic_mode mode, struct page **pagep); +#else /* !CONFIG_SHMEM */ +#define shmem_mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, \ + src_addr, mode, pagep) ({ BUG(); 0; }) +#endif /* CONFIG_SHMEM */ +#endif /* CONFIG_USERFAULTFD */
#endif diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index bafbeb1a2624..47d9790d863d 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -31,7 +31,8 @@ UFFD_FEATURE_MISSING_SHMEM | \ UFFD_FEATURE_SIGBUS | \ UFFD_FEATURE_THREAD_ID | \ - UFFD_FEATURE_MINOR_HUGETLBFS) + UFFD_FEATURE_MINOR_HUGETLBFS | \ + UFFD_FEATURE_MINOR_SHMEM) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -196,6 +197,7 @@ struct uffdio_api { #define UFFD_FEATURE_SIGBUS (1<<7) #define UFFD_FEATURE_THREAD_ID (1<<8) #define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9) +#define UFFD_FEATURE_MINOR_SHMEM (1<<10) __u64 features;
__u64 ioctls; diff --git a/mm/memory.c b/mm/memory.c index c8e357627318..a1e5ff55027e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3929,9 +3929,11 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf) * something). */ if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { - ret = do_fault_around(vmf); - if (ret) - return ret; + if (likely(!userfaultfd_minor(vmf->vma))) { + ret = do_fault_around(vmf); + if (ret) + return ret; + } }
ret = __do_fault(vmf); diff --git a/mm/shmem.c b/mm/shmem.c index b2db4ed0fbc7..6f81259fabb3 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -77,7 +77,6 @@ static struct vfsmount *shm_mnt; #include <linux/syscalls.h> #include <linux/fcntl.h> #include <uapi/linux/memfd.h> -#include <linux/userfaultfd_k.h> #include <linux/rmap.h> #include <linux/uuid.h>
@@ -1785,8 +1784,8 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, * vm. If we swap it in we mark it dirty since we also free the swap * entry since a page cannot live in both the swap and page cache. * - * vmf and fault_type are only supplied by shmem_fault: - * otherwise they are NULL. + * vma, vmf, and fault_type are only supplied by shmem_fault: otherwise they + * are NULL. */ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, @@ -1830,6 +1829,12 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, return error; }
+ if (page && vma && userfaultfd_minor(vma)) { + unlock_page(page); + *fault_type = handle_userfault(vmf, VM_UFFD_MINOR); + return 0; + } + if (page) hindex = page->index; if (page && sgp == SGP_WRITE) @@ -2354,14 +2359,12 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode return inode; }
-static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, - pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - bool zeropage, - struct page **pagep) +int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, unsigned long src_addr, + enum mcopy_atomic_mode mode, struct page **pagep) { + bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE); struct inode *inode = file_inode(dst_vma->vm_file); struct shmem_inode_info *info = SHMEM_I(inode); struct address_space *mapping = inode->i_mapping; @@ -2378,12 +2381,17 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (!shmem_inode_acct_block(inode, 1)) goto out;
- if (!*pagep) { + if (is_continue) { + ret = -EFAULT; + page = find_lock_page(mapping, pgoff); + if (!page) + goto out_unacct_blocks; + } else if (!*pagep) { page = shmem_alloc_page(gfp, info, pgoff); if (!page) goto out_unacct_blocks;
- if (!zeropage) { /* mcopy_atomic */ + if (mode == MCOPY_ATOMIC_NORMAL) { /* mcopy_atomic */ page_kaddr = kmap_atomic(page); ret = copy_from_user(page_kaddr, (const void __user *)src_addr, @@ -2397,7 +2405,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, /* don't free the page */ return -ENOENT; } - } else { /* mfill_zeropage_atomic */ + } else { /* zeropage */ clear_highpage(page); } } else { @@ -2405,10 +2413,13 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, *pagep = NULL; }
- VM_BUG_ON(PageLocked(page) || PageSwapBacked(page)); - __SetPageLocked(page); - __SetPageSwapBacked(page); - __SetPageUptodate(page); + if (!is_continue) { + VM_BUG_ON(PageSwapBacked(page)); + VM_BUG_ON(PageLocked(page)); + __SetPageLocked(page); + __SetPageSwapBacked(page); + __SetPageUptodate(page); + }
ret = -EFAULT; offset = linear_page_index(dst_vma, dst_addr); @@ -2416,10 +2427,13 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (unlikely(offset >= max_off)) goto out_release;
- ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL, - gfp & GFP_RECLAIM_MASK, dst_mm); - if (ret) - goto out_release; + /* If page wasn't already in the page cache, add it. */ + if (!is_continue) { + ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL, + gfp & GFP_RECLAIM_MASK, dst_mm); + if (ret) + goto out_release; + }
_dst_pte = mk_pte(page, dst_vma->vm_page_prot); if (dst_vma->vm_flags & VM_WRITE) @@ -2446,13 +2460,15 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (!pte_none(*dst_pte)) goto out_release_unlock;
- lru_cache_add(page); + if (!is_continue) { + lru_cache_add(page);
- spin_lock_irq(&info->lock); - info->alloced++; - inode->i_blocks += BLOCKS_PER_PAGE; - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); + spin_lock_irq(&info->lock); + info->alloced++; + inode->i_blocks += BLOCKS_PER_PAGE; + shmem_recalc_inode(inode); + spin_unlock_irq(&info->lock); + }
inc_mm_counter(dst_mm, mm_counter_file(page)); page_add_file_rmap(page, false); @@ -2477,28 +2493,6 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, goto out; }
-int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, - pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - struct page **pagep) -{ - return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, - dst_addr, src_addr, false, pagep); -} - -int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm, - pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr) -{ - struct page *page = NULL; - - return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, - dst_addr, 0, true, &page); -} - #ifdef CONFIG_TMPFS static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_short_symlink_operations; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index ce6cb4760d2c..6cd7ab531aec 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -415,7 +415,7 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, unsigned long dst_addr, unsigned long src_addr, struct page **page, - bool zeropage, + enum mcopy_atomic_mode mode, bool wp_copy) { ssize_t err; @@ -431,22 +431,24 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, * and not in the radix tree. */ if (!(dst_vma->vm_flags & VM_SHARED)) { - if (!zeropage) + switch (mode) { + case MCOPY_ATOMIC_NORMAL: err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, src_addr, page, wp_copy); - else + break; + case MCOPY_ATOMIC_ZEROPAGE: err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, dst_addr); + break; + case MCOPY_ATOMIC_CONTINUE: + err = -EINVAL; + break; + } } else { VM_WARN_ON_ONCE(wp_copy); - if (!zeropage) - err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, - dst_vma, dst_addr, - src_addr, page); - else - err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd, - dst_vma, dst_addr); + err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, + src_addr, mode, page); }
return err; @@ -467,7 +469,6 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, long copied; struct page *page; bool wp_copy; - bool zeropage = (mcopy_mode == MCOPY_ATOMIC_ZEROPAGE);
/* * Sanitize the command parameters: @@ -530,7 +531,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) goto out_unlock; - if (mcopy_mode == MCOPY_ATOMIC_CONTINUE) + if (!vma_is_shmem(dst_vma) && mcopy_mode == MCOPY_ATOMIC_CONTINUE) goto out_unlock;
/* @@ -578,7 +579,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, BUG_ON(pmd_trans_huge(*dst_pmd));
err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, - src_addr, &page, zeropage, wp_copy); + src_addr, &page, mcopy_mode, wp_copy); cond_resched();
if (unlikely(err == -ENOENT)) {
On 1 Mar 2021, at 19:01, Axel Rasmussen wrote:
Modify the userfaultfd register API to allow registering shmem VMAs in minor mode. Modify the shmem mcopy implementation to support UFFDIO_CONTINUE in order to resolve such faults.
Combine the shmem mcopy handler functions into a single shmem_mcopy_atomic_pte, which takes a mode parameter. This matches how the hugetlbfs implementation is structured, and lets us remove a good chunk of boilerplate.
Signed-off-by: Axel Rasmussen axelrasmussen@google.com
fs/userfaultfd.c | 6 +-- include/linux/shmem_fs.h | 26 ++++----- include/uapi/linux/userfaultfd.h | 4 +- mm/memory.c | 8 +-- mm/shmem.c | 92 +++++++++++++++----------------- mm/userfaultfd.c | 27 +++++----- 6 files changed, 79 insertions(+), 84 deletions(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 14f92285d04f..9f3b8684cf3c 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1267,8 +1267,7 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, }
if (vm_flags & VM_UFFD_MINOR) {
/* FIXME: Add minor fault interception for shmem. */
if (!is_vm_hugetlb_page(vma))
}if (!(is_vm_hugetlb_page(vma) || vma_is_shmem(vma))) return false;
@@ -1941,7 +1940,8 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, /* report all available features and ioctls to userland */ uffdio_api.features = UFFD_API_FEATURES; #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
- uffdio_api.features &= ~UFFD_FEATURE_MINOR_HUGETLBFS;
- uffdio_api.features &=
~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
#endif uffdio_api.ioctls = UFFD_API_IOCTLS; ret = -EFAULT; diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index d82b6f396588..f0919c3722e7 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -9,6 +9,7 @@ #include <linux/percpu_counter.h> #include <linux/xattr.h> #include <linux/fs_parser.h> +#include <linux/userfaultfd_k.h>
/* inode in-kernel data */
@@ -122,21 +123,16 @@ static inline bool shmem_file(struct file *file) extern bool shmem_charge(struct inode *inode, long pages); extern void shmem_uncharge(struct inode *inode, long pages);
+#ifdef CONFIG_USERFAULTFD #ifdef CONFIG_SHMEM -extern int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
struct page **pagep);
-extern int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr);
-#else -#define shmem_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \
src_addr, pagep) ({ BUG(); 0; })
-#define shmem_mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, \
dst_addr) ({ BUG(); 0; })
-#endif +int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr, unsigned long src_addr,
enum mcopy_atomic_mode mode, struct page **pagep);
+#else /* !CONFIG_SHMEM */ +#define shmem_mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, \
src_addr, mode, pagep) ({ BUG(); 0; })
+#endif /* CONFIG_SHMEM */ +#endif /* CONFIG_USERFAULTFD */
#endif diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index bafbeb1a2624..47d9790d863d 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -31,7 +31,8 @@ UFFD_FEATURE_MISSING_SHMEM | \ UFFD_FEATURE_SIGBUS | \ UFFD_FEATURE_THREAD_ID | \
UFFD_FEATURE_MINOR_HUGETLBFS)
UFFD_FEATURE_MINOR_HUGETLBFS | \
UFFD_FEATURE_MINOR_SHMEM)
#define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -196,6 +197,7 @@ struct uffdio_api { #define UFFD_FEATURE_SIGBUS (1<<7) #define UFFD_FEATURE_THREAD_ID (1<<8) #define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9) +#define UFFD_FEATURE_MINOR_SHMEM (1<<10) __u64 features;
__u64 ioctls; diff --git a/mm/memory.c b/mm/memory.c index c8e357627318..a1e5ff55027e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3929,9 +3929,11 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf) * something). */ if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
ret = do_fault_around(vmf);
if (ret)
return ret;
if (likely(!userfaultfd_minor(vmf->vma))) {
ret = do_fault_around(vmf);
if (ret)
return ret;
}
}
ret = __do_fault(vmf);
diff --git a/mm/shmem.c b/mm/shmem.c index b2db4ed0fbc7..6f81259fabb3 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -77,7 +77,6 @@ static struct vfsmount *shm_mnt; #include <linux/syscalls.h> #include <linux/fcntl.h> #include <uapi/linux/memfd.h> -#include <linux/userfaultfd_k.h> #include <linux/rmap.h> #include <linux/uuid.h>
@@ -1785,8 +1784,8 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index,
- vm. If we swap it in we mark it dirty since we also free the swap
- entry since a page cannot live in both the swap and page cache.
- vmf and fault_type are only supplied by shmem_fault:
- otherwise they are NULL.
- vma, vmf, and fault_type are only supplied by shmem_fault: otherwise they
*/
- are NULL.
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, @@ -1830,6 +1829,12 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, return error; }
- if (page && vma && userfaultfd_minor(vma)) {
unlock_page(page);
*fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
return 0;
- }
- if (page) hindex = page->index; if (page && sgp == SGP_WRITE)
@@ -2354,14 +2359,12 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode return inode; }
-static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
bool zeropage,
struct page **pagep)
+int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr, unsigned long src_addr,
enum mcopy_atomic_mode mode, struct page **pagep)
{
- bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE); struct inode *inode = file_inode(dst_vma->vm_file); struct shmem_inode_info *info = SHMEM_I(inode); struct address_space *mapping = inode->i_mapping;
@@ -2378,12 +2381,17 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (!shmem_inode_acct_block(inode, 1)) goto out;
- if (!*pagep) {
- if (is_continue) {
ret = -EFAULT;
page = find_lock_page(mapping, pgoff);
if (!page)
goto out_unacct_blocks;
- } else if (!*pagep) { page = shmem_alloc_page(gfp, info, pgoff); if (!page) goto out_unacct_blocks;
if (!zeropage) { /* mcopy_atomic */
if (mode == MCOPY_ATOMIC_NORMAL) { /* mcopy_atomic */ page_kaddr = kmap_atomic(page); ret = copy_from_user(page_kaddr, (const void __user *)src_addr,
Hi Axel,
shmem_mcopy_atomic_pte is not guarded by CONFIG_USERFAULTFD, thus it is causing compilation errors due to the use of enum mcopy_atomic_mode mode, when CONFIG_USERFAULTFD is not set.
— Best Regards, Yan Zi
On Tue, Mar 9, 2021 at 11:52 AM Zi Yan ziy@nvidia.com wrote:
On 1 Mar 2021, at 19:01, Axel Rasmussen wrote:
Modify the userfaultfd register API to allow registering shmem VMAs in minor mode. Modify the shmem mcopy implementation to support UFFDIO_CONTINUE in order to resolve such faults.
Combine the shmem mcopy handler functions into a single shmem_mcopy_atomic_pte, which takes a mode parameter. This matches how the hugetlbfs implementation is structured, and lets us remove a good chunk of boilerplate.
Signed-off-by: Axel Rasmussen axelrasmussen@google.com
fs/userfaultfd.c | 6 +-- include/linux/shmem_fs.h | 26 ++++----- include/uapi/linux/userfaultfd.h | 4 +- mm/memory.c | 8 +-- mm/shmem.c | 92 +++++++++++++++----------------- mm/userfaultfd.c | 27 +++++----- 6 files changed, 79 insertions(+), 84 deletions(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 14f92285d04f..9f3b8684cf3c 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1267,8 +1267,7 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, }
if (vm_flags & VM_UFFD_MINOR) {
/* FIXME: Add minor fault interception for shmem. */
if (!is_vm_hugetlb_page(vma))
if (!(is_vm_hugetlb_page(vma) || vma_is_shmem(vma))) return false; }
@@ -1941,7 +1940,8 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, /* report all available features and ioctls to userland */ uffdio_api.features = UFFD_API_FEATURES; #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
uffdio_api.features &= ~UFFD_FEATURE_MINOR_HUGETLBFS;
uffdio_api.features &=
~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
#endif uffdio_api.ioctls = UFFD_API_IOCTLS; ret = -EFAULT; diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index d82b6f396588..f0919c3722e7 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -9,6 +9,7 @@ #include <linux/percpu_counter.h> #include <linux/xattr.h> #include <linux/fs_parser.h> +#include <linux/userfaultfd_k.h>
/* inode in-kernel data */
@@ -122,21 +123,16 @@ static inline bool shmem_file(struct file *file) extern bool shmem_charge(struct inode *inode, long pages); extern void shmem_uncharge(struct inode *inode, long pages);
+#ifdef CONFIG_USERFAULTFD #ifdef CONFIG_SHMEM -extern int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
struct page **pagep);
-extern int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr);
-#else -#define shmem_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \
src_addr, pagep) ({ BUG(); 0; })
-#define shmem_mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, \
dst_addr) ({ BUG(); 0; })
-#endif +int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr, unsigned long src_addr,
enum mcopy_atomic_mode mode, struct page **pagep);
+#else /* !CONFIG_SHMEM */ +#define shmem_mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, \
src_addr, mode, pagep) ({ BUG(); 0; })
+#endif /* CONFIG_SHMEM */ +#endif /* CONFIG_USERFAULTFD */
#endif diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index bafbeb1a2624..47d9790d863d 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -31,7 +31,8 @@ UFFD_FEATURE_MISSING_SHMEM | \ UFFD_FEATURE_SIGBUS | \ UFFD_FEATURE_THREAD_ID | \
UFFD_FEATURE_MINOR_HUGETLBFS)
UFFD_FEATURE_MINOR_HUGETLBFS | \
UFFD_FEATURE_MINOR_SHMEM)
#define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -196,6 +197,7 @@ struct uffdio_api { #define UFFD_FEATURE_SIGBUS (1<<7) #define UFFD_FEATURE_THREAD_ID (1<<8) #define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9) +#define UFFD_FEATURE_MINOR_SHMEM (1<<10) __u64 features;
__u64 ioctls;
diff --git a/mm/memory.c b/mm/memory.c index c8e357627318..a1e5ff55027e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3929,9 +3929,11 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf) * something). */ if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
ret = do_fault_around(vmf);
if (ret)
return ret;
if (likely(!userfaultfd_minor(vmf->vma))) {
ret = do_fault_around(vmf);
if (ret)
return ret;
} } ret = __do_fault(vmf);
diff --git a/mm/shmem.c b/mm/shmem.c index b2db4ed0fbc7..6f81259fabb3 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -77,7 +77,6 @@ static struct vfsmount *shm_mnt; #include <linux/syscalls.h> #include <linux/fcntl.h> #include <uapi/linux/memfd.h> -#include <linux/userfaultfd_k.h> #include <linux/rmap.h> #include <linux/uuid.h>
@@ -1785,8 +1784,8 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index,
- vm. If we swap it in we mark it dirty since we also free the swap
- entry since a page cannot live in both the swap and page cache.
- vmf and fault_type are only supplied by shmem_fault:
- otherwise they are NULL.
- vma, vmf, and fault_type are only supplied by shmem_fault: otherwise they
*/
- are NULL.
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, @@ -1830,6 +1829,12 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, return error; }
if (page && vma && userfaultfd_minor(vma)) {
unlock_page(page);
*fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
return 0;
}
if (page) hindex = page->index; if (page && sgp == SGP_WRITE)
@@ -2354,14 +2359,12 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode return inode; }
-static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
bool zeropage,
struct page **pagep)
+int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr, unsigned long src_addr,
enum mcopy_atomic_mode mode, struct page **pagep)
{
bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE); struct inode *inode = file_inode(dst_vma->vm_file); struct shmem_inode_info *info = SHMEM_I(inode); struct address_space *mapping = inode->i_mapping;
@@ -2378,12 +2381,17 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (!shmem_inode_acct_block(inode, 1)) goto out;
if (!*pagep) {
if (is_continue) {
ret = -EFAULT;
page = find_lock_page(mapping, pgoff);
if (!page)
goto out_unacct_blocks;
} else if (!*pagep) { page = shmem_alloc_page(gfp, info, pgoff); if (!page) goto out_unacct_blocks;
if (!zeropage) { /* mcopy_atomic */
if (mode == MCOPY_ATOMIC_NORMAL) { /* mcopy_atomic */ page_kaddr = kmap_atomic(page); ret = copy_from_user(page_kaddr, (const void __user *)src_addr,
Hi Axel,
shmem_mcopy_atomic_pte is not guarded by CONFIG_USERFAULTFD, thus it is causing compilation errors due to the use of enum mcopy_atomic_mode mode, when CONFIG_USERFAULTFD is not set.
Ah, my apologies, I guarded it in the header but forgot to do so in shmem.c. I'll send an updated patch today.
— Best Regards, Yan Zi
Modify the userfaultfd register API to allow registering shmem VMAs in minor mode. Modify the shmem mcopy implementation to support UFFDIO_CONTINUE in order to resolve such faults.
Combine the shmem mcopy handler functions into a single shmem_mcopy_atomic_pte, which takes a mode parameter. This matches how the hugetlbfs implementation is structured, and lets us remove a good chunk of boilerplate.
Signed-off-by: Axel Rasmussen axelrasmussen@google.com --- fs/userfaultfd.c | 6 +- include/linux/shmem_fs.h | 26 ++++----- include/uapi/linux/userfaultfd.h | 4 +- mm/memory.c | 8 ++- mm/shmem.c | 94 +++++++++++++++----------------- mm/userfaultfd.c | 27 ++++----- 6 files changed, 81 insertions(+), 84 deletions(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 14f92285d04f..9f3b8684cf3c 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1267,8 +1267,7 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, }
if (vm_flags & VM_UFFD_MINOR) { - /* FIXME: Add minor fault interception for shmem. */ - if (!is_vm_hugetlb_page(vma)) + if (!(is_vm_hugetlb_page(vma) || vma_is_shmem(vma))) return false; }
@@ -1941,7 +1940,8 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, /* report all available features and ioctls to userland */ uffdio_api.features = UFFD_API_FEATURES; #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR - uffdio_api.features &= ~UFFD_FEATURE_MINOR_HUGETLBFS; + uffdio_api.features &= + ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM); #endif uffdio_api.ioctls = UFFD_API_IOCTLS; ret = -EFAULT; diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index d82b6f396588..f0919c3722e7 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -9,6 +9,7 @@ #include <linux/percpu_counter.h> #include <linux/xattr.h> #include <linux/fs_parser.h> +#include <linux/userfaultfd_k.h>
/* inode in-kernel data */
@@ -122,21 +123,16 @@ static inline bool shmem_file(struct file *file) extern bool shmem_charge(struct inode *inode, long pages); extern void shmem_uncharge(struct inode *inode, long pages);
+#ifdef CONFIG_USERFAULTFD #ifdef CONFIG_SHMEM -extern int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - struct page **pagep); -extern int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm, - pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr); -#else -#define shmem_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \ - src_addr, pagep) ({ BUG(); 0; }) -#define shmem_mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, \ - dst_addr) ({ BUG(); 0; }) -#endif +int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, unsigned long src_addr, + enum mcopy_atomic_mode mode, struct page **pagep); +#else /* !CONFIG_SHMEM */ +#define shmem_mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, \ + src_addr, mode, pagep) ({ BUG(); 0; }) +#endif /* CONFIG_SHMEM */ +#endif /* CONFIG_USERFAULTFD */
#endif diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index bafbeb1a2624..47d9790d863d 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -31,7 +31,8 @@ UFFD_FEATURE_MISSING_SHMEM | \ UFFD_FEATURE_SIGBUS | \ UFFD_FEATURE_THREAD_ID | \ - UFFD_FEATURE_MINOR_HUGETLBFS) + UFFD_FEATURE_MINOR_HUGETLBFS | \ + UFFD_FEATURE_MINOR_SHMEM) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -196,6 +197,7 @@ struct uffdio_api { #define UFFD_FEATURE_SIGBUS (1<<7) #define UFFD_FEATURE_THREAD_ID (1<<8) #define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9) +#define UFFD_FEATURE_MINOR_SHMEM (1<<10) __u64 features;
__u64 ioctls; diff --git a/mm/memory.c b/mm/memory.c index c8e357627318..a1e5ff55027e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3929,9 +3929,11 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf) * something). */ if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { - ret = do_fault_around(vmf); - if (ret) - return ret; + if (likely(!userfaultfd_minor(vmf->vma))) { + ret = do_fault_around(vmf); + if (ret) + return ret; + } }
ret = __do_fault(vmf); diff --git a/mm/shmem.c b/mm/shmem.c index b2db4ed0fbc7..ef8c9f5e92fc 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -77,7 +77,6 @@ static struct vfsmount *shm_mnt; #include <linux/syscalls.h> #include <linux/fcntl.h> #include <uapi/linux/memfd.h> -#include <linux/userfaultfd_k.h> #include <linux/rmap.h> #include <linux/uuid.h>
@@ -1785,8 +1784,8 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, * vm. If we swap it in we mark it dirty since we also free the swap * entry since a page cannot live in both the swap and page cache. * - * vmf and fault_type are only supplied by shmem_fault: - * otherwise they are NULL. + * vma, vmf, and fault_type are only supplied by shmem_fault: otherwise they + * are NULL. */ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, @@ -1830,6 +1829,12 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, return error; }
+ if (page && vma && userfaultfd_minor(vma)) { + unlock_page(page); + *fault_type = handle_userfault(vmf, VM_UFFD_MINOR); + return 0; + } + if (page) hindex = page->index; if (page && sgp == SGP_WRITE) @@ -2354,14 +2359,13 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode return inode; }
-static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, - pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - bool zeropage, - struct page **pagep) +#ifdef CONFIG_USERFAULTFD +int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, unsigned long src_addr, + enum mcopy_atomic_mode mode, struct page **pagep) { + bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE); struct inode *inode = file_inode(dst_vma->vm_file); struct shmem_inode_info *info = SHMEM_I(inode); struct address_space *mapping = inode->i_mapping; @@ -2378,12 +2382,17 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (!shmem_inode_acct_block(inode, 1)) goto out;
- if (!*pagep) { + if (is_continue) { + ret = -EFAULT; + page = find_lock_page(mapping, pgoff); + if (!page) + goto out_unacct_blocks; + } else if (!*pagep) { page = shmem_alloc_page(gfp, info, pgoff); if (!page) goto out_unacct_blocks;
- if (!zeropage) { /* mcopy_atomic */ + if (mode == MCOPY_ATOMIC_NORMAL) { /* mcopy_atomic */ page_kaddr = kmap_atomic(page); ret = copy_from_user(page_kaddr, (const void __user *)src_addr, @@ -2397,7 +2406,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, /* don't free the page */ return -ENOENT; } - } else { /* mfill_zeropage_atomic */ + } else { /* zeropage */ clear_highpage(page); } } else { @@ -2405,10 +2414,13 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, *pagep = NULL; }
- VM_BUG_ON(PageLocked(page) || PageSwapBacked(page)); - __SetPageLocked(page); - __SetPageSwapBacked(page); - __SetPageUptodate(page); + if (!is_continue) { + VM_BUG_ON(PageSwapBacked(page)); + VM_BUG_ON(PageLocked(page)); + __SetPageLocked(page); + __SetPageSwapBacked(page); + __SetPageUptodate(page); + }
ret = -EFAULT; offset = linear_page_index(dst_vma, dst_addr); @@ -2416,10 +2428,13 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (unlikely(offset >= max_off)) goto out_release;
- ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL, - gfp & GFP_RECLAIM_MASK, dst_mm); - if (ret) - goto out_release; + /* If page wasn't already in the page cache, add it. */ + if (!is_continue) { + ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL, + gfp & GFP_RECLAIM_MASK, dst_mm); + if (ret) + goto out_release; + }
_dst_pte = mk_pte(page, dst_vma->vm_page_prot); if (dst_vma->vm_flags & VM_WRITE) @@ -2446,13 +2461,15 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (!pte_none(*dst_pte)) goto out_release_unlock;
- lru_cache_add(page); + if (!is_continue) { + lru_cache_add(page);
- spin_lock_irq(&info->lock); - info->alloced++; - inode->i_blocks += BLOCKS_PER_PAGE; - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); + spin_lock_irq(&info->lock); + info->alloced++; + inode->i_blocks += BLOCKS_PER_PAGE; + shmem_recalc_inode(inode); + spin_unlock_irq(&info->lock); + }
inc_mm_counter(dst_mm, mm_counter_file(page)); page_add_file_rmap(page, false); @@ -2476,28 +2493,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, shmem_inode_unacct_blocks(inode, 1); goto out; } - -int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, - pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - struct page **pagep) -{ - return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, - dst_addr, src_addr, false, pagep); -} - -int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm, - pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr) -{ - struct page *page = NULL; - - return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, - dst_addr, 0, true, &page); -} +#endif /* CONFIG_USERFAULTFD */
#ifdef CONFIG_TMPFS static const struct inode_operations shmem_symlink_inode_operations; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index ce6cb4760d2c..6cd7ab531aec 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -415,7 +415,7 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, unsigned long dst_addr, unsigned long src_addr, struct page **page, - bool zeropage, + enum mcopy_atomic_mode mode, bool wp_copy) { ssize_t err; @@ -431,22 +431,24 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, * and not in the radix tree. */ if (!(dst_vma->vm_flags & VM_SHARED)) { - if (!zeropage) + switch (mode) { + case MCOPY_ATOMIC_NORMAL: err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, src_addr, page, wp_copy); - else + break; + case MCOPY_ATOMIC_ZEROPAGE: err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, dst_addr); + break; + case MCOPY_ATOMIC_CONTINUE: + err = -EINVAL; + break; + } } else { VM_WARN_ON_ONCE(wp_copy); - if (!zeropage) - err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, - dst_vma, dst_addr, - src_addr, page); - else - err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd, - dst_vma, dst_addr); + err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, + src_addr, mode, page); }
return err; @@ -467,7 +469,6 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, long copied; struct page *page; bool wp_copy; - bool zeropage = (mcopy_mode == MCOPY_ATOMIC_ZEROPAGE);
/* * Sanitize the command parameters: @@ -530,7 +531,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) goto out_unlock; - if (mcopy_mode == MCOPY_ATOMIC_CONTINUE) + if (!vma_is_shmem(dst_vma) && mcopy_mode == MCOPY_ATOMIC_CONTINUE) goto out_unlock;
/* @@ -578,7 +579,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, BUG_ON(pmd_trans_huge(*dst_pmd));
err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, - src_addr, &page, zeropage, wp_copy); + src_addr, &page, mcopy_mode, wp_copy); cond_resched();
if (unlikely(err == -ENOENT)) { -- 2.30.1.766.gb4fecdf3b7-goog
+Stephen to CC, as an FYI.
On Tue, Mar 9, 2021 at 2:58 PM Axel Rasmussen axelrasmussen@google.com wrote:
Modify the userfaultfd register API to allow registering shmem VMAs in minor mode. Modify the shmem mcopy implementation to support UFFDIO_CONTINUE in order to resolve such faults.
Combine the shmem mcopy handler functions into a single shmem_mcopy_atomic_pte, which takes a mode parameter. This matches how the hugetlbfs implementation is structured, and lets us remove a good chunk of boilerplate.
Signed-off-by: Axel Rasmussen axelrasmussen@google.com
fs/userfaultfd.c | 6 +- include/linux/shmem_fs.h | 26 ++++----- include/uapi/linux/userfaultfd.h | 4 +- mm/memory.c | 8 ++- mm/shmem.c | 94 +++++++++++++++----------------- mm/userfaultfd.c | 27 ++++----- 6 files changed, 81 insertions(+), 84 deletions(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 14f92285d04f..9f3b8684cf3c 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1267,8 +1267,7 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, }
if (vm_flags & VM_UFFD_MINOR) {
/* FIXME: Add minor fault interception for shmem. */
if (!is_vm_hugetlb_page(vma))
if (!(is_vm_hugetlb_page(vma) || vma_is_shmem(vma))) return false; }
@@ -1941,7 +1940,8 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, /* report all available features and ioctls to userland */ uffdio_api.features = UFFD_API_FEATURES; #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
uffdio_api.features &= ~UFFD_FEATURE_MINOR_HUGETLBFS;
uffdio_api.features &=
~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
#endif uffdio_api.ioctls = UFFD_API_IOCTLS; ret = -EFAULT; diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index d82b6f396588..f0919c3722e7 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -9,6 +9,7 @@ #include <linux/percpu_counter.h> #include <linux/xattr.h> #include <linux/fs_parser.h> +#include <linux/userfaultfd_k.h>
/* inode in-kernel data */
@@ -122,21 +123,16 @@ static inline bool shmem_file(struct file *file) extern bool shmem_charge(struct inode *inode, long pages); extern void shmem_uncharge(struct inode *inode, long pages);
+#ifdef CONFIG_USERFAULTFD #ifdef CONFIG_SHMEM -extern int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
struct page **pagep);
-extern int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr);
-#else -#define shmem_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \
src_addr, pagep) ({ BUG(); 0; })
-#define shmem_mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, \
dst_addr) ({ BUG(); 0; })
-#endif +int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr, unsigned long src_addr,
enum mcopy_atomic_mode mode, struct page **pagep);
+#else /* !CONFIG_SHMEM */ +#define shmem_mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, \
src_addr, mode, pagep) ({ BUG(); 0; })
+#endif /* CONFIG_SHMEM */ +#endif /* CONFIG_USERFAULTFD */
#endif diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index bafbeb1a2624..47d9790d863d 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -31,7 +31,8 @@ UFFD_FEATURE_MISSING_SHMEM | \ UFFD_FEATURE_SIGBUS | \ UFFD_FEATURE_THREAD_ID | \
UFFD_FEATURE_MINOR_HUGETLBFS)
UFFD_FEATURE_MINOR_HUGETLBFS | \
UFFD_FEATURE_MINOR_SHMEM)
#define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -196,6 +197,7 @@ struct uffdio_api { #define UFFD_FEATURE_SIGBUS (1<<7) #define UFFD_FEATURE_THREAD_ID (1<<8) #define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9) +#define UFFD_FEATURE_MINOR_SHMEM (1<<10) __u64 features;
__u64 ioctls;
diff --git a/mm/memory.c b/mm/memory.c index c8e357627318..a1e5ff55027e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3929,9 +3929,11 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf) * something). */ if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
ret = do_fault_around(vmf);
if (ret)
return ret;
if (likely(!userfaultfd_minor(vmf->vma))) {
ret = do_fault_around(vmf);
if (ret)
return ret;
} } ret = __do_fault(vmf);
diff --git a/mm/shmem.c b/mm/shmem.c index b2db4ed0fbc7..ef8c9f5e92fc 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -77,7 +77,6 @@ static struct vfsmount *shm_mnt; #include <linux/syscalls.h> #include <linux/fcntl.h> #include <uapi/linux/memfd.h> -#include <linux/userfaultfd_k.h> #include <linux/rmap.h> #include <linux/uuid.h>
@@ -1785,8 +1784,8 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index,
- vm. If we swap it in we mark it dirty since we also free the swap
- entry since a page cannot live in both the swap and page cache.
- vmf and fault_type are only supplied by shmem_fault:
- otherwise they are NULL.
- vma, vmf, and fault_type are only supplied by shmem_fault: otherwise they
*/
- are NULL.
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, @@ -1830,6 +1829,12 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, return error; }
if (page && vma && userfaultfd_minor(vma)) {
unlock_page(page);
*fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
return 0;
}
if (page) hindex = page->index; if (page && sgp == SGP_WRITE)
@@ -2354,14 +2359,13 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode return inode; }
-static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
bool zeropage,
struct page **pagep)
+#ifdef CONFIG_USERFAULTFD +int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr, unsigned long src_addr,
enum mcopy_atomic_mode mode, struct page **pagep)
{
bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE); struct inode *inode = file_inode(dst_vma->vm_file); struct shmem_inode_info *info = SHMEM_I(inode); struct address_space *mapping = inode->i_mapping;
@@ -2378,12 +2382,17 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (!shmem_inode_acct_block(inode, 1)) goto out;
if (!*pagep) {
if (is_continue) {
ret = -EFAULT;
page = find_lock_page(mapping, pgoff);
if (!page)
goto out_unacct_blocks;
} else if (!*pagep) { page = shmem_alloc_page(gfp, info, pgoff); if (!page) goto out_unacct_blocks;
if (!zeropage) { /* mcopy_atomic */
if (mode == MCOPY_ATOMIC_NORMAL) { /* mcopy_atomic */ page_kaddr = kmap_atomic(page); ret = copy_from_user(page_kaddr, (const void __user *)src_addr,
@@ -2397,7 +2406,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, /* don't free the page */ return -ENOENT; }
} else { /* mfill_zeropage_atomic */
} else { /* zeropage */ clear_highpage(page); } } else {
@@ -2405,10 +2414,13 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, *pagep = NULL; }
VM_BUG_ON(PageLocked(page) || PageSwapBacked(page));
__SetPageLocked(page);
__SetPageSwapBacked(page);
__SetPageUptodate(page);
if (!is_continue) {
VM_BUG_ON(PageSwapBacked(page));
VM_BUG_ON(PageLocked(page));
__SetPageLocked(page);
__SetPageSwapBacked(page);
__SetPageUptodate(page);
} ret = -EFAULT; offset = linear_page_index(dst_vma, dst_addr);
@@ -2416,10 +2428,13 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (unlikely(offset >= max_off)) goto out_release;
ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
gfp & GFP_RECLAIM_MASK, dst_mm);
if (ret)
goto out_release;
/* If page wasn't already in the page cache, add it. */
if (!is_continue) {
ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
gfp & GFP_RECLAIM_MASK, dst_mm);
if (ret)
goto out_release;
} _dst_pte = mk_pte(page, dst_vma->vm_page_prot); if (dst_vma->vm_flags & VM_WRITE)
@@ -2446,13 +2461,15 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (!pte_none(*dst_pte)) goto out_release_unlock;
lru_cache_add(page);
if (!is_continue) {
lru_cache_add(page);
spin_lock_irq(&info->lock);
info->alloced++;
inode->i_blocks += BLOCKS_PER_PAGE;
shmem_recalc_inode(inode);
spin_unlock_irq(&info->lock);
spin_lock_irq(&info->lock);
info->alloced++;
inode->i_blocks += BLOCKS_PER_PAGE;
shmem_recalc_inode(inode);
spin_unlock_irq(&info->lock);
} inc_mm_counter(dst_mm, mm_counter_file(page)); page_add_file_rmap(page, false);
@@ -2476,28 +2493,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, shmem_inode_unacct_blocks(inode, 1); goto out; }
-int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
struct page **pagep)
-{
return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
dst_addr, src_addr, false, pagep);
-}
-int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr)
-{
struct page *page = NULL;
return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
dst_addr, 0, true, &page);
-} +#endif /* CONFIG_USERFAULTFD */
#ifdef CONFIG_TMPFS static const struct inode_operations shmem_symlink_inode_operations; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index ce6cb4760d2c..6cd7ab531aec 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -415,7 +415,7 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, unsigned long dst_addr, unsigned long src_addr, struct page **page,
bool zeropage,
enum mcopy_atomic_mode mode, bool wp_copy)
{ ssize_t err; @@ -431,22 +431,24 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, * and not in the radix tree. */ if (!(dst_vma->vm_flags & VM_SHARED)) {
if (!zeropage)
switch (mode) {
case MCOPY_ATOMIC_NORMAL: err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, src_addr, page, wp_copy);
else
break;
case MCOPY_ATOMIC_ZEROPAGE: err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, dst_addr);
break;
case MCOPY_ATOMIC_CONTINUE:
err = -EINVAL;
break;
} } else { VM_WARN_ON_ONCE(wp_copy);
if (!zeropage)
err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
dst_vma, dst_addr,
src_addr, page);
else
err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd,
dst_vma, dst_addr);
err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
src_addr, mode, page); } return err;
@@ -467,7 +469,6 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, long copied; struct page *page; bool wp_copy;
bool zeropage = (mcopy_mode == MCOPY_ATOMIC_ZEROPAGE); /* * Sanitize the command parameters:
@@ -530,7 +531,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) goto out_unlock;
if (mcopy_mode == MCOPY_ATOMIC_CONTINUE)
if (!vma_is_shmem(dst_vma) && mcopy_mode == MCOPY_ATOMIC_CONTINUE) goto out_unlock; /*
@@ -578,7 +579,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, BUG_ON(pmd_trans_huge(*dst_pmd));
err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
src_addr, &page, zeropage, wp_copy);
src_addr, &page, mcopy_mode, wp_copy); cond_resched(); if (unlikely(err == -ENOENT)) {
-- 2.30.1.766.gb4fecdf3b7-goog
This is a preparatory commit. In the future, we want to be able to setup alias mappings for area_src and area_dst in the shmem test, like we do in the hugetlb_shared test. With a VMA obtained via mmap(MAP_ANONYMOUS | MAP_SHARED), it isn't clear how to do this.
So, mmap() with an fd, so we can create alias mappings. Use memfd_create instead of actually passing in a tmpfs path like hugetlb does, since it's more convenient / simpler to run, and works just as well.
Future commits will:
1. Setup the alias mappings. 2. Extend our tests to actually take advantage of this, to test new userfaultfd behavior being introduced in this series.
Also, a small fix in the area we're changing: when the hugetlb setup fails in main(), pass in the right argv[] so we actually print out the hugetlb file path.
Signed-off-by: Axel Rasmussen axelrasmussen@google.com --- tools/testing/selftests/vm/userfaultfd.c | 35 ++++++++++++++++++++---- 1 file changed, 30 insertions(+), 5 deletions(-)
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index f5ab5e0312e7..859398efb4fe 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -85,6 +85,7 @@ static bool test_uffdio_wp = false; static bool test_uffdio_minor = false;
static bool map_shared; +static int shm_fd; static int huge_fd; static char *huge_fd_off0; static unsigned long long *count_verify; @@ -297,12 +298,20 @@ static int shmem_release_pages(char *rel_area)
static void shmem_allocate_area(void **alloc_area) { + unsigned long offset = + alloc_area == (void **)&area_src ? 0 : nr_pages * page_size; + *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_SHARED, -1, 0); + MAP_SHARED, shm_fd, offset); if (*alloc_area == MAP_FAILED) { - fprintf(stderr, "shared memory mmap failed\n"); - *alloc_area = NULL; + perror("mmap of memfd failed"); + goto fail; } + + return; + +fail: + *alloc_area = NULL; }
struct uffd_test_ops { @@ -1672,15 +1681,31 @@ int main(int argc, char **argv) usage(); huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755); if (huge_fd < 0) { - fprintf(stderr, "Open of %s failed", argv[3]); + fprintf(stderr, "Open of %s failed", argv[4]); perror("open"); exit(1); } if (ftruncate(huge_fd, 0)) { - fprintf(stderr, "ftruncate %s to size 0 failed", argv[3]); + fprintf(stderr, "ftruncate %s to size 0 failed", argv[4]); perror("ftruncate"); exit(1); } + } else if (test_type == TEST_SHMEM) { + shm_fd = memfd_create(argv[0], 0); + if (shm_fd < 0) { + perror("memfd_create"); + exit(1); + } + if (ftruncate(shm_fd, nr_pages * page_size * 2)) { + perror("ftruncate"); + exit(1); + } + if (fallocate(shm_fd, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, + nr_pages * page_size * 2)) { + perror("fallocate"); + exit(1); + } } printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n", nr_pages, nr_pages_per_cpu);
Previously, we just allocated two shm areas: area_src and area_dst. With this commit, change this so we also allocate area_src_alias, and area_dst_alias.
area_*_alias and area_* (respectively) point to the same underlying physical pages, but are different VMAs. In a future commit in this series, we'll leverage this setup to exercise minor fault handling support for shmem, just like we do in the hugetlb_shared test.
Signed-off-by: Axel Rasmussen axelrasmussen@google.com --- tools/testing/selftests/vm/userfaultfd.c | 29 +++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-)
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 859398efb4fe..4a18590fe0f8 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -298,8 +298,9 @@ static int shmem_release_pages(char *rel_area)
static void shmem_allocate_area(void **alloc_area) { - unsigned long offset = - alloc_area == (void **)&area_src ? 0 : nr_pages * page_size; + void *area_alias = NULL; + bool is_src = alloc_area == (void **)&area_src; + unsigned long offset = is_src ? 0 : nr_pages * page_size;
*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, MAP_SHARED, shm_fd, offset); @@ -308,12 +309,34 @@ static void shmem_allocate_area(void **alloc_area) goto fail; }
+ area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, + MAP_SHARED, shm_fd, offset); + if (area_alias == MAP_FAILED) { + perror("mmap of memfd alias failed"); + goto fail_munmap; + } + + if (is_src) + area_src_alias = area_alias; + else + area_dst_alias = area_alias; + return;
+fail_munmap: + if (munmap(*alloc_area, nr_pages * page_size) < 0) { + perror("munmap of memfd failed\n"); + exit(1); + } fail: *alloc_area = NULL; }
+static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset) +{ + *start = (unsigned long)area_dst_alias + offset; +} + struct uffd_test_ops { unsigned long expected_ioctls; void (*allocate_area)(void **alloc_area); @@ -341,7 +364,7 @@ static struct uffd_test_ops shmem_uffd_test_ops = { .expected_ioctls = SHMEM_EXPECTED_IOCTLS, .allocate_area = shmem_allocate_area, .release_pages = shmem_release_pages, - .alias_mapping = noop_alias_mapping, + .alias_mapping = shmem_alias_mapping, };
static struct uffd_test_ops hugetlb_uffd_test_ops = {
Currently, the context (fds, mmap-ed areas, etc.) are global. Each test mutates this state in some way, in some cases really "clobbering it" (e.g., the events test mremap-ing area_dst over the top of area_src, or the minor faults tests overwriting the count_verify values in the test areas). We run the tests in a particular order, each test is careful to make the right assumptions about its starting state, etc.
But, this is fragile. It's better for a test's success or failure to not depend on what some other prior test case did to the global state.
To that end, clear and reinitialize the test context at the start of each test case, so whatever prior test cases did doesn't affect future tests.
This is particularly relevant to this series because the events test's mremap of area_dst screws up assumptions the minor fault test was relying on. This wasn't a problem for hugetlb, as we don't mremap in that case.
Signed-off-by: Axel Rasmussen axelrasmussen@google.com --- tools/testing/selftests/vm/userfaultfd.c | 249 ++++++++++++++--------- 1 file changed, 151 insertions(+), 98 deletions(-)
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 4a18590fe0f8..5183ddb3080d 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -89,7 +89,8 @@ static int shm_fd; static int huge_fd; static char *huge_fd_off0; static unsigned long long *count_verify; -static int uffd, uffd_flags, finished, *pipefd; +static int uffd = -1; +static int uffd_flags, finished, *pipefd; static char *area_src, *area_src_alias, *area_dst, *area_dst_alias; static char *zeropage; pthread_attr_t attr; @@ -376,6 +377,146 @@ static struct uffd_test_ops hugetlb_uffd_test_ops = {
static struct uffd_test_ops *uffd_test_ops;
+static int userfaultfd_open(uint64_t *features) +{ + struct uffdio_api uffdio_api; + + uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + if (uffd < 0) { + fprintf(stderr, + "userfaultfd syscall not available in this kernel\n"); + return 1; + } + uffd_flags = fcntl(uffd, F_GETFD, NULL); + + uffdio_api.api = UFFD_API; + uffdio_api.features = *features; + if (ioctl(uffd, UFFDIO_API, &uffdio_api)) { + fprintf(stderr, "UFFDIO_API failed.\nPlease make sure to " + "run with either root or ptrace capability.\n"); + return 1; + } + if (uffdio_api.api != UFFD_API) { + fprintf(stderr, "UFFDIO_API error: %" PRIu64 "\n", + (uint64_t)uffdio_api.api); + return 1; + } + + *features = uffdio_api.features; + return 0; +} + +static int uffd_test_ctx_init_ext(uint64_t *features) +{ + unsigned long nr, cpu; + + uffd_test_ops->allocate_area((void **)&area_src); + if (!area_src) + return 1; + uffd_test_ops->allocate_area((void **)&area_dst); + if (!area_dst) + return 1; + + if (uffd_test_ops->release_pages(area_src)) + return 1; + + if (uffd_test_ops->release_pages(area_dst)) + return 1; + + if (userfaultfd_open(features)) + return 1; + + count_verify = malloc(nr_pages * sizeof(unsigned long long)); + if (!count_verify) { + perror("count_verify"); + return 1; + } + + for (nr = 0; nr < nr_pages; nr++) { + *area_mutex(area_src, nr) = + (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER; + count_verify[nr] = *area_count(area_src, nr) = 1; + /* + * In the transition between 255 to 256, powerpc will + * read out of order in my_bcmp and see both bytes as + * zero, so leave a placeholder below always non-zero + * after the count, to avoid my_bcmp to trigger false + * positives. + */ + *(area_count(area_src, nr) + 1) = 1; + } + + pipefd = malloc(sizeof(int) * nr_cpus * 2); + if (!pipefd) { + perror("pipefd"); + return 1; + } + for (cpu = 0; cpu < nr_cpus; cpu++) { + if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK)) { + perror("pipe"); + return 1; + } + } + + return 0; +} + +static inline int uffd_test_ctx_init(uint64_t features) +{ + return uffd_test_ctx_init_ext(&features); +} + +static inline int munmap_area(void **area) +{ + if (*area) { + if (munmap(*area, nr_pages * page_size)) { + perror("munmap"); + return 1; + } + } + + *area = NULL; + return 0; +} + +static int uffd_test_ctx_clear(void) +{ + int ret = 0; + size_t i; + + if (pipefd) { + for (i = 0; i < nr_cpus * 2; ++i) { + if (close(pipefd[i])) { + perror("close pipefd"); + ret = 1; + } + } + free(pipefd); + pipefd = NULL; + } + + if (count_verify) { + free(count_verify); + count_verify = NULL; + } + + if (uffd != -1) { + if (close(uffd)) { + perror("close uffd"); + ret = 1; + } + uffd = -1; + } + + huge_fd_off0 = NULL; + ret |= munmap_area((void **)&area_src); + ret |= munmap_area((void **)&area_src_alias); + ret |= munmap_area((void **)&area_dst); + ret |= munmap_area((void **)&area_dst_alias); + + return ret; +} + static int my_bcmp(char *str1, char *str2, size_t n) { unsigned long i; @@ -859,40 +1000,6 @@ static int stress(struct uffd_stats *uffd_stats) return 0; }
-static int userfaultfd_open_ext(uint64_t *features) -{ - struct uffdio_api uffdio_api; - - uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); - if (uffd < 0) { - fprintf(stderr, - "userfaultfd syscall not available in this kernel\n"); - return 1; - } - uffd_flags = fcntl(uffd, F_GETFD, NULL); - - uffdio_api.api = UFFD_API; - uffdio_api.features = *features; - if (ioctl(uffd, UFFDIO_API, &uffdio_api)) { - fprintf(stderr, "UFFDIO_API failed.\nPlease make sure to " - "run with either root or ptrace capability.\n"); - return 1; - } - if (uffdio_api.api != UFFD_API) { - fprintf(stderr, "UFFDIO_API error: %" PRIu64 "\n", - (uint64_t)uffdio_api.api); - return 1; - } - - *features = uffdio_api.features; - return 0; -} - -static int userfaultfd_open(uint64_t features) -{ - return userfaultfd_open_ext(&features); -} - sigjmp_buf jbuf, *sigbuf;
static void sighndl(int sig, siginfo_t *siginfo, void *ptr) @@ -1010,6 +1117,8 @@ static int faulting_process(int signal_test) perror("mremap"); exit(1); } + /* Reset area_src since we just clobbered it */ + area_src = NULL;
for (; nr < nr_pages; nr++) { count = *area_count(area_dst, nr); @@ -1113,11 +1222,9 @@ static int userfaultfd_zeropage_test(void) printf("testing UFFDIO_ZEROPAGE: "); fflush(stdout);
- if (uffd_test_ops->release_pages(area_dst)) + if (uffd_test_ctx_clear() || uffd_test_ctx_init(0)) return 1;
- if (userfaultfd_open(0)) - return 1; uffdio_register.range.start = (unsigned long) area_dst; uffdio_register.range.len = nr_pages * page_size; uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; @@ -1143,7 +1250,6 @@ static int userfaultfd_zeropage_test(void) } }
- close(uffd); printf("done.\n"); return 0; } @@ -1161,13 +1267,11 @@ static int userfaultfd_events_test(void) printf("testing events (fork, remap, remove): "); fflush(stdout);
- if (uffd_test_ops->release_pages(area_dst)) - return 1; - features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP | UFFD_FEATURE_EVENT_REMOVE; - if (userfaultfd_open(features)) + if (uffd_test_ctx_clear() || uffd_test_ctx_init(features)) return 1; + fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
uffdio_register.range.start = (unsigned long) area_dst; @@ -1213,8 +1317,6 @@ static int userfaultfd_events_test(void) if (pthread_join(uffd_mon, NULL)) return 1;
- close(uffd); - uffd_stats_report(&stats, 1);
return stats.missing_faults != nr_pages; @@ -1234,12 +1336,10 @@ static int userfaultfd_sig_test(void) printf("testing signal delivery: "); fflush(stdout);
- if (uffd_test_ops->release_pages(area_dst)) - return 1; - features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS; - if (userfaultfd_open(features)) + if (uffd_test_ctx_clear() || uffd_test_ctx_init(features)) return 1; + fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
uffdio_register.range.start = (unsigned long) area_dst; @@ -1297,7 +1397,6 @@ static int userfaultfd_sig_test(void) if (userfaults) fprintf(stderr, "Signal test failed, userfaults: %ld\n", userfaults); - close(uffd); return userfaults != 0; }
@@ -1319,10 +1418,7 @@ static int userfaultfd_minor_test(void) printf("testing minor faults: "); fflush(stdout);
- if (uffd_test_ops->release_pages(area_dst)) - return 1; - - if (userfaultfd_open_ext(&features)) + if (uffd_test_ctx_clear() || uffd_test_ctx_init_ext(&features)) return 1; /* If kernel reports the feature isn't supported, skip the test. */ if (!(features & UFFD_FEATURE_MINOR_HUGETLBFS)) { @@ -1390,8 +1486,6 @@ static int userfaultfd_minor_test(void) if (pthread_join(uffd_mon, NULL)) return 1;
- close(uffd); - uffd_stats_report(&stats, 1);
return stats.missing_faults != 0 || stats.minor_faults != nr_pages; @@ -1403,52 +1497,12 @@ static int userfaultfd_stress(void) char *tmp_area; unsigned long nr; struct uffdio_register uffdio_register; - unsigned long cpu; int err; struct uffd_stats uffd_stats[nr_cpus];
- uffd_test_ops->allocate_area((void **)&area_src); - if (!area_src) - return 1; - uffd_test_ops->allocate_area((void **)&area_dst); - if (!area_dst) - return 1; - - if (userfaultfd_open(0)) + if (uffd_test_ctx_init(0)) return 1;
- count_verify = malloc(nr_pages * sizeof(unsigned long long)); - if (!count_verify) { - perror("count_verify"); - return 1; - } - - for (nr = 0; nr < nr_pages; nr++) { - *area_mutex(area_src, nr) = (pthread_mutex_t) - PTHREAD_MUTEX_INITIALIZER; - count_verify[nr] = *area_count(area_src, nr) = 1; - /* - * In the transition between 255 to 256, powerpc will - * read out of order in my_bcmp and see both bytes as - * zero, so leave a placeholder below always non-zero - * after the count, to avoid my_bcmp to trigger false - * positives. - */ - *(area_count(area_src, nr) + 1) = 1; - } - - pipefd = malloc(sizeof(int) * nr_cpus * 2); - if (!pipefd) { - perror("pipefd"); - return 1; - } - for (cpu = 0; cpu < nr_cpus; cpu++) { - if (pipe2(&pipefd[cpu*2], O_CLOEXEC | O_NONBLOCK)) { - perror("pipe"); - return 1; - } - } - if (posix_memalign(&area, page_size, page_size)) { fprintf(stderr, "out of memory\n"); return 1; @@ -1593,7 +1647,6 @@ static int userfaultfd_stress(void) if (err) return err;
- close(uffd); return userfaultfd_zeropage_test() || userfaultfd_sig_test() || userfaultfd_events_test() || userfaultfd_minor_test(); }
Enable test_uffdio_minor for test_type == TEST_SHMEM, and modify the test slightly to pass in / check for the right feature flags.
Signed-off-by: Axel Rasmussen axelrasmussen@google.com --- tools/testing/selftests/vm/userfaultfd.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-)
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 5183ddb3080d..f31e9a4edc55 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -1410,7 +1410,7 @@ static int userfaultfd_minor_test(void) void *expected_page; char c; struct uffd_stats stats = { 0 }; - uint64_t features = UFFD_FEATURE_MINOR_HUGETLBFS; + uint64_t req_features, features_out;
if (!test_uffdio_minor) return 0; @@ -1418,10 +1418,18 @@ static int userfaultfd_minor_test(void) printf("testing minor faults: "); fflush(stdout);
- if (uffd_test_ctx_clear() || uffd_test_ctx_init_ext(&features)) + if (test_type == TEST_HUGETLB) + req_features = UFFD_FEATURE_MINOR_HUGETLBFS; + else if (test_type == TEST_SHMEM) + req_features = UFFD_FEATURE_MINOR_SHMEM; + else + return 1; + + features_out = req_features; + if (uffd_test_ctx_clear() || uffd_test_ctx_init_ext(&features_out)) return 1; - /* If kernel reports the feature isn't supported, skip the test. */ - if (!(features & UFFD_FEATURE_MINOR_HUGETLBFS)) { + /* If kernel reports required features aren't supported, skip test. */ + if ((features_out & req_features) != req_features) { printf("skipping test due to lack of feature support\n"); fflush(stdout); return 0; @@ -1431,7 +1439,7 @@ static int userfaultfd_minor_test(void) uffdio_register.range.len = nr_pages * page_size; uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR; if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) { - fprintf(stderr, "register failure\n"); + perror("register failure"); exit(1); }
@@ -1695,6 +1703,7 @@ static void set_test_type(const char *type) map_shared = true; test_type = TEST_SHMEM; uffd_test_ops = &shmem_uffd_test_ops; + test_uffdio_minor = true; } else { fprintf(stderr, "Unknown test type: %s\n", type); exit(1); }
linux-kselftest-mirror@lists.linaro.org