This series cleans up and fixes break_ksm(). In summary, we no longer use fake write faults to break COW but instead FAULT_FLAG_UNSHARE. Further, we move away from using follow_page() [that we can hopefully remove completely at one point] and use new walk_page_range_vma() instead.
Fortunately, we can get rid of VM_FAULT_WRITE and FOLL_MIGRATION in common code now.
Add a selftest to measure MADV_UNMERGEABLE performance. In my setup (AMD Ryzen 9 3900X), running the KSM selftest to test unmerge performance on 2 GiB (taskset 0x8 ./ksm_tests -D -s 2048), this results in a performance degradation of ~8% -- 9% (old: ~5250 MiB/s, new: ~4800 MiB/s). I don't think we particularly care for now, but it's good to be aware of the implication.
Cc: Andrew Morton akpm@linux-foundation.org Cc: Shuah Khan shuah@kernel.org Cc: Hugh Dickins hughd@google.com Cc: Vlastimil Babka vbabka@suse.cz Cc: Peter Xu peterx@redhat.com Cc: Andrea Arcangeli aarcange@redhat.com Cc: "Matthew Wilcox (Oracle)" willy@infradead.org Cc: Jason Gunthorpe jgg@nvidia.com Cc: John Hubbard jhubbard@nvidia.com
David Hildenbrand (7): selftests/vm: add test to measure MADV_UNMERGEABLE performance mm/ksm: simplify break_ksm() to not rely on VM_FAULT_WRITE mm: remove VM_FAULT_WRITE mm/ksm: fix KSM COW breaking with userfaultfd-wp via FAULT_FLAG_UNSHARE mm/pagewalk: add walk_page_range_vma() mm/ksm: convert break_ksm() to use walk_page_range_vma() mm/gup: remove FOLL_MIGRATION
include/linux/mm.h | 1 - include/linux/mm_types.h | 3 - include/linux/pagewalk.h | 3 + mm/gup.c | 55 ++----------- mm/huge_memory.c | 2 +- mm/ksm.c | 103 +++++++++++++++++++------ mm/memory.c | 9 +-- mm/pagewalk.c | 27 +++++++ tools/testing/selftests/vm/ksm_tests.c | 76 +++++++++++++++++- 9 files changed, 192 insertions(+), 87 deletions(-)
Let's add a test to measure performance of KSM breaking not triggered via COW, but triggered by disabling KSM on an area filled with KSM pages via MADV_UNMERGEABLE.
Signed-off-by: David Hildenbrand david@redhat.com --- tools/testing/selftests/vm/ksm_tests.c | 76 +++++++++++++++++++++++++- 1 file changed, 74 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/vm/ksm_tests.c b/tools/testing/selftests/vm/ksm_tests.c index f5e4e0bbd081..353eee96aeba 100644 --- a/tools/testing/selftests/vm/ksm_tests.c +++ b/tools/testing/selftests/vm/ksm_tests.c @@ -40,6 +40,7 @@ enum ksm_test_name { CHECK_KSM_NUMA_MERGE, KSM_MERGE_TIME, KSM_MERGE_TIME_HUGE_PAGES, + KSM_UNMERGE_TIME, KSM_COW_TIME };
@@ -108,7 +109,10 @@ static void print_help(void) " -P evaluate merging time and speed.\n" " For this test, the size of duplicated memory area (in MiB)\n" " must be provided using -s option\n" - " -H evaluate merging time and speed of area allocated mostly with huge pages\n" + " -H evaluate merging time and speed of area allocated mostly with huge pages\n" + " For this test, the size of duplicated memory area (in MiB)\n" + " must be provided using -s option\n" + " -D evaluate unmerging time and speed when disabling KSM.\n" " For this test, the size of duplicated memory area (in MiB)\n" " must be provided using -s option\n" " -C evaluate the time required to break COW of merged pages.\n\n"); @@ -188,6 +192,16 @@ static int ksm_merge_pages(void *addr, size_t size, struct timespec start_time, return 0; }
+static int ksm_unmerge_pages(void *addr, size_t size, + struct timespec start_time, int timeout) +{ + if (madvise(addr, size, MADV_UNMERGEABLE)) { + perror("madvise"); + return 1; + } + return 0; +} + static bool assert_ksm_pages_count(long dupl_page_count) { unsigned long max_page_sharing, pages_sharing, pages_shared; @@ -560,6 +574,53 @@ static int ksm_merge_time(int mapping, int prot, int timeout, size_t map_size) return KSFT_FAIL; }
+static int ksm_unmerge_time(int mapping, int prot, int timeout, size_t map_size) +{ + void *map_ptr; + struct timespec start_time, end_time; + unsigned long scan_time_ns; + + map_size *= MB; + + map_ptr = allocate_memory(NULL, prot, mapping, '*', map_size); + if (!map_ptr) + return KSFT_FAIL; + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + goto err_out; + } + if (ksm_merge_pages(map_ptr, map_size, start_time, timeout)) + goto err_out; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + goto err_out; + } + if (ksm_unmerge_pages(map_ptr, map_size, start_time, timeout)) + goto err_out; + if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { + perror("clock_gettime"); + goto err_out; + } + + scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + + (end_time.tv_nsec - start_time.tv_nsec); + + printf("Total size: %lu MiB\n", map_size / MB); + printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC, + scan_time_ns % NSEC_PER_SEC); + printf("Average speed: %.3f MiB/s\n", (map_size / MB) / + ((double)scan_time_ns / NSEC_PER_SEC)); + + munmap(map_ptr, map_size); + return KSFT_PASS; + +err_out: + printf("Not OK\n"); + munmap(map_ptr, map_size); + return KSFT_FAIL; +} + static int ksm_cow_time(int mapping, int prot, int timeout, size_t page_size) { void *map_ptr; @@ -644,7 +705,7 @@ int main(int argc, char *argv[]) bool merge_across_nodes = KSM_MERGE_ACROSS_NODES_DEFAULT; long size_MB = 0;
- while ((opt = getopt(argc, argv, "ha:p:l:z:m:s:MUZNPCH")) != -1) { + while ((opt = getopt(argc, argv, "ha:p:l:z:m:s:MUZNPCHD")) != -1) { switch (opt) { case 'a': prot = str_to_prot(optarg); @@ -701,6 +762,9 @@ int main(int argc, char *argv[]) case 'H': test_name = KSM_MERGE_TIME_HUGE_PAGES; break; + case 'D': + test_name = KSM_UNMERGE_TIME; + break; case 'C': test_name = KSM_COW_TIME; break; @@ -762,6 +826,14 @@ int main(int argc, char *argv[]) ret = ksm_merge_hugepages_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, size_MB); break; + case KSM_UNMERGE_TIME: + if (size_MB == 0) { + printf("Option '-s' is required.\n"); + return KSFT_FAIL; + } + ret = ksm_unmerge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, + ksm_scan_limit_sec, size_MB); + break; case KSM_COW_TIME: ret = ksm_cow_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, page_size);
On Fri, Sep 30, 2022 at 04:19:25PM +0200, David Hildenbrand wrote:
Let's add a test to measure performance of KSM breaking not triggered via COW, but triggered by disabling KSM on an area filled with KSM pages via MADV_UNMERGEABLE.
Signed-off-by: David Hildenbrand david@redhat.com
Acked-by: Peter Xu peterx@redhat.com
Now that GUP no longer requires VM_FAULT_WRITE, break_ksm() is the sole remaining user of VM_FAULT_WRITE. As we also want to stop triggering a fake write fault and instead use FAULT_FLAG_UNSHARE -- similar to GUP-triggered unsharing when taking a R/O pin on a shared anonymous page (including KSM pages), let's stop relying on VM_FAULT_WRITE.
Let's rework break_ksm() to not rely on the return value of handle_mm_fault() anymore to figure out whether COW-breaking was successful. Simply perform another follow_page() lookup to verify the result.
While this makes break_ksm() slightly less efficient, we can simplify handle_mm_fault() a little and easily switch to FAULT_FLAG_UNSHARE without introducing similar KSM-specific behavior for FAULT_FLAG_UNSHARE.
In my setup (AMD Ryzen 9 3900X), running the KSM selftest to test unmerge performance on 2 GiB (taskset 0x8 ./ksm_tests -D -s 2048), this results in a performance degradation of ~4% -- 5% (old: ~5250 MiB/s, new: ~5010 MiB/s).
I don't think that we particularly care about that performance drop when unmerging. If it ever turns out to be an actual performance issue, we can think about a better alternative for FAULT_FLAG_UNSHARE -- let's just keep it simple for now.
Signed-off-by: David Hildenbrand david@redhat.com --- mm/ksm.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-)
diff --git a/mm/ksm.c b/mm/ksm.c index 0cd2f4b62334..e8d987fb379e 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -473,26 +473,27 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) vm_fault_t ret = 0;
do { + bool ksm_page = false; + cond_resched(); page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE); if (IS_ERR_OR_NULL(page)) break; if (PageKsm(page)) - ret = handle_mm_fault(vma, addr, - FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE, - NULL); - else - ret = VM_FAULT_WRITE; + ksm_page = true; put_page(page); - } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM))); + + if (!ksm_page) + return 0; + ret = handle_mm_fault(vma, addr, + FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE, + NULL); + } while (!(ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM))); /* - * We must loop because handle_mm_fault() may back out if there's - * any difficulty e.g. if pte accessed bit gets updated concurrently. - * - * VM_FAULT_WRITE is what we have been hoping for: it indicates that - * COW has been broken, even if the vma does not permit VM_WRITE; - * but note that a concurrent fault might break PageKsm for us. + * We must loop until we no longer find a KSM page because + * handle_mm_fault() may back out if there's any difficulty e.g. if + * pte accessed bit gets updated concurrently. * * VM_FAULT_SIGBUS could occur if we race with truncation of the * backing file, which also invalidates anonymous pages: that's
On Fri, Sep 30, 2022 at 04:19:26PM +0200, David Hildenbrand wrote:
I don't think that we particularly care about that performance drop when unmerging. If it ever turns out to be an actual performance issue, we can think about a better alternative for FAULT_FLAG_UNSHARE -- let's just keep it simple for now.
It'll be nice to hear from real ksm users for sure. But to me this makes sense, and the patch itself looks good to me, I think that also means:
Acked-by: Peter Xu peterx@redhat.com
All users -- GUP and KSM -- are gone, let's just remove it.
Signed-off-by: David Hildenbrand david@redhat.com --- include/linux/mm_types.h | 3 --- mm/huge_memory.c | 2 +- mm/memory.c | 9 ++++----- 3 files changed, 5 insertions(+), 9 deletions(-)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 8f30f262431c..6a1375dcb4ac 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -807,7 +807,6 @@ typedef __bitwise unsigned int vm_fault_t; * @VM_FAULT_OOM: Out Of Memory * @VM_FAULT_SIGBUS: Bad access * @VM_FAULT_MAJOR: Page read from storage - * @VM_FAULT_WRITE: Special case for get_user_pages * @VM_FAULT_HWPOISON: Hit poisoned small page * @VM_FAULT_HWPOISON_LARGE: Hit poisoned large page. Index encoded * in upper bits @@ -828,7 +827,6 @@ enum vm_fault_reason { VM_FAULT_OOM = (__force vm_fault_t)0x000001, VM_FAULT_SIGBUS = (__force vm_fault_t)0x000002, VM_FAULT_MAJOR = (__force vm_fault_t)0x000004, - VM_FAULT_WRITE = (__force vm_fault_t)0x000008, VM_FAULT_HWPOISON = (__force vm_fault_t)0x000010, VM_FAULT_HWPOISON_LARGE = (__force vm_fault_t)0x000020, VM_FAULT_SIGSEGV = (__force vm_fault_t)0x000040, @@ -854,7 +852,6 @@ enum vm_fault_reason { { VM_FAULT_OOM, "OOM" }, \ { VM_FAULT_SIGBUS, "SIGBUS" }, \ { VM_FAULT_MAJOR, "MAJOR" }, \ - { VM_FAULT_WRITE, "WRITE" }, \ { VM_FAULT_HWPOISON, "HWPOISON" }, \ { VM_FAULT_HWPOISON_LARGE, "HWPOISON_LARGE" }, \ { VM_FAULT_SIGSEGV, "SIGSEGV" }, \ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 84bf1d5f6b7e..b351c1d4f858 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1376,7 +1376,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); spin_unlock(vmf->ptl); - return VM_FAULT_WRITE; + return 0; }
unlock_fallback: diff --git a/mm/memory.c b/mm/memory.c index e49faa0a1f9a..6e2f47d05f2b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3240,7 +3240,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) }
delayacct_wpcopy_end(); - return (page_copied && !unshare) ? VM_FAULT_WRITE : 0; + return 0; oom_free_new: put_page(new_page); oom: @@ -3304,14 +3304,14 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf) return finish_mkwrite_fault(vmf); } wp_page_reuse(vmf); - return VM_FAULT_WRITE; + return 0; }
static vm_fault_t wp_page_shared(struct vm_fault *vmf) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; - vm_fault_t ret = VM_FAULT_WRITE; + vm_fault_t ret = 0;
get_page(vmf->page);
@@ -3462,7 +3462,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) return 0; } wp_page_reuse(vmf); - return VM_FAULT_WRITE; + return 0; } else if (unshare) { /* No anonymous page -> nothing to do. */ pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -3960,7 +3960,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (vmf->flags & FAULT_FLAG_WRITE) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); vmf->flags &= ~FAULT_FLAG_WRITE; - ret |= VM_FAULT_WRITE; } rmap_flags |= RMAP_EXCLUSIVE; }
On Fri, Sep 30, 2022 at 04:19:27PM +0200, David Hildenbrand wrote:
All users -- GUP and KSM -- are gone, let's just remove it.
Signed-off-by: David Hildenbrand david@redhat.com
Acked-by: Peter Xu peterx@redhat.com
Let's stop breaking COW via a fake write fault and let's use FAULT_FLAG_UNSHARE instead. This avoids any wrong side effects of the fake write fault, such as mapping the PTE writable and marking the pte dirty/softdirty.
Also, this fixes KSM interaction with userfaultfd-wp: when we have a KSM page that's write-protected by userfaultfd, break_ksm()->handle_mm_fault() will fail with VM_FAULT_SIGBUS and will simpy return in break_ksm() with 0. The warning in dmesg indicates this wrong handling:
[ 230.096368] FAULT_FLAG_ALLOW_RETRY missing 881 [ 230.100822] CPU: 1 PID: 1643 Comm: ksm-uffd-wp [...] [ 230.110124] Hardware name: [...] [ 230.117775] Call Trace: [ 230.120227] <TASK> [ 230.122334] dump_stack_lvl+0x44/0x5c [ 230.126010] handle_userfault.cold+0x14/0x19 [ 230.130281] ? tlb_finish_mmu+0x65/0x170 [ 230.134207] ? uffd_wp_range+0x65/0xa0 [ 230.137959] ? _raw_spin_unlock+0x15/0x30 [ 230.141972] ? do_wp_page+0x50/0x590 [ 230.145551] __handle_mm_fault+0x9f5/0xf50 [ 230.149652] ? mmput+0x1f/0x40 [ 230.152712] handle_mm_fault+0xb9/0x2a0 [ 230.156550] break_ksm+0x141/0x180 [ 230.159964] unmerge_ksm_pages+0x60/0x90 [ 230.163890] ksm_madvise+0x3c/0xb0 [ 230.167295] do_madvise.part.0+0x10c/0xeb0 [ 230.171396] ? do_syscall_64+0x67/0x80 [ 230.175157] __x64_sys_madvise+0x5a/0x70 [ 230.179082] do_syscall_64+0x58/0x80 [ 230.182661] ? do_syscall_64+0x67/0x80 [ 230.186413] entry_SYSCALL_64_after_hwframe+0x63/0xcd
--------------------------------------------------------------------------
#include <stdio.h> #include <stdlib.h> #include <string.h> #include <fcntl.h> #include <unistd.h> #include <errno.h> #include <sys/mman.h> #include <sys/syscall.h> #include <sys/ioctl.h> #include <linux/userfaultfd.h>
#define MMAP_SIZE (2 * 1024 * 1024u)
static char *map; int uffd;
static int setup_uffd(void) { struct uffdio_api uffdio_api; struct uffdio_register uffdio_register; struct uffdio_writeprotect uffd_writeprotect; struct uffdio_range uffd_range;
uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); if (uffd < 0) { fprintf(stderr, "syscall() failed: %d\n", errno); return -errno; }
uffdio_api.api = UFFD_API; uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP; if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) { fprintf(stderr, "UFFDIO_API failed: %d\n", errno); return -errno; }
if (!(uffdio_api.features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) { fprintf(stderr, "UFFD_FEATURE_WRITEPROTECT missing\n"); return -ENOSYS; }
/* Register UFFD-WP */ uffdio_register.range.start = (unsigned long) map; uffdio_register.range.len = MMAP_SIZE; uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) < 0) { fprintf(stderr, "UFFDIO_REGISTER failed: %d\n", errno); return -errno; }
/* Writeprotect the range. */ uffd_writeprotect.range.start = (unsigned long) map; uffd_writeprotect.range.len = MMAP_SIZE; uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_WP; if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) { fprintf(stderr, "UFFDIO_WRITEPROTECT failed: %d\n", errno); return -errno; }
return 0; }
int main(int argc, char **argv) { int ksm_fd, ret;
ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR); if (ksm_fd < 0) { fprintf(stderr, "KSM not available\n"); return -errno; }
map = mmap(NULL, MMAP_SIZE, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0); if (map == MAP_FAILED) { fprintf(stderr, "mmap() failed\n"); return -errno; } ret = madvise(map, MMAP_SIZE, MADV_NOHUGEPAGE); if (ret) { fprintf(stderr, "MADV_NOHUGEPAGE failed\n"); return -errno; }
/* Fill with same value and trigger merging. */ memset(map, 0xff, MMAP_SIZE); ret = madvise(map, MMAP_SIZE, MADV_MERGEABLE); if (ret) { fprintf(stderr, "MADV_MERGEABLE failed\n"); return -errno; }
/* * Run KSM to trigger merging and wait a bit until merging should be * done. */ if (write(ksm_fd, "1", 1) != 1) { fprintf(stderr, "Running KSM failed\n"); } sleep(10);
/* Write-protect the range with UFFD. */ if (setup_uffd()) return 1;
/* Trigger unsharing. */ ret = madvise(map, MMAP_SIZE, MADV_UNMERGEABLE); if (ret) { fprintf(stderr, "MADV_UNMERGEABLE failed\n"); return -errno; }
return 0; }
--------------------------------------------------------------------------
Consequently, we will no longer trigger a fake write fault and break COW without any such side-effects.
This is primarily a fix for KSM+userfaultfd-wp, however, the fake write fault was always questionable. As this fix is not easy to backport and it's not very critical, let's not cc stable.
Fixes: 529b930b87d9 ("userfaultfd: wp: hook userfault handler to write protection fault") Signed-off-by: David Hildenbrand david@redhat.com --- mm/ksm.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-)
diff --git a/mm/ksm.c b/mm/ksm.c index e8d987fb379e..4d7bcf7da7c3 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -453,17 +453,15 @@ static inline bool ksm_test_exit(struct mm_struct *mm) }
/* - * We use break_ksm to break COW on a ksm page: it's a stripped down + * We use break_ksm to break COW on a ksm page by triggering unsharing, + * such that the ksm page will get replaced by an exclusive anonymous page. * - * if (get_user_pages(addr, 1, FOLL_WRITE, &page, NULL) == 1) - * put_page(page); - * - * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, + * We take great care only to touch a ksm page, in a VM_MERGEABLE vma, * in case the application has unmapped and remapped mm,addr meanwhile. * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP * mmap of /dev/mem, where we would not want to touch it. * - * FAULT_FLAG/FOLL_REMOTE are because we do this outside the context + * FAULT_FLAG_REMOTE/FOLL_REMOTE are because we do this outside the context * of the process that owns 'vma'. We also do not want to enforce * protection keys here anyway. */ @@ -487,7 +485,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) if (!ksm_page) return 0; ret = handle_mm_fault(vma, addr, - FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE, + FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE, NULL); } while (!(ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM))); /*
On Fri, 30 Sep 2022 16:19:28 +0200 David Hildenbrand david@redhat.com wrote:
Let's stop breaking COW via a fake write fault and let's use FAULT_FLAG_UNSHARE instead. This avoids any wrong side effects of the fake write fault, such as mapping the PTE writable and marking the pte dirty/softdirty.
Also, this fixes KSM interaction with userfaultfd-wp: when we have a KSM page that's write-protected by userfaultfd, break_ksm()->handle_mm_fault() will fail with VM_FAULT_SIGBUS and will simpy return in break_ksm() with 0. The warning in dmesg indicates this wrong handling:
We're at -rc7. I'd prefer to avoid merging larger patchsets at this time.
Is there some minimal fix for 6.0 and -stable? Or is the problem non-serious enough to only fix it in 6.1 and later?
On 01.10.22 00:27, Andrew Morton wrote:
On Fri, 30 Sep 2022 16:19:28 +0200 David Hildenbrand david@redhat.com wrote:
Let's stop breaking COW via a fake write fault and let's use FAULT_FLAG_UNSHARE instead. This avoids any wrong side effects of the fake write fault, such as mapping the PTE writable and marking the pte dirty/softdirty.
Also, this fixes KSM interaction with userfaultfd-wp: when we have a KSM page that's write-protected by userfaultfd, break_ksm()->handle_mm_fault() will fail with VM_FAULT_SIGBUS and will simpy return in break_ksm() with 0. The warning in dmesg indicates this wrong handling:
We're at -rc7. I'd prefer to avoid merging larger patchsets at this time.
Yes, this is 6.1 material.
Is there some minimal fix for 6.0 and -stable? Or is the problem non-serious enough to only fix it in 6.1 and later?
See the end of this lengthy patch description:
"This is primarily a fix for KSM+userfaultfd-wp, however, the fake write fault was always questionable. As this fix is not easy to backport and it's not very critical, let's not cc stable."
This can wait, thanks!
On Fri, Sep 30, 2022 at 04:19:28PM +0200, David Hildenbrand wrote:
Let's stop breaking COW via a fake write fault and let's use FAULT_FLAG_UNSHARE instead. This avoids any wrong side effects of the fake write fault, such as mapping the PTE writable and marking the pte dirty/softdirty.
Also, this fixes KSM interaction with userfaultfd-wp: when we have a KSM page that's write-protected by userfaultfd, break_ksm()->handle_mm_fault() will fail with VM_FAULT_SIGBUS and will simpy return in break_ksm() with 0. The warning in dmesg indicates this wrong handling:
[ 230.096368] FAULT_FLAG_ALLOW_RETRY missing 881 [ 230.100822] CPU: 1 PID: 1643 Comm: ksm-uffd-wp [...] [ 230.110124] Hardware name: [...] [ 230.117775] Call Trace: [ 230.120227] <TASK> [ 230.122334] dump_stack_lvl+0x44/0x5c [ 230.126010] handle_userfault.cold+0x14/0x19 [ 230.130281] ? tlb_finish_mmu+0x65/0x170 [ 230.134207] ? uffd_wp_range+0x65/0xa0 [ 230.137959] ? _raw_spin_unlock+0x15/0x30 [ 230.141972] ? do_wp_page+0x50/0x590 [ 230.145551] __handle_mm_fault+0x9f5/0xf50 [ 230.149652] ? mmput+0x1f/0x40 [ 230.152712] handle_mm_fault+0xb9/0x2a0 [ 230.156550] break_ksm+0x141/0x180 [ 230.159964] unmerge_ksm_pages+0x60/0x90 [ 230.163890] ksm_madvise+0x3c/0xb0 [ 230.167295] do_madvise.part.0+0x10c/0xeb0 [ 230.171396] ? do_syscall_64+0x67/0x80 [ 230.175157] __x64_sys_madvise+0x5a/0x70 [ 230.179082] do_syscall_64+0x58/0x80 [ 230.182661] ? do_syscall_64+0x67/0x80 [ 230.186413] entry_SYSCALL_64_after_hwframe+0x63/0xcd
Since it's already there, worth adding the test into ksm_test.c?
Consequently, we will no longer trigger a fake write fault and break COW without any such side-effects.
This is primarily a fix for KSM+userfaultfd-wp, however, the fake write fault was always questionable. As this fix is not easy to backport and it's not very critical, let's not cc stable.
A patch to cc most of the stable would probably need to still go with the old write approach but attaching ALLOW_RETRY. But I agree maybe that may not need to bother, or a report should have arrived earlier.. The unshare approach looks much cleaner indeed.
Fixes: 529b930b87d9 ("userfaultfd: wp: hook userfault handler to write protection fault") Signed-off-by: David Hildenbrand david@redhat.com
Acked-by: Peter Xu peterx@redhat.com
On 05.10.22 22:35, Peter Xu wrote:
On Fri, Sep 30, 2022 at 04:19:28PM +0200, David Hildenbrand wrote:
Let's stop breaking COW via a fake write fault and let's use FAULT_FLAG_UNSHARE instead. This avoids any wrong side effects of the fake write fault, such as mapping the PTE writable and marking the pte dirty/softdirty.
Also, this fixes KSM interaction with userfaultfd-wp: when we have a KSM page that's write-protected by userfaultfd, break_ksm()->handle_mm_fault() will fail with VM_FAULT_SIGBUS and will simpy return in break_ksm() with 0. The warning in dmesg indicates this wrong handling:
[ 230.096368] FAULT_FLAG_ALLOW_RETRY missing 881 [ 230.100822] CPU: 1 PID: 1643 Comm: ksm-uffd-wp [...] [ 230.110124] Hardware name: [...] [ 230.117775] Call Trace: [ 230.120227] <TASK> [ 230.122334] dump_stack_lvl+0x44/0x5c [ 230.126010] handle_userfault.cold+0x14/0x19 [ 230.130281] ? tlb_finish_mmu+0x65/0x170 [ 230.134207] ? uffd_wp_range+0x65/0xa0 [ 230.137959] ? _raw_spin_unlock+0x15/0x30 [ 230.141972] ? do_wp_page+0x50/0x590 [ 230.145551] __handle_mm_fault+0x9f5/0xf50 [ 230.149652] ? mmput+0x1f/0x40 [ 230.152712] handle_mm_fault+0xb9/0x2a0 [ 230.156550] break_ksm+0x141/0x180 [ 230.159964] unmerge_ksm_pages+0x60/0x90 [ 230.163890] ksm_madvise+0x3c/0xb0 [ 230.167295] do_madvise.part.0+0x10c/0xeb0 [ 230.171396] ? do_syscall_64+0x67/0x80 [ 230.175157] __x64_sys_madvise+0x5a/0x70 [ 230.179082] do_syscall_64+0x58/0x80 [ 230.182661] ? do_syscall_64+0x67/0x80 [ 230.186413] entry_SYSCALL_64_after_hwframe+0x63/0xcd
Since it's already there, worth adding the test into ksm_test.c?
Yes, I can give it a try. What I dislike about ksm_test is that it's a mixture of benchmarks and test cases that have to explicitly triggered by parameters. It's not a simple "run all available test cases" tests as we know it. So maybe something separate (or having it as part of the uffd tests) makes more sense.
Consequently, we will no longer trigger a fake write fault and break COW without any such side-effects.
This is primarily a fix for KSM+userfaultfd-wp, however, the fake write fault was always questionable. As this fix is not easy to backport and it's not very critical, let's not cc stable.
A patch to cc most of the stable would probably need to still go with the old write approach but attaching ALLOW_RETRY. But I agree maybe that may not need to bother, or a report should have arrived earlier.. The unshare approach looks much cleaner indeed.
A fix without FAULT_FLAG_UNSHARE is not straight forward. We really don't want to notify user space about write events here (because there is none). And there is no way around the uffd handling in WP code without that.
FAULT_FLAG_ALLOW_RETRY would rely on userfaultfd triggering and having to resolve the WP event.
Fixes: 529b930b87d9 ("userfaultfd: wp: hook userfault handler to write protection fault") Signed-off-by: David Hildenbrand david@redhat.com
Acked-by: Peter Xu peterx@redhat.com
Thanks!
On Thu, Oct 06, 2022 at 11:29:29AM +0200, David Hildenbrand wrote:
On 05.10.22 22:35, Peter Xu wrote:
On Fri, Sep 30, 2022 at 04:19:28PM +0200, David Hildenbrand wrote:
Let's stop breaking COW via a fake write fault and let's use FAULT_FLAG_UNSHARE instead. This avoids any wrong side effects of the fake write fault, such as mapping the PTE writable and marking the pte dirty/softdirty.
Also, this fixes KSM interaction with userfaultfd-wp: when we have a KSM page that's write-protected by userfaultfd, break_ksm()->handle_mm_fault() will fail with VM_FAULT_SIGBUS and will simpy return in break_ksm() with 0. The warning in dmesg indicates this wrong handling:
[ 230.096368] FAULT_FLAG_ALLOW_RETRY missing 881 [ 230.100822] CPU: 1 PID: 1643 Comm: ksm-uffd-wp [...] [ 230.110124] Hardware name: [...] [ 230.117775] Call Trace: [ 230.120227] <TASK> [ 230.122334] dump_stack_lvl+0x44/0x5c [ 230.126010] handle_userfault.cold+0x14/0x19 [ 230.130281] ? tlb_finish_mmu+0x65/0x170 [ 230.134207] ? uffd_wp_range+0x65/0xa0 [ 230.137959] ? _raw_spin_unlock+0x15/0x30 [ 230.141972] ? do_wp_page+0x50/0x590 [ 230.145551] __handle_mm_fault+0x9f5/0xf50 [ 230.149652] ? mmput+0x1f/0x40 [ 230.152712] handle_mm_fault+0xb9/0x2a0 [ 230.156550] break_ksm+0x141/0x180 [ 230.159964] unmerge_ksm_pages+0x60/0x90 [ 230.163890] ksm_madvise+0x3c/0xb0 [ 230.167295] do_madvise.part.0+0x10c/0xeb0 [ 230.171396] ? do_syscall_64+0x67/0x80 [ 230.175157] __x64_sys_madvise+0x5a/0x70 [ 230.179082] do_syscall_64+0x58/0x80 [ 230.182661] ? do_syscall_64+0x67/0x80 [ 230.186413] entry_SYSCALL_64_after_hwframe+0x63/0xcd
Since it's already there, worth adding the test into ksm_test.c?
Yes, I can give it a try. What I dislike about ksm_test is that it's a mixture of benchmarks and test cases that have to explicitly triggered by parameters. It's not a simple "run all available test cases" tests as we know it. So maybe something separate (or having it as part of the uffd tests) makes more sense.
We can add an entry into run_vmtests.sh. That's also what current ksm_test does.
Yes adding into uffd test would work too, but I do have a plan that we should move functional tests out of userfaultfd.c, leaving that with the stress test only. Not really a big deal, though.
Consequently, we will no longer trigger a fake write fault and break COW without any such side-effects.
This is primarily a fix for KSM+userfaultfd-wp, however, the fake write fault was always questionable. As this fix is not easy to backport and it's not very critical, let's not cc stable.
A patch to cc most of the stable would probably need to still go with the old write approach but attaching ALLOW_RETRY. But I agree maybe that may not need to bother, or a report should have arrived earlier.. The unshare approach looks much cleaner indeed.
A fix without FAULT_FLAG_UNSHARE is not straight forward. We really don't want to notify user space about write events here (because there is none). And there is no way around the uffd handling in WP code without that.
FAULT_FLAG_ALLOW_RETRY would rely on userfaultfd triggering and having to resolve the WP event.
Right it'll be very much a false positive, but the userspace should be fine with it e.g. for live snapshot we need to copy page earlier; it still won't stop the process from running along the way. But I agree that's not ideal.
Hi Peter,
sorry for replying so late, I only managed now to get back to this patch set.
Yes, I can give it a try. What I dislike about ksm_test is that it's a mixture of benchmarks and test cases that have to explicitly triggered by parameters. It's not a simple "run all available test cases" tests as we know it. So maybe something separate (or having it as part of the uffd tests) makes more sense.
We can add an entry into run_vmtests.sh. That's also what current ksm_test does.
Right, but I kind-of don't like that way at all and would much rather do it cleaner...
Yes adding into uffd test would work too, but I do have a plan that we should move functional tests out of userfaultfd.c, leaving that with the stress test only. Not really a big deal, though.
... similar to what you want to do with userfaultfd.c
So maybe I'll just add a new test for ksm functional tests.
Consequently, we will no longer trigger a fake write fault and break COW without any such side-effects.
This is primarily a fix for KSM+userfaultfd-wp, however, the fake write fault was always questionable. As this fix is not easy to backport and it's not very critical, let's not cc stable.
A patch to cc most of the stable would probably need to still go with the old write approach but attaching ALLOW_RETRY. But I agree maybe that may not need to bother, or a report should have arrived earlier.. The unshare approach looks much cleaner indeed.
A fix without FAULT_FLAG_UNSHARE is not straight forward. We really don't want to notify user space about write events here (because there is none). And there is no way around the uffd handling in WP code without that.
FAULT_FLAG_ALLOW_RETRY would rely on userfaultfd triggering and having to resolve the WP event.
Right it'll be very much a false positive, but the userspace should be fine with it e.g. for live snapshot we need to copy page earlier; it still won't stop the process from running along the way. But I agree that's not ideal.
At least the test case at hand will wait until infinitely, because there is no handler that would allow break_cow to make progress (well, we don't expect write events in the test :) ).
Anyhow, I don't think messing with that for stable kernels is worth the pain / complexity / possible issues.
Let's add walk_page_range_vma(), which is similar to walk_page_vma(), however, is only interested in a subset of the VMA range.
To be used in KSM code to stop using follow_page() next.
Signed-off-by: David Hildenbrand david@redhat.com --- include/linux/pagewalk.h | 3 +++ mm/pagewalk.c | 27 +++++++++++++++++++++++++++ 2 files changed, 30 insertions(+)
diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index f3fafb731ffd..2f8f6cc980b4 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -99,6 +99,9 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, pgd_t *pgd, void *private); +int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + void *private); int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, void *private); int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 131b2b335b2c..757c075da231 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -514,6 +514,33 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start, return __walk_page_range(start, end, &walk); }
+int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + void *private) +{ + struct mm_walk walk = { + .ops = ops, + .mm = vma->vm_mm, + .vma = vma, + .private = private, + }; + int err; + + if (start >= end || !walk.mm) + return -EINVAL; + if (start < vma->vm_start || end > vma->vm_end) + return -EINVAL; + + mmap_assert_locked(walk.mm); + + err = walk_page_test(start, end, &walk); + if (err > 0) + return 0; + if (err < 0) + return err; + return __walk_page_range(start, end, &walk); +} + int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, void *private) {
On Fri, Sep 30, 2022 at 04:19:29PM +0200, David Hildenbrand wrote:
+int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
unsigned long end, const struct mm_walk_ops *ops,
void *private)
+{
- struct mm_walk walk = {
.ops = ops,
.mm = vma->vm_mm,
.vma = vma,
.private = private,
- };
- int err;
- if (start >= end || !walk.mm)
return -EINVAL;
- if (start < vma->vm_start || end > vma->vm_end)
return -EINVAL;
- mmap_assert_locked(walk.mm);
- err = walk_page_test(start, end, &walk);
According to test_walk():
* @test_walk: caller specific callback function to determine whether * we walk over the current vma or not. Returning 0 means * "do page table walk over the current vma", returning * a negative value means "abort current page table walk * right now" and returning 1 means "skip the current vma"
Since this helper has vma passed in, not sure whether this is needed at all?
walk_page_vma_range() sounds slightly better to me as it does look more like an extension of walk_page_vma(), rather than sister version of walk_page_range_novma() (which works for "no backing VMA" case). But no strong opinion.
On 05.10.22 22:42, Peter Xu wrote:
On Fri, Sep 30, 2022 at 04:19:29PM +0200, David Hildenbrand wrote:
+int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
unsigned long end, const struct mm_walk_ops *ops,
void *private)
+{
- struct mm_walk walk = {
.ops = ops,
.mm = vma->vm_mm,
.vma = vma,
.private = private,
- };
- int err;
- if (start >= end || !walk.mm)
return -EINVAL;
- if (start < vma->vm_start || end > vma->vm_end)
return -EINVAL;
- mmap_assert_locked(walk.mm);
- err = walk_page_test(start, end, &walk);
According to test_walk():
- @test_walk: caller specific callback function to determine whether
we walk over the current vma or not. Returning 0 means
"do page table walk over the current vma", returning
a negative value means "abort current page table walk
right now" and returning 1 means "skip the current vma"
Since this helper has vma passed in, not sure whether this is needed at all?
I kept it because walk_page_vma() similarly has it -- walk_page_vma() walks the whole VMA range.
I do agree that it's kind of weird to have it like that. All users of walk_page_vma() don't use it, so we can just get rid of it there as well. Might make the walk slightly faster.
walk_page_vma_range() sounds slightly better to me as it does look more like an extension of walk_page_vma(), rather than sister version of walk_page_range_novma() (which works for "no backing VMA" case). But no strong opinion.
I matched that to walk_page_range_novma(). Now we have
walk_page_range walk_page_vma walk_page_range_novma walk_page_range_vma
FOLL_MIGRATION exists only for the purpose of break_ksm(), and actually, there is not even the need to wait for the migration to finish, we only want to know if we're dealing with a KSM page.
Using follow_page() just to identify a KSM page overcomplicates GUP code. Let's use walk_page_range_vma() instead, because we don't actually care about the page itself, we only need to know a single property -- no need to even grab a reference on the page.
In my setup (AMD Ryzen 9 3900X), running the KSM selftest to test unmerge performance on 2 GiB (taskset 0x8 ./ksm_tests -D -s 2048), this results in a performance degradation of ~4% (old: ~5010 MiB/s, new: ~4800 MiB/s). I don't think we particularly care for now.
Signed-off-by: David Hildenbrand david@redhat.com --- mm/ksm.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 62 insertions(+), 8 deletions(-)
diff --git a/mm/ksm.c b/mm/ksm.c index 4d7bcf7da7c3..814c1a37c323 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -39,6 +39,7 @@ #include <linux/freezer.h> #include <linux/oom.h> #include <linux/numa.h> +#include <linux/pagewalk.h>
#include <asm/tlbflush.h> #include "internal.h" @@ -452,6 +453,60 @@ static inline bool ksm_test_exit(struct mm_struct *mm) return atomic_read(&mm->mm_users) == 0; }
+int break_ksm_pud_entry(pud_t *pud, unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + /* We only care about page tables to walk to a single base page. */ + if (pud_leaf(*pud) || !pud_present(*pud)) + return 1; + return 0; +} + +int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + bool *ksm_page = walk->private; + struct page *page = NULL; + pte_t *pte, ptent; + spinlock_t *ptl; + + /* We only care about page tables to walk to a single base page. */ + if (pmd_leaf(*pmd) || !pmd_present(*pmd)) + return 1; + + /* + * We only lookup a single page (a) no need to iterate; and (b) + * always return 1 to exit immediately and not iterate in the caller. + */ + pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + ptent = *pte; + + if (pte_none(ptent)) + return 1; + if (!pte_present(ptent)) { + swp_entry_t entry = pte_to_swp_entry(ptent); + + /* + * We only care about migration of KSM pages. As KSM pages + * remain KSM pages until freed, no need to wait here for + * migration to end to identify such. + */ + if (is_migration_entry(entry)) + page = pfn_swap_entry_to_page(entry); + } else { + page = vm_normal_page(walk->vma, addr, ptent); + } + if (page && PageKsm(page)) + *ksm_page = true; + pte_unmap_unlock(pte, ptl); + return 1; +} + +static const struct mm_walk_ops break_ksm_ops = { + .pud_entry = break_ksm_pud_entry, + .pmd_entry = break_ksm_pmd_entry, +}; + /* * We use break_ksm to break COW on a ksm page by triggering unsharing, * such that the ksm page will get replaced by an exclusive anonymous page. @@ -467,20 +522,19 @@ static inline bool ksm_test_exit(struct mm_struct *mm) */ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) { - struct page *page; vm_fault_t ret = 0;
+ if (WARN_ON_ONCE(!IS_ALIGNED(addr, PAGE_SIZE))) + return -EINVAL; + do { bool ksm_page = false;
cond_resched(); - page = follow_page(vma, addr, - FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE); - if (IS_ERR_OR_NULL(page)) - break; - if (PageKsm(page)) - ksm_page = true; - put_page(page); + ret = walk_page_range_vma(vma, addr, addr + PAGE_SIZE, + &break_ksm_ops, &ksm_page); + if (WARN_ON_ONCE(ret < 0)) + return ret;
if (!ksm_page) return 0;
On Fri, Sep 30, 2022 at 04:19:30PM +0200, David Hildenbrand wrote:
FOLL_MIGRATION exists only for the purpose of break_ksm(), and actually, there is not even the need to wait for the migration to finish, we only want to know if we're dealing with a KSM page.
Using follow_page() just to identify a KSM page overcomplicates GUP code. Let's use walk_page_range_vma() instead, because we don't actually care about the page itself, we only need to know a single property -- no need to even grab a reference on the page.
In my setup (AMD Ryzen 9 3900X), running the KSM selftest to test unmerge performance on 2 GiB (taskset 0x8 ./ksm_tests -D -s 2048), this results in a performance degradation of ~4% (old: ~5010 MiB/s, new: ~4800 MiB/s). I don't think we particularly care for now.
Signed-off-by: David Hildenbrand david@redhat.com
[...]
+int break_ksm_pud_entry(pud_t *pud, unsigned long addr, unsigned long next,
struct mm_walk *walk)
+{
- /* We only care about page tables to walk to a single base page. */
- if (pud_leaf(*pud) || !pud_present(*pud))
return 1;
- return 0;
+}
Is this needed? I thought the pgtable walker handlers this already.
[...]
static int break_ksm(struct vm_area_struct *vma, unsigned long addr) {
- struct page *page; vm_fault_t ret = 0;
- if (WARN_ON_ONCE(!IS_ALIGNED(addr, PAGE_SIZE)))
return -EINVAL;
- do { bool ksm_page = false;
cond_resched();
page = follow_page(vma, addr,
FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
if (IS_ERR_OR_NULL(page))
break;
if (PageKsm(page))
ksm_page = true;
put_page(page);
ret = walk_page_range_vma(vma, addr, addr + PAGE_SIZE,
&break_ksm_ops, &ksm_page);
if (WARN_ON_ONCE(ret < 0))
return ret;
I'm not sure this would be worth it, especially with a 4% degrade. The next patch will be able to bring 50- LOC, but this patch does 60+ anyway, based on another new helper just introduced...
I just don't see whether there's strong enough reason to do so to drop FOLL_MIGRATE. It's different to the previous VM_FAULT_WRITE refactor because of the unshare approach was much of a good reasoning to me.
Perhaps I missed something?
+int break_ksm_pud_entry(pud_t *pud, unsigned long addr, unsigned long next,
struct mm_walk *walk)
+{
- /* We only care about page tables to walk to a single base page. */
- if (pud_leaf(*pud) || !pud_present(*pud))
return 1;
- return 0;
+}
Is this needed? I thought the pgtable walker handlers this already.
[...]
Most probably yes. I was trying to avoid about PUD splits, but I guess we simply should not care in VMAs that are considered by KSM (MERGABLE). Most probably never ever happens.
static int break_ksm(struct vm_area_struct *vma, unsigned long addr) {
- struct page *page; vm_fault_t ret = 0;
- if (WARN_ON_ONCE(!IS_ALIGNED(addr, PAGE_SIZE)))
return -EINVAL;
- do { bool ksm_page = false;
cond_resched();
page = follow_page(vma, addr,
FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
if (IS_ERR_OR_NULL(page))
break;
if (PageKsm(page))
ksm_page = true;
put_page(page);
ret = walk_page_range_vma(vma, addr, addr + PAGE_SIZE,
&break_ksm_ops, &ksm_page);
if (WARN_ON_ONCE(ret < 0))
return ret;
I'm not sure this would be worth it, especially with a 4% degrade. The next patch will be able to bring 50- LOC, but this patch does 60+ anyway, based on another new helper just introduced...
I just don't see whether there's strong enough reason to do so to drop FOLL_MIGRATE. It's different to the previous VM_FAULT_WRITE refactor because of the unshare approach was much of a good reasoning to me.
Perhaps I missed something?
My main motivation is to remove most of that GUP hackery here, which is 1) Getting a reference on a page and waiting for migration to finish even though both is unnecessary. 2) As we don't have sufficient control, we added FOLL_MIGRATION hacks to MM core to work around limitations in the GUP-based approacj. 3) We rely on legacy follow_page() interface that we should really get rid of in the long term.
All we want to do is walk the page tables and make a decision if something we care about is mapped. Instead of leaking these details via hacks into GUP code and making that code harder to grasp/maintain, this patch moves that logic to the actual user, while reusing generic page walking code.
Yes, we have to extend page walking code, but it's just the natural, non-hacky way of doing it.
Regarding the 4% performance degradation (if I wouldn't have added the benchmarks, nobody would know and probably care ;) ), I am not quite sure why that is the case. We're just walking page tables after all in both cases. Maybe the callback-based implementation of pagewalk code is less efficient, but we might be able to improve that implementation if we really care about performance here. Maybe removing break_ksm_pud_entry() already improves the numbers slightly.
Thanks!
On Thu, Oct 06, 2022 at 11:20:42AM +0200, David Hildenbrand wrote:
+int break_ksm_pud_entry(pud_t *pud, unsigned long addr, unsigned long next,
struct mm_walk *walk)
+{
- /* We only care about page tables to walk to a single base page. */
- if (pud_leaf(*pud) || !pud_present(*pud))
return 1;
- return 0;
+}
Is this needed? I thought the pgtable walker handlers this already.
[...]
Most probably yes. I was trying to avoid about PUD splits, but I guess we simply should not care in VMAs that are considered by KSM (MERGABLE). Most probably never ever happens.
I was surprised the split is the default approach; didn't really notice that before. Yeah maybe better to keep it.
static int break_ksm(struct vm_area_struct *vma, unsigned long addr) {
- struct page *page; vm_fault_t ret = 0;
- if (WARN_ON_ONCE(!IS_ALIGNED(addr, PAGE_SIZE)))
return -EINVAL;
- do { bool ksm_page = false; cond_resched();
page = follow_page(vma, addr,
FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
if (IS_ERR_OR_NULL(page))
break;
if (PageKsm(page))
ksm_page = true;
put_page(page);
ret = walk_page_range_vma(vma, addr, addr + PAGE_SIZE,
&break_ksm_ops, &ksm_page);
if (WARN_ON_ONCE(ret < 0))
return ret;
I'm not sure this would be worth it, especially with a 4% degrade. The next patch will be able to bring 50- LOC, but this patch does 60+ anyway, based on another new helper just introduced...
I just don't see whether there's strong enough reason to do so to drop FOLL_MIGRATE. It's different to the previous VM_FAULT_WRITE refactor because of the unshare approach was much of a good reasoning to me.
Perhaps I missed something?
My main motivation is to remove most of that GUP hackery here, which is
- Getting a reference on a page and waiting for migration to finish even though both is unnecessary.
- As we don't have sufficient control, we added FOLL_MIGRATION hacks to MM core to work around limitations in the GUP-based approacj.
I saw one thing of adding FOLL_MIGRATION from Hugh was to have a hint for follow page users:
I'd have preferred to avoid another flag, and do it every time, in case someone else makes the same easy mistake..
Though..
- We rely on legacy follow_page() interface that we should really get rid of in the long term.
..this is part of effort to remove follow_page()? More context will be helpful in that case.
All we want to do is walk the page tables and make a decision if something we care about is mapped. Instead of leaking these details via hacks into GUP code and making that code harder to grasp/maintain, this patch moves that logic to the actual user, while reusing generic page walking code.
Indeed there's only one ksm user, at least proving that the flag was not widely used.
Yes, we have to extend page walking code, but it's just the natural, non-hacky way of doing it.
Regarding the 4% performance degradation (if I wouldn't have added the benchmarks, nobody would know and probably care ;) ), I am not quite sure why that is the case. We're just walking page tables after all in both cases. Maybe the callback-based implementation of pagewalk code is less efficient, but we might be able to improve that implementation if we really care about performance here. Maybe removing break_ksm_pud_entry() already improves the numbers slightly.
Yeah it could be the walker is just slower. And for !ksm walking your code should be faster when hit migration entries, but that should really be rare anyway.
On 06.10.22 21:28, Peter Xu wrote:
On Thu, Oct 06, 2022 at 11:20:42AM +0200, David Hildenbrand wrote:
+int break_ksm_pud_entry(pud_t *pud, unsigned long addr, unsigned long next,
struct mm_walk *walk)
+{
- /* We only care about page tables to walk to a single base page. */
- if (pud_leaf(*pud) || !pud_present(*pud))
return 1;
- return 0;
+}
Is this needed? I thought the pgtable walker handlers this already.
[...]
Most probably yes. I was trying to avoid about PUD splits, but I guess we simply should not care in VMAs that are considered by KSM (MERGABLE). Most probably never ever happens.
I was surprised the split is the default approach; didn't really notice that before. Yeah maybe better to keep it.
Interestingly, one callback reduces the benchmark result by 100-200 MiB.
With only break_ksm_pmd_entry(), I get ~4900 MiB/s instead of ~5010 MiB/s (~2%).
I came to the conclusion that we really shouldn't have to worry about pud THPs here: it could only be a file PUD and splitting that only zaps the entry, but doesn't PMD- or PTE-map it.
Also, I think we will barely see large pud THP in a mergable mapping ... :)
[...]
My main motivation is to remove most of that GUP hackery here, which is
- Getting a reference on a page and waiting for migration to finish even though both is unnecessary.
- As we don't have sufficient control, we added FOLL_MIGRATION hacks to MM core to work around limitations in the GUP-based approacj.
I saw one thing of adding FOLL_MIGRATION from Hugh was to have a hint for follow page users:
I'd have preferred to avoid another flag, and do it every time, in case someone else makes the same easy mistake..
Though..
The important thing I think is that FOLL_MIGRATION really only applies to follow_page(). In case of "modern" GUP we will just wait for migration entries, handle swap entries ... when triggering a page fault.
- We rely on legacy follow_page() interface that we should really get rid of in the long term.
..this is part of effort to remove follow_page()? More context will be helpful in that case.
The comment from Hugh is another example why follow_page() adds complexity. One might wonder, how pages in the swapcache, device coherent pages, ... would have to be handled.
Short-term, I want to cleanup GUP. Long-term we might want to consider removing follow_page() completely.
[...]
Yes, we have to extend page walking code, but it's just the natural, non-hacky way of doing it.
Regarding the 4% performance degradation (if I wouldn't have added the benchmarks, nobody would know and probably care ;) ), I am not quite sure why that is the case. We're just walking page tables after all in both cases. Maybe the callback-based implementation of pagewalk code is less efficient, but we might be able to improve that implementation if we really care about performance here. Maybe removing break_ksm_pud_entry() already improves the numbers slightly.
Yeah it could be the walker is just slower. And for !ksm walking your code should be faster when hit migration entries, but that should really be rare anyway.
I have the following right now:
From 7f767f9e9e673a29793cd35f1c3d66ed593b67cb Mon Sep 17 00:00:00 2001 From: David Hildenbrand david@redhat.com Date: Mon, 25 Jul 2022 10:36:20 +0200 Subject: [PATCH] mm/ksm: convert break_ksm() to use walk_page_range_vma()
FOLL_MIGRATION exists only for the purpose of break_ksm(), and actually, there is not even the need to wait for the migration to finish, we only want to know if we're dealing with a KSM page.
Using follow_page() just to identify a KSM page overcomplicates GUP code. Let's use walk_page_range_vma() instead, because we don't actually care about the page itself, we only need to know a single property -- no need to even grab a reference.
So, get rid of follow_page() usage such that we can get rid of FOLL_MIGRATION now and eventually be able to get rid of follow_page() in the future.
In my setup (AMD Ryzen 9 3900X), running the KSM selftest to test unmerge performance on 2 GiB (taskset 0x8 ./ksm_tests -D -s 2048), this results in a performance degradation of ~2% (old: ~5010 MiB/s, new: ~4900 MiB/s). I don't think we particularly care for now.
Interestingly, the benchmark reduction is due to the single callback. Adding a second callback (e.g., pud_entry()) reduces the benchmark by another 100-200 MiB/s.
Signed-off-by: David Hildenbrand david@redhat.com --- mm/ksm.c | 49 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 39 insertions(+), 10 deletions(-)
diff --git a/mm/ksm.c b/mm/ksm.c index c6f58aa6e731..5cdb852ff132 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -39,6 +39,7 @@ #include <linux/freezer.h> #include <linux/oom.h> #include <linux/numa.h> +#include <linux/pagewalk.h>
#include <asm/tlbflush.h> #include "internal.h" @@ -419,6 +420,39 @@ static inline bool ksm_test_exit(struct mm_struct *mm) return atomic_read(&mm->mm_users) == 0; }
+static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + struct page *page = NULL; + spinlock_t *ptl; + pte_t *pte; + int ret; + + if (pmd_leaf(*pmd) || !pmd_present(*pmd)) + return 0; + + pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (pte_present(*pte)) { + page = vm_normal_page(walk->vma, addr, *pte); + } else if (!pte_none(*pte)) { + swp_entry_t entry = pte_to_swp_entry(*pte); + + /* + * As KSM pages remain KSM pages until freed, no need to wait + * here for migration to end. + */ + if (is_migration_entry(entry)) + page = pfn_swap_entry_to_page(entry); + } + ret = page && PageKsm(page); + pte_unmap_unlock(pte, ptl); + return ret; +} + +static const struct mm_walk_ops break_ksm_ops = { + .pmd_entry = break_ksm_pmd_entry, +}; + /* * We use break_ksm to break COW on a ksm page by triggering unsharing, * such that the ksm page will get replaced by an exclusive anonymous page. @@ -434,21 +468,16 @@ static inline bool ksm_test_exit(struct mm_struct *mm) */ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) { - struct page *page; vm_fault_t ret = 0;
do { - bool ksm_page = false; + int ksm_page;
cond_resched(); - page = follow_page(vma, addr, - FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE); - if (IS_ERR_OR_NULL(page)) - break; - if (PageKsm(page)) - ksm_page = true; - put_page(page); - + ksm_page = walk_page_range_vma(vma, addr, addr + 1, + &break_ksm_ops, NULL); + if (WARN_ON_ONCE(ksm_page < 0)) + return ksm_page; if (!ksm_page) return 0; ret = handle_mm_fault(vma, addr,
On 30.09.22 16:19, David Hildenbrand wrote:
FOLL_MIGRATION exists only for the purpose of break_ksm(), and actually, there is not even the need to wait for the migration to finish, we only want to know if we're dealing with a KSM page.
Using follow_page() just to identify a KSM page overcomplicates GUP code. Let's use walk_page_range_vma() instead, because we don't actually care about the page itself, we only need to know a single property -- no need to even grab a reference on the page.
In my setup (AMD Ryzen 9 3900X), running the KSM selftest to test unmerge performance on 2 GiB (taskset 0x8 ./ksm_tests -D -s 2048), this results in a performance degradation of ~4% (old: ~5010 MiB/s, new: ~4800 MiB/s). I don't think we particularly care for now.
Signed-off-by: David Hildenbrand david@redhat.com
mm/ksm.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 62 insertions(+), 8 deletions(-)
diff --git a/mm/ksm.c b/mm/ksm.c index 4d7bcf7da7c3..814c1a37c323 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -39,6 +39,7 @@ #include <linux/freezer.h> #include <linux/oom.h> #include <linux/numa.h> +#include <linux/pagewalk.h> #include <asm/tlbflush.h> #include "internal.h" @@ -452,6 +453,60 @@ static inline bool ksm_test_exit(struct mm_struct *mm) return atomic_read(&mm->mm_users) == 0; } +int break_ksm_pud_entry(pud_t *pud, unsigned long addr, unsigned long next,
struct mm_walk *walk)
+{
- /* We only care about page tables to walk to a single base page. */
- if (pud_leaf(*pud) || !pud_present(*pud))
return 1;
- return 0;
+}
+int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
struct mm_walk *walk)
+{
- bool *ksm_page = walk->private;
- struct page *page = NULL;
- pte_t *pte, ptent;
- spinlock_t *ptl;
- /* We only care about page tables to walk to a single base page. */
- if (pmd_leaf(*pmd) || !pmd_present(*pmd))
return 1;
- /*
* We only lookup a single page (a) no need to iterate; and (b)
* always return 1 to exit immediately and not iterate in the caller.
*/
- pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
- ptent = *pte;
- if (pte_none(ptent))
return 1;
As reported by Janosch, we fail to drop the lock here.
t480s: ~/git/linux ksm_unshare $ git diff diff --git a/mm/ksm.c b/mm/ksm.c index 26aec41b127c..94f8e114c89f 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -435,7 +435,7 @@ int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
if (pte_none(*pte)) - return 1; + goto out_unlock; if (!pte_present(*pte)) { swp_entry_t entry = pte_to_swp_entry(*pte);
@@ -451,6 +451,7 @@ int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, } if (page && PageKsm(page)) *ksm_page = true; +out_unlock: pte_unmap_unlock(pte, ptl); return 1; }
Fortunately, the last user (KSM) is gone, so let's just remove this rather special code from generic GUP handling -- especially because KSM never required the PMD handling as KSM only deals with individual base pages.
Signed-off-by: David Hildenbrand david@redhat.com --- include/linux/mm.h | 1 - mm/gup.c | 55 +++++----------------------------------------- 2 files changed, 5 insertions(+), 51 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h index e56dd8f7eae1..4c176e308ead 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2942,7 +2942,6 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, * and return without waiting upon it */ #define FOLL_NOFAULT 0x80 /* do not fault in pages */ #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ -#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ #define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ #define FOLL_ANON 0x8000 /* don't do file mappings */ diff --git a/mm/gup.c b/mm/gup.c index ce00a4c40da8..37195c549f68 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -537,30 +537,13 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == (FOLL_PIN | FOLL_GET))) return ERR_PTR(-EINVAL); -retry: if (unlikely(pmd_bad(*pmd))) return no_page_table(vma, flags);
ptep = pte_offset_map_lock(mm, pmd, address, &ptl); pte = *ptep; - if (!pte_present(pte)) { - swp_entry_t entry; - /* - * KSM's break_ksm() relies upon recognizing a ksm page - * even while it is being migrated, so for that case we - * need migration_entry_wait(). - */ - if (likely(!(flags & FOLL_MIGRATION))) - goto no_page; - if (pte_none(pte)) - goto no_page; - entry = pte_to_swp_entry(pte); - if (!is_migration_entry(entry)) - goto no_page; - pte_unmap_unlock(ptep, ptl); - migration_entry_wait(mm, pmd, address); - goto retry; - } + if (!pte_present(pte)) + goto no_page; if (pte_protnone(pte) && !gup_can_follow_protnone(flags)) goto no_page;
@@ -682,28 +665,8 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, return page; return no_page_table(vma, flags); } -retry: - if (!pmd_present(pmdval)) { - /* - * Should never reach here, if thp migration is not supported; - * Otherwise, it must be a thp migration entry. - */ - VM_BUG_ON(!thp_migration_supported() || - !is_pmd_migration_entry(pmdval)); - - if (likely(!(flags & FOLL_MIGRATION))) - return no_page_table(vma, flags); - - pmd_migration_entry_wait(mm, pmd); - pmdval = READ_ONCE(*pmd); - /* - * MADV_DONTNEED may convert the pmd to null because - * mmap_lock is held in read mode - */ - if (pmd_none(pmdval)) - return no_page_table(vma, flags); - goto retry; - } + if (!pmd_present(pmdval)) + return no_page_table(vma, flags); if (pmd_devmap(pmdval)) { ptl = pmd_lock(mm, pmd); page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap); @@ -717,18 +680,10 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, if (pmd_protnone(pmdval) && !gup_can_follow_protnone(flags)) return no_page_table(vma, flags);
-retry_locked: ptl = pmd_lock(mm, pmd); - if (unlikely(pmd_none(*pmd))) { - spin_unlock(ptl); - return no_page_table(vma, flags); - } if (unlikely(!pmd_present(*pmd))) { spin_unlock(ptl); - if (likely(!(flags & FOLL_MIGRATION))) - return no_page_table(vma, flags); - pmd_migration_entry_wait(mm, pmd); - goto retry_locked; + return no_page_table(vma, flags); } if (unlikely(!pmd_trans_huge(*pmd))) { spin_unlock(ptl);
linux-kselftest-mirror@lists.linaro.org