June 2021 - Linux-stable-mirror

[PATCH v4 4/4] remoteproc: core: Cleanup device in case of failure

by Siddharth Gupta

When a failure occurs in rproc_add() it returns an error, but does not cleanup after itself. This change adds the failure path in such cases. Signed-off-by: Siddharth Gupta <sidgup(a)codeaurora.org> Cc: stable(a)vger.kernel.org --- drivers/remoteproc/remoteproc_core.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index b874280..d823f70 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -2343,8 +2343,10 @@ int rproc_add(struct rproc *rproc) return ret; ret = device_add(dev); - if (ret < 0) - return ret; + if (ret < 0) { + put_device(dev); + goto rproc_remove_cdev; + } dev_info(dev, "%s is available\n", rproc->name); @@ -2355,7 +2357,7 @@ int rproc_add(struct rproc *rproc) if (rproc->auto_boot) { ret = rproc_trigger_auto_boot(rproc); if (ret < 0) - return ret; + goto rproc_remove_dev; } /* expose to rproc_get_by_phandle users */ @@ -2364,6 +2366,13 @@ int rproc_add(struct rproc *rproc) mutex_unlock(&rproc_list_mutex); return 0; + +rproc_remove_dev: + rproc_delete_debug_dir(rproc); + device_del(dev); +rproc_remove_cdev: + rproc_char_device_remove(rproc); + return ret; } EXPORT_SYMBOL(rproc_add); -- Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project

4 years, 6 months

2
3
0 0

[PATCH] proc: Track /proc/$pid/attr/ opener mm_struct

by Kees Cook

Commit bfb819ea20ce ("proc: Check /proc/$pid/attr/ writes against file opener") tried to make sure that there could not be a confusion between the opener of a /proc/$pid/attr/ file and the writer. It used struct cred to make sure the privileges didn't change. However, there were existing cases where a more privileged thread was passing the opened fd to a differently privileged thread (during container setup). Instead, use mm_struct to track whether the opener and writer are still the same process. (This is what several other proc files already do, though for different reasons.) Reported-by: Christian Brauner <christian.brauner(a)ubuntu.com> Reported-by: Andrea Righi <andrea.righi(a)canonical.com> Tested-by: Andrea Righi <andrea.righi(a)canonical.com> Fixes: bfb819ea20ce ("proc: Check /proc/$pid/attr/ writes against file opener") Cc: stable(a)vger.kernel.org Signed-off-by: Kees Cook <keescook(a)chromium.org> --- fs/proc/base.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 58bbf334265b..7118ebe38fa6 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2674,6 +2674,11 @@ static int proc_pident_readdir(struct file *file, struct dir_context *ctx, } #ifdef CONFIG_SECURITY +static int proc_pid_attr_open(struct inode *inode, struct file *file) +{ + return __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS); +} + static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { @@ -2704,7 +2709,7 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, int rv; /* A task may only write when it was the opener. */ - if (file->f_cred != current_real_cred()) + if (file->private_data != current->mm) return -EPERM; rcu_read_lock(); @@ -2754,9 +2759,11 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, } static const struct file_operations proc_pid_attr_operations = { + .open = proc_pid_attr_open, .read = proc_pid_attr_read, .write = proc_pid_attr_write, .llseek = generic_file_llseek, + .release = mem_release, }; #define LSM_DIR_OPS(LSM) \ -- 2.25.1

4 years, 6 months

7
13
0 0

[PATCH V2 0/2] scsi: FDMI Fixes

by Javed Hasan

This series has two fixes for FDMI. Attributes length corrected for RHBA. Fixed the wrong condition check in fc_ct_ms_fill_attr(). Kindly apply this series to scsi-queue at your earliest convenience. Javed Hasan (2): scsi: fc: Corrected RHBA attributes length libfc: Corrected the condition check and invalid argument passed drivers/scsi/libfc/fc_encode.h | 8 +++++--- include/scsi/fc/fc_ms.h | 4 ++-- 2 files changed, 7 insertions(+), 5 deletions(-) -- 2.26.2

4 years, 6 months

2
4
0 0

stable-rc/queue/5.4 baseline: 128 runs, 3 regressions (v5.4.125-83-g766b4640cb46)

by kernelci.org bot

stable-rc/queue/5.4 baseline: 128 runs, 3 regressions (v5.4.125-83-g766b4640cb46) Regressions Summary ------------------- platform | arch | lab | compiler | defconfig | regressions ------------------+------+---------------+----------+--------------------+------------ rk3288-veyron-jaq | arm | lab-collabora | gcc-8 | multi_v7_defconfig | 3 Details: https://kernelci.org/test/job/stable-rc/branch/queue%2F5.4/kernel/v5.4.125-… Test: baseline Tree: stable-rc Branch: queue/5.4 Describe: v5.4.125-83-g766b4640cb46 URL: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git SHA: 766b4640cb46a4f83bc17477c8eb62db03ef3855 Test Regressions ---------------- platform | arch | lab | compiler | defconfig | regressions ------------------+------+---------------+----------+--------------------+------------ rk3288-veyron-jaq | arm | lab-collabora | gcc-8 | multi_v7_defconfig | 3 Details: https://kernelci.org/test/plan/id/60c92cc25ae07dc2ab413266 Results: 66 PASS, 3 FAIL, 0 SKIP Full config: multi_v7_defconfig Compiler: gcc-8 (arm-linux-gnueabihf-gcc (Debian 8.3.0-2) 8.3.0) Plain log: https://storage.kernelci.org//stable-rc/queue-5.4/v5.4.125-83-g766b4640cb46… HTML log: https://storage.kernelci.org//stable-rc/queue-5.4/v5.4.125-83-g766b4640cb46… Rootfs: http://storage.kernelci.org/images/rootfs/buildroot/kci-2020.05-5-g2f114cc7… * baseline.bootrr.dwmmc_rockchip-sdmmc-probed: https://kernelci.org/test/case/id/60c92cc25ae07dc2ab413283 failing since 0 day (last pass: v5.4.125-37-g7cda316475cf, first fail: v5.4.125-84-g411d62eda127) 2021-06-15T22:42:03.867065 /lava-4029265/1/../bin/lava-test-case 2021-06-15T22:42:03.872266 <8>[ 14.645342] <LAVA_SIGNAL_TESTCASE TEST_CASE_ID=dwmmc_rockchip-sdmmc-probed RESULT=fail> * baseline.bootrr.dwmmc_rockchip-sdio0-probed: https://kernelci.org/test/case/id/60c92cc25ae07dc2ab413284 failing since 0 day (last pass: v5.4.125-37-g7cda316475cf, first fail: v5.4.125-84-g411d62eda127) 2021-06-15T22:42:04.886229 /lava-4029265/1/../bin/lava-test-case 2021-06-15T22:42:04.904177 <8>[ 15.664978] <LAVA_SIGNAL_TESTCASE TEST_CASE_ID=dwmmc_rockchip-sdio0-probed RESULT=fail> * baseline.bootrr.rockchip-iodomain-grf-probed: https://kernelci.org/test/case/id/60c92cc25ae07dc2ab41329c failing since 0 day (last pass: v5.4.125-37-g7cda316475cf, first fail: v5.4.125-84-g411d62eda127) 2021-06-15T22:42:06.310878 /lava-4029265/1/../bin/lava-test-case 2021-06-15T22:42:06.327965 <8>[ 17.089076] <LAVA_SIGNAL_TESTCASE TEST_CASE_ID=rockchip-iodomain-grf-probed RESULT=fail>

4 years, 6 months

1
0
0 0

[patch 17/18] mm: thp: replace DEBUG_VM BUG with VM_WARN when unmap fails for split

by Andrew Morton

From: Yang Shi <shy828301(a)gmail.com> Subject: mm: thp: replace DEBUG_VM BUG with VM_WARN when unmap fails for split When debugging the bug reported by Wang Yugui [1], try_to_unmap() may fail, but the first VM_BUG_ON_PAGE() just checks page_mapcount() however it may miss the failure when head page is unmapped but other subpage is mapped. Then the second DEBUG_VM BUG() that check total mapcount would catch it. This may incur some confusion. And this is not a fatal issue, so consolidate the two DEBUG_VM checks into one VM_WARN_ON_ONCE_PAGE(). [1] https://lore.kernel.org/linux-mm/20210412180659.B9E3.409509F4@e16-tech.com/ Link: https://lkml.kernel.org/r/d0f0db68-98b8-ebfb-16dc-f29df24cf012@google.com Signed-off-by: Yang Shi <shy828301(a)gmail.com> Reviewed-by: Zi Yan <ziy(a)nvidia.com> Acked-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com> Signed-off-by: Hugh Dickins <hughd(a)google.com> Cc: Alistair Popple <apopple(a)nvidia.com> Cc: Jan Kara <jack(a)suse.cz> Cc: Jue Wang <juew(a)google.com> Cc: "Matthew Wilcox (Oracle)" <willy(a)infradead.org> Cc: Miaohe Lin <linmiaohe(a)huawei.com> Cc: Minchan Kim <minchan(a)kernel.org> Cc: Naoya Horiguchi <naoya.horiguchi(a)nec.com> Cc: Oscar Salvador <osalvador(a)suse.de> Cc: Peter Xu <peterx(a)redhat.com> Cc: Ralph Campbell <rcampbell(a)nvidia.com> Cc: Shakeel Butt <shakeelb(a)google.com> Cc: Wang Yugui <wangyugui(a)e16-tech.com> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- mm/huge_memory.c | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) --- a/mm/huge_memory.c~mm-thp-replace-debug_vm-bug-with-vm_warn-when-unmap-fails-for-split +++ a/mm/huge_memory.c @@ -2352,15 +2352,15 @@ static void unmap_page(struct page *page { enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD; - bool unmap_success; VM_BUG_ON_PAGE(!PageHead(page), page); if (PageAnon(page)) ttu_flags |= TTU_SPLIT_FREEZE; - unmap_success = try_to_unmap(page, ttu_flags); - VM_BUG_ON_PAGE(!unmap_success, page); + try_to_unmap(page, ttu_flags); + + VM_WARN_ON_ONCE_PAGE(page_mapped(page), page); } static void remap_page(struct page *page, unsigned int nr) @@ -2671,7 +2671,7 @@ int split_huge_page_to_list(struct page struct deferred_split *ds_queue = get_deferred_split_queue(head); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; - int count, mapcount, extra_pins, ret; + int extra_pins, ret; pgoff_t end; VM_BUG_ON_PAGE(is_huge_zero_page(head), head); @@ -2730,7 +2730,6 @@ int split_huge_page_to_list(struct page } unmap_page(head); - VM_BUG_ON_PAGE(compound_mapcount(head), head); /* block interrupt reentry in xa_lock and spinlock */ local_irq_disable(); @@ -2748,9 +2747,7 @@ int split_huge_page_to_list(struct page /* Prevent deferred_split_scan() touching ->_refcount */ spin_lock(&ds_queue->split_queue_lock); - count = page_count(head); - mapcount = total_mapcount(head); - if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) { + if (page_ref_freeze(head, 1 + extra_pins)) { if (!list_empty(page_deferred_list(head))) { ds_queue->split_queue_len--; list_del(page_deferred_list(head)); @@ -2770,16 +2767,9 @@ int split_huge_page_to_list(struct page __split_huge_page(page, list, end); ret = 0; } else { - if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { - pr_alert("total_mapcount: %u, page_count(): %u\n", - mapcount, count); - if (PageTail(page)) - dump_page(head, NULL); - dump_page(page, "total_mapcount(head) > 0"); - BUG(); - } spin_unlock(&ds_queue->split_queue_lock); -fail: if (mapping) +fail: + if (mapping) xa_unlock(&mapping->i_pages); local_irq_enable(); remap_page(head, thp_nr_pages(head)); _

4 years, 6 months

1
0
0 0

[patch 16/18] mm/thp: unmap_mapping_page() to fix THP truncate_cleanup_page()

by Andrew Morton

From: Hugh Dickins <hughd(a)google.com> Subject: mm/thp: unmap_mapping_page() to fix THP truncate_cleanup_page() There is a race between THP unmapping and truncation, when truncate sees pmd_none() and skips the entry, after munmap's zap_huge_pmd() cleared it, but before its page_remove_rmap() gets to decrement compound_mapcount: generating false "BUG: Bad page cache" reports that the page is still mapped when deleted. This commit fixes that, but not in the way I hoped. The first attempt used try_to_unmap(page, TTU_SYNC|TTU_IGNORE_MLOCK) instead of unmap_mapping_range() in truncate_cleanup_page(): it has often been an annoyance that we usually call unmap_mapping_range() with no pages locked, but there apply it to a single locked page. try_to_unmap() looks more suitable for a single locked page. However, try_to_unmap_one() contains a VM_BUG_ON_PAGE(!pvmw.pte,page): it is used to insert THP migration entries, but not used to unmap THPs. Copy zap_huge_pmd() and add THP handling now? Perhaps, but their TLB needs are different, I'm too ignorant of the DAX cases, and couldn't decide how far to go for anon+swap. Set that aside. The second attempt took a different tack: make no change in truncate.c, but modify zap_huge_pmd() to insert an invalidated huge pmd instead of clearing it initially, then pmd_clear() between page_remove_rmap() and unlocking at the end. Nice. But powerpc blows that approach out of the water, with its serialize_against_pte_lookup(), and interesting pgtable usage. It would need serious help to get working on powerpc (with a minor optimization issue on s390 too). Set that aside. Just add an "if (page_mapped(page)) synchronize_rcu();" or other such delay, after unmapping in truncate_cleanup_page()? Perhaps, but though that's likely to reduce or eliminate the number of incidents, it would give less assurance of whether we had identified the problem correctly. This successful iteration introduces "unmap_mapping_page(page)" instead of try_to_unmap(), and goes the usual unmap_mapping_range_tree() route, with an addition to details. Then zap_pmd_range() watches for this case, and does spin_unlock(pmd_lock) if so - just like page_vma_mapped_walk() now does in the PVMW_SYNC case. Not pretty, but safe. Note that unmap_mapping_page() is doing a VM_BUG_ON(!PageLocked) to assert its interface; but currently that's only used to make sure that page->mapping is stable, and zap_pmd_range() doesn't care if the page is locked or not. Along these lines, in invalidate_inode_pages2_range() move the initial unmap_mapping_range() out from under page lock, before then calling unmap_mapping_page() under page lock if still mapped. Link: https://lkml.kernel.org/r/a2a4a148-cdd8-942c-4ef8-51b77f643dbe@google.com Fixes: fc127da085c2 ("truncate: handle file thp") Signed-off-by: Hugh Dickins <hughd(a)google.com> Acked-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com> Reviewed-by: Yang Shi <shy828301(a)gmail.com> Cc: Alistair Popple <apopple(a)nvidia.com> Cc: Jan Kara <jack(a)suse.cz> Cc: Jue Wang <juew(a)google.com> Cc: "Matthew Wilcox (Oracle)" <willy(a)infradead.org> Cc: Miaohe Lin <linmiaohe(a)huawei.com> Cc: Minchan Kim <minchan(a)kernel.org> Cc: Naoya Horiguchi <naoya.horiguchi(a)nec.com> Cc: Oscar Salvador <osalvador(a)suse.de> Cc: Peter Xu <peterx(a)redhat.com> Cc: Ralph Campbell <rcampbell(a)nvidia.com> Cc: Shakeel Butt <shakeelb(a)google.com> Cc: Wang Yugui <wangyugui(a)e16-tech.com> Cc: Zi Yan <ziy(a)nvidia.com> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- include/linux/mm.h | 3 +++ mm/memory.c | 41 +++++++++++++++++++++++++++++++++++++++++ mm/truncate.c | 43 +++++++++++++++++++------------------------ 3 files changed, 63 insertions(+), 24 deletions(-) --- a/include/linux/mm.h~mm-thp-unmap_mapping_page-to-fix-thp-truncate_cleanup_page +++ a/include/linux/mm.h @@ -1719,6 +1719,7 @@ struct zap_details { struct address_space *check_mapping; /* Check page->mapping if set */ pgoff_t first_index; /* Lowest page->index to unmap */ pgoff_t last_index; /* Highest page->index to unmap */ + struct page *single_page; /* Locked page to be unmapped */ }; struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, @@ -1766,6 +1767,7 @@ extern vm_fault_t handle_mm_fault(struct extern int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked); +void unmap_mapping_page(struct page *page); void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows); void unmap_mapping_range(struct address_space *mapping, @@ -1786,6 +1788,7 @@ static inline int fixup_user_fault(struc BUG(); return -EFAULT; } +static inline void unmap_mapping_page(struct page *page) { } static inline void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows) { } static inline void unmap_mapping_range(struct address_space *mapping, --- a/mm/memory.c~mm-thp-unmap_mapping_page-to-fix-thp-truncate_cleanup_page +++ a/mm/memory.c @@ -1361,7 +1361,18 @@ static inline unsigned long zap_pmd_rang else if (zap_huge_pmd(tlb, vma, pmd, addr)) goto next; /* fall through */ + } else if (details && details->single_page && + PageTransCompound(details->single_page) && + next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) { + spinlock_t *ptl = pmd_lock(tlb->mm, pmd); + /* + * Take and drop THP pmd lock so that we cannot return + * prematurely, while zap_huge_pmd() has cleared *pmd, + * but not yet decremented compound_mapcount(). + */ + spin_unlock(ptl); } + /* * Here there can be other concurrent MADV_DONTNEED or * trans huge page faults running, and if the pmd is @@ -3237,6 +3248,36 @@ static inline void unmap_mapping_range_t } /** + * unmap_mapping_page() - Unmap single page from processes. + * @page: The locked page to be unmapped. + * + * Unmap this page from any userspace process which still has it mmaped. + * Typically, for efficiency, the range of nearby pages has already been + * unmapped by unmap_mapping_pages() or unmap_mapping_range(). But once + * truncation or invalidation holds the lock on a page, it may find that + * the page has been remapped again: and then uses unmap_mapping_page() + * to unmap it finally. + */ +void unmap_mapping_page(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct zap_details details = { }; + + VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(PageTail(page)); + + details.check_mapping = mapping; + details.first_index = page->index; + details.last_index = page->index + thp_nr_pages(page) - 1; + details.single_page = page; + + i_mmap_lock_write(mapping); + if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) + unmap_mapping_range_tree(&mapping->i_mmap, &details); + i_mmap_unlock_write(mapping); +} + +/** * unmap_mapping_pages() - Unmap pages from processes. * @mapping: The address space containing pages to be unmapped. * @start: Index of first page to be unmapped. --- a/mm/truncate.c~mm-thp-unmap_mapping_page-to-fix-thp-truncate_cleanup_page +++ a/mm/truncate.c @@ -167,13 +167,10 @@ void do_invalidatepage(struct page *page * its lock, b) when a concurrent invalidate_mapping_pages got there first and * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. */ -static void -truncate_cleanup_page(struct address_space *mapping, struct page *page) +static void truncate_cleanup_page(struct page *page) { - if (page_mapped(page)) { - unsigned int nr = thp_nr_pages(page); - unmap_mapping_pages(mapping, page->index, nr, false); - } + if (page_mapped(page)) + unmap_mapping_page(page); if (page_has_private(page)) do_invalidatepage(page, 0, thp_size(page)); @@ -218,7 +215,7 @@ int truncate_inode_page(struct address_s if (page->mapping != mapping) return -EIO; - truncate_cleanup_page(mapping, page); + truncate_cleanup_page(page); delete_from_page_cache(page); return 0; } @@ -325,7 +322,7 @@ void truncate_inode_pages_range(struct a index = indices[pagevec_count(&pvec) - 1] + 1; truncate_exceptional_pvec_entries(mapping, &pvec, indices); for (i = 0; i < pagevec_count(&pvec); i++) - truncate_cleanup_page(mapping, pvec.pages[i]); + truncate_cleanup_page(pvec.pages[i]); delete_from_page_cache_batch(mapping, &pvec); for (i = 0; i < pagevec_count(&pvec); i++) unlock_page(pvec.pages[i]); @@ -639,6 +636,16 @@ int invalidate_inode_pages2_range(struct continue; } + if (!did_range_unmap && page_mapped(page)) { + /* + * If page is mapped, before taking its lock, + * zap the rest of the file in one hit. + */ + unmap_mapping_pages(mapping, index, + (1 + end - index), false); + did_range_unmap = 1; + } + lock_page(page); WARN_ON(page_to_index(page) != index); if (page->mapping != mapping) { @@ -646,23 +653,11 @@ int invalidate_inode_pages2_range(struct continue; } wait_on_page_writeback(page); - if (page_mapped(page)) { - if (!did_range_unmap) { - /* - * Zap the rest of the file in one hit. - */ - unmap_mapping_pages(mapping, index, - (1 + end - index), false); - did_range_unmap = 1; - } else { - /* - * Just zap this page - */ - unmap_mapping_pages(mapping, index, - 1, false); - } - } + + if (page_mapped(page)) + unmap_mapping_page(page); BUG_ON(page_mapped(page)); + ret2 = do_launder_page(mapping, page); if (ret2 == 0) { if (!invalidate_complete_page2(mapping, page)) _

4 years, 6 months

1
0
0 0

[patch 15/18] mm/thp: fix page_address_in_vma() on file THP tails

by Andrew Morton

From: Jue Wang <juew(a)google.com> Subject: mm/thp: fix page_address_in_vma() on file THP tails Anon THP tails were already supported, but memory-failure may need to use page_address_in_vma() on file THP tails, which its page->mapping check did not permit: fix it. hughd adds: no current usage is known to hit the issue, but this does fix a subtle trap in a general helper: best fixed in stable sooner than later. Link: https://lkml.kernel.org/r/a0d9b53-bf5d-8bab-ac5-759dc61819c1@google.com Fixes: 800d8c63b2e9 ("shmem: add huge pages support") Signed-off-by: Jue Wang <juew(a)google.com> Signed-off-by: Hugh Dickins <hughd(a)google.com> Reviewed-by: Matthew Wilcox (Oracle) <willy(a)infradead.org> Reviewed-by: Yang Shi <shy828301(a)gmail.com> Acked-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com> Cc: Alistair Popple <apopple(a)nvidia.com> Cc: Jan Kara <jack(a)suse.cz> Cc: Miaohe Lin <linmiaohe(a)huawei.com> Cc: Minchan Kim <minchan(a)kernel.org> Cc: Naoya Horiguchi <naoya.horiguchi(a)nec.com> Cc: Oscar Salvador <osalvador(a)suse.de> Cc: Peter Xu <peterx(a)redhat.com> Cc: Ralph Campbell <rcampbell(a)nvidia.com> Cc: Shakeel Butt <shakeelb(a)google.com> Cc: Wang Yugui <wangyugui(a)e16-tech.com> Cc: Zi Yan <ziy(a)nvidia.com> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- mm/rmap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) --- a/mm/rmap.c~mm-thp-fix-page_address_in_vma-on-file-thp-tails +++ a/mm/rmap.c @@ -716,11 +716,11 @@ unsigned long page_address_in_vma(struct if (!vma->anon_vma || !page__anon_vma || vma->anon_vma->root != page__anon_vma->root) return -EFAULT; - } else if (page->mapping) { - if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping) - return -EFAULT; - } else + } else if (!vma->vm_file) { + return -EFAULT; + } else if (vma->vm_file->f_mapping != compound_head(page)->mapping) { return -EFAULT; + } return vma_address(page, vma); } _

4 years, 6 months

1
0
0 0

[patch 14/18] mm/thp: fix vma_address() if virtual address below file offset

by Andrew Morton

From: Hugh Dickins <hughd(a)google.com> Subject: mm/thp: fix vma_address() if virtual address below file offset Running certain tests with a DEBUG_VM kernel would crash within hours, on the total_mapcount BUG() in split_huge_page_to_list(), while trying to free up some memory by punching a hole in a shmem huge page: split's try_to_unmap() was unable to find all the mappings of the page (which, on a !DEBUG_VM kernel, would then keep the huge page pinned in memory). When that BUG() was changed to a WARN(), it would later crash on the VM_BUG_ON_VMA(end < vma->vm_start || start >= vma->vm_end, vma) in mm/internal.h:vma_address(), used by rmap_walk_file() for try_to_unmap(). vma_address() is usually correct, but there's a wraparound case when the vm_start address is unusually low, but vm_pgoff not so low: vma_address() chooses max(start, vma->vm_start), but that decides on the wrong address, because start has become almost ULONG_MAX. Rewrite vma_address() to be more careful about vm_pgoff; move the VM_BUG_ON_VMA() out of it, returning -EFAULT for errors, so that it can be safely used from page_mapped_in_vma() and page_address_in_vma() too. Add vma_address_end() to apply similar care to end address calculation, in page_vma_mapped_walk() and page_mkclean_one() and try_to_unmap_one(); though it raises a question of whether callers would do better to supply pvmw->end to page_vma_mapped_walk() - I chose not, for a smaller patch. An irritation is that their apparent generality breaks down on KSM pages, which cannot be located by the page->index that page_to_pgoff() uses: as 4b0ece6fa016 ("mm: migrate: fix remove_migration_pte() for ksm pages") once discovered. I dithered over the best thing to do about that, and have ended up with a VM_BUG_ON_PAGE(PageKsm) in both vma_address() and vma_address_end(); though the only place in danger of using it on them was try_to_unmap_one(). Sidenote: vma_address() and vma_address_end() now use compound_nr() on a head page, instead of thp_size(): to make the right calculation on a hugetlbfs page, whether or not THPs are configured. try_to_unmap() is used on hugetlbfs pages, but perhaps the wrong calculation never mattered. Link: https://lkml.kernel.org/r/caf1c1a3-7cfb-7f8f-1beb-ba816e932825@google.com Fixes: a8fa41ad2f6f ("mm, rmap: check all VMAs that PTE-mapped THP can be part of") Signed-off-by: Hugh Dickins <hughd(a)google.com> Acked-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com> Cc: Alistair Popple <apopple(a)nvidia.com> Cc: Jan Kara <jack(a)suse.cz> Cc: Jue Wang <juew(a)google.com> Cc: "Matthew Wilcox (Oracle)" <willy(a)infradead.org> Cc: Miaohe Lin <linmiaohe(a)huawei.com> Cc: Minchan Kim <minchan(a)kernel.org> Cc: Naoya Horiguchi <naoya.horiguchi(a)nec.com> Cc: Oscar Salvador <osalvador(a)suse.de> Cc: Peter Xu <peterx(a)redhat.com> Cc: Ralph Campbell <rcampbell(a)nvidia.com> Cc: Shakeel Butt <shakeelb(a)google.com> Cc: Wang Yugui <wangyugui(a)e16-tech.com> Cc: Yang Shi <shy828301(a)gmail.com> Cc: Zi Yan <ziy(a)nvidia.com> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- mm/internal.h | 51 ++++++++++++++++++++++++++++++----------- mm/page_vma_mapped.c | 16 ++++-------- mm/rmap.c | 16 ++++++------ 3 files changed, 52 insertions(+), 31 deletions(-) --- a/mm/internal.h~mm-thp-fix-vma_address-if-virtual-address-below-file-offset +++ a/mm/internal.h @@ -384,27 +384,52 @@ static inline void mlock_migrate_page(st extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); /* - * At what user virtual address is page expected in @vma? + * At what user virtual address is page expected in vma? + * Returns -EFAULT if all of the page is outside the range of vma. + * If page is a compound head, the entire compound page is considered. */ static inline unsigned long -__vma_address(struct page *page, struct vm_area_struct *vma) +vma_address(struct page *page, struct vm_area_struct *vma) { - pgoff_t pgoff = page_to_pgoff(page); - return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + pgoff_t pgoff; + unsigned long address; + + VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */ + pgoff = page_to_pgoff(page); + if (pgoff >= vma->vm_pgoff) { + address = vma->vm_start + + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + /* Check for address beyond vma (or wrapped through 0?) */ + if (address < vma->vm_start || address >= vma->vm_end) + address = -EFAULT; + } else if (PageHead(page) && + pgoff + compound_nr(page) - 1 >= vma->vm_pgoff) { + /* Test above avoids possibility of wrap to 0 on 32-bit */ + address = vma->vm_start; + } else { + address = -EFAULT; + } + return address; } +/* + * Then at what user virtual address will none of the page be found in vma? + * Assumes that vma_address() already returned a good starting address. + * If page is a compound head, the entire compound page is considered. + */ static inline unsigned long -vma_address(struct page *page, struct vm_area_struct *vma) +vma_address_end(struct page *page, struct vm_area_struct *vma) { - unsigned long start, end; - - start = __vma_address(page, vma); - end = start + thp_size(page) - PAGE_SIZE; - - /* page should be within @vma mapping range */ - VM_BUG_ON_VMA(end < vma->vm_start || start >= vma->vm_end, vma); + pgoff_t pgoff; + unsigned long address; - return max(start, vma->vm_start); + VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */ + pgoff = page_to_pgoff(page) + compound_nr(page); + address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + /* Check for address beyond vma (or wrapped through 0?) */ + if (address < vma->vm_start || address > vma->vm_end) + address = vma->vm_end; + return address; } static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, --- a/mm/page_vma_mapped.c~mm-thp-fix-vma_address-if-virtual-address-below-file-offset +++ a/mm/page_vma_mapped.c @@ -228,18 +228,18 @@ restart: if (!map_pte(pvmw)) goto next_pte; while (1) { + unsigned long end; + if (check_pte(pvmw)) return true; next_pte: /* Seek to next pte only makes sense for THP */ if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page)) return not_found(pvmw); + end = vma_address_end(pvmw->page, pvmw->vma); do { pvmw->address += PAGE_SIZE; - if (pvmw->address >= pvmw->vma->vm_end || - pvmw->address >= - __vma_address(pvmw->page, pvmw->vma) + - thp_size(pvmw->page)) + if (pvmw->address >= end) return not_found(pvmw); /* Did we cross page table boundary? */ if (pvmw->address % PMD_SIZE == 0) { @@ -277,14 +277,10 @@ int page_mapped_in_vma(struct page *page .vma = vma, .flags = PVMW_SYNC, }; - unsigned long start, end; - - start = __vma_address(page, vma); - end = start + thp_size(page) - PAGE_SIZE; - if (unlikely(end < vma->vm_start || start >= vma->vm_end)) + pvmw.address = vma_address(page, vma); + if (pvmw.address == -EFAULT) return 0; - pvmw.address = max(start, vma->vm_start); if (!page_vma_mapped_walk(&pvmw)) return 0; page_vma_mapped_walk_done(&pvmw); --- a/mm/rmap.c~mm-thp-fix-vma_address-if-virtual-address-below-file-offset +++ a/mm/rmap.c @@ -707,7 +707,6 @@ static bool should_defer_flush(struct mm */ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) { - unsigned long address; if (PageAnon(page)) { struct anon_vma *page__anon_vma = page_anon_vma(page); /* @@ -722,10 +721,8 @@ unsigned long page_address_in_vma(struct return -EFAULT; } else return -EFAULT; - address = __vma_address(page, vma); - if (unlikely(address < vma->vm_start || address >= vma->vm_end)) - return -EFAULT; - return address; + + return vma_address(page, vma); } pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) @@ -919,7 +916,7 @@ static bool page_mkclean_one(struct page */ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0, vma, vma->vm_mm, address, - min(vma->vm_end, address + page_size(page))); + vma_address_end(page, vma)); mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { @@ -1435,9 +1432,10 @@ static bool try_to_unmap_one(struct page * Note that the page can not be free in this function as call of * try_to_unmap() must hold a reference on the page. */ + range.end = PageKsm(page) ? + address + PAGE_SIZE : vma_address_end(page, vma); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, - address, - min(vma->vm_end, address + page_size(page))); + address, range.end); if (PageHuge(page)) { /* * If sharing is possible, start and end will be adjusted @@ -1889,6 +1887,7 @@ static void rmap_walk_anon(struct page * struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); + VM_BUG_ON_VMA(address == -EFAULT, vma); cond_resched(); if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) @@ -1943,6 +1942,7 @@ static void rmap_walk_file(struct page * pgoff_start, pgoff_end) { unsigned long address = vma_address(page, vma); + VM_BUG_ON_VMA(address == -EFAULT, vma); cond_resched(); if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) _

4 years, 6 months

1
0
0 0

[patch 13/18] mm/thp: try_to_unmap() use TTU_SYNC for safe splitting

by Andrew Morton

From: Hugh Dickins <hughd(a)google.com> Subject: mm/thp: try_to_unmap() use TTU_SYNC for safe splitting Stressing huge tmpfs often crashed on unmap_page()'s VM_BUG_ON_PAGE (!unmap_success): with dump_page() showing mapcount:1, but then its raw struct page output showing _mapcount ffffffff i.e. mapcount 0. And even if that particular VM_BUG_ON_PAGE(!unmap_success) is removed, it is immediately followed by a VM_BUG_ON_PAGE(compound_mapcount(head)), and further down an IS_ENABLED(CONFIG_DEBUG_VM) total_mapcount BUG(): all indicative of some mapcount difficulty in development here perhaps. But the !CONFIG_DEBUG_VM path handles the failures correctly and silently. I believe the problem is that once a racing unmap has cleared pte or pmd, try_to_unmap_one() may skip taking the page table lock, and emerge from try_to_unmap() before the racing task has reached decrementing mapcount. Instead of abandoning the unsafe VM_BUG_ON_PAGE(), and the ones that follow, use PVMW_SYNC in try_to_unmap_one() in this case: adding TTU_SYNC to the options, and passing that from unmap_page(). When CONFIG_DEBUG_VM, or for non-debug too? Consensus is to do the same for both: the slight overhead added should rarely matter, except perhaps if splitting sparsely-populated multiply-mapped shmem. Once confident that bugs are fixed, TTU_SYNC here can be removed, and the race tolerated. Link: https://lkml.kernel.org/r/c1e95853-8bcd-d8fd-55fa-e7f2488e78f@google.com Fixes: fec89c109f3a ("thp: rewrite freeze_page()/unfreeze_page() with generic rmap walkers") Signed-off-by: Hugh Dickins <hughd(a)google.com> Cc: Alistair Popple <apopple(a)nvidia.com> Cc: Jan Kara <jack(a)suse.cz> Cc: Jue Wang <juew(a)google.com> Cc: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com> Cc: "Matthew Wilcox (Oracle)" <willy(a)infradead.org> Cc: Miaohe Lin <linmiaohe(a)huawei.com> Cc: Minchan Kim <minchan(a)kernel.org> Cc: Naoya Horiguchi <naoya.horiguchi(a)nec.com> Cc: Oscar Salvador <osalvador(a)suse.de> Cc: Peter Xu <peterx(a)redhat.com> Cc: Ralph Campbell <rcampbell(a)nvidia.com> Cc: Shakeel Butt <shakeelb(a)google.com> Cc: Wang Yugui <wangyugui(a)e16-tech.com> Cc: Yang Shi <shy828301(a)gmail.com> Cc: Zi Yan <ziy(a)nvidia.com> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- include/linux/rmap.h | 1 + mm/huge_memory.c | 2 +- mm/page_vma_mapped.c | 11 +++++++++++ mm/rmap.c | 17 ++++++++++++++++- 4 files changed, 29 insertions(+), 2 deletions(-) --- a/include/linux/rmap.h~mm-thp-try_to_unmap-use-ttu_sync-for-safe-splitting +++ a/include/linux/rmap.h @@ -91,6 +91,7 @@ enum ttu_flags { TTU_SPLIT_HUGE_PMD = 0x4, /* split huge PMD if any */ TTU_IGNORE_MLOCK = 0x8, /* ignore mlock */ + TTU_SYNC = 0x10, /* avoid racy checks with PVMW_SYNC */ TTU_IGNORE_HWPOISON = 0x20, /* corrupted page is recoverable */ TTU_BATCH_FLUSH = 0x40, /* Batch TLB flushes where possible * and caller guarantees they will --- a/mm/huge_memory.c~mm-thp-try_to_unmap-use-ttu_sync-for-safe-splitting +++ a/mm/huge_memory.c @@ -2350,7 +2350,7 @@ void vma_adjust_trans_huge(struct vm_are static void unmap_page(struct page *page) { - enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | + enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD; bool unmap_success; --- a/mm/page_vma_mapped.c~mm-thp-try_to_unmap-use-ttu_sync-for-safe-splitting +++ a/mm/page_vma_mapped.c @@ -212,6 +212,17 @@ restart: pvmw->ptl = NULL; } } else if (!pmd_present(pmde)) { + /* + * If PVMW_SYNC, take and drop THP pmd lock so that we + * cannot return prematurely, while zap_huge_pmd() has + * cleared *pmd but not decremented compound_mapcount(). + */ + if ((pvmw->flags & PVMW_SYNC) && + PageTransCompound(pvmw->page)) { + spinlock_t *ptl = pmd_lock(mm, pvmw->pmd); + + spin_unlock(ptl); + } return false; } if (!map_pte(pvmw)) --- a/mm/rmap.c~mm-thp-try_to_unmap-use-ttu_sync-for-safe-splitting +++ a/mm/rmap.c @@ -1405,6 +1405,15 @@ static bool try_to_unmap_one(struct page struct mmu_notifier_range range; enum ttu_flags flags = (enum ttu_flags)(long)arg; + /* + * When racing against e.g. zap_pte_range() on another cpu, + * in between its ptep_get_and_clear_full() and page_remove_rmap(), + * try_to_unmap() may return false when it is about to become true, + * if page table locking is skipped: use TTU_SYNC to wait for that. + */ + if (flags & TTU_SYNC) + pvmw.flags = PVMW_SYNC; + /* munlock has nothing to gain from examining un-locked vmas */ if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) return true; @@ -1777,7 +1786,13 @@ bool try_to_unmap(struct page *page, enu else rmap_walk(page, &rwc); - return !page_mapcount(page) ? true : false; + /* + * When racing against e.g. zap_pte_range() on another cpu, + * in between its ptep_get_and_clear_full() and page_remove_rmap(), + * try_to_unmap() may return false when it is about to become true, + * if page table locking is skipped: use TTU_SYNC to wait for that. + */ + return !page_mapcount(page); } /** _

4 years, 6 months

1
0
0 0

[patch 12/18] mm/thp: make is_huge_zero_pmd() safe and quicker

by Andrew Morton

From: Hugh Dickins <hughd(a)google.com> Subject: mm/thp: make is_huge_zero_pmd() safe and quicker Most callers of is_huge_zero_pmd() supply a pmd already verified present; but a few (notably zap_huge_pmd()) do not - it might be a pmd migration entry, in which the pfn is encoded differently from a present pmd: which might pass the is_huge_zero_pmd() test (though not on x86, since L1TF forced us to protect against that); or perhaps even crash in pmd_page() applied to a swap-like entry. Make it safe by adding pmd_present() check into is_huge_zero_pmd() itself; and make it quicker by saving huge_zero_pfn, so that is_huge_zero_pmd() will not need to do that pmd_page() lookup each time. __split_huge_pmd_locked() checked pmd_trans_huge() before: that worked, but is unnecessary now that is_huge_zero_pmd() checks present. Link: https://lkml.kernel.org/r/21ea9ca-a1f5-8b90-5e88-95fb1c49bbfa@google.com Fixes: e71769ae5260 ("mm: enable thp migration for shmem thp") Signed-off-by: Hugh Dickins <hughd(a)google.com> Acked-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com> Reviewed-by: Yang Shi <shy828301(a)gmail.com> Cc: Alistair Popple <apopple(a)nvidia.com> Cc: Jan Kara <jack(a)suse.cz> Cc: Jue Wang <juew(a)google.com> Cc: "Matthew Wilcox (Oracle)" <willy(a)infradead.org> Cc: Miaohe Lin <linmiaohe(a)huawei.com> Cc: Minchan Kim <minchan(a)kernel.org> Cc: Naoya Horiguchi <naoya.horiguchi(a)nec.com> Cc: Oscar Salvador <osalvador(a)suse.de> Cc: Peter Xu <peterx(a)redhat.com> Cc: Ralph Campbell <rcampbell(a)nvidia.com> Cc: Shakeel Butt <shakeelb(a)google.com> Cc: Wang Yugui <wangyugui(a)e16-tech.com> Cc: Zi Yan <ziy(a)nvidia.com> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- include/linux/huge_mm.h | 8 +++++++- mm/huge_memory.c | 5 ++++- 2 files changed, 11 insertions(+), 2 deletions(-) --- a/include/linux/huge_mm.h~mm-thp-make-is_huge_zero_pmd-safe-and-quicker +++ a/include/linux/huge_mm.h @@ -286,6 +286,7 @@ struct page *follow_devmap_pud(struct vm vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd); extern struct page *huge_zero_page; +extern unsigned long huge_zero_pfn; static inline bool is_huge_zero_page(struct page *page) { @@ -294,7 +295,7 @@ static inline bool is_huge_zero_page(str static inline bool is_huge_zero_pmd(pmd_t pmd) { - return is_huge_zero_page(pmd_page(pmd)); + return READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd) && pmd_present(pmd); } static inline bool is_huge_zero_pud(pud_t pud) @@ -439,6 +440,11 @@ static inline bool is_huge_zero_page(str { return false; } + +static inline bool is_huge_zero_pmd(pmd_t pmd) +{ + return false; +} static inline bool is_huge_zero_pud(pud_t pud) { --- a/mm/huge_memory.c~mm-thp-make-is_huge_zero_pmd-safe-and-quicker +++ a/mm/huge_memory.c @@ -62,6 +62,7 @@ static struct shrinker deferred_split_sh static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; +unsigned long huge_zero_pfn __read_mostly = ~0UL; bool transparent_hugepage_enabled(struct vm_area_struct *vma) { @@ -98,6 +99,7 @@ retry: __free_pages(zero_page, compound_order(zero_page)); goto retry; } + WRITE_ONCE(huge_zero_pfn, page_to_pfn(zero_page)); /* We take additional reference here. It will be put back by shrinker */ atomic_set(&huge_zero_refcount, 2); @@ -147,6 +149,7 @@ static unsigned long shrink_huge_zero_pa if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { struct page *zero_page = xchg(&huge_zero_page, NULL); BUG_ON(zero_page == NULL); + WRITE_ONCE(huge_zero_pfn, ~0UL); __free_pages(zero_page, compound_order(zero_page)); return HPAGE_PMD_NR; } @@ -2071,7 +2074,7 @@ static void __split_huge_pmd_locked(stru return; } - if (pmd_trans_huge(*pmd) && is_huge_zero_pmd(*pmd)) { + if (is_huge_zero_pmd(*pmd)) { /* * FIXME: Do we want to invalidate secondary mmu by calling * mmu_notifier_invalidate_range() see comments below inside _

4 years, 6 months

1
0
0 0

2025

2024

2023

2022

2021

2020

2019

2018

2017

Linux-stable-mirror June 2021