August 2019 - Linux-stable-mirror

[patch 10/11] mm/zsmalloc.c: fix race condition in zs_destroy_pool

by akpm＠linux-foundation.org

From: Henry Burns <henryburns(a)google.com> Subject: mm/zsmalloc.c: fix race condition in zs_destroy_pool In zs_destroy_pool() we call flush_work(&pool->free_work). However, we have no guarantee that migration isn't happening in the background at that time. Since migration can't directly free pages, it relies on free_work being scheduled to free the pages. But there's nothing preventing an in-progress migrate from queuing the work *after* zs_unregister_migration() has called flush_work(). Which would mean pages still pointing at the inode when we free it. Since we know at destroy time all objects should be free, no new migrations can come in (since zs_page_isolate() fails for fully-free zspages). This means it is sufficient to track a "# isolated zspages" count by class, and have the destroy logic ensure all such pages have drained before proceeding. Keeping that state under the class spinlock keeps the logic straightforward. In this case a memory leak could lead to an eventual crash if compaction hits the leaked page. This crash would only occur if people are changing their zswap backend at runtime (which eventually starts destruction). Link: http://lkml.kernel.org/r/20190809181751.219326-2-henryburns@google.com Fixes: 48b4800a1c6a ("zsmalloc: page migration support") Signed-off-by: Henry Burns <henryburns(a)google.com> Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky(a)gmail.com> Cc: Henry Burns <henrywolfeburns(a)gmail.com> Cc: Minchan Kim <minchan(a)kernel.org> Cc: Shakeel Butt <shakeelb(a)google.com> Cc: Jonathan Adams <jwadams(a)google.com> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- mm/zsmalloc.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 59 insertions(+), 2 deletions(-) --- a/mm/zsmalloc.c~mm-zsmallocc-fix-race-condition-in-zs_destroy_pool +++ a/mm/zsmalloc.c @@ -54,6 +54,7 @@ #include <linux/mount.h> #include <linux/pseudo_fs.h> #include <linux/migrate.h> +#include <linux/wait.h> #include <linux/pagemap.h> #include <linux/fs.h> @@ -268,6 +269,10 @@ struct zs_pool { #ifdef CONFIG_COMPACTION struct inode *inode; struct work_struct free_work; + /* A wait queue for when migration races with async_free_zspage() */ + struct wait_queue_head migration_wait; + atomic_long_t isolated_pages; + bool destroying; #endif }; @@ -1874,6 +1879,19 @@ static void putback_zspage_deferred(stru } +static inline void zs_pool_dec_isolated(struct zs_pool *pool) +{ + VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0); + atomic_long_dec(&pool->isolated_pages); + /* + * There's no possibility of racing, since wait_for_isolated_drain() + * checks the isolated count under &class->lock after enqueuing + * on migration_wait. + */ + if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying) + wake_up_all(&pool->migration_wait); +} + static void replace_sub_page(struct size_class *class, struct zspage *zspage, struct page *newpage, struct page *oldpage) { @@ -1943,6 +1961,7 @@ static bool zs_page_isolate(struct page */ if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) { get_zspage_mapping(zspage, &class_idx, &fullness); + atomic_long_inc(&pool->isolated_pages); remove_zspage(class, zspage, fullness); } @@ -2042,8 +2061,16 @@ static int zs_page_migrate(struct addres * Page migration is done so let's putback isolated zspage to * the list if @page is final isolated subpage in the zspage. */ - if (!is_zspage_isolated(zspage)) + if (!is_zspage_isolated(zspage)) { + /* + * We cannot race with zs_destroy_pool() here because we wait + * for isolation to hit zero before we start destroying. + * Also, we ensure that everyone can see pool->destroying before + * we start waiting. + */ putback_zspage_deferred(pool, class, zspage); + zs_pool_dec_isolated(pool); + } reset_page(page); put_page(page); @@ -2094,8 +2121,8 @@ static void zs_page_putback(struct page * so let's defer. */ putback_zspage_deferred(pool, class, zspage); + zs_pool_dec_isolated(pool); } - spin_unlock(&class->lock); } @@ -2118,8 +2145,36 @@ static int zs_register_migration(struct return 0; } +static bool pool_isolated_are_drained(struct zs_pool *pool) +{ + return atomic_long_read(&pool->isolated_pages) == 0; +} + +/* Function for resolving migration */ +static void wait_for_isolated_drain(struct zs_pool *pool) +{ + + /* + * We're in the process of destroying the pool, so there are no + * active allocations. zs_page_isolate() fails for completely free + * zspages, so we need only wait for the zs_pool's isolated + * count to hit zero. + */ + wait_event(pool->migration_wait, + pool_isolated_are_drained(pool)); +} + static void zs_unregister_migration(struct zs_pool *pool) { + pool->destroying = true; + /* + * We need a memory barrier here to ensure global visibility of + * pool->destroying. Thus pool->isolated pages will either be 0 in which + * case we don't care, or it will be > 0 and pool->destroying will + * ensure that we wake up once isolation hits 0. + */ + smp_mb(); + wait_for_isolated_drain(pool); /* This can block */ flush_work(&pool->free_work); iput(pool->inode); } @@ -2357,6 +2412,8 @@ struct zs_pool *zs_create_pool(const cha if (!pool->name) goto err; + init_waitqueue_head(&pool->migration_wait); + if (create_cache(pool)) goto err; _

5 years, 10 months

1
0
0 0

[patch 09/11] mm/zsmalloc.c: migration can leave pages in ZS_EMPTY indefinitely

by akpm＠linux-foundation.org

From: Henry Burns <henryburns(a)google.com> Subject: mm/zsmalloc.c: migration can leave pages in ZS_EMPTY indefinitely In zs_page_migrate() we call putback_zspage() after we have finished migrating all pages in this zspage. However, the return value is ignored. If a zs_free() races in between zs_page_isolate() and zs_page_migrate(), freeing the last object in the zspage, putback_zspage() will leave the page in ZS_EMPTY for potentially an unbounded amount of time. To fix this, we need to do the same thing as zs_page_putback() does: schedule free_work to occur. To avoid duplicated code, move the sequence to a new putback_zspage_deferred() function which both zs_page_migrate() and zs_page_putback() call. Link: http://lkml.kernel.org/r/20190809181751.219326-1-henryburns@google.com Fixes: 48b4800a1c6a ("zsmalloc: page migration support") Signed-off-by: Henry Burns <henryburns(a)google.com> Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky(a)gmail.com> Cc: Henry Burns <henrywolfeburns(a)gmail.com> Cc: Minchan Kim <minchan(a)kernel.org> Cc: Shakeel Butt <shakeelb(a)google.com> Cc: Jonathan Adams <jwadams(a)google.com> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- mm/zsmalloc.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) --- a/mm/zsmalloc.c~mm-zsmallocc-migration-can-leave-pages-in-zs_empty-indefinitely +++ a/mm/zsmalloc.c @@ -1862,6 +1862,18 @@ static void dec_zspage_isolation(struct zspage->isolated--; } +static void putback_zspage_deferred(struct zs_pool *pool, + struct size_class *class, + struct zspage *zspage) +{ + enum fullness_group fg; + + fg = putback_zspage(class, zspage); + if (fg == ZS_EMPTY) + schedule_work(&pool->free_work); + +} + static void replace_sub_page(struct size_class *class, struct zspage *zspage, struct page *newpage, struct page *oldpage) { @@ -2031,7 +2043,7 @@ static int zs_page_migrate(struct addres * the list if @page is final isolated subpage in the zspage. */ if (!is_zspage_isolated(zspage)) - putback_zspage(class, zspage); + putback_zspage_deferred(pool, class, zspage); reset_page(page); put_page(page); @@ -2077,14 +2089,13 @@ static void zs_page_putback(struct page spin_lock(&class->lock); dec_zspage_isolation(zspage); if (!is_zspage_isolated(zspage)) { - fg = putback_zspage(class, zspage); /* * Due to page_lock, we cannot free zspage immediately * so let's defer. */ - if (fg == ZS_EMPTY) - schedule_work(&pool->free_work); + putback_zspage_deferred(pool, class, zspage); } + spin_unlock(&class->lock); } _

5 years, 10 months

1
0
0 0

[patch 08/11] mm, page_owner: handle THP splits correctly

by akpm＠linux-foundation.org

From: Vlastimil Babka <vbabka(a)suse.cz> Subject: mm, page_owner: handle THP splits correctly THP splitting path is missing the split_page_owner() call that split_page() has. As a result, split THP pages are wrongly reported in the page_owner file as order-9 pages. Furthermore when the former head page is freed, the remaining former tail pages are not listed in the page_owner file at all. This patch fixes that by adding the split_page_owner() call into __split_huge_page(). Link: http://lkml.kernel.org/r/20190820131828.22684-2-vbabka@suse.cz Fixes: a9627bc5e34e ("mm/page_owner: introduce split_page_owner and replace manual handling") Reported-by: Kirill A. Shutemov <kirill(a)shutemov.name> Signed-off-by: Vlastimil Babka <vbabka(a)suse.cz> Cc: Michal Hocko <mhocko(a)kernel.org> Cc: Mel Gorman <mgorman(a)techsingularity.net> Cc: Matthew Wilcox <willy(a)infradead.org> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- mm/huge_memory.c | 4 ++++ 1 file changed, 4 insertions(+) --- a/mm/huge_memory.c~mm-page_owner-handle-thp-splits-correctly +++ a/mm/huge_memory.c @@ -32,6 +32,7 @@ #include <linux/shmem_fs.h> #include <linux/oom.h> #include <linux/numa.h> +#include <linux/page_owner.h> #include <asm/tlb.h> #include <asm/pgalloc.h> @@ -2516,6 +2517,9 @@ static void __split_huge_page(struct pag } ClearPageCompound(head); + + split_page_owner(head, HPAGE_PMD_ORDER); + /* See comment in __split_huge_page_tail() */ if (PageAnon(head)) { /* Additional pin to swap cache */ _

5 years, 10 months

1
0
0 0

[patch 07/11] userfaultfd_release: always remove uffd flags and clear vm_userfaultfd_ctx

by akpm＠linux-foundation.org

From: Oleg Nesterov <oleg(a)redhat.com> Subject: userfaultfd_release: always remove uffd flags and clear vm_userfaultfd_ctx userfaultfd_release() should clear vm_flags/vm_userfaultfd_ctx even if mm->core_state != NULL. Otherwise a page fault can see userfaultfd_missing() == T and use an already freed userfaultfd_ctx. Link: http://lkml.kernel.org/r/20190820160237.GB4983@redhat.com Fixes: 04f5866e41fb ("coredump: fix race condition between mmget_not_zero()/get_task_mm() and core dumping") Signed-off-by: Oleg Nesterov <oleg(a)redhat.com> Reported-by: Kefeng Wang <wangkefeng.wang(a)huawei.com> Reviewed-by: Andrea Arcangeli <aarcange(a)redhat.com> Tested-by: Kefeng Wang <wangkefeng.wang(a)huawei.com> Cc: Peter Xu <peterx(a)redhat.com> Cc: Mike Rapoport <rppt(a)linux.ibm.com> Cc: Jann Horn <jannh(a)google.com> Cc: Jason Gunthorpe <jgg(a)mellanox.com> Cc: Michal Hocko <mhocko(a)suse.com> Cc: Tetsuo Handa <penguin-kernel(a)I-love.SAKURA.ne.jp> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- fs/userfaultfd.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) --- a/fs/userfaultfd.c~userfaultfd_release-always-remove-uffd-flags-and-clear-vm_userfaultfd_ctx +++ a/fs/userfaultfd.c @@ -880,6 +880,7 @@ static int userfaultfd_release(struct in /* len == 0 means wake all */ struct userfaultfd_wake_range range = { .len = 0, }; unsigned long new_flags; + bool still_valid; WRITE_ONCE(ctx->released, true); @@ -895,8 +896,7 @@ static int userfaultfd_release(struct in * taking the mmap_sem for writing. */ down_write(&mm->mmap_sem); - if (!mmget_still_valid(mm)) - goto skip_mm; + still_valid = mmget_still_valid(mm); prev = NULL; for (vma = mm->mmap; vma; vma = vma->vm_next) { cond_resched(); @@ -907,19 +907,20 @@ static int userfaultfd_release(struct in continue; } new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP); - prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, - new_flags, vma->anon_vma, - vma->vm_file, vma->vm_pgoff, - vma_policy(vma), - NULL_VM_UFFD_CTX); - if (prev) - vma = prev; - else - prev = vma; + if (still_valid) { + prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, + new_flags, vma->anon_vma, + vma->vm_file, vma->vm_pgoff, + vma_policy(vma), + NULL_VM_UFFD_CTX); + if (prev) + vma = prev; + else + prev = vma; + } vma->vm_flags = new_flags; vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; } -skip_mm: up_write(&mm->mmap_sem); mmput(mm); wakeup: _

5 years, 10 months

1
0
0 0

[patch 06/11] psi: get poll_work to run when calling poll syscall next time

by akpm＠linux-foundation.org

From: Jason Xing <kerneljasonxing(a)linux.alibaba.com> Subject: psi: get poll_work to run when calling poll syscall next time Only when calling the poll syscall the first time can user receive POLLPRI correctly. After that, user always fails to acquire the event signal. Reproduce case: 1. Get the monitor code in Documentation/accounting/psi.txt 2. Run it, and wait for the event triggered. 3. Kill and restart the process. The question is why we can end up with poll_scheduled = 1 but the work not running (which would reset it to 0). And the answer is because the scheduling side sees group->poll_kworker under RCU protection and then schedules it, but here we cancel the work and destroy the worker. The cancel needs to pair with resetting the poll_scheduled flag. Link: http://lkml.kernel.org/r/1566357985-97781-1-git-send-email-joseph.qi@linux.… Signed-off-by: Jason Xing <kerneljasonxing(a)linux.alibaba.com> Signed-off-by: Joseph Qi <joseph.qi(a)linux.alibaba.com> Reviewed-by: Caspar Zhang <caspar(a)linux.alibaba.com> Reviewed-by: Suren Baghdasaryan <surenb(a)google.com> Acked-by: Johannes Weiner <hannes(a)cmpxchg.org> Cc: Ingo Molnar <mingo(a)redhat.com> Cc: Peter Zijlstra <peterz(a)infradead.org> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- kernel/sched/psi.c | 8 ++++++++ 1 file changed, 8 insertions(+) --- a/kernel/sched/psi.c~psi-get-poll_work-to-run-when-calling-poll-syscall-next-time +++ a/kernel/sched/psi.c @@ -1131,7 +1131,15 @@ static void psi_trigger_destroy(struct k * deadlock while waiting for psi_poll_work to acquire trigger_lock */ if (kworker_to_destroy) { + /* + * After the RCU grace period has expired, the worker + * can no longer be found through group->poll_kworker. + * But it might have been already scheduled before + * that - deschedule it cleanly before destroying it. + */ kthread_cancel_delayed_work_sync(&group->poll_work); + atomic_set(&group->poll_scheduled, 0); + kthread_destroy_worker(kworker_to_destroy); } kfree(t); _

5 years, 10 months

1
0
0 0

[patch 05/11] mm: memcontrol: flush percpu vmevents before releasing memcg

by akpm＠linux-foundation.org

From: Roman Gushchin <guro(a)fb.com> Subject: mm: memcontrol: flush percpu vmevents before releasing memcg Similar to vmstats, percpu caching of local vmevents leads to an accumulation of errors on non-leaf levels. This happens because some leftovers may remain in percpu caches, so that they are never propagated up by the cgroup tree and just disappear into nonexistence with on releasing of the memory cgroup. To fix this issue let's accumulate and propagate percpu vmevents values before releasing the memory cgroup similar to what we're doing with vmstats. Since on cpu hotplug we do flush percpu vmstats anyway, we can iterate only over online cpus. Link: http://lkml.kernel.org/r/20190819202338.363363-4-guro@fb.com Fixes: 42a300353577 ("mm: memcontrol: fix recursive statistics correctness & scalabilty") Signed-off-by: Roman Gushchin <guro(a)fb.com> Acked-by: Michal Hocko <mhocko(a)suse.com> Cc: Johannes Weiner <hannes(a)cmpxchg.org> Cc: Vladimir Davydov <vdavydov.dev(a)gmail.com> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- mm/memcontrol.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) --- a/mm/memcontrol.c~mm-memcontrol-flush-percpu-vmevents-before-releasing-memcg +++ a/mm/memcontrol.c @@ -3295,6 +3295,25 @@ static void memcg_flush_percpu_vmstats(s } } +static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg) +{ + unsigned long events[NR_VM_EVENT_ITEMS]; + struct mem_cgroup *mi; + int cpu, i; + + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) + events[i] = 0; + + for_each_online_cpu(cpu) + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) + events[i] += raw_cpu_read( + memcg->vmstats_percpu->events[i]); + + for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) + atomic_long_add(events[i], &mi->vmevents[i]); +} + #ifdef CONFIG_MEMCG_KMEM static int memcg_online_kmem(struct mem_cgroup *memcg) { @@ -4718,10 +4737,11 @@ static void __mem_cgroup_free(struct mem int node; /* - * Flush percpu vmstats to guarantee the value correctness + * Flush percpu vmstats and vmevents to guarantee the value correctness * on parent's and all ancestor levels. */ memcg_flush_percpu_vmstats(memcg); + memcg_flush_percpu_vmevents(memcg); for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); free_percpu(memcg->vmstats_percpu); _

5 years, 10 months

1
0
0 0

[patch 04/11] mm: memcontrol: flush percpu vmstats before releasing memcg

by akpm＠linux-foundation.org

From: Roman Gushchin <guro(a)fb.com> Subject: mm: memcontrol: flush percpu vmstats before releasing memcg Percpu caching of local vmstats with the conditional propagation by the cgroup tree leads to an accumulation of errors on non-leaf levels. Let's imagine two nested memory cgroups A and A/B. Say, a process belonging to A/B allocates 100 pagecache pages on the CPU 0. The percpu cache will spill 3 times, so that 32*3=96 pages will be accounted to A/B and A atomic vmstat counters, 4 pages will remain in the percpu cache. Imagine A/B is nearby memory.max, so that every following allocation triggers a direct reclaim on the local CPU. Say, each such attempt will free 16 pages on a new cpu. That means every percpu cache will have -16 pages, except the first one, which will have 4 - 16 = -12. A/B and A atomic counters will not be touched at all. Now a user removes A/B. All percpu caches are freed and corresponding vmstat numbers are forgotten. A has 96 pages more than expected. As memory cgroups are created and destroyed, errors do accumulate. Even 1-2 pages differences can accumulate into large numbers. To fix this issue let's accumulate and propagate percpu vmstat values before releasing the memory cgroup. At this point these numbers are stable and cannot be changed. Since on cpu hotplug we do flush percpu vmstats anyway, we can iterate only over online cpus. Link: http://lkml.kernel.org/r/20190819202338.363363-2-guro@fb.com Fixes: 42a300353577 ("mm: memcontrol: fix recursive statistics correctness & scalabilty") Signed-off-by: Roman Gushchin <guro(a)fb.com> Acked-by: Michal Hocko <mhocko(a)suse.com> Cc: Johannes Weiner <hannes(a)cmpxchg.org> Cc: Vladimir Davydov <vdavydov.dev(a)gmail.com> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- mm/memcontrol.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) --- a/mm/memcontrol.c~mm-memcontrol-flush-percpu-vmstats-before-releasing-memcg +++ a/mm/memcontrol.c @@ -3260,6 +3260,41 @@ static u64 mem_cgroup_read_u64(struct cg } } +static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg) +{ + unsigned long stat[MEMCG_NR_STAT]; + struct mem_cgroup *mi; + int node, cpu, i; + + for (i = 0; i < MEMCG_NR_STAT; i++) + stat[i] = 0; + + for_each_online_cpu(cpu) + for (i = 0; i < MEMCG_NR_STAT; i++) + stat[i] += raw_cpu_read(memcg->vmstats_percpu->stat[i]); + + for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) + for (i = 0; i < MEMCG_NR_STAT; i++) + atomic_long_add(stat[i], &mi->vmstats[i]); + + for_each_node(node) { + struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; + struct mem_cgroup_per_node *pi; + + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + stat[i] = 0; + + for_each_online_cpu(cpu) + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + stat[i] += raw_cpu_read( + pn->lruvec_stat_cpu->count[i]); + + for (pi = pn; pi; pi = parent_nodeinfo(pi, node)) + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + atomic_long_add(stat[i], &pi->lruvec_stat[i]); + } +} + #ifdef CONFIG_MEMCG_KMEM static int memcg_online_kmem(struct mem_cgroup *memcg) { @@ -4682,6 +4717,11 @@ static void __mem_cgroup_free(struct mem { int node; + /* + * Flush percpu vmstats to guarantee the value correctness + * on parent's and all ancestor levels. + */ + memcg_flush_percpu_vmstats(memcg); for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); free_percpu(memcg->vmstats_percpu); _

5 years, 10 months

1
0
0 0

[patch 02/11] mm, page_alloc: move_freepages should not examine struct page of reserved memory

by akpm＠linux-foundation.org

From: David Rientjes <rientjes(a)google.com> Subject: mm, page_alloc: move_freepages should not examine struct page of reserved memory After commit 907ec5fca3dc ("mm: zero remaining unavailable struct pages"), struct page of reserved memory is zeroed. This causes page->flags to be 0 and fixes issues related to reading /proc/kpageflags, for example, of reserved memory. The VM_BUG_ON() in move_freepages_block(), however, assumes that page_zone() is meaningful even for reserved memory. That assumption is no longer true after the aforementioned commit. There's no reason why move_freepages_block() should be testing the legitimacy of page_zone() for reserved memory; its scope is limited only to pages on the zone's freelist. Note that pfn_valid() can be true for reserved memory: there is a backing struct page. The check for page_to_nid(page) is also buggy but reserved memory normally only appears on node 0 so the zeroing doesn't affect this. Move the debug checks to after verifying PageBuddy is true. This isolates the scope of the checks to only be for buddy pages which are on the zone's freelist which move_freepages_block() is operating on. In this case, an incorrect node or zone is a bug worthy of being warned about (and the examination of struct page is acceptable bcause this memory is not reserved). Why does move_freepages_block() gets called on reserved memory? It's simply math after finding a valid free page from the per-zone free area to use as fallback. We find the beginning and end of the pageblock of the valid page and that can bring us into memory that was reserved per the e820. pfn_valid() is still true (it's backed by a struct page), but since it's zero'd we shouldn't make any inferences here about comparing its node or zone. The current node check just happens to succeed most of the time by luck because reserved memory typically appears on node 0. The fix here is to validate that we actually have buddy pages before testing if there's any type of zone or node strangeness going on. We noticed it almost immediately after bringing 907ec5fca3dc in on CONFIG_DEBUG_VM builds. It depends on finding specific free pages in the per-zone free area where the math in move_freepages() will bring the start or end pfn into reserved memory and wanting to claim that entire pageblock as a new migratetype. So the path will be rare, require CONFIG_DEBUG_VM, and require fallback to a different migratetype. Some struct pages were already zeroed from reserve pages before 907ec5fca3c so it theoretically could trigger before this commit. I think it's rare enough under a config option that most people don't run that others may not have noticed. I wouldn't argue against a stable tag and the backport should be easy enough, but probably wouldn't single out a commit that this is fixing. Mel said: : The overhead of the debugging check is higher with this patch although : it'll only affect debug builds and the path is not particularly hot. : If this was a concern, I think it would be reasonable to simply remove : the debugging check as the zone boundaries are checked in : move_freepages_block and we never expect a zone/node to be smaller than : a pageblock and stuck in the middle of another zone. Link: http://lkml.kernel.org/r/alpine.DEB.2.21.1908122036560.10779@chino.kir.corp… Signed-off-by: David Rientjes <rientjes(a)google.com> Acked-by: Mel Gorman <mgorman(a)techsingularity.net> Cc: Naoya Horiguchi <n-horiguchi(a)ah.jp.nec.com> Cc: Masayoshi Mizuma <m.mizuma(a)jp.fujitsu.com> Cc: Oscar Salvador <osalvador(a)suse.de> Cc: Pavel Tatashin <pavel.tatashin(a)microsoft.com> Cc: Vlastimil Babka <vbabka(a)suse.cz> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- mm/page_alloc.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) --- a/mm/page_alloc.c~mm-page_alloc-move_freepages-should-not-examine-struct-page-of-reserved-memory +++ a/mm/page_alloc.c @@ -2238,27 +2238,12 @@ static int move_freepages(struct zone *z unsigned int order; int pages_moved = 0; -#ifndef CONFIG_HOLES_IN_ZONE - /* - * page_zone is not safe to call in this context when - * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant - * anyway as we check zone boundaries in move_freepages_block(). - * Remove at a later date when no bug reports exist related to - * grouping pages by mobility - */ - VM_BUG_ON(pfn_valid(page_to_pfn(start_page)) && - pfn_valid(page_to_pfn(end_page)) && - page_zone(start_page) != page_zone(end_page)); -#endif for (page = start_page; page <= end_page;) { if (!pfn_valid_within(page_to_pfn(page))) { page++; continue; } - /* Make sure we are not inadvertently changing nodes */ - VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); - if (!PageBuddy(page)) { /* * We assume that pages that could be isolated for @@ -2273,6 +2258,10 @@ static int move_freepages(struct zone *z continue; } + /* Make sure we are not inadvertently changing nodes */ + VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); + VM_BUG_ON_PAGE(page_zone(page) != zone, page); + order = page_order(page); move_to_free_area(page, &zone->free_area[order], migratetype); page += 1 << order; _

5 years, 10 months

1
0
0 0

[patch 01/11] mm/z3fold.c: fix race between migration and destruction

by akpm＠linux-foundation.org

From: Henry Burns <henryburns(a)google.com> Subject: mm/z3fold.c: fix race between migration and destruction In z3fold_destroy_pool() we call destroy_workqueue(&pool->compact_wq). However, we have no guarantee that migration isn't happening in the background at that time. Migration directly calls queue_work_on(pool->compact_wq), if destruction wins that race we are using a destroyed workqueue. Link: http://lkml.kernel.org/r/20190809213828.202833-1-henryburns@google.com Signed-off-by: Henry Burns <henryburns(a)google.com> Cc: Vitaly Wool <vitalywool(a)gmail.com> Cc: Shakeel Butt <shakeelb(a)google.com> Cc: Jonathan Adams <jwadams(a)google.com> Cc: Henry Burns <henrywolfeburns(a)gmail.com> Cc: <stable(a)vger.kernel.org> Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> --- mm/z3fold.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) --- a/mm/z3fold.c~mm-z3foldc-fix-race-between-migration-and-destruction +++ a/mm/z3fold.c @@ -41,6 +41,7 @@ #include <linux/workqueue.h> #include <linux/slab.h> #include <linux/spinlock.h> +#include <linux/wait.h> #include <linux/zpool.h> #include <linux/magic.h> @@ -145,6 +146,8 @@ struct z3fold_header { * @release_wq: workqueue for safe page release * @work: work_struct for safe page release * @inode: inode for z3fold pseudo filesystem + * @destroying: bool to stop migration once we start destruction + * @isolated: int to count the number of pages currently in isolation * * This structure is allocated at pool creation time and maintains metadata * pertaining to a particular z3fold pool. @@ -163,8 +166,11 @@ struct z3fold_pool { const struct zpool_ops *zpool_ops; struct workqueue_struct *compact_wq; struct workqueue_struct *release_wq; + struct wait_queue_head isolate_wait; struct work_struct work; struct inode *inode; + bool destroying; + int isolated; }; /* @@ -769,6 +775,7 @@ static struct z3fold_pool *z3fold_create goto out_c; spin_lock_init(&pool->lock); spin_lock_init(&pool->stale_lock); + init_waitqueue_head(&pool->isolate_wait); pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2); if (!pool->unbuddied) goto out_pool; @@ -808,6 +815,15 @@ out: return NULL; } +static bool pool_isolated_are_drained(struct z3fold_pool *pool) +{ + bool ret; + + spin_lock(&pool->lock); + ret = pool->isolated == 0; + spin_unlock(&pool->lock); + return ret; +} /** * z3fold_destroy_pool() - destroys an existing z3fold pool * @pool: the z3fold pool to be destroyed @@ -817,6 +833,22 @@ out: static void z3fold_destroy_pool(struct z3fold_pool *pool) { kmem_cache_destroy(pool->c_handle); + /* + * We set pool-> destroying under lock to ensure that + * z3fold_page_isolate() sees any changes to destroying. This way we + * avoid the need for any memory barriers. + */ + + spin_lock(&pool->lock); + pool->destroying = true; + spin_unlock(&pool->lock); + + /* + * We need to ensure that no pages are being migrated while we destroy + * these workqueues, as migration can queue work on either of the + * workqueues. + */ + wait_event(pool->isolate_wait, !pool_isolated_are_drained(pool)); /* * We need to destroy pool->compact_wq before pool->release_wq, @@ -1307,6 +1339,28 @@ static u64 z3fold_get_pool_size(struct z return atomic64_read(&pool->pages_nr); } +/* + * z3fold_dec_isolated() expects to be called while pool->lock is held. + */ +static void z3fold_dec_isolated(struct z3fold_pool *pool) +{ + assert_spin_locked(&pool->lock); + VM_BUG_ON(pool->isolated <= 0); + pool->isolated--; + + /* + * If we have no more isolated pages, we have to see if + * z3fold_destroy_pool() is waiting for a signal. + */ + if (pool->isolated == 0 && waitqueue_active(&pool->isolate_wait)) + wake_up_all(&pool->isolate_wait); +} + +static void z3fold_inc_isolated(struct z3fold_pool *pool) +{ + pool->isolated++; +} + static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) { struct z3fold_header *zhdr; @@ -1333,6 +1387,33 @@ static bool z3fold_page_isolate(struct p spin_lock(&pool->lock); if (!list_empty(&page->lru)) list_del(&page->lru); + /* + * We need to check for destruction while holding pool->lock, as + * otherwise destruction could see 0 isolated pages, and + * proceed. + */ + if (unlikely(pool->destroying)) { + spin_unlock(&pool->lock); + /* + * If this page isn't stale, somebody else holds a + * reference to it. Let't drop our refcount so that they + * can call the release logic. + */ + if (unlikely(kref_put(&zhdr->refcount, + release_z3fold_page_locked))) { + /* + * If we get here we have kref problems, so we + * should freak out. + */ + WARN(1, "Z3fold is experiencing kref problems\n"); + return false; + } + z3fold_page_unlock(zhdr); + return false; + } + + + z3fold_inc_isolated(pool); spin_unlock(&pool->lock); z3fold_page_unlock(zhdr); return true; @@ -1401,6 +1482,10 @@ static int z3fold_page_migrate(struct ad queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work); + spin_lock(&pool->lock); + z3fold_dec_isolated(pool); + spin_unlock(&pool->lock); + page_mapcount_reset(page); put_page(page); return 0; @@ -1420,10 +1505,14 @@ static void z3fold_page_putback(struct p INIT_LIST_HEAD(&page->lru); if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) { atomic64_dec(&pool->pages_nr); + spin_lock(&pool->lock); + z3fold_dec_isolated(pool); + spin_unlock(&pool->lock); return; } spin_lock(&pool->lock); list_add(&page->lru, &pool->lru); + z3fold_dec_isolated(pool); spin_unlock(&pool->lock); z3fold_page_unlock(zhdr); } _

5 years, 10 months

1
0
0 0

[PATCH 4.4 00/78] 4.4.190-stable review

by Greg Kroah-Hartman

This is the start of the stable review cycle for the 4.4.190 release. There are 78 patches in this series, all will be posted as a response to this one. If anyone has any issues with these being applied, please let me know. Responses should be made by Sat 24 Aug 2019 05:18:13 PM UTC. Anything received after that time might be too late. The whole patch series can be found in one patch at: https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.4.190-rc… or in the git tree and branch at: git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-4.4.y and the diffstat can be found below. thanks, greg k-h ------------- Pseudo-Shortlog of commits: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org> Linux 4.4.190-rc1 YueHaibing <yuehaibing(a)huawei.com> bonding: Add vlan tx offload to hw_enc_features Xin Long <lucien.xin(a)gmail.com> sctp: fix the transport error_count check Huy Nguyen <huyn(a)mellanox.com> net/mlx5e: Only support tx/rx pause setting for port owner Ross Lagerwall <ross.lagerwall(a)citrix.com> xen/netback: Reset nr_frags before freeing skb Eric Dumazet <edumazet(a)google.com> net/packet: fix race in tpacket_snd() Matthias Kaehlcke <mka(a)chromium.org> x86/boot: Disable the address-of-packed-member compiler warning Joerg Roedel <jroedel(a)suse.de> iommu/amd: Move iommu_init_pci() to .init section Andy Lutomirski <luto(a)kernel.org> x86/vdso: Remove direct HPET access through the vDSO Doug Ledford <dledford(a)redhat.com> IB/mlx5: Make coding style more consistent Jason Gunthorpe <jgg(a)mellanox.com> RDMA: Directly cast the sockaddr union to sockaddr Hannes Reinecke <hare(a)suse.de> scsi: fcoe: Embed fc_rport_priv in fcoe_rport structure Arnd Bergmann <arnd(a)arndb.de> asm-generic: default BUG_ON(x) to if(x)BUG() YueHaibing <yuehaibing(a)huawei.com> Input: psmouse - fix build error of multiple definition Will Deacon <will(a)kernel.org> arm64: compat: Allow single-byte watchpoints on all addresses Miguel Ojeda <miguel.ojeda.sandonis(a)gmail.com> include/linux/module.h: copy __init/__exit attrs to init/cleanup_module Miguel Ojeda <miguel.ojeda.sandonis(a)gmail.com> Backport minimal compiler_attributes.h to support GCC 9 Tony Lindgren <tony(a)atomide.com> USB: serial: option: Add Motorola modem UARTs Bob Ham <bob.ham(a)puri.sm> USB: serial: option: add the BroadMobi BM818 card Yoshiaki Okamoto <yokamoto(a)allied-telesis.co.jp> USB: serial: option: Add support for ZTE MF871A Rogan Dawes <rogan(a)dawes.za.net> USB: serial: option: add D-Link DWM-222 device ID Oliver Neukum <oneukum(a)suse.com> usb: cdc-acm: make sure a refcount is taken early enough Alan Stern <stern(a)rowland.harvard.edu> USB: core: Fix races in character device registration and deregistraion Ian Abbott <abbotti(a)mev.co.uk> staging: comedi: dt3000: Fix rounding up of timer divisor Ian Abbott <abbotti(a)mev.co.uk> staging: comedi: dt3000: Fix signed integer overflow 'divider * base' Qian Cai <cai(a)lca.pw> asm-generic: fix -Wtype-limits compiler warnings YueHaibing <yuehaibing(a)huawei.com> ocfs2: remove set but not used variable 'last_hash' Tony Luck <tony.luck(a)intel.com> IB/core: Add mitigation for Spectre V1 Masahiro Yamada <yamada.masahiro(a)socionext.com> kbuild: modpost: handle KBUILD_EXTRA_SYMBOLS only for external modules Miquel Raynal <miquel.raynal(a)bootlin.com> ata: libahci: do not complain in case of deferred probe Don Brace <don.brace(a)microsemi.com> scsi: hpsa: correct scsi command status issue after reset Kees Cook <keescook(a)chromium.org> libata: zpodd: Fix small read overflow in zpodd_get_mech_type() Numfor Mbiziwo-Tiapo <nums(a)google.com> perf header: Fix use of unitialized value warning Vince Weaver <vincent.weaver(a)maine.edu> perf header: Fix divide by zero error if f_header.attr_size==0 Lucas Stach <l.stach(a)pengutronix.de> irqchip/irq-imx-gpcv2: Forward irq type to parent YueHaibing <yuehaibing(a)huawei.com> xen/pciback: remove set but not used variable 'old_state' Denis Kirjanov <kda(a)linux-powerpc.org> net: usb: pegasus: fix improper read if get_registers() fail Oliver Neukum <oneukum(a)suse.com> Input: iforce - add sanity checks Oliver Neukum <oneukum(a)suse.com> Input: kbtab - sanity check for endpoint type Hillf Danton <hdanton(a)sina.com> HID: hiddev: do cleanup in failure of opening a device Hillf Danton <hdanton(a)sina.com> HID: hiddev: avoid opening a disconnected device Oliver Neukum <oneukum(a)suse.com> HID: holtek: test for sanity of intfdata Wenwen Wang <wenwen(a)cs.uga.edu> ALSA: hda - Fix a memory leak bug Miles Chen <miles.chen(a)mediatek.com> mm/memcontrol.c: fix use after free in mem_cgroup_iter() Yavuz, Tuba <tuba(a)ece.ufl.edu> USB: gadget: f_midi: fixing a possible double-free in f_midi Felipe F. Tonello <eu(a)felipetonello.com> usb: gadget: f_midi: fail if set_alt fails to allocate requests Gustavo A. R. Silva <gustavo(a)embeddedor.com> sh: kernel: hw_breakpoint: Fix missing break in switch statement Suganath Prabu <suganath-prabu.subramani(a)broadcom.com> scsi: mpt3sas: Use 63-bit DMA addressing on SAS35 HBA Brian Norris <briannorris(a)chromium.org> mwifiex: fix 802.11n/WPA detection Steve French <stfrench(a)microsoft.com> smb3: send CAP_DFS capability during session setup Pavel Shilovsky <pshilov(a)microsoft.com> SMB3: Fix deadlock in validate negotiate hits reconnect Brian Norris <briannorris(a)chromium.org> mac80211: don't WARN on short WMM parameters from AP Wenwen Wang <wenwen(a)cs.uga.edu> ALSA: firewire: fix a memory leak bug Guenter Roeck <linux(a)roeck-us.net> hwmon: (nct7802) Fix wrong detection of in4 presence Tomas Bortoli <tomasbortoli(a)gmail.com> can: peak_usb: pcan_usb_fd: Fix info-leaks to USB devices Tomas Bortoli <tomasbortoli(a)gmail.com> can: peak_usb: pcan_usb_pro: Fix info-leaks to USB devices Leonard Crestez <leonard.crestez(a)nxp.com> perf/core: Fix creating kernel counters for PMUs that override event->cpu Peter Zijlstra <peterz(a)infradead.org> tty/ldsem, locking/rwsem: Add missing ACQUIRE to read_failed sleep loop Tyrel Datwyler <tyreld(a)linux.vnet.ibm.com> scsi: ibmvfc: fix WARN_ON during event pool release Junxiao Bi <junxiao.bi(a)oracle.com> scsi: megaraid_sas: fix panic on loading firmware crashdump Arnd Bergmann <arnd(a)arndb.de> ARM: davinci: fix sleep.S build error on ARMv4 Arnaldo Carvalho de Melo <acme(a)redhat.com> perf probe: Avoid calling freeing routine multiple times for same pointer Charles Keepax <ckeepax(a)opensource.cirrus.com> ALSA: compress: Be more restrictive about when a drain is allowed Charles Keepax <ckeepax(a)opensource.cirrus.com> ALSA: compress: Prevent bypasses of set_params Charles Keepax <ckeepax(a)opensource.cirrus.com> ALSA: compress: Fix regression on compressed capture streams Julian Wiedmann <jwi(a)linux.ibm.com> s390/qdio: add sanity checks to the fast-requeue path Wen Yang <wen.yang99(a)zte.com.cn> cpufreq/pasemi: fix use-after-free in pas_cpufreq_cpu_init() Björn Gerhart <gerhart(a)posteo.de> hwmon: (nct6775) Fix register address and added missed tolerance for nct6106 Brian Norris <briannorris(a)chromium.org> mac80211: don't warn about CW params when not using them Thomas Tai <thomas.tai(a)oracle.com> iscsi_ibft: make ISCSI_IBFT dependson ACPI instead of ISCSI_IBFT_FIND Florian Westphal <fw(a)strlen.de> netfilter: nfnetlink: avoid deadlock due to synchronous request_module Stephane Grosjean <s.grosjean(a)peak-system.com> can: peak_usb: fix potential double kfree_skb() Suzuki K Poulose <suzuki.poulose(a)arm.com> usb: yurex: Fix use-after-free in yurex_delete Adrian Hunter <adrian.hunter(a)intel.com> perf db-export: Fix thread__exec_comm() Joerg Roedel <jroedel(a)suse.de> mm/vmalloc: Sync unmappings in __purge_vmap_area_lazy() Joerg Roedel <jroedel(a)suse.de> x86/mm: Sync also unmappings in vmalloc_sync_all() Joerg Roedel <jroedel(a)suse.de> x86/mm: Check for pfn instead of page in vmalloc_sync_one() Wenwen Wang <wenwen(a)cs.uga.edu> sound: fix a memory leak bug Oliver Neukum <oneukum(a)suse.com> usb: iowarrior: fix deadlock on disconnect ------------- Diffstat: Makefile | 4 +- arch/arm/mach-davinci/sleep.S | 1 + arch/arm64/kernel/hw_breakpoint.c | 7 +-- arch/sh/kernel/hw_breakpoint.c | 1 + arch/x86/boot/compressed/Makefile | 1 + arch/x86/entry/vdso/vclock_gettime.c | 15 ------- arch/x86/entry/vdso/vdso-layout.lds.S | 5 +-- arch/x86/include/asm/clocksource.h | 3 +- arch/x86/kernel/hpet.c | 1 - arch/x86/kvm/trace.h | 3 +- arch/x86/mm/fault.c | 15 +++---- drivers/ata/libahci_platform.c | 3 ++ drivers/ata/libata-zpodd.c | 2 +- drivers/cpufreq/pasemi-cpufreq.c | 23 ++++------ drivers/firmware/Kconfig | 5 ++- drivers/firmware/iscsi_ibft.c | 4 ++ drivers/hid/hid-holtek-kbd.c | 9 +++- drivers/hid/usbhid/hiddev.c | 12 +++++ drivers/hwmon/nct6775.c | 3 +- drivers/hwmon/nct7802.c | 6 +-- drivers/infiniband/core/addr.c | 15 +++---- drivers/infiniband/core/user_mad.c | 6 ++- drivers/infiniband/hw/mlx5/mr.c | 6 +-- drivers/input/joystick/iforce/iforce-usb.c | 5 +++ drivers/input/mouse/trackpoint.h | 3 +- drivers/input/tablet/kbtab.c | 6 ++- drivers/iommu/amd_iommu_init.c | 2 +- drivers/irqchip/irq-imx-gpcv2.c | 1 + drivers/net/bonding/bond_main.c | 4 +- drivers/net/can/usb/peak_usb/pcan_usb_core.c | 8 ++-- drivers/net/can/usb/peak_usb/pcan_usb_fd.c | 2 +- drivers/net/can/usb/peak_usb/pcan_usb_pro.c | 2 +- .../net/ethernet/mellanox/mlx5/core/en_ethtool.c | 3 ++ drivers/net/usb/pegasus.c | 2 +- drivers/net/wireless/mwifiex/main.h | 1 + drivers/net/wireless/mwifiex/scan.c | 3 +- drivers/net/xen-netback/netback.c | 2 + drivers/s390/cio/qdio_main.c | 12 ++--- drivers/scsi/fcoe/fcoe_ctlr.c | 33 ++++++-------- drivers/scsi/hpsa.c | 12 ++++- drivers/scsi/ibmvscsi/ibmvfc.c | 2 +- drivers/scsi/libfc/fc_rport.c | 5 ++- drivers/scsi/megaraid/megaraid_sas_base.c | 3 ++ drivers/scsi/mpt3sas/mpt3sas_base.c | 12 ++--- drivers/staging/comedi/drivers/dt3000.c | 8 ++-- drivers/tty/tty_ldsem.c | 5 +-- drivers/usb/class/cdc-acm.c | 18 ++++---- drivers/usb/core/file.c | 10 ++--- drivers/usb/gadget/function/f_midi.c | 6 ++- drivers/usb/gadget/u_f.h | 2 + drivers/usb/misc/iowarrior.c | 7 +-- drivers/usb/misc/yurex.c | 2 +- drivers/usb/serial/option.c | 10 +++++ drivers/xen/xen-pciback/conf_space_capability.c | 3 +- fs/cifs/smb2pdu.c | 7 ++- fs/ocfs2/xattr.c | 3 -- include/asm-generic/bug.h | 2 +- include/asm-generic/getorder.h | 50 +++++++++------------ include/linux/compiler.h | 16 +++++++ include/linux/module.h | 4 +- include/scsi/libfcoe.h | 1 + include/sound/compress_driver.h | 5 +-- kernel/events/core.c | 2 +- mm/memcontrol.c | 41 ++++++++++++----- mm/vmalloc.c | 9 ++++ net/mac80211/driver-ops.c | 13 ++++-- net/mac80211/mlme.c | 10 +++++ net/netfilter/nfnetlink.c | 2 +- net/packet/af_packet.c | 7 +++ net/sctp/sm_sideeffect.c | 2 +- scripts/Makefile.modpost | 2 +- sound/core/compress_offload.c | 52 +++++++++++++++++----- sound/firewire/packets-buffer.c | 2 +- sound/pci/hda/hda_generic.c | 2 +- sound/sound_core.c | 3 +- tools/perf/builtin-probe.c | 10 +++++ tools/perf/util/header.c | 9 +++- tools/perf/util/thread.c | 12 ++++- 78 files changed, 387 insertions(+), 223 deletions(-)

5 years, 10 months

7
90
0 0

2025

2024

2023

2022

2021

2020

2019

2018

2017

Linux-stable-mirror August 2019