From: Henry Burns <henryburns(a)google.com>
Subject: mm/zsmalloc.c: fix race condition in zs_destroy_pool
In zs_destroy_pool() we call flush_work(&pool->free_work). However, we
have no guarantee that migration isn't happening in the background at that
time.
Since migration can't directly free pages, it relies on free_work being
scheduled to free the pages. But there's nothing preventing an
in-progress migrate from queuing the work *after*
zs_unregister_migration() has called flush_work(). Which would mean pages
still pointing at the inode when we free it.
Since we know at destroy time all objects should be free, no new
migrations can come in (since zs_page_isolate() fails for fully-free
zspages). This means it is sufficient to track a "# isolated zspages"
count by class, and have the destroy logic ensure all such pages have
drained before proceeding. Keeping that state under the class spinlock
keeps the logic straightforward.
In this case a memory leak could lead to an eventual crash if
compaction hits the leaked page. This crash would only occur if people
are changing their zswap backend at runtime (which eventually starts
destruction).
Link: http://lkml.kernel.org/r/20190809181751.219326-2-henryburns@google.com
Fixes: 48b4800a1c6a ("zsmalloc: page migration support")
Signed-off-by: Henry Burns <henryburns(a)google.com>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky(a)gmail.com>
Cc: Henry Burns <henrywolfeburns(a)gmail.com>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: Shakeel Butt <shakeelb(a)google.com>
Cc: Jonathan Adams <jwadams(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/zsmalloc.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 59 insertions(+), 2 deletions(-)
--- a/mm/zsmalloc.c~mm-zsmallocc-fix-race-condition-in-zs_destroy_pool
+++ a/mm/zsmalloc.c
@@ -54,6 +54,7 @@
#include <linux/mount.h>
#include <linux/pseudo_fs.h>
#include <linux/migrate.h>
+#include <linux/wait.h>
#include <linux/pagemap.h>
#include <linux/fs.h>
@@ -268,6 +269,10 @@ struct zs_pool {
#ifdef CONFIG_COMPACTION
struct inode *inode;
struct work_struct free_work;
+ /* A wait queue for when migration races with async_free_zspage() */
+ struct wait_queue_head migration_wait;
+ atomic_long_t isolated_pages;
+ bool destroying;
#endif
};
@@ -1874,6 +1879,19 @@ static void putback_zspage_deferred(stru
}
+static inline void zs_pool_dec_isolated(struct zs_pool *pool)
+{
+ VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0);
+ atomic_long_dec(&pool->isolated_pages);
+ /*
+ * There's no possibility of racing, since wait_for_isolated_drain()
+ * checks the isolated count under &class->lock after enqueuing
+ * on migration_wait.
+ */
+ if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying)
+ wake_up_all(&pool->migration_wait);
+}
+
static void replace_sub_page(struct size_class *class, struct zspage *zspage,
struct page *newpage, struct page *oldpage)
{
@@ -1943,6 +1961,7 @@ static bool zs_page_isolate(struct page
*/
if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {
get_zspage_mapping(zspage, &class_idx, &fullness);
+ atomic_long_inc(&pool->isolated_pages);
remove_zspage(class, zspage, fullness);
}
@@ -2042,8 +2061,16 @@ static int zs_page_migrate(struct addres
* Page migration is done so let's putback isolated zspage to
* the list if @page is final isolated subpage in the zspage.
*/
- if (!is_zspage_isolated(zspage))
+ if (!is_zspage_isolated(zspage)) {
+ /*
+ * We cannot race with zs_destroy_pool() here because we wait
+ * for isolation to hit zero before we start destroying.
+ * Also, we ensure that everyone can see pool->destroying before
+ * we start waiting.
+ */
putback_zspage_deferred(pool, class, zspage);
+ zs_pool_dec_isolated(pool);
+ }
reset_page(page);
put_page(page);
@@ -2094,8 +2121,8 @@ static void zs_page_putback(struct page
* so let's defer.
*/
putback_zspage_deferred(pool, class, zspage);
+ zs_pool_dec_isolated(pool);
}
-
spin_unlock(&class->lock);
}
@@ -2118,8 +2145,36 @@ static int zs_register_migration(struct
return 0;
}
+static bool pool_isolated_are_drained(struct zs_pool *pool)
+{
+ return atomic_long_read(&pool->isolated_pages) == 0;
+}
+
+/* Function for resolving migration */
+static void wait_for_isolated_drain(struct zs_pool *pool)
+{
+
+ /*
+ * We're in the process of destroying the pool, so there are no
+ * active allocations. zs_page_isolate() fails for completely free
+ * zspages, so we need only wait for the zs_pool's isolated
+ * count to hit zero.
+ */
+ wait_event(pool->migration_wait,
+ pool_isolated_are_drained(pool));
+}
+
static void zs_unregister_migration(struct zs_pool *pool)
{
+ pool->destroying = true;
+ /*
+ * We need a memory barrier here to ensure global visibility of
+ * pool->destroying. Thus pool->isolated pages will either be 0 in which
+ * case we don't care, or it will be > 0 and pool->destroying will
+ * ensure that we wake up once isolation hits 0.
+ */
+ smp_mb();
+ wait_for_isolated_drain(pool); /* This can block */
flush_work(&pool->free_work);
iput(pool->inode);
}
@@ -2357,6 +2412,8 @@ struct zs_pool *zs_create_pool(const cha
if (!pool->name)
goto err;
+ init_waitqueue_head(&pool->migration_wait);
+
if (create_cache(pool))
goto err;
_
From: Henry Burns <henryburns(a)google.com>
Subject: mm/zsmalloc.c: migration can leave pages in ZS_EMPTY indefinitely
In zs_page_migrate() we call putback_zspage() after we have finished
migrating all pages in this zspage. However, the return value is ignored.
If a zs_free() races in between zs_page_isolate() and zs_page_migrate(),
freeing the last object in the zspage, putback_zspage() will leave the
page in ZS_EMPTY for potentially an unbounded amount of time.
To fix this, we need to do the same thing as zs_page_putback() does:
schedule free_work to occur. To avoid duplicated code, move the sequence
to a new putback_zspage_deferred() function which both zs_page_migrate()
and zs_page_putback() call.
Link: http://lkml.kernel.org/r/20190809181751.219326-1-henryburns@google.com
Fixes: 48b4800a1c6a ("zsmalloc: page migration support")
Signed-off-by: Henry Burns <henryburns(a)google.com>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky(a)gmail.com>
Cc: Henry Burns <henrywolfeburns(a)gmail.com>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: Shakeel Butt <shakeelb(a)google.com>
Cc: Jonathan Adams <jwadams(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/zsmalloc.c | 19 +++++++++++++++----
1 file changed, 15 insertions(+), 4 deletions(-)
--- a/mm/zsmalloc.c~mm-zsmallocc-migration-can-leave-pages-in-zs_empty-indefinitely
+++ a/mm/zsmalloc.c
@@ -1862,6 +1862,18 @@ static void dec_zspage_isolation(struct
zspage->isolated--;
}
+static void putback_zspage_deferred(struct zs_pool *pool,
+ struct size_class *class,
+ struct zspage *zspage)
+{
+ enum fullness_group fg;
+
+ fg = putback_zspage(class, zspage);
+ if (fg == ZS_EMPTY)
+ schedule_work(&pool->free_work);
+
+}
+
static void replace_sub_page(struct size_class *class, struct zspage *zspage,
struct page *newpage, struct page *oldpage)
{
@@ -2031,7 +2043,7 @@ static int zs_page_migrate(struct addres
* the list if @page is final isolated subpage in the zspage.
*/
if (!is_zspage_isolated(zspage))
- putback_zspage(class, zspage);
+ putback_zspage_deferred(pool, class, zspage);
reset_page(page);
put_page(page);
@@ -2077,14 +2089,13 @@ static void zs_page_putback(struct page
spin_lock(&class->lock);
dec_zspage_isolation(zspage);
if (!is_zspage_isolated(zspage)) {
- fg = putback_zspage(class, zspage);
/*
* Due to page_lock, we cannot free zspage immediately
* so let's defer.
*/
- if (fg == ZS_EMPTY)
- schedule_work(&pool->free_work);
+ putback_zspage_deferred(pool, class, zspage);
}
+
spin_unlock(&class->lock);
}
_
From: Vlastimil Babka <vbabka(a)suse.cz>
Subject: mm, page_owner: handle THP splits correctly
THP splitting path is missing the split_page_owner() call that
split_page() has. As a result, split THP pages are wrongly reported in
the page_owner file as order-9 pages. Furthermore when the former head
page is freed, the remaining former tail pages are not listed in the
page_owner file at all. This patch fixes that by adding the
split_page_owner() call into __split_huge_page().
Link: http://lkml.kernel.org/r/20190820131828.22684-2-vbabka@suse.cz
Fixes: a9627bc5e34e ("mm/page_owner: introduce split_page_owner and replace manual handling")
Reported-by: Kirill A. Shutemov <kirill(a)shutemov.name>
Signed-off-by: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Michal Hocko <mhocko(a)kernel.org>
Cc: Mel Gorman <mgorman(a)techsingularity.net>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/huge_memory.c | 4 ++++
1 file changed, 4 insertions(+)
--- a/mm/huge_memory.c~mm-page_owner-handle-thp-splits-correctly
+++ a/mm/huge_memory.c
@@ -32,6 +32,7 @@
#include <linux/shmem_fs.h>
#include <linux/oom.h>
#include <linux/numa.h>
+#include <linux/page_owner.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
@@ -2516,6 +2517,9 @@ static void __split_huge_page(struct pag
}
ClearPageCompound(head);
+
+ split_page_owner(head, HPAGE_PMD_ORDER);
+
/* See comment in __split_huge_page_tail() */
if (PageAnon(head)) {
/* Additional pin to swap cache */
_
From: Jason Xing <kerneljasonxing(a)linux.alibaba.com>
Subject: psi: get poll_work to run when calling poll syscall next time
Only when calling the poll syscall the first time can user receive POLLPRI
correctly. After that, user always fails to acquire the event signal.
Reproduce case:
1. Get the monitor code in Documentation/accounting/psi.txt
2. Run it, and wait for the event triggered.
3. Kill and restart the process.
The question is why we can end up with poll_scheduled = 1 but the work not
running (which would reset it to 0). And the answer is because the
scheduling side sees group->poll_kworker under RCU protection and then
schedules it, but here we cancel the work and destroy the worker. The
cancel needs to pair with resetting the poll_scheduled flag.
Link: http://lkml.kernel.org/r/1566357985-97781-1-git-send-email-joseph.qi@linux.…
Signed-off-by: Jason Xing <kerneljasonxing(a)linux.alibaba.com>
Signed-off-by: Joseph Qi <joseph.qi(a)linux.alibaba.com>
Reviewed-by: Caspar Zhang <caspar(a)linux.alibaba.com>
Reviewed-by: Suren Baghdasaryan <surenb(a)google.com>
Acked-by: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
kernel/sched/psi.c | 8 ++++++++
1 file changed, 8 insertions(+)
--- a/kernel/sched/psi.c~psi-get-poll_work-to-run-when-calling-poll-syscall-next-time
+++ a/kernel/sched/psi.c
@@ -1131,7 +1131,15 @@ static void psi_trigger_destroy(struct k
* deadlock while waiting for psi_poll_work to acquire trigger_lock
*/
if (kworker_to_destroy) {
+ /*
+ * After the RCU grace period has expired, the worker
+ * can no longer be found through group->poll_kworker.
+ * But it might have been already scheduled before
+ * that - deschedule it cleanly before destroying it.
+ */
kthread_cancel_delayed_work_sync(&group->poll_work);
+ atomic_set(&group->poll_scheduled, 0);
+
kthread_destroy_worker(kworker_to_destroy);
}
kfree(t);
_
From: Roman Gushchin <guro(a)fb.com>
Subject: mm: memcontrol: flush percpu vmevents before releasing memcg
Similar to vmstats, percpu caching of local vmevents leads to an
accumulation of errors on non-leaf levels. This happens because some
leftovers may remain in percpu caches, so that they are never propagated
up by the cgroup tree and just disappear into nonexistence with on
releasing of the memory cgroup.
To fix this issue let's accumulate and propagate percpu vmevents values
before releasing the memory cgroup similar to what we're doing with
vmstats.
Since on cpu hotplug we do flush percpu vmstats anyway, we can iterate
only over online cpus.
Link: http://lkml.kernel.org/r/20190819202338.363363-4-guro@fb.com
Fixes: 42a300353577 ("mm: memcontrol: fix recursive statistics correctness & scalabilty")
Signed-off-by: Roman Gushchin <guro(a)fb.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/memcontrol.c | 22 +++++++++++++++++++++-
1 file changed, 21 insertions(+), 1 deletion(-)
--- a/mm/memcontrol.c~mm-memcontrol-flush-percpu-vmevents-before-releasing-memcg
+++ a/mm/memcontrol.c
@@ -3295,6 +3295,25 @@ static void memcg_flush_percpu_vmstats(s
}
}
+static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
+{
+ unsigned long events[NR_VM_EVENT_ITEMS];
+ struct mem_cgroup *mi;
+ int cpu, i;
+
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+ events[i] = 0;
+
+ for_each_online_cpu(cpu)
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+ events[i] += raw_cpu_read(
+ memcg->vmstats_percpu->events[i]);
+
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+ atomic_long_add(events[i], &mi->vmevents[i]);
+}
+
#ifdef CONFIG_MEMCG_KMEM
static int memcg_online_kmem(struct mem_cgroup *memcg)
{
@@ -4718,10 +4737,11 @@ static void __mem_cgroup_free(struct mem
int node;
/*
- * Flush percpu vmstats to guarantee the value correctness
+ * Flush percpu vmstats and vmevents to guarantee the value correctness
* on parent's and all ancestor levels.
*/
memcg_flush_percpu_vmstats(memcg);
+ memcg_flush_percpu_vmevents(memcg);
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
free_percpu(memcg->vmstats_percpu);
_
From: Roman Gushchin <guro(a)fb.com>
Subject: mm: memcontrol: flush percpu vmstats before releasing memcg
Percpu caching of local vmstats with the conditional propagation by the
cgroup tree leads to an accumulation of errors on non-leaf levels.
Let's imagine two nested memory cgroups A and A/B. Say, a process
belonging to A/B allocates 100 pagecache pages on the CPU 0. The percpu
cache will spill 3 times, so that 32*3=96 pages will be accounted to A/B
and A atomic vmstat counters, 4 pages will remain in the percpu cache.
Imagine A/B is nearby memory.max, so that every following allocation
triggers a direct reclaim on the local CPU. Say, each such attempt will
free 16 pages on a new cpu. That means every percpu cache will have -16
pages, except the first one, which will have 4 - 16 = -12. A/B and A
atomic counters will not be touched at all.
Now a user removes A/B. All percpu caches are freed and corresponding
vmstat numbers are forgotten. A has 96 pages more than expected.
As memory cgroups are created and destroyed, errors do accumulate. Even
1-2 pages differences can accumulate into large numbers.
To fix this issue let's accumulate and propagate percpu vmstat values
before releasing the memory cgroup. At this point these numbers are
stable and cannot be changed.
Since on cpu hotplug we do flush percpu vmstats anyway, we can iterate
only over online cpus.
Link: http://lkml.kernel.org/r/20190819202338.363363-2-guro@fb.com
Fixes: 42a300353577 ("mm: memcontrol: fix recursive statistics correctness & scalabilty")
Signed-off-by: Roman Gushchin <guro(a)fb.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/memcontrol.c | 40 ++++++++++++++++++++++++++++++++++++++++
1 file changed, 40 insertions(+)
--- a/mm/memcontrol.c~mm-memcontrol-flush-percpu-vmstats-before-releasing-memcg
+++ a/mm/memcontrol.c
@@ -3260,6 +3260,41 @@ static u64 mem_cgroup_read_u64(struct cg
}
}
+static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
+{
+ unsigned long stat[MEMCG_NR_STAT];
+ struct mem_cgroup *mi;
+ int node, cpu, i;
+
+ for (i = 0; i < MEMCG_NR_STAT; i++)
+ stat[i] = 0;
+
+ for_each_online_cpu(cpu)
+ for (i = 0; i < MEMCG_NR_STAT; i++)
+ stat[i] += raw_cpu_read(memcg->vmstats_percpu->stat[i]);
+
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ for (i = 0; i < MEMCG_NR_STAT; i++)
+ atomic_long_add(stat[i], &mi->vmstats[i]);
+
+ for_each_node(node) {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
+ struct mem_cgroup_per_node *pi;
+
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+ stat[i] = 0;
+
+ for_each_online_cpu(cpu)
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+ stat[i] += raw_cpu_read(
+ pn->lruvec_stat_cpu->count[i]);
+
+ for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+ atomic_long_add(stat[i], &pi->lruvec_stat[i]);
+ }
+}
+
#ifdef CONFIG_MEMCG_KMEM
static int memcg_online_kmem(struct mem_cgroup *memcg)
{
@@ -4682,6 +4717,11 @@ static void __mem_cgroup_free(struct mem
{
int node;
+ /*
+ * Flush percpu vmstats to guarantee the value correctness
+ * on parent's and all ancestor levels.
+ */
+ memcg_flush_percpu_vmstats(memcg);
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
free_percpu(memcg->vmstats_percpu);
_
From: David Rientjes <rientjes(a)google.com>
Subject: mm, page_alloc: move_freepages should not examine struct page of reserved memory
After commit 907ec5fca3dc ("mm: zero remaining unavailable struct pages"),
struct page of reserved memory is zeroed. This causes page->flags to be 0
and fixes issues related to reading /proc/kpageflags, for example, of
reserved memory.
The VM_BUG_ON() in move_freepages_block(), however, assumes that
page_zone() is meaningful even for reserved memory. That assumption is no
longer true after the aforementioned commit.
There's no reason why move_freepages_block() should be testing the
legitimacy of page_zone() for reserved memory; its scope is limited only
to pages on the zone's freelist.
Note that pfn_valid() can be true for reserved memory: there is a backing
struct page. The check for page_to_nid(page) is also buggy but reserved
memory normally only appears on node 0 so the zeroing doesn't affect this.
Move the debug checks to after verifying PageBuddy is true. This isolates
the scope of the checks to only be for buddy pages which are on the zone's
freelist which move_freepages_block() is operating on. In this case, an
incorrect node or zone is a bug worthy of being warned about (and the
examination of struct page is acceptable bcause this memory is not
reserved).
Why does move_freepages_block() gets called on reserved memory? It's
simply math after finding a valid free page from the per-zone free area to
use as fallback. We find the beginning and end of the pageblock of the
valid page and that can bring us into memory that was reserved per the
e820. pfn_valid() is still true (it's backed by a struct page), but since
it's zero'd we shouldn't make any inferences here about comparing its node
or zone. The current node check just happens to succeed most of the time
by luck because reserved memory typically appears on node 0.
The fix here is to validate that we actually have buddy pages before
testing if there's any type of zone or node strangeness going on.
We noticed it almost immediately after bringing 907ec5fca3dc in on
CONFIG_DEBUG_VM builds. It depends on finding specific free pages in
the per-zone free area where the math in move_freepages() will bring
the start or end pfn into reserved memory and wanting to claim that
entire pageblock as a new migratetype. So the path will be rare,
require CONFIG_DEBUG_VM, and require fallback to a different
migratetype.
Some struct pages were already zeroed from reserve pages before
907ec5fca3c so it theoretically could trigger before this commit. I
think it's rare enough under a config option that most people don't run
that others may not have noticed. I wouldn't argue against a stable
tag and the backport should be easy enough, but probably wouldn't
single out a commit that this is fixing.
Mel said:
: The overhead of the debugging check is higher with this patch although
: it'll only affect debug builds and the path is not particularly hot.
: If this was a concern, I think it would be reasonable to simply remove
: the debugging check as the zone boundaries are checked in
: move_freepages_block and we never expect a zone/node to be smaller than
: a pageblock and stuck in the middle of another zone.
Link: http://lkml.kernel.org/r/alpine.DEB.2.21.1908122036560.10779@chino.kir.corp…
Signed-off-by: David Rientjes <rientjes(a)google.com>
Acked-by: Mel Gorman <mgorman(a)techsingularity.net>
Cc: Naoya Horiguchi <n-horiguchi(a)ah.jp.nec.com>
Cc: Masayoshi Mizuma <m.mizuma(a)jp.fujitsu.com>
Cc: Oscar Salvador <osalvador(a)suse.de>
Cc: Pavel Tatashin <pavel.tatashin(a)microsoft.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/page_alloc.c | 19 ++++---------------
1 file changed, 4 insertions(+), 15 deletions(-)
--- a/mm/page_alloc.c~mm-page_alloc-move_freepages-should-not-examine-struct-page-of-reserved-memory
+++ a/mm/page_alloc.c
@@ -2238,27 +2238,12 @@ static int move_freepages(struct zone *z
unsigned int order;
int pages_moved = 0;
-#ifndef CONFIG_HOLES_IN_ZONE
- /*
- * page_zone is not safe to call in this context when
- * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
- * anyway as we check zone boundaries in move_freepages_block().
- * Remove at a later date when no bug reports exist related to
- * grouping pages by mobility
- */
- VM_BUG_ON(pfn_valid(page_to_pfn(start_page)) &&
- pfn_valid(page_to_pfn(end_page)) &&
- page_zone(start_page) != page_zone(end_page));
-#endif
for (page = start_page; page <= end_page;) {
if (!pfn_valid_within(page_to_pfn(page))) {
page++;
continue;
}
- /* Make sure we are not inadvertently changing nodes */
- VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
-
if (!PageBuddy(page)) {
/*
* We assume that pages that could be isolated for
@@ -2273,6 +2258,10 @@ static int move_freepages(struct zone *z
continue;
}
+ /* Make sure we are not inadvertently changing nodes */
+ VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
+ VM_BUG_ON_PAGE(page_zone(page) != zone, page);
+
order = page_order(page);
move_to_free_area(page, &zone->free_area[order], migratetype);
page += 1 << order;
_
From: Henry Burns <henryburns(a)google.com>
Subject: mm/z3fold.c: fix race between migration and destruction
In z3fold_destroy_pool() we call destroy_workqueue(&pool->compact_wq).
However, we have no guarantee that migration isn't happening in the
background at that time.
Migration directly calls queue_work_on(pool->compact_wq), if destruction
wins that race we are using a destroyed workqueue.
Link: http://lkml.kernel.org/r/20190809213828.202833-1-henryburns@google.com
Signed-off-by: Henry Burns <henryburns(a)google.com>
Cc: Vitaly Wool <vitalywool(a)gmail.com>
Cc: Shakeel Butt <shakeelb(a)google.com>
Cc: Jonathan Adams <jwadams(a)google.com>
Cc: Henry Burns <henrywolfeburns(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/z3fold.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 89 insertions(+)
--- a/mm/z3fold.c~mm-z3foldc-fix-race-between-migration-and-destruction
+++ a/mm/z3fold.c
@@ -41,6 +41,7 @@
#include <linux/workqueue.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
+#include <linux/wait.h>
#include <linux/zpool.h>
#include <linux/magic.h>
@@ -145,6 +146,8 @@ struct z3fold_header {
* @release_wq: workqueue for safe page release
* @work: work_struct for safe page release
* @inode: inode for z3fold pseudo filesystem
+ * @destroying: bool to stop migration once we start destruction
+ * @isolated: int to count the number of pages currently in isolation
*
* This structure is allocated at pool creation time and maintains metadata
* pertaining to a particular z3fold pool.
@@ -163,8 +166,11 @@ struct z3fold_pool {
const struct zpool_ops *zpool_ops;
struct workqueue_struct *compact_wq;
struct workqueue_struct *release_wq;
+ struct wait_queue_head isolate_wait;
struct work_struct work;
struct inode *inode;
+ bool destroying;
+ int isolated;
};
/*
@@ -769,6 +775,7 @@ static struct z3fold_pool *z3fold_create
goto out_c;
spin_lock_init(&pool->lock);
spin_lock_init(&pool->stale_lock);
+ init_waitqueue_head(&pool->isolate_wait);
pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2);
if (!pool->unbuddied)
goto out_pool;
@@ -808,6 +815,15 @@ out:
return NULL;
}
+static bool pool_isolated_are_drained(struct z3fold_pool *pool)
+{
+ bool ret;
+
+ spin_lock(&pool->lock);
+ ret = pool->isolated == 0;
+ spin_unlock(&pool->lock);
+ return ret;
+}
/**
* z3fold_destroy_pool() - destroys an existing z3fold pool
* @pool: the z3fold pool to be destroyed
@@ -817,6 +833,22 @@ out:
static void z3fold_destroy_pool(struct z3fold_pool *pool)
{
kmem_cache_destroy(pool->c_handle);
+ /*
+ * We set pool-> destroying under lock to ensure that
+ * z3fold_page_isolate() sees any changes to destroying. This way we
+ * avoid the need for any memory barriers.
+ */
+
+ spin_lock(&pool->lock);
+ pool->destroying = true;
+ spin_unlock(&pool->lock);
+
+ /*
+ * We need to ensure that no pages are being migrated while we destroy
+ * these workqueues, as migration can queue work on either of the
+ * workqueues.
+ */
+ wait_event(pool->isolate_wait, !pool_isolated_are_drained(pool));
/*
* We need to destroy pool->compact_wq before pool->release_wq,
@@ -1307,6 +1339,28 @@ static u64 z3fold_get_pool_size(struct z
return atomic64_read(&pool->pages_nr);
}
+/*
+ * z3fold_dec_isolated() expects to be called while pool->lock is held.
+ */
+static void z3fold_dec_isolated(struct z3fold_pool *pool)
+{
+ assert_spin_locked(&pool->lock);
+ VM_BUG_ON(pool->isolated <= 0);
+ pool->isolated--;
+
+ /*
+ * If we have no more isolated pages, we have to see if
+ * z3fold_destroy_pool() is waiting for a signal.
+ */
+ if (pool->isolated == 0 && waitqueue_active(&pool->isolate_wait))
+ wake_up_all(&pool->isolate_wait);
+}
+
+static void z3fold_inc_isolated(struct z3fold_pool *pool)
+{
+ pool->isolated++;
+}
+
static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
{
struct z3fold_header *zhdr;
@@ -1333,6 +1387,33 @@ static bool z3fold_page_isolate(struct p
spin_lock(&pool->lock);
if (!list_empty(&page->lru))
list_del(&page->lru);
+ /*
+ * We need to check for destruction while holding pool->lock, as
+ * otherwise destruction could see 0 isolated pages, and
+ * proceed.
+ */
+ if (unlikely(pool->destroying)) {
+ spin_unlock(&pool->lock);
+ /*
+ * If this page isn't stale, somebody else holds a
+ * reference to it. Let't drop our refcount so that they
+ * can call the release logic.
+ */
+ if (unlikely(kref_put(&zhdr->refcount,
+ release_z3fold_page_locked))) {
+ /*
+ * If we get here we have kref problems, so we
+ * should freak out.
+ */
+ WARN(1, "Z3fold is experiencing kref problems\n");
+ return false;
+ }
+ z3fold_page_unlock(zhdr);
+ return false;
+ }
+
+
+ z3fold_inc_isolated(pool);
spin_unlock(&pool->lock);
z3fold_page_unlock(zhdr);
return true;
@@ -1401,6 +1482,10 @@ static int z3fold_page_migrate(struct ad
queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
+ spin_lock(&pool->lock);
+ z3fold_dec_isolated(pool);
+ spin_unlock(&pool->lock);
+
page_mapcount_reset(page);
put_page(page);
return 0;
@@ -1420,10 +1505,14 @@ static void z3fold_page_putback(struct p
INIT_LIST_HEAD(&page->lru);
if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
atomic64_dec(&pool->pages_nr);
+ spin_lock(&pool->lock);
+ z3fold_dec_isolated(pool);
+ spin_unlock(&pool->lock);
return;
}
spin_lock(&pool->lock);
list_add(&page->lru, &pool->lru);
+ z3fold_dec_isolated(pool);
spin_unlock(&pool->lock);
z3fold_page_unlock(zhdr);
}
_
This is the start of the stable review cycle for the 4.4.190 release.
There are 78 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Sat 24 Aug 2019 05:18:13 PM UTC.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.4.190-rc…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-4.4.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 4.4.190-rc1
YueHaibing <yuehaibing(a)huawei.com>
bonding: Add vlan tx offload to hw_enc_features
Xin Long <lucien.xin(a)gmail.com>
sctp: fix the transport error_count check
Huy Nguyen <huyn(a)mellanox.com>
net/mlx5e: Only support tx/rx pause setting for port owner
Ross Lagerwall <ross.lagerwall(a)citrix.com>
xen/netback: Reset nr_frags before freeing skb
Eric Dumazet <edumazet(a)google.com>
net/packet: fix race in tpacket_snd()
Matthias Kaehlcke <mka(a)chromium.org>
x86/boot: Disable the address-of-packed-member compiler warning
Joerg Roedel <jroedel(a)suse.de>
iommu/amd: Move iommu_init_pci() to .init section
Andy Lutomirski <luto(a)kernel.org>
x86/vdso: Remove direct HPET access through the vDSO
Doug Ledford <dledford(a)redhat.com>
IB/mlx5: Make coding style more consistent
Jason Gunthorpe <jgg(a)mellanox.com>
RDMA: Directly cast the sockaddr union to sockaddr
Hannes Reinecke <hare(a)suse.de>
scsi: fcoe: Embed fc_rport_priv in fcoe_rport structure
Arnd Bergmann <arnd(a)arndb.de>
asm-generic: default BUG_ON(x) to if(x)BUG()
YueHaibing <yuehaibing(a)huawei.com>
Input: psmouse - fix build error of multiple definition
Will Deacon <will(a)kernel.org>
arm64: compat: Allow single-byte watchpoints on all addresses
Miguel Ojeda <miguel.ojeda.sandonis(a)gmail.com>
include/linux/module.h: copy __init/__exit attrs to init/cleanup_module
Miguel Ojeda <miguel.ojeda.sandonis(a)gmail.com>
Backport minimal compiler_attributes.h to support GCC 9
Tony Lindgren <tony(a)atomide.com>
USB: serial: option: Add Motorola modem UARTs
Bob Ham <bob.ham(a)puri.sm>
USB: serial: option: add the BroadMobi BM818 card
Yoshiaki Okamoto <yokamoto(a)allied-telesis.co.jp>
USB: serial: option: Add support for ZTE MF871A
Rogan Dawes <rogan(a)dawes.za.net>
USB: serial: option: add D-Link DWM-222 device ID
Oliver Neukum <oneukum(a)suse.com>
usb: cdc-acm: make sure a refcount is taken early enough
Alan Stern <stern(a)rowland.harvard.edu>
USB: core: Fix races in character device registration and deregistraion
Ian Abbott <abbotti(a)mev.co.uk>
staging: comedi: dt3000: Fix rounding up of timer divisor
Ian Abbott <abbotti(a)mev.co.uk>
staging: comedi: dt3000: Fix signed integer overflow 'divider * base'
Qian Cai <cai(a)lca.pw>
asm-generic: fix -Wtype-limits compiler warnings
YueHaibing <yuehaibing(a)huawei.com>
ocfs2: remove set but not used variable 'last_hash'
Tony Luck <tony.luck(a)intel.com>
IB/core: Add mitigation for Spectre V1
Masahiro Yamada <yamada.masahiro(a)socionext.com>
kbuild: modpost: handle KBUILD_EXTRA_SYMBOLS only for external modules
Miquel Raynal <miquel.raynal(a)bootlin.com>
ata: libahci: do not complain in case of deferred probe
Don Brace <don.brace(a)microsemi.com>
scsi: hpsa: correct scsi command status issue after reset
Kees Cook <keescook(a)chromium.org>
libata: zpodd: Fix small read overflow in zpodd_get_mech_type()
Numfor Mbiziwo-Tiapo <nums(a)google.com>
perf header: Fix use of unitialized value warning
Vince Weaver <vincent.weaver(a)maine.edu>
perf header: Fix divide by zero error if f_header.attr_size==0
Lucas Stach <l.stach(a)pengutronix.de>
irqchip/irq-imx-gpcv2: Forward irq type to parent
YueHaibing <yuehaibing(a)huawei.com>
xen/pciback: remove set but not used variable 'old_state'
Denis Kirjanov <kda(a)linux-powerpc.org>
net: usb: pegasus: fix improper read if get_registers() fail
Oliver Neukum <oneukum(a)suse.com>
Input: iforce - add sanity checks
Oliver Neukum <oneukum(a)suse.com>
Input: kbtab - sanity check for endpoint type
Hillf Danton <hdanton(a)sina.com>
HID: hiddev: do cleanup in failure of opening a device
Hillf Danton <hdanton(a)sina.com>
HID: hiddev: avoid opening a disconnected device
Oliver Neukum <oneukum(a)suse.com>
HID: holtek: test for sanity of intfdata
Wenwen Wang <wenwen(a)cs.uga.edu>
ALSA: hda - Fix a memory leak bug
Miles Chen <miles.chen(a)mediatek.com>
mm/memcontrol.c: fix use after free in mem_cgroup_iter()
Yavuz, Tuba <tuba(a)ece.ufl.edu>
USB: gadget: f_midi: fixing a possible double-free in f_midi
Felipe F. Tonello <eu(a)felipetonello.com>
usb: gadget: f_midi: fail if set_alt fails to allocate requests
Gustavo A. R. Silva <gustavo(a)embeddedor.com>
sh: kernel: hw_breakpoint: Fix missing break in switch statement
Suganath Prabu <suganath-prabu.subramani(a)broadcom.com>
scsi: mpt3sas: Use 63-bit DMA addressing on SAS35 HBA
Brian Norris <briannorris(a)chromium.org>
mwifiex: fix 802.11n/WPA detection
Steve French <stfrench(a)microsoft.com>
smb3: send CAP_DFS capability during session setup
Pavel Shilovsky <pshilov(a)microsoft.com>
SMB3: Fix deadlock in validate negotiate hits reconnect
Brian Norris <briannorris(a)chromium.org>
mac80211: don't WARN on short WMM parameters from AP
Wenwen Wang <wenwen(a)cs.uga.edu>
ALSA: firewire: fix a memory leak bug
Guenter Roeck <linux(a)roeck-us.net>
hwmon: (nct7802) Fix wrong detection of in4 presence
Tomas Bortoli <tomasbortoli(a)gmail.com>
can: peak_usb: pcan_usb_fd: Fix info-leaks to USB devices
Tomas Bortoli <tomasbortoli(a)gmail.com>
can: peak_usb: pcan_usb_pro: Fix info-leaks to USB devices
Leonard Crestez <leonard.crestez(a)nxp.com>
perf/core: Fix creating kernel counters for PMUs that override event->cpu
Peter Zijlstra <peterz(a)infradead.org>
tty/ldsem, locking/rwsem: Add missing ACQUIRE to read_failed sleep loop
Tyrel Datwyler <tyreld(a)linux.vnet.ibm.com>
scsi: ibmvfc: fix WARN_ON during event pool release
Junxiao Bi <junxiao.bi(a)oracle.com>
scsi: megaraid_sas: fix panic on loading firmware crashdump
Arnd Bergmann <arnd(a)arndb.de>
ARM: davinci: fix sleep.S build error on ARMv4
Arnaldo Carvalho de Melo <acme(a)redhat.com>
perf probe: Avoid calling freeing routine multiple times for same pointer
Charles Keepax <ckeepax(a)opensource.cirrus.com>
ALSA: compress: Be more restrictive about when a drain is allowed
Charles Keepax <ckeepax(a)opensource.cirrus.com>
ALSA: compress: Prevent bypasses of set_params
Charles Keepax <ckeepax(a)opensource.cirrus.com>
ALSA: compress: Fix regression on compressed capture streams
Julian Wiedmann <jwi(a)linux.ibm.com>
s390/qdio: add sanity checks to the fast-requeue path
Wen Yang <wen.yang99(a)zte.com.cn>
cpufreq/pasemi: fix use-after-free in pas_cpufreq_cpu_init()
Björn Gerhart <gerhart(a)posteo.de>
hwmon: (nct6775) Fix register address and added missed tolerance for nct6106
Brian Norris <briannorris(a)chromium.org>
mac80211: don't warn about CW params when not using them
Thomas Tai <thomas.tai(a)oracle.com>
iscsi_ibft: make ISCSI_IBFT dependson ACPI instead of ISCSI_IBFT_FIND
Florian Westphal <fw(a)strlen.de>
netfilter: nfnetlink: avoid deadlock due to synchronous request_module
Stephane Grosjean <s.grosjean(a)peak-system.com>
can: peak_usb: fix potential double kfree_skb()
Suzuki K Poulose <suzuki.poulose(a)arm.com>
usb: yurex: Fix use-after-free in yurex_delete
Adrian Hunter <adrian.hunter(a)intel.com>
perf db-export: Fix thread__exec_comm()
Joerg Roedel <jroedel(a)suse.de>
mm/vmalloc: Sync unmappings in __purge_vmap_area_lazy()
Joerg Roedel <jroedel(a)suse.de>
x86/mm: Sync also unmappings in vmalloc_sync_all()
Joerg Roedel <jroedel(a)suse.de>
x86/mm: Check for pfn instead of page in vmalloc_sync_one()
Wenwen Wang <wenwen(a)cs.uga.edu>
sound: fix a memory leak bug
Oliver Neukum <oneukum(a)suse.com>
usb: iowarrior: fix deadlock on disconnect
-------------
Diffstat:
Makefile | 4 +-
arch/arm/mach-davinci/sleep.S | 1 +
arch/arm64/kernel/hw_breakpoint.c | 7 +--
arch/sh/kernel/hw_breakpoint.c | 1 +
arch/x86/boot/compressed/Makefile | 1 +
arch/x86/entry/vdso/vclock_gettime.c | 15 -------
arch/x86/entry/vdso/vdso-layout.lds.S | 5 +--
arch/x86/include/asm/clocksource.h | 3 +-
arch/x86/kernel/hpet.c | 1 -
arch/x86/kvm/trace.h | 3 +-
arch/x86/mm/fault.c | 15 +++----
drivers/ata/libahci_platform.c | 3 ++
drivers/ata/libata-zpodd.c | 2 +-
drivers/cpufreq/pasemi-cpufreq.c | 23 ++++------
drivers/firmware/Kconfig | 5 ++-
drivers/firmware/iscsi_ibft.c | 4 ++
drivers/hid/hid-holtek-kbd.c | 9 +++-
drivers/hid/usbhid/hiddev.c | 12 +++++
drivers/hwmon/nct6775.c | 3 +-
drivers/hwmon/nct7802.c | 6 +--
drivers/infiniband/core/addr.c | 15 +++----
drivers/infiniband/core/user_mad.c | 6 ++-
drivers/infiniband/hw/mlx5/mr.c | 6 +--
drivers/input/joystick/iforce/iforce-usb.c | 5 +++
drivers/input/mouse/trackpoint.h | 3 +-
drivers/input/tablet/kbtab.c | 6 ++-
drivers/iommu/amd_iommu_init.c | 2 +-
drivers/irqchip/irq-imx-gpcv2.c | 1 +
drivers/net/bonding/bond_main.c | 4 +-
drivers/net/can/usb/peak_usb/pcan_usb_core.c | 8 ++--
drivers/net/can/usb/peak_usb/pcan_usb_fd.c | 2 +-
drivers/net/can/usb/peak_usb/pcan_usb_pro.c | 2 +-
.../net/ethernet/mellanox/mlx5/core/en_ethtool.c | 3 ++
drivers/net/usb/pegasus.c | 2 +-
drivers/net/wireless/mwifiex/main.h | 1 +
drivers/net/wireless/mwifiex/scan.c | 3 +-
drivers/net/xen-netback/netback.c | 2 +
drivers/s390/cio/qdio_main.c | 12 ++---
drivers/scsi/fcoe/fcoe_ctlr.c | 33 ++++++--------
drivers/scsi/hpsa.c | 12 ++++-
drivers/scsi/ibmvscsi/ibmvfc.c | 2 +-
drivers/scsi/libfc/fc_rport.c | 5 ++-
drivers/scsi/megaraid/megaraid_sas_base.c | 3 ++
drivers/scsi/mpt3sas/mpt3sas_base.c | 12 ++---
drivers/staging/comedi/drivers/dt3000.c | 8 ++--
drivers/tty/tty_ldsem.c | 5 +--
drivers/usb/class/cdc-acm.c | 18 ++++----
drivers/usb/core/file.c | 10 ++---
drivers/usb/gadget/function/f_midi.c | 6 ++-
drivers/usb/gadget/u_f.h | 2 +
drivers/usb/misc/iowarrior.c | 7 +--
drivers/usb/misc/yurex.c | 2 +-
drivers/usb/serial/option.c | 10 +++++
drivers/xen/xen-pciback/conf_space_capability.c | 3 +-
fs/cifs/smb2pdu.c | 7 ++-
fs/ocfs2/xattr.c | 3 --
include/asm-generic/bug.h | 2 +-
include/asm-generic/getorder.h | 50 +++++++++------------
include/linux/compiler.h | 16 +++++++
include/linux/module.h | 4 +-
include/scsi/libfcoe.h | 1 +
include/sound/compress_driver.h | 5 +--
kernel/events/core.c | 2 +-
mm/memcontrol.c | 41 ++++++++++++-----
mm/vmalloc.c | 9 ++++
net/mac80211/driver-ops.c | 13 ++++--
net/mac80211/mlme.c | 10 +++++
net/netfilter/nfnetlink.c | 2 +-
net/packet/af_packet.c | 7 +++
net/sctp/sm_sideeffect.c | 2 +-
scripts/Makefile.modpost | 2 +-
sound/core/compress_offload.c | 52 +++++++++++++++++-----
sound/firewire/packets-buffer.c | 2 +-
sound/pci/hda/hda_generic.c | 2 +-
sound/sound_core.c | 3 +-
tools/perf/builtin-probe.c | 10 +++++
tools/perf/util/header.c | 9 +++-
tools/perf/util/thread.c | 12 ++++-
78 files changed, 387 insertions(+), 223 deletions(-)