The patch titled
Subject: mm/zsmalloc.c: fix race condition in zs_destroy_pool
has been removed from the -mm tree. Its filename was
mm-zsmallocc-fix-race-condition-in-zs_destroy_pool.patch
This patch was dropped because it was merged into mainline or a subsystem tree
------------------------------------------------------
From: Henry Burns <henryburns(a)google.com>
Subject: mm/zsmalloc.c: fix race condition in zs_destroy_pool
In zs_destroy_pool() we call flush_work(&pool->free_work). However, we
have no guarantee that migration isn't happening in the background at that
time.
Since migration can't directly free pages, it relies on free_work being
scheduled to free the pages. But there's nothing preventing an
in-progress migrate from queuing the work *after*
zs_unregister_migration() has called flush_work(). Which would mean pages
still pointing at the inode when we free it.
Since we know at destroy time all objects should be free, no new
migrations can come in (since zs_page_isolate() fails for fully-free
zspages). This means it is sufficient to track a "# isolated zspages"
count by class, and have the destroy logic ensure all such pages have
drained before proceeding. Keeping that state under the class spinlock
keeps the logic straightforward.
In this case a memory leak could lead to an eventual crash if
compaction hits the leaked page. This crash would only occur if people
are changing their zswap backend at runtime (which eventually starts
destruction).
Link: http://lkml.kernel.org/r/20190809181751.219326-2-henryburns@google.com
Fixes: 48b4800a1c6a ("zsmalloc: page migration support")
Signed-off-by: Henry Burns <henryburns(a)google.com>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky(a)gmail.com>
Cc: Henry Burns <henrywolfeburns(a)gmail.com>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: Shakeel Butt <shakeelb(a)google.com>
Cc: Jonathan Adams <jwadams(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/zsmalloc.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 59 insertions(+), 2 deletions(-)
--- a/mm/zsmalloc.c~mm-zsmallocc-fix-race-condition-in-zs_destroy_pool
+++ a/mm/zsmalloc.c
@@ -54,6 +54,7 @@
#include <linux/mount.h>
#include <linux/pseudo_fs.h>
#include <linux/migrate.h>
+#include <linux/wait.h>
#include <linux/pagemap.h>
#include <linux/fs.h>
@@ -268,6 +269,10 @@ struct zs_pool {
#ifdef CONFIG_COMPACTION
struct inode *inode;
struct work_struct free_work;
+ /* A wait queue for when migration races with async_free_zspage() */
+ struct wait_queue_head migration_wait;
+ atomic_long_t isolated_pages;
+ bool destroying;
#endif
};
@@ -1874,6 +1879,19 @@ static void putback_zspage_deferred(stru
}
+static inline void zs_pool_dec_isolated(struct zs_pool *pool)
+{
+ VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0);
+ atomic_long_dec(&pool->isolated_pages);
+ /*
+ * There's no possibility of racing, since wait_for_isolated_drain()
+ * checks the isolated count under &class->lock after enqueuing
+ * on migration_wait.
+ */
+ if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying)
+ wake_up_all(&pool->migration_wait);
+}
+
static void replace_sub_page(struct size_class *class, struct zspage *zspage,
struct page *newpage, struct page *oldpage)
{
@@ -1943,6 +1961,7 @@ static bool zs_page_isolate(struct page
*/
if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {
get_zspage_mapping(zspage, &class_idx, &fullness);
+ atomic_long_inc(&pool->isolated_pages);
remove_zspage(class, zspage, fullness);
}
@@ -2042,8 +2061,16 @@ static int zs_page_migrate(struct addres
* Page migration is done so let's putback isolated zspage to
* the list if @page is final isolated subpage in the zspage.
*/
- if (!is_zspage_isolated(zspage))
+ if (!is_zspage_isolated(zspage)) {
+ /*
+ * We cannot race with zs_destroy_pool() here because we wait
+ * for isolation to hit zero before we start destroying.
+ * Also, we ensure that everyone can see pool->destroying before
+ * we start waiting.
+ */
putback_zspage_deferred(pool, class, zspage);
+ zs_pool_dec_isolated(pool);
+ }
reset_page(page);
put_page(page);
@@ -2094,8 +2121,8 @@ static void zs_page_putback(struct page
* so let's defer.
*/
putback_zspage_deferred(pool, class, zspage);
+ zs_pool_dec_isolated(pool);
}
-
spin_unlock(&class->lock);
}
@@ -2118,8 +2145,36 @@ static int zs_register_migration(struct
return 0;
}
+static bool pool_isolated_are_drained(struct zs_pool *pool)
+{
+ return atomic_long_read(&pool->isolated_pages) == 0;
+}
+
+/* Function for resolving migration */
+static void wait_for_isolated_drain(struct zs_pool *pool)
+{
+
+ /*
+ * We're in the process of destroying the pool, so there are no
+ * active allocations. zs_page_isolate() fails for completely free
+ * zspages, so we need only wait for the zs_pool's isolated
+ * count to hit zero.
+ */
+ wait_event(pool->migration_wait,
+ pool_isolated_are_drained(pool));
+}
+
static void zs_unregister_migration(struct zs_pool *pool)
{
+ pool->destroying = true;
+ /*
+ * We need a memory barrier here to ensure global visibility of
+ * pool->destroying. Thus pool->isolated pages will either be 0 in which
+ * case we don't care, or it will be > 0 and pool->destroying will
+ * ensure that we wake up once isolation hits 0.
+ */
+ smp_mb();
+ wait_for_isolated_drain(pool); /* This can block */
flush_work(&pool->free_work);
iput(pool->inode);
}
@@ -2357,6 +2412,8 @@ struct zs_pool *zs_create_pool(const cha
if (!pool->name)
goto err;
+ init_waitqueue_head(&pool->migration_wait);
+
if (create_cache(pool))
goto err;
_
Patches currently in -mm which might be from henryburns(a)google.com are
The patch titled
Subject: mm/zsmalloc.c: migration can leave pages in ZS_EMPTY indefinitely
has been removed from the -mm tree. Its filename was
mm-zsmallocc-migration-can-leave-pages-in-zs_empty-indefinitely.patch
This patch was dropped because it was merged into mainline or a subsystem tree
------------------------------------------------------
From: Henry Burns <henryburns(a)google.com>
Subject: mm/zsmalloc.c: migration can leave pages in ZS_EMPTY indefinitely
In zs_page_migrate() we call putback_zspage() after we have finished
migrating all pages in this zspage. However, the return value is ignored.
If a zs_free() races in between zs_page_isolate() and zs_page_migrate(),
freeing the last object in the zspage, putback_zspage() will leave the
page in ZS_EMPTY for potentially an unbounded amount of time.
To fix this, we need to do the same thing as zs_page_putback() does:
schedule free_work to occur. To avoid duplicated code, move the sequence
to a new putback_zspage_deferred() function which both zs_page_migrate()
and zs_page_putback() call.
Link: http://lkml.kernel.org/r/20190809181751.219326-1-henryburns@google.com
Fixes: 48b4800a1c6a ("zsmalloc: page migration support")
Signed-off-by: Henry Burns <henryburns(a)google.com>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky(a)gmail.com>
Cc: Henry Burns <henrywolfeburns(a)gmail.com>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: Shakeel Butt <shakeelb(a)google.com>
Cc: Jonathan Adams <jwadams(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/zsmalloc.c | 19 +++++++++++++++----
1 file changed, 15 insertions(+), 4 deletions(-)
--- a/mm/zsmalloc.c~mm-zsmallocc-migration-can-leave-pages-in-zs_empty-indefinitely
+++ a/mm/zsmalloc.c
@@ -1862,6 +1862,18 @@ static void dec_zspage_isolation(struct
zspage->isolated--;
}
+static void putback_zspage_deferred(struct zs_pool *pool,
+ struct size_class *class,
+ struct zspage *zspage)
+{
+ enum fullness_group fg;
+
+ fg = putback_zspage(class, zspage);
+ if (fg == ZS_EMPTY)
+ schedule_work(&pool->free_work);
+
+}
+
static void replace_sub_page(struct size_class *class, struct zspage *zspage,
struct page *newpage, struct page *oldpage)
{
@@ -2031,7 +2043,7 @@ static int zs_page_migrate(struct addres
* the list if @page is final isolated subpage in the zspage.
*/
if (!is_zspage_isolated(zspage))
- putback_zspage(class, zspage);
+ putback_zspage_deferred(pool, class, zspage);
reset_page(page);
put_page(page);
@@ -2077,14 +2089,13 @@ static void zs_page_putback(struct page
spin_lock(&class->lock);
dec_zspage_isolation(zspage);
if (!is_zspage_isolated(zspage)) {
- fg = putback_zspage(class, zspage);
/*
* Due to page_lock, we cannot free zspage immediately
* so let's defer.
*/
- if (fg == ZS_EMPTY)
- schedule_work(&pool->free_work);
+ putback_zspage_deferred(pool, class, zspage);
}
+
spin_unlock(&class->lock);
}
_
Patches currently in -mm which might be from henryburns(a)google.com are
The patch titled
Subject: mm, page_owner: handle THP splits correctly
has been removed from the -mm tree. Its filename was
mm-page_owner-handle-thp-splits-correctly.patch
This patch was dropped because it was merged into mainline or a subsystem tree
------------------------------------------------------
From: Vlastimil Babka <vbabka(a)suse.cz>
Subject: mm, page_owner: handle THP splits correctly
THP splitting path is missing the split_page_owner() call that
split_page() has. As a result, split THP pages are wrongly reported in
the page_owner file as order-9 pages. Furthermore when the former head
page is freed, the remaining former tail pages are not listed in the
page_owner file at all. This patch fixes that by adding the
split_page_owner() call into __split_huge_page().
Link: http://lkml.kernel.org/r/20190820131828.22684-2-vbabka@suse.cz
Fixes: a9627bc5e34e ("mm/page_owner: introduce split_page_owner and replace manual handling")
Reported-by: Kirill A. Shutemov <kirill(a)shutemov.name>
Signed-off-by: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Michal Hocko <mhocko(a)kernel.org>
Cc: Mel Gorman <mgorman(a)techsingularity.net>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/huge_memory.c | 4 ++++
1 file changed, 4 insertions(+)
--- a/mm/huge_memory.c~mm-page_owner-handle-thp-splits-correctly
+++ a/mm/huge_memory.c
@@ -32,6 +32,7 @@
#include <linux/shmem_fs.h>
#include <linux/oom.h>
#include <linux/numa.h>
+#include <linux/page_owner.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
@@ -2516,6 +2517,9 @@ static void __split_huge_page(struct pag
}
ClearPageCompound(head);
+
+ split_page_owner(head, HPAGE_PMD_ORDER);
+
/* See comment in __split_huge_page_tail() */
if (PageAnon(head)) {
/* Additional pin to swap cache */
_
Patches currently in -mm which might be from vbabka(a)suse.cz are
mm-page_owner-record-page-owner-for-each-subpage.patch
mm-page_owner-keep-owner-info-when-freeing-the-page.patch
mm-page_owner-debug_pagealloc-save-and-dump-freeing-stack-trace.patch
mm-compaction-clear-total_migratefree_scanned-before-scanning-a-new-zone-fix-2.patch
mm-reclaim-cleanup-should_continue_reclaim.patch
mm-compaction-raise-compaction-priority-after-it-withdrawns.patch
The patch titled
Subject: psi: get poll_work to run when calling poll syscall next time
has been removed from the -mm tree. Its filename was
psi-get-poll_work-to-run-when-calling-poll-syscall-next-time.patch
This patch was dropped because it was merged into mainline or a subsystem tree
------------------------------------------------------
From: Jason Xing <kerneljasonxing(a)linux.alibaba.com>
Subject: psi: get poll_work to run when calling poll syscall next time
Only when calling the poll syscall the first time can user receive POLLPRI
correctly. After that, user always fails to acquire the event signal.
Reproduce case:
1. Get the monitor code in Documentation/accounting/psi.txt
2. Run it, and wait for the event triggered.
3. Kill and restart the process.
The question is why we can end up with poll_scheduled = 1 but the work not
running (which would reset it to 0). And the answer is because the
scheduling side sees group->poll_kworker under RCU protection and then
schedules it, but here we cancel the work and destroy the worker. The
cancel needs to pair with resetting the poll_scheduled flag.
Link: http://lkml.kernel.org/r/1566357985-97781-1-git-send-email-joseph.qi@linux.…
Signed-off-by: Jason Xing <kerneljasonxing(a)linux.alibaba.com>
Signed-off-by: Joseph Qi <joseph.qi(a)linux.alibaba.com>
Reviewed-by: Caspar Zhang <caspar(a)linux.alibaba.com>
Reviewed-by: Suren Baghdasaryan <surenb(a)google.com>
Acked-by: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
kernel/sched/psi.c | 8 ++++++++
1 file changed, 8 insertions(+)
--- a/kernel/sched/psi.c~psi-get-poll_work-to-run-when-calling-poll-syscall-next-time
+++ a/kernel/sched/psi.c
@@ -1131,7 +1131,15 @@ static void psi_trigger_destroy(struct k
* deadlock while waiting for psi_poll_work to acquire trigger_lock
*/
if (kworker_to_destroy) {
+ /*
+ * After the RCU grace period has expired, the worker
+ * can no longer be found through group->poll_kworker.
+ * But it might have been already scheduled before
+ * that - deschedule it cleanly before destroying it.
+ */
kthread_cancel_delayed_work_sync(&group->poll_work);
+ atomic_set(&group->poll_scheduled, 0);
+
kthread_destroy_worker(kworker_to_destroy);
}
kfree(t);
_
Patches currently in -mm which might be from kerneljasonxing(a)linux.alibaba.com are
The patch titled
Subject: mm: memcontrol: flush percpu vmevents before releasing memcg
has been removed from the -mm tree. Its filename was
mm-memcontrol-flush-percpu-vmevents-before-releasing-memcg.patch
This patch was dropped because it was merged into mainline or a subsystem tree
------------------------------------------------------
From: Roman Gushchin <guro(a)fb.com>
Subject: mm: memcontrol: flush percpu vmevents before releasing memcg
Similar to vmstats, percpu caching of local vmevents leads to an
accumulation of errors on non-leaf levels. This happens because some
leftovers may remain in percpu caches, so that they are never propagated
up by the cgroup tree and just disappear into nonexistence with on
releasing of the memory cgroup.
To fix this issue let's accumulate and propagate percpu vmevents values
before releasing the memory cgroup similar to what we're doing with
vmstats.
Since on cpu hotplug we do flush percpu vmstats anyway, we can iterate
only over online cpus.
Link: http://lkml.kernel.org/r/20190819202338.363363-4-guro@fb.com
Fixes: 42a300353577 ("mm: memcontrol: fix recursive statistics correctness & scalabilty")
Signed-off-by: Roman Gushchin <guro(a)fb.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/memcontrol.c | 22 +++++++++++++++++++++-
1 file changed, 21 insertions(+), 1 deletion(-)
--- a/mm/memcontrol.c~mm-memcontrol-flush-percpu-vmevents-before-releasing-memcg
+++ a/mm/memcontrol.c
@@ -3295,6 +3295,25 @@ static void memcg_flush_percpu_vmstats(s
}
}
+static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
+{
+ unsigned long events[NR_VM_EVENT_ITEMS];
+ struct mem_cgroup *mi;
+ int cpu, i;
+
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+ events[i] = 0;
+
+ for_each_online_cpu(cpu)
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+ events[i] += raw_cpu_read(
+ memcg->vmstats_percpu->events[i]);
+
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+ atomic_long_add(events[i], &mi->vmevents[i]);
+}
+
#ifdef CONFIG_MEMCG_KMEM
static int memcg_online_kmem(struct mem_cgroup *memcg)
{
@@ -4718,10 +4737,11 @@ static void __mem_cgroup_free(struct mem
int node;
/*
- * Flush percpu vmstats to guarantee the value correctness
+ * Flush percpu vmstats and vmevents to guarantee the value correctness
* on parent's and all ancestor levels.
*/
memcg_flush_percpu_vmstats(memcg);
+ memcg_flush_percpu_vmevents(memcg);
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
free_percpu(memcg->vmstats_percpu);
_
Patches currently in -mm which might be from guro(a)fb.com are
mm-memcontrol-flush-percpu-slab-vmstats-on-kmem-offlining.patch
partially-revert-mm-memcontrolc-keep-local-vm-counters-in-sync-with-the-hierarchical-ones.patch
mm-memcontrol-switch-to-rcu-protection-in-drain_all_stock.patch
The patch titled
Subject: mm: memcontrol: flush percpu vmstats before releasing memcg
has been removed from the -mm tree. Its filename was
mm-memcontrol-flush-percpu-vmstats-before-releasing-memcg.patch
This patch was dropped because it was merged into mainline or a subsystem tree
------------------------------------------------------
From: Roman Gushchin <guro(a)fb.com>
Subject: mm: memcontrol: flush percpu vmstats before releasing memcg
Percpu caching of local vmstats with the conditional propagation by the
cgroup tree leads to an accumulation of errors on non-leaf levels.
Let's imagine two nested memory cgroups A and A/B. Say, a process
belonging to A/B allocates 100 pagecache pages on the CPU 0. The percpu
cache will spill 3 times, so that 32*3=96 pages will be accounted to A/B
and A atomic vmstat counters, 4 pages will remain in the percpu cache.
Imagine A/B is nearby memory.max, so that every following allocation
triggers a direct reclaim on the local CPU. Say, each such attempt will
free 16 pages on a new cpu. That means every percpu cache will have -16
pages, except the first one, which will have 4 - 16 = -12. A/B and A
atomic counters will not be touched at all.
Now a user removes A/B. All percpu caches are freed and corresponding
vmstat numbers are forgotten. A has 96 pages more than expected.
As memory cgroups are created and destroyed, errors do accumulate. Even
1-2 pages differences can accumulate into large numbers.
To fix this issue let's accumulate and propagate percpu vmstat values
before releasing the memory cgroup. At this point these numbers are
stable and cannot be changed.
Since on cpu hotplug we do flush percpu vmstats anyway, we can iterate
only over online cpus.
Link: http://lkml.kernel.org/r/20190819202338.363363-2-guro@fb.com
Fixes: 42a300353577 ("mm: memcontrol: fix recursive statistics correctness & scalabilty")
Signed-off-by: Roman Gushchin <guro(a)fb.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/memcontrol.c | 40 ++++++++++++++++++++++++++++++++++++++++
1 file changed, 40 insertions(+)
--- a/mm/memcontrol.c~mm-memcontrol-flush-percpu-vmstats-before-releasing-memcg
+++ a/mm/memcontrol.c
@@ -3260,6 +3260,41 @@ static u64 mem_cgroup_read_u64(struct cg
}
}
+static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
+{
+ unsigned long stat[MEMCG_NR_STAT];
+ struct mem_cgroup *mi;
+ int node, cpu, i;
+
+ for (i = 0; i < MEMCG_NR_STAT; i++)
+ stat[i] = 0;
+
+ for_each_online_cpu(cpu)
+ for (i = 0; i < MEMCG_NR_STAT; i++)
+ stat[i] += raw_cpu_read(memcg->vmstats_percpu->stat[i]);
+
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ for (i = 0; i < MEMCG_NR_STAT; i++)
+ atomic_long_add(stat[i], &mi->vmstats[i]);
+
+ for_each_node(node) {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
+ struct mem_cgroup_per_node *pi;
+
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+ stat[i] = 0;
+
+ for_each_online_cpu(cpu)
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+ stat[i] += raw_cpu_read(
+ pn->lruvec_stat_cpu->count[i]);
+
+ for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+ atomic_long_add(stat[i], &pi->lruvec_stat[i]);
+ }
+}
+
#ifdef CONFIG_MEMCG_KMEM
static int memcg_online_kmem(struct mem_cgroup *memcg)
{
@@ -4682,6 +4717,11 @@ static void __mem_cgroup_free(struct mem
{
int node;
+ /*
+ * Flush percpu vmstats to guarantee the value correctness
+ * on parent's and all ancestor levels.
+ */
+ memcg_flush_percpu_vmstats(memcg);
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
free_percpu(memcg->vmstats_percpu);
_
Patches currently in -mm which might be from guro(a)fb.com are
mm-memcontrol-flush-percpu-slab-vmstats-on-kmem-offlining.patch
partially-revert-mm-memcontrolc-keep-local-vm-counters-in-sync-with-the-hierarchical-ones.patch
mm-memcontrol-switch-to-rcu-protection-in-drain_all_stock.patch
The patch titled
Subject: mm, page_alloc: move_freepages should not examine struct page of reserved memory
has been removed from the -mm tree. Its filename was
mm-page_alloc-move_freepages-should-not-examine-struct-page-of-reserved-memory.patch
This patch was dropped because it was merged into mainline or a subsystem tree
------------------------------------------------------
From: David Rientjes <rientjes(a)google.com>
Subject: mm, page_alloc: move_freepages should not examine struct page of reserved memory
After commit 907ec5fca3dc ("mm: zero remaining unavailable struct pages"),
struct page of reserved memory is zeroed. This causes page->flags to be 0
and fixes issues related to reading /proc/kpageflags, for example, of
reserved memory.
The VM_BUG_ON() in move_freepages_block(), however, assumes that
page_zone() is meaningful even for reserved memory. That assumption is no
longer true after the aforementioned commit.
There's no reason why move_freepages_block() should be testing the
legitimacy of page_zone() for reserved memory; its scope is limited only
to pages on the zone's freelist.
Note that pfn_valid() can be true for reserved memory: there is a backing
struct page. The check for page_to_nid(page) is also buggy but reserved
memory normally only appears on node 0 so the zeroing doesn't affect this.
Move the debug checks to after verifying PageBuddy is true. This isolates
the scope of the checks to only be for buddy pages which are on the zone's
freelist which move_freepages_block() is operating on. In this case, an
incorrect node or zone is a bug worthy of being warned about (and the
examination of struct page is acceptable bcause this memory is not
reserved).
Why does move_freepages_block() gets called on reserved memory? It's
simply math after finding a valid free page from the per-zone free area to
use as fallback. We find the beginning and end of the pageblock of the
valid page and that can bring us into memory that was reserved per the
e820. pfn_valid() is still true (it's backed by a struct page), but since
it's zero'd we shouldn't make any inferences here about comparing its node
or zone. The current node check just happens to succeed most of the time
by luck because reserved memory typically appears on node 0.
The fix here is to validate that we actually have buddy pages before
testing if there's any type of zone or node strangeness going on.
We noticed it almost immediately after bringing 907ec5fca3dc in on
CONFIG_DEBUG_VM builds. It depends on finding specific free pages in
the per-zone free area where the math in move_freepages() will bring
the start or end pfn into reserved memory and wanting to claim that
entire pageblock as a new migratetype. So the path will be rare,
require CONFIG_DEBUG_VM, and require fallback to a different
migratetype.
Some struct pages were already zeroed from reserve pages before
907ec5fca3c so it theoretically could trigger before this commit. I
think it's rare enough under a config option that most people don't run
that others may not have noticed. I wouldn't argue against a stable
tag and the backport should be easy enough, but probably wouldn't
single out a commit that this is fixing.
Mel said:
: The overhead of the debugging check is higher with this patch although
: it'll only affect debug builds and the path is not particularly hot.
: If this was a concern, I think it would be reasonable to simply remove
: the debugging check as the zone boundaries are checked in
: move_freepages_block and we never expect a zone/node to be smaller than
: a pageblock and stuck in the middle of another zone.
Link: http://lkml.kernel.org/r/alpine.DEB.2.21.1908122036560.10779@chino.kir.corp…
Signed-off-by: David Rientjes <rientjes(a)google.com>
Acked-by: Mel Gorman <mgorman(a)techsingularity.net>
Cc: Naoya Horiguchi <n-horiguchi(a)ah.jp.nec.com>
Cc: Masayoshi Mizuma <m.mizuma(a)jp.fujitsu.com>
Cc: Oscar Salvador <osalvador(a)suse.de>
Cc: Pavel Tatashin <pavel.tatashin(a)microsoft.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/page_alloc.c | 19 ++++---------------
1 file changed, 4 insertions(+), 15 deletions(-)
--- a/mm/page_alloc.c~mm-page_alloc-move_freepages-should-not-examine-struct-page-of-reserved-memory
+++ a/mm/page_alloc.c
@@ -2238,27 +2238,12 @@ static int move_freepages(struct zone *z
unsigned int order;
int pages_moved = 0;
-#ifndef CONFIG_HOLES_IN_ZONE
- /*
- * page_zone is not safe to call in this context when
- * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
- * anyway as we check zone boundaries in move_freepages_block().
- * Remove at a later date when no bug reports exist related to
- * grouping pages by mobility
- */
- VM_BUG_ON(pfn_valid(page_to_pfn(start_page)) &&
- pfn_valid(page_to_pfn(end_page)) &&
- page_zone(start_page) != page_zone(end_page));
-#endif
for (page = start_page; page <= end_page;) {
if (!pfn_valid_within(page_to_pfn(page))) {
page++;
continue;
}
- /* Make sure we are not inadvertently changing nodes */
- VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
-
if (!PageBuddy(page)) {
/*
* We assume that pages that could be isolated for
@@ -2273,6 +2258,10 @@ static int move_freepages(struct zone *z
continue;
}
+ /* Make sure we are not inadvertently changing nodes */
+ VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
+ VM_BUG_ON_PAGE(page_zone(page) != zone, page);
+
order = page_order(page);
move_to_free_area(page, &zone->free_area[order], migratetype);
page += 1 << order;
_
Patches currently in -mm which might be from rientjes(a)google.com are
The patch titled
Subject: mm/z3fold.c: fix race between migration and destruction
has been removed from the -mm tree. Its filename was
mm-z3foldc-fix-race-between-migration-and-destruction.patch
This patch was dropped because it was merged into mainline or a subsystem tree
------------------------------------------------------
From: Henry Burns <henryburns(a)google.com>
Subject: mm/z3fold.c: fix race between migration and destruction
In z3fold_destroy_pool() we call destroy_workqueue(&pool->compact_wq).
However, we have no guarantee that migration isn't happening in the
background at that time.
Migration directly calls queue_work_on(pool->compact_wq), if destruction
wins that race we are using a destroyed workqueue.
Link: http://lkml.kernel.org/r/20190809213828.202833-1-henryburns@google.com
Signed-off-by: Henry Burns <henryburns(a)google.com>
Cc: Vitaly Wool <vitalywool(a)gmail.com>
Cc: Shakeel Butt <shakeelb(a)google.com>
Cc: Jonathan Adams <jwadams(a)google.com>
Cc: Henry Burns <henrywolfeburns(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/z3fold.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 89 insertions(+)
--- a/mm/z3fold.c~mm-z3foldc-fix-race-between-migration-and-destruction
+++ a/mm/z3fold.c
@@ -41,6 +41,7 @@
#include <linux/workqueue.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
+#include <linux/wait.h>
#include <linux/zpool.h>
#include <linux/magic.h>
@@ -145,6 +146,8 @@ struct z3fold_header {
* @release_wq: workqueue for safe page release
* @work: work_struct for safe page release
* @inode: inode for z3fold pseudo filesystem
+ * @destroying: bool to stop migration once we start destruction
+ * @isolated: int to count the number of pages currently in isolation
*
* This structure is allocated at pool creation time and maintains metadata
* pertaining to a particular z3fold pool.
@@ -163,8 +166,11 @@ struct z3fold_pool {
const struct zpool_ops *zpool_ops;
struct workqueue_struct *compact_wq;
struct workqueue_struct *release_wq;
+ struct wait_queue_head isolate_wait;
struct work_struct work;
struct inode *inode;
+ bool destroying;
+ int isolated;
};
/*
@@ -769,6 +775,7 @@ static struct z3fold_pool *z3fold_create
goto out_c;
spin_lock_init(&pool->lock);
spin_lock_init(&pool->stale_lock);
+ init_waitqueue_head(&pool->isolate_wait);
pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2);
if (!pool->unbuddied)
goto out_pool;
@@ -808,6 +815,15 @@ out:
return NULL;
}
+static bool pool_isolated_are_drained(struct z3fold_pool *pool)
+{
+ bool ret;
+
+ spin_lock(&pool->lock);
+ ret = pool->isolated == 0;
+ spin_unlock(&pool->lock);
+ return ret;
+}
/**
* z3fold_destroy_pool() - destroys an existing z3fold pool
* @pool: the z3fold pool to be destroyed
@@ -817,6 +833,22 @@ out:
static void z3fold_destroy_pool(struct z3fold_pool *pool)
{
kmem_cache_destroy(pool->c_handle);
+ /*
+ * We set pool-> destroying under lock to ensure that
+ * z3fold_page_isolate() sees any changes to destroying. This way we
+ * avoid the need for any memory barriers.
+ */
+
+ spin_lock(&pool->lock);
+ pool->destroying = true;
+ spin_unlock(&pool->lock);
+
+ /*
+ * We need to ensure that no pages are being migrated while we destroy
+ * these workqueues, as migration can queue work on either of the
+ * workqueues.
+ */
+ wait_event(pool->isolate_wait, !pool_isolated_are_drained(pool));
/*
* We need to destroy pool->compact_wq before pool->release_wq,
@@ -1307,6 +1339,28 @@ static u64 z3fold_get_pool_size(struct z
return atomic64_read(&pool->pages_nr);
}
+/*
+ * z3fold_dec_isolated() expects to be called while pool->lock is held.
+ */
+static void z3fold_dec_isolated(struct z3fold_pool *pool)
+{
+ assert_spin_locked(&pool->lock);
+ VM_BUG_ON(pool->isolated <= 0);
+ pool->isolated--;
+
+ /*
+ * If we have no more isolated pages, we have to see if
+ * z3fold_destroy_pool() is waiting for a signal.
+ */
+ if (pool->isolated == 0 && waitqueue_active(&pool->isolate_wait))
+ wake_up_all(&pool->isolate_wait);
+}
+
+static void z3fold_inc_isolated(struct z3fold_pool *pool)
+{
+ pool->isolated++;
+}
+
static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
{
struct z3fold_header *zhdr;
@@ -1333,6 +1387,33 @@ static bool z3fold_page_isolate(struct p
spin_lock(&pool->lock);
if (!list_empty(&page->lru))
list_del(&page->lru);
+ /*
+ * We need to check for destruction while holding pool->lock, as
+ * otherwise destruction could see 0 isolated pages, and
+ * proceed.
+ */
+ if (unlikely(pool->destroying)) {
+ spin_unlock(&pool->lock);
+ /*
+ * If this page isn't stale, somebody else holds a
+ * reference to it. Let't drop our refcount so that they
+ * can call the release logic.
+ */
+ if (unlikely(kref_put(&zhdr->refcount,
+ release_z3fold_page_locked))) {
+ /*
+ * If we get here we have kref problems, so we
+ * should freak out.
+ */
+ WARN(1, "Z3fold is experiencing kref problems\n");
+ return false;
+ }
+ z3fold_page_unlock(zhdr);
+ return false;
+ }
+
+
+ z3fold_inc_isolated(pool);
spin_unlock(&pool->lock);
z3fold_page_unlock(zhdr);
return true;
@@ -1401,6 +1482,10 @@ static int z3fold_page_migrate(struct ad
queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
+ spin_lock(&pool->lock);
+ z3fold_dec_isolated(pool);
+ spin_unlock(&pool->lock);
+
page_mapcount_reset(page);
put_page(page);
return 0;
@@ -1420,10 +1505,14 @@ static void z3fold_page_putback(struct p
INIT_LIST_HEAD(&page->lru);
if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
atomic64_dec(&pool->pages_nr);
+ spin_lock(&pool->lock);
+ z3fold_dec_isolated(pool);
+ spin_unlock(&pool->lock);
return;
}
spin_lock(&pool->lock);
list_add(&page->lru, &pool->lru);
+ z3fold_dec_isolated(pool);
spin_unlock(&pool->lock);
z3fold_page_unlock(zhdr);
}
_
Patches currently in -mm which might be from henryburns(a)google.com are
This is the start of the stable review cycle for the 4.14.137 release.
There are 53 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Wed 07 Aug 2019 12:47:58 PM UTC.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.14.137-r…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-4.14.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 4.14.137-rc1
Andy Lutomirski <luto(a)kernel.org>
x86/vdso: Prevent segfaults due to hoisted vclock reads
Linus Torvalds <torvalds(a)linux-foundation.org>
gcc-9: properly declare the {pv,hv}clock_page storage
Josh Poimboeuf <jpoimboe(a)redhat.com>
objtool: Support GCC 9 cold subfunction naming scheme
Jean Delvare <jdelvare(a)suse.de>
eeprom: at24: make spd world-readable again
John Fleck <john.fleck(a)intel.com>
IB/hfi1: Check for error on call to alloc_rsm_map_table
Yishai Hadas <yishaih(a)mellanox.com>
IB/mlx5: Fix RSS Toeplitz setup to be aligned with the HW specification
Yishai Hadas <yishaih(a)mellanox.com>
IB/mlx5: Move MRs to a kernel PD when freeing them to the MR cache
Yishai Hadas <yishaih(a)mellanox.com>
IB/mlx5: Use direct mkey destroy command upon UMR unreg failure
Yishai Hadas <yishaih(a)mellanox.com>
IB/mlx5: Fix unreg_umr to ignore the mkey state
Juergen Gross <jgross(a)suse.com>
xen/swiotlb: fix condition for calling xen_destroy_contiguous_region()
Munehisa Kamata <kamatam(a)amazon.com>
nbd: replace kill_bdev() with __invalidate_device() again
Will Deacon <will(a)kernel.org>
drivers/perf: arm_pmu: Fix failure path in PM notifier
Helge Deller <deller(a)gmx.de>
parisc: Fix build of compressed kernel even with debug enabled
Stefan Haberland <sth(a)linux.ibm.com>
s390/dasd: fix endless loop after read unit address configuration
Ondrej Mosnacek <omosnace(a)redhat.com>
selinux: fix memory leak in policydb_init()
Gustavo A. R. Silva <gustavo(a)embeddedor.com>
IB/hfi1: Fix Spectre v1 vulnerability
Michael Wu <michael.wu(a)vatics.com>
gpiolib: fix incorrect IRQ requesting of an active-low lineevent
Douglas Anderson <dianders(a)chromium.org>
mmc: dw_mmc: Fix occasional hang after tuning on eMMC
Filipe Manana <fdmanana(a)suse.com>
Btrfs: fix race leading to fs corruption after transaction abort
Filipe Manana <fdmanana(a)suse.com>
Btrfs: fix incremental send failure after deduplication
Masahiro Yamada <yamada.masahiro(a)socionext.com>
kbuild: initialize CLANG_FLAGS correctly in the top Makefile
Yongxin Liu <yongxin.liu(a)windriver.com>
drm/nouveau: fix memory leak in nouveau_conn_reset()
Zhenzhong Duan <zhenzhong.duan(a)oracle.com>
x86, boot: Remove multiple copy of static function sanitize_boot_params()
Josh Poimboeuf <jpoimboe(a)redhat.com>
x86/paravirt: Fix callee-saved function ELF sizes
Josh Poimboeuf <jpoimboe(a)redhat.com>
x86/kvm: Don't call kvm_spurious_fault() from .fixup
Zhenzhong Duan <zhenzhong.duan(a)oracle.com>
xen/pv: Fix a boot up hang revealed by int3 self test
Kees Cook <keescook(a)chromium.org>
ipc/mqueue.c: only perform resource calculation if user valid
Dan Carpenter <dan.carpenter(a)oracle.com>
drivers/rapidio/devices/rio_mport_cdev.c: NUL terminate some strings
Mikko Rapeli <mikko.rapeli(a)iki.fi>
uapi linux/coda_psdev.h: move upc_req definition from uapi to kernel side headers
Sam Protsenko <semen.protsenko(a)linaro.org>
coda: fix build using bare-metal toolchain
Zhouyang Jia <jiazhouyang09(a)gmail.com>
coda: add error handling for fget
Doug Berger <opendmb(a)gmail.com>
mm/cma.c: fail if fixed declaration can't be honored
Arnd Bergmann <arnd(a)arndb.de>
x86: math-emu: Hide clang warnings for 16-bit overflow
Qian Cai <cai(a)lca.pw>
x86/apic: Silence -Wtype-limits compiler warnings
Benjamin Poirier <bpoirier(a)suse.com>
be2net: Signal that the device cannot transmit during reconfiguration
Arnd Bergmann <arnd(a)arndb.de>
ACPI: fix false-positive -Wuninitialized warning
Arnd Bergmann <arnd(a)arndb.de>
x86: kvm: avoid constant-conversion warning
Benjamin Block <bblock(a)linux.ibm.com>
scsi: zfcp: fix GCC compiler warning emitted with -Wmaybe-uninitialized
Arnd Bergmann <arnd(a)arndb.de>
ACPI: blacklist: fix clang warning for unused DMI table
Jeff Layton <jlayton(a)kernel.org>
ceph: return -ERANGE if virtual xattr value didn't fit in buffer
Andrea Parri <andrea.parri(a)amarulasolutions.com>
ceph: fix improper use of smp_mb__before_atomic()
Ronnie Sahlberg <lsahlber(a)redhat.com>
cifs: Fix a race condition with cifs_echo_request
David Sterba <dsterba(a)suse.com>
btrfs: fix minimum number of chunk errors for DUP
Russell King <rmk+kernel(a)armlinux.org.uk>
fs/adfs: super: fix use-after-free bug
JC Kuo <jckuo(a)nvidia.com>
clk: tegra210: fix PLLU and PLLU_OUT1
Geert Uytterhoeven <geert+renesas(a)glider.be>
dmaengine: rcar-dmac: Reject zero-length slave DMA requests
Petr Cvek <petrcvekcz(a)gmail.com>
MIPS: lantiq: Fix bitfield masking
Prarit Bhargava <prarit(a)redhat.com>
kernel/module.c: Only return -EEXIST for modules that have finished loading
Cheng Jian <cj.chengjian(a)huawei.com>
ftrace: Enable trampoline when rec count returns back to one
Douglas Anderson <dianders(a)chromium.org>
ARM: dts: rockchip: Mark that the rk3288 timer might stop in suspend
Douglas Anderson <dianders(a)chromium.org>
ARM: dts: rockchip: Make rk3288-veyron-mickey's emmc work again
Douglas Anderson <dianders(a)chromium.org>
ARM: dts: rockchip: Make rk3288-veyron-minnie run at hs200
Russell King <rmk+kernel(a)armlinux.org.uk>
ARM: riscpc: fix DMA
-------------
Diffstat:
Makefile | 7 +--
arch/arm/boot/dts/rk3288-veyron-mickey.dts | 4 --
arch/arm/boot/dts/rk3288-veyron-minnie.dts | 4 --
arch/arm/boot/dts/rk3288.dtsi | 1 +
arch/arm/mach-rpc/dma.c | 5 +-
arch/mips/lantiq/irq.c | 5 +-
arch/parisc/boot/compressed/vmlinux.lds.S | 4 +-
arch/x86/boot/compressed/misc.c | 1 +
arch/x86/boot/compressed/misc.h | 1 -
arch/x86/entry/entry_64.S | 1 -
arch/x86/entry/vdso/vclock_gettime.c | 19 +++++--
arch/x86/include/asm/apic.h | 2 +-
arch/x86/include/asm/kvm_host.h | 34 +++++++------
arch/x86/include/asm/paravirt.h | 1 +
arch/x86/include/asm/traps.h | 2 +-
arch/x86/kernel/apic/apic.c | 2 +-
arch/x86/kernel/kvm.c | 1 +
arch/x86/kvm/mmu.c | 6 +--
arch/x86/math-emu/fpu_emu.h | 2 +-
arch/x86/math-emu/reg_constant.c | 2 +-
arch/x86/xen/enlighten_pv.c | 2 +-
arch/x86/xen/xen-asm_64.S | 1 -
drivers/acpi/blacklist.c | 4 ++
drivers/block/nbd.c | 2 +-
drivers/clk/tegra/clk-tegra210.c | 8 +--
drivers/dma/sh/rcar-dmac.c | 2 +-
drivers/gpio/gpiolib.c | 6 ++-
drivers/gpu/drm/nouveau/nouveau_connector.c | 2 +-
drivers/infiniband/hw/hfi1/chip.c | 11 ++++-
drivers/infiniband/hw/hfi1/verbs.c | 2 +
drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 +
drivers/infiniband/hw/mlx5/mr.c | 17 ++++---
drivers/infiniband/hw/mlx5/qp.c | 13 +++--
drivers/misc/eeprom/at24.c | 2 +-
drivers/mmc/host/dw_mmc.c | 3 +-
drivers/net/ethernet/emulex/benet/be_main.c | 6 ++-
drivers/perf/arm_pmu.c | 2 +-
drivers/rapidio/devices/rio_mport_cdev.c | 2 +
drivers/s390/block/dasd_alias.c | 22 ++++++---
drivers/s390/scsi/zfcp_erp.c | 7 +++
drivers/xen/swiotlb-xen.c | 4 +-
fs/adfs/super.c | 5 +-
fs/btrfs/send.c | 77 ++++++-----------------------
fs/btrfs/transaction.c | 10 ++++
fs/btrfs/volumes.c | 3 +-
fs/ceph/super.h | 7 ++-
fs/ceph/xattr.c | 14 +++---
fs/cifs/connect.c | 8 +--
fs/coda/psdev.c | 5 +-
include/linux/acpi.h | 5 +-
include/linux/coda.h | 3 +-
include/linux/coda_psdev.h | 11 +++++
include/uapi/linux/coda_psdev.h | 13 -----
ipc/mqueue.c | 19 +++----
kernel/module.c | 6 +--
kernel/trace/ftrace.c | 28 ++++++-----
mm/cma.c | 13 +++++
security/selinux/ss/policydb.c | 6 ++-
tools/objtool/elf.c | 2 +-
59 files changed, 254 insertions(+), 204 deletions(-)