From: Jaewon Kim <jaewon31.kim(a)samsung.com>
Subject: mm/page_ext.c: check if page_ext is not prepared
online_page_ext() and page_ext_init() allocate page_ext for each section,
but they do not allocate if the first PFN is !pfn_present(pfn) or
!pfn_valid(pfn). Then section->page_ext remains as NULL. lookup_page_ext
checks NULL only if CONFIG_DEBUG_VM is enabled. For a valid PFN,
__set_page_owner will try to get page_ext through lookup_page_ext.
Without CONFIG_DEBUG_VM lookup_page_ext will misuse NULL pointer as value
0. This incurrs invalid address access.
This is the panic example when PFN 0x100000 is not valid but PFN 0x13FC00
is being used for page_ext. section->page_ext is NULL, get_entry returned
invalid page_ext address as 0x1DFA000 for a PFN 0x13FC00.
To avoid this panic, CONFIG_DEBUG_VM should be removed so that page_ext
will be checked at all times.
<1>[ 11.618085] Unable to handle kernel paging request at virtual address 01dfa014
<1>[ 11.618140] pgd = ffffffc0c6dc9000
<1>[ 11.618174] [01dfa014] *pgd=0000000000000000, *pud=0000000000000000
<4>[ 11.618240] ------------[ cut here ]------------
<2>[ 11.618278] Kernel BUG at ffffff80082371e0 [verbose debug info unavailable]
<0>[ 11.618338] Internal error: Oops: 96000045 [#1] PREEMPT SMP
<4>[ 11.618381] Modules linked in:
<4>[ 11.618524] task: ffffffc0c6ec9180 task.stack: ffffffc0c6f40000
<4>[ 11.618569] PC is at __set_page_owner+0x48/0x78
<4>[ 11.618607] LR is at __set_page_owner+0x44/0x78
<4>[ 11.626025] [<ffffff80082371e0>] __set_page_owner+0x48/0x78
<4>[ 11.626071] [<ffffff80081df9f0>] get_page_from_freelist+0x880/0x8e8
<4>[ 11.626118] [<ffffff80081e00a4>] __alloc_pages_nodemask+0x14c/0xc48
<4>[ 11.626165] [<ffffff80081e610c>] __do_page_cache_readahead+0xdc/0x264
<4>[ 11.626214] [<ffffff80081d8824>] filemap_fault+0x2ac/0x550
<4>[ 11.626259] [<ffffff80082e5cf8>] ext4_filemap_fault+0x3c/0x58
<4>[ 11.626305] [<ffffff800820a2f8>] __do_fault+0x80/0x120
<4>[ 11.626347] [<ffffff800820eb4c>] handle_mm_fault+0x704/0xbb0
<4>[ 11.626393] [<ffffff800809ba70>] do_page_fault+0x2e8/0x394
<4>[ 11.626437] [<ffffff8008080be4>] do_mem_abort+0x88/0x124
Pre-4.7 kernels also need f86e427197 ("mm: check the return value of
lookup_page_ext for all call sites").
Link: http://lkml.kernel.org/r/20171107094131.14621-1-jaewon31.kim@samsung.com
Fixes: eefa864b701d ("mm/page_ext: resurrect struct page extending code for debugging")
Signed-off-by: Jaewon Kim <jaewon31.kim(a)samsung.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: Joonsoo Kim <js1304(a)gmail.com>
Cc: <stable(a)vger.kernel.org> [depends on f86e427197, see above]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/page_ext.c | 4 ----
1 file changed, 4 deletions(-)
diff -puN mm/page_ext.c~mm-page_ext-check-if-page_ext-is-not-prepared mm/page_ext.c
--- a/mm/page_ext.c~mm-page_ext-check-if-page_ext-is-not-prepared
+++ a/mm/page_ext.c
@@ -125,7 +125,6 @@ struct page_ext *lookup_page_ext(struct
struct page_ext *base;
base = NODE_DATA(page_to_nid(page))->node_page_ext;
-#if defined(CONFIG_DEBUG_VM)
/*
* The sanity checks the page allocator does upon freeing a
* page can reach here before the page_ext arrays are
@@ -134,7 +133,6 @@ struct page_ext *lookup_page_ext(struct
*/
if (unlikely(!base))
return NULL;
-#endif
index = pfn - round_down(node_start_pfn(page_to_nid(page)),
MAX_ORDER_NR_PAGES);
return get_entry(base, index);
@@ -199,7 +197,6 @@ struct page_ext *lookup_page_ext(struct
{
unsigned long pfn = page_to_pfn(page);
struct mem_section *section = __pfn_to_section(pfn);
-#if defined(CONFIG_DEBUG_VM)
/*
* The sanity checks the page allocator does upon freeing a
* page can reach here before the page_ext arrays are
@@ -208,7 +205,6 @@ struct page_ext *lookup_page_ext(struct
*/
if (!section->page_ext)
return NULL;
-#endif
return get_entry(section->page_ext, pfn);
}
_
From: Pavel Tatashin <pasha.tatashin(a)oracle.com>
Subject: mm/page_alloc.c: broken deferred calculation
In reset_deferred_meminit() we determine number of pages that must not be
deferred. We initialize pages for at least 2G of memory, but also pages
for reserved memory in this node.
The reserved memory is determined in this function:
memblock_reserved_memory_within(), which operates over physical addresses,
and returns size in bytes. However, reset_deferred_meminit() assumes that
that this function operates with pfns, and returns page count.
The result is that in the best case machine boots slower than expected due
to initializing more pages than needed in single thread, and in the worst
case panics because fewer than needed pages are initialized early.
Link: http://lkml.kernel.org/r/20171021011707.15191-1-pasha.tatashin@oracle.com
Fixes: 864b9a393dcb ("mm: consider memblock reservations for deferred memory initialization sizing")
Signed-off-by: Pavel Tatashin <pasha.tatashin(a)oracle.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: Mel Gorman <mgorman(a)techsingularity.net>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/mmzone.h | 3 ++-
mm/page_alloc.c | 27 ++++++++++++++++++---------
2 files changed, 20 insertions(+), 10 deletions(-)
diff -puN include/linux/mmzone.h~mm-broken-deferred-calculation include/linux/mmzone.h
--- a/include/linux/mmzone.h~mm-broken-deferred-calculation
+++ a/include/linux/mmzone.h
@@ -700,7 +700,8 @@ typedef struct pglist_data {
* is the first PFN that needs to be initialised.
*/
unsigned long first_deferred_pfn;
- unsigned long static_init_size;
+ /* Number of non-deferred pages */
+ unsigned long static_init_pgcnt;
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff -puN mm/page_alloc.c~mm-broken-deferred-calculation mm/page_alloc.c
--- a/mm/page_alloc.c~mm-broken-deferred-calculation
+++ a/mm/page_alloc.c
@@ -291,28 +291,37 @@ EXPORT_SYMBOL(nr_online_nodes);
int page_group_by_mobility_disabled __read_mostly;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+
+/*
+ * Determine how many pages need to be initialized durig early boot
+ * (non-deferred initialization).
+ * The value of first_deferred_pfn will be set later, once non-deferred pages
+ * are initialized, but for now set it ULONG_MAX.
+ */
static inline void reset_deferred_meminit(pg_data_t *pgdat)
{
- unsigned long max_initialise;
- unsigned long reserved_lowmem;
+ phys_addr_t start_addr, end_addr;
+ unsigned long max_pgcnt;
+ unsigned long reserved;
/*
* Initialise at least 2G of a node but also take into account that
* two large system hashes that can take up 1GB for 0.25TB/node.
*/
- max_initialise = max(2UL << (30 - PAGE_SHIFT),
- (pgdat->node_spanned_pages >> 8));
+ max_pgcnt = max(2UL << (30 - PAGE_SHIFT),
+ (pgdat->node_spanned_pages >> 8));
/*
* Compensate the all the memblock reservations (e.g. crash kernel)
* from the initial estimation to make sure we will initialize enough
* memory to boot.
*/
- reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn,
- pgdat->node_start_pfn + max_initialise);
- max_initialise += reserved_lowmem;
+ start_addr = PFN_PHYS(pgdat->node_start_pfn);
+ end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt);
+ reserved = memblock_reserved_memory_within(start_addr, end_addr);
+ max_pgcnt += PHYS_PFN(reserved);
- pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages);
+ pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages);
pgdat->first_deferred_pfn = ULONG_MAX;
}
@@ -339,7 +348,7 @@ static inline bool update_defer_init(pg_
if (zone_end < pgdat_end_pfn(pgdat))
return true;
(*nr_initialised)++;
- if ((*nr_initialised > pgdat->static_init_size) &&
+ if ((*nr_initialised > pgdat->static_init_pgcnt) &&
(pfn & (PAGES_PER_SECTION - 1)) == 0) {
pgdat->first_deferred_pfn = pfn;
return false;
_
From: Huang Ying <huang.ying.caritas(a)gmail.com>
Subject: mm, swap: fix false error message in __swp_swapcount()
When a page fault occurs for a swap entry, the physical swap readahead
(not the VMA base swap readahead) may readahead several swap entries after
the fault swap entry. The readahead algorithm calculates some of the swap
entries to readahead via increasing the offset of the fault swap entry
without checking whether they are beyond the end of the swap device and it
relys on the __swp_swapcount() and swapcache_prepare() to check it.
Although __swp_swapcount() checks for the swap entry passed in, it will
complain with the error message as follow for the expected invalid swap
entry. This may make the end users confused.
swap_info_get: Bad swap offset entry 0200f8a7
To fix the false error message, the swap entry checking is added in
swapin_readahead() to avoid to pass the out-of-bound swap entries and the
swap entry reserved for the swap header to __swp_swapcount() and
swapcache_prepare().
Link: http://lkml.kernel.org/r/20171102054225.22897-1-ying.huang@intel.com
Fixes: e8c26ab60598 ("mm/swap: skip readahead for unreferenced swap slots")
Signed-off-by: "Huang, Ying" <ying.huang(a)intel.com>
Reported-by: Christian Kujau <lists(a)nerdbynature.de>
Acked-by: Minchan Kim <minchan(a)kernel.org>
Suggested-by: Minchan Kim <minchan(a)kernel.org>
Cc: Tim Chen <tim.c.chen(a)linux.intel.com>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: <stable(a)vger.kernel.org> [4.11+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/swap_state.c | 3 +++
1 file changed, 3 insertions(+)
diff -puN mm/swap_state.c~mm-swap-fix-false-error-message-in-__swp_swapcount mm/swap_state.c
--- a/mm/swap_state.c~mm-swap-fix-false-error-message-in-__swp_swapcount
+++ a/mm/swap_state.c
@@ -559,6 +559,7 @@ struct page *swapin_readahead(swp_entry_
unsigned long offset = entry_offset;
unsigned long start_offset, end_offset;
unsigned long mask;
+ struct swap_info_struct *si = swp_swap_info(entry);
struct blk_plug plug;
bool do_poll = true, page_allocated;
@@ -572,6 +573,8 @@ struct page *swapin_readahead(swp_entry_
end_offset = offset | mask;
if (!start_offset) /* First page is swap header. */
start_offset++;
+ if (end_offset >= si->max)
+ end_offset = si->max - 1;
blk_start_plug(&plug);
for (offset = start_offset; offset <= end_offset ; offset++) {
_
From: alex chen <alex.chen(a)huawei.com>
Subject: ocfs2: should wait dio before inode lock in ocfs2_setattr()
we should wait dio requests to finish before inode lock in
ocfs2_setattr(), otherwise the following deadlock will happen:
process 1 process 2 process 3
truncate file 'A' end_io of writing file 'A' receiving the bast messages
ocfs2_setattr
ocfs2_inode_lock_tracker
ocfs2_inode_lock_full
inode_dio_wait
__inode_dio_wait
-->waiting for all dio
requests finish
dlm_proxy_ast_handler
dlm_do_local_bast
ocfs2_blocking_ast
ocfs2_generic_handle_bast
set OCFS2_LOCK_BLOCKED flag
dio_end_io
dio_bio_end_aio
dio_complete
ocfs2_dio_end_io
ocfs2_dio_end_io_write
ocfs2_inode_lock
__ocfs2_cluster_lock
ocfs2_wait_for_mask
-->waiting for OCFS2_LOCK_BLOCKED
flag to be cleared, that is waiting
for 'process 1' unlocking the inode lock
inode_dio_end
-->here dec the i_dio_count, but will never
be called, so a deadlock happened.
Link: http://lkml.kernel.org/r/59F81636.70508@huawei.com
Signed-off-by: Alex Chen <alex.chen(a)huawei.com>
Reviewed-by: Jun Piao <piaojun(a)huawei.com>
Reviewed-by: Joseph Qi <jiangqi903(a)gmail.com>
Acked-by: Changwei Ge <ge.changwei(a)h3c.com>
Cc: Mark Fasheh <mfasheh(a)versity.com>
Cc: Joel Becker <jlbec(a)evilplan.org>
Cc: Junxiao Bi <junxiao.bi(a)oracle.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/ocfs2/file.c | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff -puN fs/ocfs2/file.c~ocfs2-should-wait-dio-before-inode-lock-in-ocfs2_setattr fs/ocfs2/file.c
--- a/fs/ocfs2/file.c~ocfs2-should-wait-dio-before-inode-lock-in-ocfs2_setattr
+++ a/fs/ocfs2/file.c
@@ -1161,6 +1161,13 @@ int ocfs2_setattr(struct dentry *dentry,
}
size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
if (size_change) {
+ /*
+ * Here we should wait dio to finish before inode lock
+ * to avoid a deadlock between ocfs2_setattr() and
+ * ocfs2_dio_end_io_write()
+ */
+ inode_dio_wait(inode);
+
status = ocfs2_rw_lock(inode, 1);
if (status < 0) {
mlog_errno(status);
@@ -1200,8 +1207,6 @@ int ocfs2_setattr(struct dentry *dentry,
if (status)
goto bail_unlock;
- inode_dio_wait(inode);
-
if (i_size_read(inode) >= attr->ia_size) {
if (ocfs2_should_order_data(inode)) {
status = ocfs2_begin_ordered_truncate(inode,
_
From: Changwei Ge <ge.changwei(a)h3c.com>
Subject: ocfs2: fix cluster hang after a node dies
When a node dies, other live nodes have to choose a new master for an
existed lock resource mastered by the dead node.
As for ocfs2/dlm implementation, this is done by function -
dlm_move_lockres_to_recovery_list which marks those lock rsources as
DLM_LOCK_RES_RECOVERING and manages them via a list from which DLM changes
lock resource's master later.
So without invoking dlm_move_lockres_to_recovery_list, no master will be
choosed after dlm recovery accomplishment since no lock resource can be
found through ::resource list.
What's worse is that if DLM_LOCK_RES_RECOVERING is not marked for lock
resources mastered a dead node, it will break up synchronization among
nodes.
So invoke dlm_move_lockres_to_recovery_list again.
Fixs: 'commit ee8f7fcbe638 ("ocfs2/dlm: continue to purge recovery lockres when recovery master goes down")'
Link: http://lkml.kernel.org/r/63ADC13FD55D6546B7DECE290D39E373CED6E0F9@H3CMLB14-…
Signed-off-by: Changwei Ge <ge.changwei(a)h3c.com>
Reported-by: Vitaly Mayatskih <v.mayatskih(a)gmail.com>
Tested-by: Vitaly Mayatskikh <v.mayatskih(a)gmail.com>
Cc: Mark Fasheh <mfasheh(a)versity.com>
Cc: Joel Becker <jlbec(a)evilplan.org>
Cc: Junxiao Bi <junxiao.bi(a)oracle.com>
Cc: Joseph Qi <jiangqi903(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/ocfs2/dlm/dlmrecovery.c | 1 +
1 file changed, 1 insertion(+)
diff -puN fs/ocfs2/dlm/dlmrecovery.c~ocfs2-fix-cluster-hang-after-a-node-dies fs/ocfs2/dlm/dlmrecovery.c
--- a/fs/ocfs2/dlm/dlmrecovery.c~ocfs2-fix-cluster-hang-after-a-node-dies
+++ a/fs/ocfs2/dlm/dlmrecovery.c
@@ -2419,6 +2419,7 @@ static void dlm_do_local_recovery_cleanu
dlm_lockres_put(res);
continue;
}
+ dlm_move_lockres_to_recovery_list(dlm, res);
} else if (res->owner == dlm->node_num) {
dlm_free_dead_locks(dlm, res, dead_node);
__dlm_lockres_calc_usage(dlm, res);
_
Please can you queue up the following for stable:
df80cd9b28b9 sctp: do not peel off an assoc from one netns to another one
Ben.
--
Ben Hutchings
Software Developer, Codethink Ltd.
The patch titled
Subject: mm, swap: fix false error message in __swp_swapcount()
has been removed from the -mm tree. Its filename was
mm-swap-skip-swapcache-for-swapin-of-synchronous-device-fix.patch
This patch was dropped because it was folded into mm-swap-skip-swapcache-for-swapin-of-synchronous-device.patch
------------------------------------------------------
From: Huang Ying <huang.ying.caritas(a)gmail.com>
Subject: mm, swap: fix false error message in __swp_swapcount()
When a page fault occurs for a swap entry, the physical swap readahead
(not the VMA base swap readahead) may readahead several swap entries after
the fault swap entry. The readahead algorithm calculates some of the swap
entries to readahead via increasing the offset of the fault swap entry
without checking whether they are beyond the end of the swap device and it
relys on the __swp_swapcount() and swapcache_prepare() to check it.
Although __swp_swapcount() checks for the swap entry passed in, it will
complain with the error message as follow for the expected invalid swap
entry. This may make the end users confused.
swap_info_get: Bad swap offset entry 0200f8a7
To fix the false error message, the swap entry checking is added in
swapin_readahead() to avoid to pass the out-of-bound swap entries and the
swap entry reserved for the swap header to __swp_swapcount() and
swapcache_prepare().
Link: http://lkml.kernel.org/r/20171102054225.22897-1-ying.huang@intel.com
Fixes: e8c26ab60598 ("mm/swap: skip readahead for unreferenced swap slots")
Signed-off-by: "Huang, Ying" <ying.huang(a)intel.com>
Reported-by: Christian Kujau <lists(a)nerdbynature.de>
Acked-by: Minchan Kim <minchan(a)kernel.org>
Suggested-by: Minchan Kim <minchan(a)kernel.org>
Cc: Tim Chen <tim.c.chen(a)linux.intel.com>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: <stable(a)vger.kernel.org> [4.11+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/swap_state.c | 3 +++
1 file changed, 3 insertions(+)
diff -puN mm/swap_state.c~mm-swap-skip-swapcache-for-swapin-of-synchronous-device-fix mm/swap_state.c
--- a/mm/swap_state.c~mm-swap-skip-swapcache-for-swapin-of-synchronous-device-fix
+++ a/mm/swap_state.c
@@ -559,6 +559,7 @@ struct page *swapin_readahead(swp_entry_
unsigned long offset = entry_offset;
unsigned long start_offset, end_offset;
unsigned long mask;
+ struct swap_info_struct *si = swp_swap_info(entry);
struct blk_plug plug;
bool do_poll = true, page_allocated;
@@ -572,6 +573,8 @@ struct page *swapin_readahead(swp_entry_
end_offset = offset | mask;
if (!start_offset) /* First page is swap header. */
start_offset++;
+ if (end_offset >= si->max)
+ end_offset = si->max - 1;
blk_start_plug(&plug);
for (offset = start_offset; offset <= end_offset ; offset++) {
_
Patches currently in -mm which might be from huang.ying.caritas(a)gmail.com are
mm-swap-skip-swapcache-for-swapin-of-synchronous-device.patch
The default max_cache_size_bytes for dm-bufio is meant to be the lesser
of 25% of the size of the vmalloc area and 2% of the size of lowmem.
However, on 32-bit systems the intermediate result in the expression
(VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100
overflows, causing the wrong result to be computed. For example, on a
32-bit system where the vmalloc area is 520093696 bytes, the result is
1174405 rather than the expected 130023424, which makes the maximum
cache size much too small (far less than 2% of lowmem). This causes
severe performance problems for dm-verity users on affected systems.
Fix this by using mult_frac() to correctly multiply by a percentage. Do
this for all places in dm-bufio that multiply by a percentage. Also
replace (VMALLOC_END - VMALLOC_START) with VMALLOC_TOTAL, which contrary
to the comment is now defined in include/linux/vmalloc.h.
Fixes: 95d402f057f2 ("dm: add bufio")
Cc: <stable(a)vger.kernel.org> # v3.2+
Signed-off-by: Eric Biggers <ebiggers(a)google.com>
---
drivers/md/dm-bufio.c | 15 ++++++---------
1 file changed, 6 insertions(+), 9 deletions(-)
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 33bb074d6941..b8ac591aaaa7 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -974,7 +974,8 @@ static void __get_memory_limit(struct dm_bufio_client *c,
buffers = c->minimum_buffers;
*limit_buffers = buffers;
- *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100;
+ *threshold_buffers = mult_frac(buffers,
+ DM_BUFIO_WRITEBACK_PERCENT, 100);
}
/*
@@ -1910,19 +1911,15 @@ static int __init dm_bufio_init(void)
memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches);
memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names);
- mem = (__u64)((totalram_pages - totalhigh_pages) *
- DM_BUFIO_MEMORY_PERCENT / 100) << PAGE_SHIFT;
+ mem = (__u64)mult_frac(totalram_pages - totalhigh_pages,
+ DM_BUFIO_MEMORY_PERCENT, 100) << PAGE_SHIFT;
if (mem > ULONG_MAX)
mem = ULONG_MAX;
#ifdef CONFIG_MMU
- /*
- * Get the size of vmalloc space the same way as VMALLOC_TOTAL
- * in fs/proc/internal.h
- */
- if (mem > (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100)
- mem = (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100;
+ if (mem > mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100))
+ mem = mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100);
#endif
dm_bufio_default_cache_size = mem;
--
2.15.0.448.gf294e3d99a-goog