shmem_unused_huge_shrink() gets called from reclaim path. Waiting for page lock may lead to deadlock there.
Replace lock_page() with trylock_page() and skip the page if we failed to lock it. We will get to the page on the next scan.
Signed-off-by: Kirill A. Shutemov kirill.shutemov@linux.intel.com Fixes: 779750d20b93 ("shmem: split huge pages beyond i_size under memory pressure") Cc: stable@vger.kernel.org # v4.8+ --- mm/shmem.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-)
diff --git a/mm/shmem.c b/mm/shmem.c index 1907688b75ee..2afe809d4bd4 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -498,31 +498,42 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, continue; }
- page = find_lock_page(inode->i_mapping, + page = find_get_page(inode->i_mapping, (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT); if (!page) goto drop;
+ /* No huge page at the end of the file: nothing to split */ if (!PageTransHuge(page)) { - unlock_page(page); put_page(page); goto drop; }
+ /* + * Leave the inode on the list if we failed to lock + * the page at this time. + * + * Waiting for the lock may lead to deadlock in the + * reclaim path. + */ + if (!trylock_page(page)) { + put_page(page); + goto leave; + } + ret = split_huge_page(page); unlock_page(page); put_page(page);
- if (ret) { - /* split failed: leave it on the list */ - iput(inode); - continue; - } + /* If split failed leave the inode on the list */ + if (ret) + goto leave;
split++; drop: list_del_init(&info->shrinklist); removed++; +leave: iput(inode); }
On Fri 16-03-18 13:59:08, Kirill A. Shutemov wrote: [..]
@@ -498,31 +498,42 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, continue; }
page = find_lock_page(inode->i_mapping,
if (!page) goto drop;page = find_get_page(inode->i_mapping, (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
if (!PageTransHuge(page)) {/* No huge page at the end of the file: nothing to split */
}unlock_page(page); put_page(page); goto drop;
/*
* Leave the inode on the list if we failed to lock
* the page at this time.
*
* Waiting for the lock may lead to deadlock in the
* reclaim path.
*/
if (!trylock_page(page)) {
put_page(page);
goto leave;
}
Can somebody split the huge page after the PageTransHuge check and before we lock it?
- ret = split_huge_page(page); unlock_page(page); put_page(page);
if (ret) {
/* split failed: leave it on the list */
iput(inode);
continue;
}
/* If split failed leave the inode on the list */
if (ret)
goto leave;
split++; drop: list_del_init(&info->shrinklist); removed++; +leave: iput(inode); } -- 2.16.1
On Fri, Mar 16, 2018 at 01:13:03PM +0100, Michal Hocko wrote:
On Fri 16-03-18 13:59:08, Kirill A. Shutemov wrote: [..]
@@ -498,31 +498,42 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, continue; }
page = find_lock_page(inode->i_mapping,
if (!page) goto drop;page = find_get_page(inode->i_mapping, (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
if (!PageTransHuge(page)) {/* No huge page at the end of the file: nothing to split */
}unlock_page(page); put_page(page); goto drop;
/*
* Leave the inode on the list if we failed to lock
* the page at this time.
*
* Waiting for the lock may lead to deadlock in the
* reclaim path.
*/
if (!trylock_page(page)) {
put_page(page);
goto leave;
}
Can somebody split the huge page after the PageTransHuge check and before we lock it?
Nope. Pin on the page is enough to prevent split.
On Fri 16-03-18 15:25:08, Kirill A. Shutemov wrote:
On Fri, Mar 16, 2018 at 01:13:03PM +0100, Michal Hocko wrote:
On Fri 16-03-18 13:59:08, Kirill A. Shutemov wrote: [..]
@@ -498,31 +498,42 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, continue; }
page = find_lock_page(inode->i_mapping,
if (!page) goto drop;page = find_get_page(inode->i_mapping, (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
if (!PageTransHuge(page)) {/* No huge page at the end of the file: nothing to split */
}unlock_page(page); put_page(page); goto drop;
/*
* Leave the inode on the list if we failed to lock
* the page at this time.
*
* Waiting for the lock may lead to deadlock in the
* reclaim path.
*/
if (!trylock_page(page)) {
put_page(page);
goto leave;
}
Can somebody split the huge page after the PageTransHuge check and before we lock it?
Nope. Pin on the page is enough to prevent split.
Good, I thought so but wasn't really 100% sure. Thanks for the clarification and feel free to add Acked-by: Michal Hocko mhocko@suse.com
Maybe you should stick Reported-by: Eric Wheeler linux-mm@lists.ewheeler.net and point to http://lkml.kernel.org/r/alpine.LRH.2.11.1801242349220.30642@mail.ewheeler.n... because that smells like a bug that this patch would be fixing.
On Fri, Mar 16, 2018 at 01:58:27PM +0100, Michal Hocko wrote:
On Fri 16-03-18 15:25:08, Kirill A. Shutemov wrote:
On Fri, Mar 16, 2018 at 01:13:03PM +0100, Michal Hocko wrote:
On Fri 16-03-18 13:59:08, Kirill A. Shutemov wrote: [..]
@@ -498,31 +498,42 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, continue; }
page = find_lock_page(inode->i_mapping,
if (!page) goto drop;page = find_get_page(inode->i_mapping, (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
if (!PageTransHuge(page)) {/* No huge page at the end of the file: nothing to split */
}unlock_page(page); put_page(page); goto drop;
/*
* Leave the inode on the list if we failed to lock
* the page at this time.
*
* Waiting for the lock may lead to deadlock in the
* reclaim path.
*/
if (!trylock_page(page)) {
put_page(page);
goto leave;
}
Can somebody split the huge page after the PageTransHuge check and before we lock it?
Nope. Pin on the page is enough to prevent split.
Good, I thought so but wasn't really 100% sure. Thanks for the clarification and feel free to add Acked-by: Michal Hocko mhocko@suse.com
Thanks.
Maybe you should stick Reported-by: Eric Wheeler linux-mm@lists.ewheeler.net and point to http://lkml.kernel.org/r/alpine.LRH.2.11.1801242349220.30642@mail.ewheeler.n... because that smells like a bug that this patch would be fixing.
Good point.
Andrew, do you want me repost with tags integrated?
f2fs is doing
page = f2fs_pagecache_get_page(inode->i_mapping, 0, FGP_LOCK|FGP_NOWAIT, 0);
which calls
struct page *pagecache_get_page(inode->i_mapping, 0, FGP_LOCK|FGP_NOWAIT, 0);
. Then, can't we define
static inline struct page *find_trylock_page(struct address_space *mapping, pgoff_t offset) { return pagecache_get_page(mapping, offset, FGP_LOCK|FGP_NOWAIT, 0); }
and replace find_lock_page() with find_trylock_page() ?
Also, won't
---------- diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 34ce3ebf..0cfc329 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -479,6 +479,8 @@ static inline int trylock_page(struct page *page) static inline void lock_page(struct page *page) { might_sleep(); + WARN_ONCE(current->flags & PF_MEMALLOC, + "lock_page() from reclaim context might deadlock"); if (!trylock_page(page)) __lock_page(page); } @@ -491,6 +493,8 @@ static inline void lock_page(struct page *page) static inline int lock_page_killable(struct page *page) { might_sleep(); + WARN_ONCE(current->flags & PF_MEMALLOC, + "lock_page_killable() from reclaim context might deadlock"); if (!trylock_page(page)) return __lock_page_killable(page); return 0; ----------
help find lock_page() users in deep reclaim paths?
---------- [ 100.314083] ------------[ cut here ]------------ [ 100.315695] lock_page() from reclaim context might deadlock [ 100.315708] WARNING: CPU: 1 PID: 56 at ./include/linux/pagemap.h:483 pagecache_get_page+0x245/0x250 [ 100.319686] Modules linked in: sg pcspkr i2c_piix4 vmw_vmci shpchp sd_mod ata_generic pata_acpi serio_raw vmwgfx drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm drm ahci mptspi libahci scsi_transport_spi mptscsih ata_piix mptbase i2c_core e1000 libata ipv6 [ 100.325951] CPU: 1 PID: 56 Comm: kswapd0 Kdump: loaded Not tainted 4.16.0-rc5-next-20180315+ #696 [ 100.328439] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 05/19/2017 [ 100.331625] RIP: 0010:pagecache_get_page+0x245/0x250 [ 100.333211] RSP: 0018:ffffc9000085bc00 EFLAGS: 00010286 [ 100.334832] RAX: 0000000000000000 RBX: ffffea0004ad3100 RCX: 0000000000000007 [ 100.336900] RDX: 0000000000000b63 RSI: ffff88013aa0b700 RDI: ffff88013aa0ae80 [ 100.339068] RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000000 [ 100.341108] R10: 0000000000000040 R11: 0000000000000000 R12: ffff880139b6e0c8 [ 100.343153] R13: 0000000000000000 R14: ffffffff82068220 R15: 0000000000000002 [ 100.345242] FS: 0000000000000000(0000) GS:ffff88013bc40000(0000) knlGS:0000000000000000 [ 100.347510] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 100.349277] CR2: 00007f326c67e000 CR3: 000000000200f006 CR4: 00000000001606e0 [ 100.351343] Call Trace: [ 100.352374] ? iput+0x52/0x2f0 [ 100.353567] shmem_unused_huge_shrink+0x2e9/0x380 [ 100.355112] super_cache_scan+0x17a/0x180 [ 100.356553] shrink_slab+0x218/0x590 [ 100.357854] shrink_node+0x346/0x350 [ 100.359161] kswapd+0x322/0x930 [ 100.360370] kthread+0xf0/0x130 [ 100.361566] ? mem_cgroup_shrink_node+0x320/0x320 [ 100.363112] ? kthread_create_on_node+0x60/0x60 [ 100.364634] ret_from_fork+0x3a/0x50 [ 100.365943] Code: db e8 70 4c 01 00 e9 5e fe ff ff 80 3d 44 51 f8 00 00 0f 85 46 ff ff ff 48 c7 c7 60 11 df 81 c6 05 30 51 f8 00 01 e8 5b 86 ee ff <0f> 0b e9 2c ff ff ff 0f 1f 40 00 83 e2 02 53 8b 8f 48 01 00 00 [ 100.371197] ---[ end trace b50eee6f891efec3 ]--- ----------
On Fri 16-03-18 22:14:24, Tetsuo Handa wrote:
f2fs is doing
page = f2fs_pagecache_get_page(inode->i_mapping, 0, FGP_LOCK|FGP_NOWAIT, 0);
which calls
struct page *pagecache_get_page(inode->i_mapping, 0, FGP_LOCK|FGP_NOWAIT, 0);
. Then, can't we define
static inline struct page *find_trylock_page(struct address_space *mapping, pgoff_t offset) { return pagecache_get_page(mapping, offset, FGP_LOCK|FGP_NOWAIT, 0); }
and replace find_lock_page() with find_trylock_page() ?
I haven't checked whether we have enough users of this pattern to create a helper.
Also, won't
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 34ce3ebf..0cfc329 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -479,6 +479,8 @@ static inline int trylock_page(struct page *page) static inline void lock_page(struct page *page) { might_sleep();
- WARN_ONCE(current->flags & PF_MEMALLOC,
if (!trylock_page(page)) __lock_page(page);"lock_page() from reclaim context might deadlock");
}
lock_page is called from many (semi)hot paths so I wouldn't add additional code there. Maybe we can hide it in VM_WARN. I would have to think much more to be sure this won't lead to some strange false positives. I suspect it won't but wouldn't bet my head on that.
In any case, you can try to send a patch and we can stick it into mmotm and have it there for few cycles to see what falls out...
On Fri, Mar 16, 2018 at 10:14:24PM +0900, Tetsuo Handa wrote:
f2fs is doing
page = f2fs_pagecache_get_page(inode->i_mapping, 0, FGP_LOCK|FGP_NOWAIT, 0);
which calls
struct page *pagecache_get_page(inode->i_mapping, 0, FGP_LOCK|FGP_NOWAIT, 0);
. Then, can't we define
static inline struct page *find_trylock_page(struct address_space *mapping, pgoff_t offset) { return pagecache_get_page(mapping, offset, FGP_LOCK|FGP_NOWAIT, 0); }
and replace find_lock_page() with find_trylock_page() ?
This won't work in this case. We need to destinct no-page-in-page-cache from failed-to-lock-page. We take different routes depending on this.
Kirill A. Shutemov wrote:
On Fri, Mar 16, 2018 at 10:14:24PM +0900, Tetsuo Handa wrote:
f2fs is doing
page = f2fs_pagecache_get_page(inode->i_mapping, 0, FGP_LOCK|FGP_NOWAIT, 0);
which calls
struct page *pagecache_get_page(inode->i_mapping, 0, FGP_LOCK|FGP_NOWAIT, 0);
. Then, can't we define
static inline struct page *find_trylock_page(struct address_space *mapping, pgoff_t offset) { return pagecache_get_page(mapping, offset, FGP_LOCK|FGP_NOWAIT, 0); }
and replace find_lock_page() with find_trylock_page() ?
This won't work in this case. We need to destinct no-page-in-page-cache from failed-to-lock-page. We take different routes depending on this.
OK. Then, I think we should avoid reordering trylock_page() and PageTransHuge() without patch description why it is safe. Below patch preserves the ordering and sounds safer for stable. But either patch, please add why it is safe to omit "/* Has the page been truncated? */" check which would have been done for FGP_LOCK in patch description.
--- mm/shmem.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-)
diff --git a/mm/shmem.c b/mm/shmem.c index 8ead6cb..5e94ca4 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -493,16 +493,27 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, info = list_entry(pos, struct shmem_inode_info, shrinklist); inode = &info->vfs_inode;
- if (nr_to_split && split >= nr_to_split) { - iput(inode); - continue; - } + if (nr_to_split && split >= nr_to_split) + goto leave;
- page = find_lock_page(inode->i_mapping, + page = find_get_page(inode->i_mapping, (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT); if (!page) goto drop;
+ /* + * Leave the inode on the list if we failed to lock + * the page at this time. + * + * Waiting for the lock may lead to deadlock in the + * reclaim path. + */ + if (!trylock_page(page)) { + put_page(page); + goto leave; + } + + /* No huge page at the end of the file: nothing to split */ if (!PageTransHuge(page)) { unlock_page(page); put_page(page); @@ -513,16 +524,15 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, unlock_page(page); put_page(page);
- if (ret) { - /* split failed: leave it on the list */ - iput(inode); - continue; - } + /* If split failed leave the inode on the list */ + if (ret) + goto leave;
split++; drop: list_del_init(&info->shrinklist); removed++; +leave: iput(inode); }
--
linux-stable-mirror@lists.linaro.org