This patch fixes a data race in commit 779750d20b93 ("shmem: split huge pages beyond i_size under memory pressure").
Here are call traces:
Call Trace 1: shmem_unused_huge_shrink+0x3ae/0x410 ? __list_lru_walk_one.isra.5+0x33/0x160 super_cache_scan+0x17c/0x190 shrink_slab.part.55+0x1ef/0x3f0 shrink_node+0x10e/0x330 kswapd+0x380/0x740 kthread+0xfc/0x130 ? mem_cgroup_shrink_node+0x170/0x170 ? kthread_create_on_node+0x70/0x70 ret_from_fork+0x1f/0x30
Call Trace 2: shmem_evict_inode+0xd8/0x190 evict+0xbe/0x1c0 do_unlinkat+0x137/0x330 do_syscall_64+0x76/0x120 entry_SYSCALL_64_after_hwframe+0x3d/0xa2
The simultaneous deletion of adjacent elements in the local list (@list) by shmem_unused_huge_shrink and shmem_evict_inode will break the list.
Image there are 3 items in the local list (@list). In the first traversal, A is not deleted from @list.
1) A->B->C ^ | pos (leave)
In the second traversal, B is deleted from @list. Concurrently, A is deleted from @list through shmem_evict_inode() since last reference counter of inode is dropped by other thread. Then the @list is corrupted.
2) A->B->C ^ ^ | | evict pos (drop)
Fix:
We should make sure the item is either on the global list or deleted from any local list before iput().
Fixed by moving inodes that are on @list and will not be deleted back to global list before iput.
Fixes: 779750d20b93 ("shmem: split huge pages beyond i_size under memory pressure") Signed-off-by: Gang Li ligang.bdlg@bytedance.com
---
Changes in v3: - Add more comment. - Use list_move(&info->shrinklist, &sbinfo->shrinklist) instead of list_move(pos, &sbinfo->shrinklist) for consistency.
Changes in v2: https://lore.kernel.org/all/20211124030840.88455-1-ligang.bdlg@bytedance.com... - Move spinlock to the front of iput instead of changing lock type since iput will call evict which may cause deadlock by requesting shrinklist_lock. - Add call trace in commit message.
v1: https://lore.kernel.org/lkml/20211122064126.76734-1-ligang.bdlg@bytedance.co...
--- mm/shmem.c | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-)
diff --git a/mm/shmem.c b/mm/shmem.c index 9023103ee7d8..ab2df692bd58 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -569,7 +569,6 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, /* inode is about to be evicted */ if (!inode) { list_del_init(&info->shrinklist); - removed++; goto next; }
@@ -577,15 +576,16 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, if (round_up(inode->i_size, PAGE_SIZE) == round_up(inode->i_size, HPAGE_PMD_SIZE)) { list_move(&info->shrinklist, &to_remove); - removed++; goto next; }
list_move(&info->shrinklist, &list); next: + removed++; if (!--batch) break; } + sbinfo->shrinklist_len -= removed; spin_unlock(&sbinfo->shrinklist_lock);
list_for_each_safe(pos, next, &to_remove) { @@ -602,7 +602,7 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, inode = &info->vfs_inode;
if (nr_to_split && split >= nr_to_split) - goto leave; + goto move_back;
page = find_get_page(inode->i_mapping, (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT); @@ -616,38 +616,43 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, }
/* - * Leave the inode on the list if we failed to lock - * the page at this time. + * Move the inode on the list back to shrinklist if we failed + * to lock the page at this time. * * Waiting for the lock may lead to deadlock in the * reclaim path. */ if (!trylock_page(page)) { put_page(page); - goto leave; + goto move_back; }
ret = split_huge_page(page); unlock_page(page); put_page(page);
- /* If split failed leave the inode on the list */ + /* If split failed move the inode on the list back to shrinklist */ if (ret) - goto leave; + goto move_back;
split++; drop: list_del_init(&info->shrinklist); - removed++; -leave: + goto put; +move_back: + /* inodes that are on @list and will not be deleted must be moved back to + * global list before iput for two reasons: + * 1. iput in lock: iput call shmem_evict_inode, then cause deadlock. + * 2. iput before lock: shmem_evict_inode may grab the inode on @list, + * which will cause race. + */ + spin_lock(&sbinfo->shrinklist_lock); + list_move(&info->shrinklist, &sbinfo->shrinklist); + sbinfo->shrinklist_len++; + spin_unlock(&sbinfo->shrinklist_lock); +put: iput(inode); }
- spin_lock(&sbinfo->shrinklist_lock); - list_splice_tail(&list, &sbinfo->shrinklist); - sbinfo->shrinklist_len -= removed; - spin_unlock(&sbinfo->shrinklist_lock); - return split; }
On Wed, Nov 24, 2021 at 05:43:16PM +0800, Gang Li wrote:
+move_back:
/* inodes that are on @list and will not be deleted must be moved back to
* global list before iput for two reasons:
* 1. iput in lock: iput call shmem_evict_inode, then cause deadlock.
* 2. iput before lock: shmem_evict_inode may grab the inode on @list,
* which will cause race.
*/
spin_lock(&sbinfo->shrinklist_lock);
list_move(&info->shrinklist, &sbinfo->shrinklist);
sbinfo->shrinklist_len++;
spin_unlock(&sbinfo->shrinklist_lock);
+put: iput(inode); }
- spin_lock(&sbinfo->shrinklist_lock);
- list_splice_tail(&list, &sbinfo->shrinklist);
- sbinfo->shrinklist_len -= removed;
- spin_unlock(&sbinfo->shrinklist_lock);
- return split;
}
Okay, I guess it works. Locking is not pretty, but well..
Acked-by: Kirill A. Shutemov kirill.shutemov@linux.intel.com
On Wed, Nov 24, 2021 at 5:43 PM Gang Li ligang.bdlg@bytedance.com wrote:
This patch fixes a data race in commit 779750d20b93 ("shmem: split huge pages beyond i_size under memory pressure").
Here are call traces:
Call Trace 1: shmem_unused_huge_shrink+0x3ae/0x410 ? __list_lru_walk_one.isra.5+0x33/0x160 super_cache_scan+0x17c/0x190 shrink_slab.part.55+0x1ef/0x3f0 shrink_node+0x10e/0x330 kswapd+0x380/0x740 kthread+0xfc/0x130 ? mem_cgroup_shrink_node+0x170/0x170 ? kthread_create_on_node+0x70/0x70 ret_from_fork+0x1f/0x30
Call Trace 2: shmem_evict_inode+0xd8/0x190 evict+0xbe/0x1c0 do_unlinkat+0x137/0x330 do_syscall_64+0x76/0x120 entry_SYSCALL_64_after_hwframe+0x3d/0xa2
The simultaneous deletion of adjacent elements in the local list (@list) by shmem_unused_huge_shrink and shmem_evict_inode will break the list.
Image there are 3 items in the local list (@list). In the first traversal, A is not deleted from @list.
- A->B->C ^ | pos (leave)
In the second traversal, B is deleted from @list. Concurrently, A is deleted from @list through shmem_evict_inode() since last reference counter of inode is dropped by other thread. Then the @list is corrupted.
- A->B->C ^ ^ | | evict pos (drop)
Fix:
We should make sure the item is either on the global list or deleted from any local list before iput().
Fixed by moving inodes that are on @list and will not be deleted back to global list before iput.
Fixes: 779750d20b93 ("shmem: split huge pages beyond i_size under memory pressure") Signed-off-by: Gang Li ligang.bdlg@bytedance.com
Changes in v3:
- Add more comment.
- Use list_move(&info->shrinklist, &sbinfo->shrinklist) instead of list_move(pos, &sbinfo->shrinklist) for consistency.
Changes in v2: https://lore.kernel.org/all/20211124030840.88455-1-ligang.bdlg@bytedance.com...
- Move spinlock to the front of iput instead of changing lock type since iput will call evict which may cause deadlock by requesting shrinklist_lock.
- Add call trace in commit message.
v1: https://lore.kernel.org/lkml/20211122064126.76734-1-ligang.bdlg@bytedance.co...
mm/shmem.c | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-)
diff --git a/mm/shmem.c b/mm/shmem.c index 9023103ee7d8..ab2df692bd58 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -569,7 +569,6 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, /* inode is about to be evicted */ if (!inode) { list_del_init(&info->shrinklist);
removed++; goto next; }
@@ -577,15 +576,16 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, if (round_up(inode->i_size, PAGE_SIZE) == round_up(inode->i_size, HPAGE_PMD_SIZE)) { list_move(&info->shrinklist, &to_remove);
removed++; goto next; } list_move(&info->shrinklist, &list);
next:
removed++; if (!--batch) break; }
sbinfo->shrinklist_len -= removed; spin_unlock(&sbinfo->shrinklist_lock); list_for_each_safe(pos, next, &to_remove) {
@@ -602,7 +602,7 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, inode = &info->vfs_inode;
if (nr_to_split && split >= nr_to_split)
goto leave;
goto move_back; page = find_get_page(inode->i_mapping, (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
@@ -616,38 +616,43 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, }
/*
* Leave the inode on the list if we failed to lock
* the page at this time.
* Move the inode on the list back to shrinklist if we failed
* to lock the page at this time. * * Waiting for the lock may lead to deadlock in the * reclaim path. */ if (!trylock_page(page)) { put_page(page);
goto leave;
goto move_back; } ret = split_huge_page(page); unlock_page(page); put_page(page);
/* If split failed leave the inode on the list */
/* If split failed move the inode on the list back to shrinklist */ if (ret)
goto leave;
goto move_back; split++;
drop: list_del_init(&info->shrinklist);
removed++;
-leave:
goto put;
+move_back:
/* inodes that are on @list and will not be deleted must be moved back to
* global list before iput for two reasons:
* 1. iput in lock: iput call shmem_evict_inode, then cause deadlock.
* 2. iput before lock: shmem_evict_inode may grab the inode on @list,
* which will cause race.
*/
Multi-line comment is like the following format.
/* * Comment here. */
And I also suggest reworking the comments here. Something like:
/* * Make sure the inode is either on the global list or deleted from * any local list before iput() since it could be deleted in another * thread once we put the inode (then the local list is corrupted). */
With that.
Reviewed-by: Muchun Song songmuchun@bytedance.com
spin_lock(&sbinfo->shrinklist_lock);
list_move(&info->shrinklist, &sbinfo->shrinklist);
sbinfo->shrinklist_len++;
spin_unlock(&sbinfo->shrinklist_lock);
+put: iput(inode); }
spin_lock(&sbinfo->shrinklist_lock);
list_splice_tail(&list, &sbinfo->shrinklist);
sbinfo->shrinklist_len -= removed;
spin_unlock(&sbinfo->shrinklist_lock);
return split;
}
-- 2.20.1
linux-stable-mirror@lists.linaro.org