The patch titled
Subject: mm/z3fold.c: lock z3fold page before __SetPageMovable()
has been removed from the -mm tree. Its filename was
mm-z3foldc-lock-z3fold-page-before-__setpagemovable.patch
This patch was dropped because it was merged into mainline or a subsystem tree
------------------------------------------------------
From: Henry Burns <henryburns(a)google.com>
Subject: mm/z3fold.c: lock z3fold page before __SetPageMovable()
Following zsmalloc.c's example we call trylock_page() and unlock_page().
Also make z3fold_page_migrate() assert that newpage is passed in locked,
as per the documentation.
[akpm(a)linux-foundation.org: fix trylock_page return value test, per Shakeel]
Link: http://lkml.kernel.org/r/20190702005122.41036-1-henryburns@google.com
Link: http://lkml.kernel.org/r/20190702233538.52793-1-henryburns@google.com
Signed-off-by: Henry Burns <henryburns(a)google.com>
Suggested-by: Vitaly Wool <vitalywool(a)gmail.com>
Acked-by: Vitaly Wool <vitalywool(a)gmail.com>
Acked-by: David Rientjes <rientjes(a)google.com>
Reviewed-by: Shakeel Butt <shakeelb(a)google.com>
Cc: Vitaly Vul <vitaly.vul(a)sony.com>
Cc: Mike Rapoport <rppt(a)linux.vnet.ibm.com>
Cc: Xidong Wang <wangxidong_97(a)163.com>
Cc: Jonathan Adams <jwadams(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/z3fold.c | 12 +++++++++++-
1 file changed, 11 insertions(+), 1 deletion(-)
--- a/mm/z3fold.c~mm-z3foldc-lock-z3fold-page-before-__setpagemovable
+++ a/mm/z3fold.c
@@ -924,7 +924,16 @@ retry:
set_bit(PAGE_HEADLESS, &page->private);
goto headless;
}
- __SetPageMovable(page, pool->inode->i_mapping);
+ if (can_sleep) {
+ lock_page(page);
+ __SetPageMovable(page, pool->inode->i_mapping);
+ unlock_page(page);
+ } else {
+ if (trylock_page(page)) {
+ __SetPageMovable(page, pool->inode->i_mapping);
+ unlock_page(page);
+ }
+ }
z3fold_page_lock(zhdr);
found:
@@ -1331,6 +1340,7 @@ static int z3fold_page_migrate(struct ad
VM_BUG_ON_PAGE(!PageMovable(page), page);
VM_BUG_ON_PAGE(!PageIsolated(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
zhdr = page_address(page);
pool = zhdr_to_pool(zhdr);
_
Patches currently in -mm which might be from henryburns(a)google.com are
mm-z3foldc-remove-z3fold_migration-trylock.patch
The patch titled
Subject: mm/memcontrol: fix wrong statistics in memory.stat
has been removed from the -mm tree. Its filename was
mm-memcontrol-fix-wrong-statistics-in-memorystat.patch
This patch was dropped because it was merged into mainline or a subsystem tree
------------------------------------------------------
From: Yafang Shao <laoar.shao(a)gmail.com>
Subject: mm/memcontrol: fix wrong statistics in memory.stat
When we calculate total statistics for memcg1_stats and memcg1_events, we
use the the index 'i' in the for loop as the events index. Actually we
should use memcg1_stats[i] and memcg1_events[i] as the events index.
Link: http://lkml.kernel.org/r/1562116978-19539-1-git-send-email-laoar.shao@gmail…
Fixes: 42a300353577 ("mm: memcontrol: fix recursive statistics correctness & scalabilty").
Signed-off-by: Yafang Shao <laoar.shao(a)gmail.com
Reviewed-by: Shakeel Butt <shakeelb(a)google.com>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Yafang Shao <shaoyafang(a)didiglobal.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/memcontrol.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
--- a/mm/memcontrol.c~mm-memcontrol-fix-wrong-statistics-in-memorystat
+++ a/mm/memcontrol.c
@@ -3523,12 +3523,13 @@ static int memcg_stat_show(struct seq_fi
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
- (u64)memcg_page_state(memcg, i) * PAGE_SIZE);
+ (u64)memcg_page_state(memcg, memcg1_stats[i]) *
+ PAGE_SIZE);
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
- (u64)memcg_events(memcg, i));
+ (u64)memcg_events(memcg, memcg1_events[i]));
for (i = 0; i < NR_LRU_LISTS; i++)
seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
_
Patches currently in -mm which might be from laoar.shao(a)gmail.com are
mm-vmscan-expose-cgroup_ino-for-memcg-reclaim-tracepoints.patch
mm-memcontrol-keep-local-vm-counters-in-sync-with-the-hierarchical-ones.patch
mm-vmscan-add-a-new-member-reclaim_state-in-struct-shrink_control.patch
mm-vmscan-add-a-new-member-reclaim_state-in-struct-shrink_control-fix.patch
mm-vmscan-calculate-reclaimed-slab-caches-in-all-reclaim-paths.patch
The patch titled
Subject: mm/nvdimm: add is_ioremap_addr and use that to check ioremap address
has been removed from the -mm tree. Its filename was
mm-nvdimm-add-is_ioremap_addr-and-use-that-to-check-ioremap-address.patch
This patch was dropped because it was merged into mainline or a subsystem tree
------------------------------------------------------
From: "Aneesh Kumar K.V" <aneesh.kumar(a)linux.ibm.com>
Subject: mm/nvdimm: add is_ioremap_addr and use that to check ioremap address
Architectures like powerpc use different address range to map ioremap and
vmalloc range. The memunmap() check used by the nvdimm layer was wrongly
using is_vmalloc_addr() to check for ioremap range which fails for ppc64.
This result in ppc64 not freeing the ioremap mapping. The side effect of
this is an unbind failure during module unload with papr_scm nvdimm driver
Link: http://lkml.kernel.org/r/20190701134038.14165-1-aneesh.kumar@linux.ibm.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar(a)linux.ibm.com>
Fixes: b5beae5e224f ("powerpc/pseries: Add driver for PAPR SCM regions")
Cc: Dan Williams <dan.j.williams(a)intel.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
arch/powerpc/include/asm/pgtable.h | 14 ++++++++++++++
include/linux/mm.h | 5 +++++
kernel/iomem.c | 2 +-
3 files changed, 20 insertions(+), 1 deletion(-)
--- a/arch/powerpc/include/asm/pgtable.h~mm-nvdimm-add-is_ioremap_addr-and-use-that-to-check-ioremap-address
+++ a/arch/powerpc/include/asm/pgtable.h
@@ -140,6 +140,20 @@ static inline void pte_frag_set(mm_conte
}
#endif
+#ifdef CONFIG_PPC64
+#define is_ioremap_addr is_ioremap_addr
+static inline bool is_ioremap_addr(const void *x)
+{
+#ifdef CONFIG_MMU
+ unsigned long addr = (unsigned long)x;
+
+ return addr >= IOREMAP_BASE && addr < IOREMAP_END;
+#else
+ return false;
+#endif
+}
+#endif /* CONFIG_PPC64 */
+
#endif /* __ASSEMBLY__ */
#endif /* _ASM_POWERPC_PGTABLE_H */
--- a/include/linux/mm.h~mm-nvdimm-add-is_ioremap_addr-and-use-that-to-check-ioremap-address
+++ a/include/linux/mm.h
@@ -633,6 +633,11 @@ static inline bool is_vmalloc_addr(const
return false;
#endif
}
+
+#ifndef is_ioremap_addr
+#define is_ioremap_addr(x) is_vmalloc_addr(x)
+#endif
+
#ifdef CONFIG_MMU
extern int is_vmalloc_or_module_addr(const void *x);
#else
--- a/kernel/iomem.c~mm-nvdimm-add-is_ioremap_addr-and-use-that-to-check-ioremap-address
+++ a/kernel/iomem.c
@@ -121,7 +121,7 @@ EXPORT_SYMBOL(memremap);
void memunmap(void *addr)
{
- if (is_vmalloc_addr(addr))
+ if (is_ioremap_addr(addr))
iounmap((void __iomem *) addr);
}
EXPORT_SYMBOL(memunmap);
_
Patches currently in -mm which might be from aneesh.kumar(a)linux.ibm.com are
mm-move-map_sync-to-asm-generic-mman-commonh.patch
mm-mmap-move-common-defines-to-mman-commonh.patch
The patch titled
Subject: mm: vmscan: scan anonymous pages on file refaults
has been removed from the -mm tree. Its filename was
mm-vmscan-scan-anonymous-pages-on-file-refaults.patch
This patch was dropped because it was merged into mainline or a subsystem tree
------------------------------------------------------
From: Kuo-Hsin Yang <vovoy(a)chromium.org>
Subject: mm: vmscan: scan anonymous pages on file refaults
When file refaults are detected and there are many inactive file pages,
the system never reclaim anonymous pages, the file pages are dropped
aggressively when there are still a lot of cold anonymous pages and system
thrashes. This issue impacts the performance of applications with large
executable, e.g. chrome.
With this patch, when file refault is detected, inactive_list_is_low()
always returns true for file pages in get_scan_count() to enable scanning
anonymous pages.
The problem can be reproduced by the following test program.
---8<---
void fallocate_file(const char *filename, off_t size)
{
struct stat st;
int fd;
if (!stat(filename, &st) && st.st_size >= size)
return;
fd = open(filename, O_WRONLY | O_CREAT, 0600);
if (fd < 0) {
perror("create file");
exit(1);
}
if (posix_fallocate(fd, 0, size)) {
perror("fallocate");
exit(1);
}
close(fd);
}
long *alloc_anon(long size)
{
long *start = malloc(size);
memset(start, 1, size);
return start;
}
long access_file(const char *filename, long size, long rounds)
{
int fd, i;
volatile char *start1, *end1, *start2;
const int page_size = getpagesize();
long sum = 0;
fd = open(filename, O_RDONLY);
if (fd == -1) {
perror("open");
exit(1);
}
/*
* Some applications, e.g. chrome, use a lot of executable file
* pages, map some of the pages with PROT_EXEC flag to simulate
* the behavior.
*/
start1 = mmap(NULL, size / 2, PROT_READ | PROT_EXEC, MAP_SHARED,
fd, 0);
if (start1 == MAP_FAILED) {
perror("mmap");
exit(1);
}
end1 = start1 + size / 2;
start2 = mmap(NULL, size / 2, PROT_READ, MAP_SHARED, fd, size / 2);
if (start2 == MAP_FAILED) {
perror("mmap");
exit(1);
}
for (i = 0; i < rounds; ++i) {
struct timeval before, after;
volatile char *ptr1 = start1, *ptr2 = start2;
gettimeofday(&before, NULL);
for (; ptr1 < end1; ptr1 += page_size, ptr2 += page_size)
sum += *ptr1 + *ptr2;
gettimeofday(&after, NULL);
printf("File access time, round %d: %f (sec)
", i,
(after.tv_sec - before.tv_sec) +
(after.tv_usec - before.tv_usec) / 1000000.0);
}
return sum;
}
int main(int argc, char *argv[])
{
const long MB = 1024 * 1024;
long anon_mb, file_mb, file_rounds;
const char filename[] = "large";
long *ret1;
long ret2;
if (argc != 4) {
printf("usage: thrash ANON_MB FILE_MB FILE_ROUNDS
");
exit(0);
}
anon_mb = atoi(argv[1]);
file_mb = atoi(argv[2]);
file_rounds = atoi(argv[3]);
fallocate_file(filename, file_mb * MB);
printf("Allocate %ld MB anonymous pages
", anon_mb);
ret1 = alloc_anon(anon_mb * MB);
printf("Access %ld MB file pages
", file_mb);
ret2 = access_file(filename, file_mb * MB, file_rounds);
printf("Print result to prevent optimization: %ld
",
*ret1 + ret2);
return 0;
}
---8<---
Running the test program on 2GB RAM VM with kernel 5.2.0-rc5, the program
fills ram with 2048 MB memory, access a 200 MB file for 10 times. Without
this patch, the file cache is dropped aggresively and every access to the
file is from disk.
$ ./thrash 2048 200 10
Allocate 2048 MB anonymous pages
Access 200 MB file pages
File access time, round 0: 2.489316 (sec)
File access time, round 1: 2.581277 (sec)
File access time, round 2: 2.487624 (sec)
File access time, round 3: 2.449100 (sec)
File access time, round 4: 2.420423 (sec)
File access time, round 5: 2.343411 (sec)
File access time, round 6: 2.454833 (sec)
File access time, round 7: 2.483398 (sec)
File access time, round 8: 2.572701 (sec)
File access time, round 9: 2.493014 (sec)
With this patch, these file pages can be cached.
$ ./thrash 2048 200 10
Allocate 2048 MB anonymous pages
Access 200 MB file pages
File access time, round 0: 2.475189 (sec)
File access time, round 1: 2.440777 (sec)
File access time, round 2: 2.411671 (sec)
File access time, round 3: 1.955267 (sec)
File access time, round 4: 0.029924 (sec)
File access time, round 5: 0.000808 (sec)
File access time, round 6: 0.000771 (sec)
File access time, round 7: 0.000746 (sec)
File access time, round 8: 0.000738 (sec)
File access time, round 9: 0.000747 (sec)
Checked the swap out stats during the test [1], 19006 pages swapped out
with this patch, 3418 pages swapped out without this patch. There are
more swap out, but I think it's within reasonable range when file backed
data set doesn't fit into the memory.
$ ./thrash 2000 100 2100 5 1 # ANON_MB FILE_EXEC FILE_NOEXEC ROUNDS
PROCESSES Allocate 2000 MB anonymous pages active_anon: 1613644,
inactive_anon: 348656, active_file: 892, inactive_file: 1384 (kB)
pswpout: 7972443, pgpgin: 478615246 Access 100 MB executable file pages
Access 2100 MB regular file pages File access time, round 0: 12.165,
(sec) active_anon: 1433788, inactive_anon: 478116, active_file: 17896,
inactive_file: 24328 (kB) File access time, round 1: 11.493, (sec)
active_anon: 1430576, inactive_anon: 477144, active_file: 25440,
inactive_file: 26172 (kB) File access time, round 2: 11.455, (sec)
active_anon: 1427436, inactive_anon: 476060, active_file: 21112,
inactive_file: 28808 (kB) File access time, round 3: 11.454, (sec)
active_anon: 1420444, inactive_anon: 473632, active_file: 23216,
inactive_file: 35036 (kB) File access time, round 4: 11.479, (sec)
active_anon: 1413964, inactive_anon: 471460, active_file: 31728,
inactive_file: 32224 (kB) pswpout: 7991449 (+ 19006), pgpgin: 489924366
(+ 11309120)
With 4 processes accessing non-overlapping parts of a large file, 30316
pages swapped out with this patch, 5152 pages swapped out without this
patch. The swapout number is small comparing to pgpgin.
[1]: https://github.com/vovo/testing/blob/master/mem_thrash.c
Link: http://lkml.kernel.org/r/20190701081038.GA83398@google.com
Fixes: e9868505987a ("mm,vmscan: only evict file pages when we have plenty")
Fixes: 7c5bd705d8f9 ("mm: memcg: only evict file pages when we have plenty")
Signed-off-by: Kuo-Hsin Yang <vovoy(a)chromium.org>
Acked-by: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Sonny Rao <sonnyrao(a)chromium.org>
Cc: Mel Gorman <mgorman(a)techsingularity.net>
Cc: Rik van Riel <riel(a)redhat.com>
Cc: Vladimir Davydov <vdavydov.dev(a)gmail.com>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: <stable(a)vger.kernel.org> [4.12+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/vmscan.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
--- a/mm/vmscan.c~mm-vmscan-scan-anonymous-pages-on-file-refaults
+++ a/mm/vmscan.c
@@ -2125,7 +2125,7 @@ static void shrink_active_list(unsigned
* 10TB 320 32GB
*/
static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
- struct scan_control *sc, bool actual_reclaim)
+ struct scan_control *sc, bool trace)
{
enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
@@ -2151,7 +2151,7 @@ static bool inactive_list_is_low(struct
* rid of the stale workingset quickly.
*/
refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE);
- if (file && actual_reclaim && lruvec->refaults != refaults) {
+ if (file && lruvec->refaults != refaults) {
inactive_ratio = 0;
} else {
gb = (inactive + active) >> (30 - PAGE_SHIFT);
@@ -2161,7 +2161,7 @@ static bool inactive_list_is_low(struct
inactive_ratio = 1;
}
- if (actual_reclaim)
+ if (trace)
trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
_
Patches currently in -mm which might be from vovoy(a)chromium.org are
Hole puching currently evicts pages from page cache and then goes on to
remove blocks from the inode. This happens under both XFS_IOLOCK_EXCL
and XFS_MMAPLOCK_EXCL which provides appropriate serialization with
racing reads or page faults. However there is currently nothing that
prevents readahead triggered by fadvise() or madvise() from racing with
the hole punch and instantiating page cache page after hole punching has
evicted page cache in xfs_flush_unmap_range() but before it has removed
blocks from the inode. This page cache page will be mapping soon to be
freed block and that can lead to returning stale data to userspace or
even filesystem corruption.
Fix the problem by protecting handling of readahead requests by
XFS_IOLOCK_SHARED similarly as we protect reads.
CC: stable(a)vger.kernel.org
Link: https://lore.kernel.org/linux-fsdevel/CAOQ4uxjQNmxqmtA_VbYW0Su9rKRk2zobJmah…
Reported-by: Amir Goldstein <amir73il(a)gmail.com>
Signed-off-by: Jan Kara <jack(a)suse.cz>
---
fs/xfs/xfs_file.c | 20 ++++++++++++++++++++
1 file changed, 20 insertions(+)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 76748255f843..88fe3dbb3ba2 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -33,6 +33,7 @@
#include <linux/pagevec.h>
#include <linux/backing-dev.h>
#include <linux/mman.h>
+#include <linux/fadvise.h>
static const struct vm_operations_struct xfs_file_vm_ops;
@@ -939,6 +940,24 @@ xfs_file_fallocate(
return error;
}
+STATIC int
+xfs_file_fadvise(
+ struct file *file,
+ loff_t start,
+ loff_t end,
+ int advice)
+{
+ struct xfs_inode *ip = XFS_I(file_inode(file));
+ int ret;
+
+ /* Readahead needs protection from hole punching and similar ops */
+ if (advice == POSIX_FADV_WILLNEED)
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ ret = generic_fadvise(file, start, end, advice);
+ if (advice == POSIX_FADV_WILLNEED)
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ return ret;
+}
STATIC loff_t
xfs_file_remap_range(
@@ -1235,6 +1254,7 @@ const struct file_operations xfs_file_operations = {
.fsync = xfs_file_fsync,
.get_unmapped_area = thp_get_unmapped_area,
.fallocate = xfs_file_fallocate,
+ .fadvise = xfs_file_fadvise,
.remap_file_range = xfs_file_remap_range,
};
--
2.16.4
In linux version 4.4, a 32-bit process may fail to allocate 64M hugepage
memory by function shmat even though there is a 64M memory gap in
the process.
It is the adjusted length that causes the problem, introduced from
commit db4fbfb9523c935 ("mm: vm_unmapped_area() lookup function").
Accounting for the worst case alignment overhead, function unmapped_area
and unmapped_area_topdown adjust the search length before searching
for available vma gap. This is an estimated length, sum of the desired
length and the longest alignment offset, which can cause misjudgement
if the system has very few virtual memory left. For example, if the
longest memory gap available is 64M, we can’t get it from the system
by allocating 64M hugepage memory via shmat function. The reason is
that it requires a longger length, the sum of the desired length(64M)
and the longest alignment offset.
To fix this error ,we can calculate the alignment offset of
gap_start or gap_end to get a desired gap_start or gap_end value,
before searching for the available gap. In this way, we don't
need to adjust the search length.
Problem reproduces procedure:
1. allocate a lot of virtual memory segments via shmat and malloc
2. release one of the biggest memory segment via shmdt
3. attach the biggest memory segment via shmat
e.g.
process maps:
00008000-00009000 r-xp 00000000 00:12 3385 /tmp/memory_mmap
00011000-00012000 rw-p 00001000 00:12 3385 /tmp/memory_mmap
27536000-f756a000 rw-p 00000000 00:00 0
f756a000-f7691000 r-xp 00000000 01:00 560 /lib/libc-2.11.1.so
f7691000-f7699000 ---p 00127000 01:00 560 /lib/libc-2.11.1.so
f7699000-f769b000 r--p 00127000 01:00 560 /lib/libc-2.11.1.so
f769b000-f769c000 rw-p 00129000 01:00 560 /lib/libc-2.11.1.so
f769c000-f769f000 rw-p 00000000 00:00 0
f769f000-f76c0000 r-xp 00000000 01:00 583 /lib/libgcc_s.so.1
f76c0000-f76c7000 ---p 00021000 01:00 583 /lib/libgcc_s.so.1
f76c7000-f76c8000 rw-p 00020000 01:00 583 /lib/libgcc_s.so.1
f76c8000-f76e5000 r-xp 00000000 01:00 543 /lib/ld-2.11.1.so
f76e9000-f76ea000 rw-p 00000000 00:00 0
f76ea000-f76ec000 rw-p 00000000 00:00 0
f76ec000-f76ed000 r--p 0001c000 01:00 543 /lib/ld-2.11.1.so
f76ed000-f76ee000 rw-p 0001d000 01:00 543 /lib/ld-2.11.1.so
f7800000-f7a00000 rw-s 00000000 00:0e 0 /SYSV000000ea (deleted)
fba00000-fca00000 rw-s 00000000 00:0e 65538 /SYSV000000ec (deleted)
fca00000-fce00000 rw-s 00000000 00:0e 98307 /SYSV000000ed (deleted)
fce00000-fd800000 rw-s 00000000 00:0e 131076 /SYSV000000ee (deleted)
ff913000-ff934000 rw-p 00000000 00:00 0 [stack]
ffff0000-ffff1000 r-xp 00000000 00:00 0 [vectors]
from 0xf7a00000 to fba00000, it has 64M memory gap, but we can't get
it from kernel.
Signed-off-by: jianhong chen <chenjianhong2(a)huawei.com>
Cc: stable(a)vger.kernel.org
---
mm/mmap.c | 43 +++++++++++++++++++++++++++++--------------
1 file changed, 29 insertions(+), 14 deletions(-)
diff --git a/mm/mmap.c b/mm/mmap.c
index bd7b9f2..c5a5782 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1865,6 +1865,22 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
return error;
}
+static inline unsigned long gap_start_offset(struct vm_unmapped_area_info *info,
+ unsigned long addr)
+{
+ /* get gap_start offset to adjust gap address to the
+ * desired alignment
+ */
+ return (info->align_offset - addr) & info->align_mask;
+}
+
+static inline unsigned long gap_end_offset(struct vm_unmapped_area_info *info,
+ unsigned long addr)
+{
+ /* get gap_end offset to adjust gap address to the desired alignment */
+ return (addr - info->align_offset) & info->align_mask;
+}
+
unsigned long unmapped_area(struct vm_unmapped_area_info *info)
{
/*
@@ -1879,10 +1895,7 @@ unsigned long unmapped_area(struct vm_unmapped_area_info *info)
struct vm_area_struct *vma;
unsigned long length, low_limit, high_limit, gap_start, gap_end;
- /* Adjust search length to account for worst case alignment overhead */
- length = info->length + info->align_mask;
- if (length < info->length)
- return -ENOMEM;
+ length = info->length;
/* Adjust search limits by the desired length */
if (info->high_limit < length)
@@ -1914,6 +1927,7 @@ unsigned long unmapped_area(struct vm_unmapped_area_info *info)
}
gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
+ gap_start += gap_start_offset(info, gap_start);
check_current:
/* Check if current node has a suitable gap */
if (gap_start > high_limit)
@@ -1942,6 +1956,7 @@ unsigned long unmapped_area(struct vm_unmapped_area_info *info)
struct vm_area_struct, vm_rb);
if (prev == vma->vm_rb.rb_left) {
gap_start = vm_end_gap(vma->vm_prev);
+ gap_start += gap_start_offset(info, gap_start);
gap_end = vm_start_gap(vma);
goto check_current;
}
@@ -1951,17 +1966,17 @@ unsigned long unmapped_area(struct vm_unmapped_area_info *info)
check_highest:
/* Check highest gap, which does not precede any rbtree node */
gap_start = mm->highest_vm_end;
+ gap_start += gap_start_offset(info, gap_start);
gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */
if (gap_start > high_limit)
return -ENOMEM;
found:
/* We found a suitable gap. Clip it with the original low_limit. */
- if (gap_start < info->low_limit)
+ if (gap_start < info->low_limit) {
gap_start = info->low_limit;
-
- /* Adjust gap address to the desired alignment */
- gap_start += (info->align_offset - gap_start) & info->align_mask;
+ gap_start += gap_start_offset(info, gap_start);
+ }
VM_BUG_ON(gap_start + info->length > info->high_limit);
VM_BUG_ON(gap_start + info->length > gap_end);
@@ -1974,16 +1989,14 @@ unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
struct vm_area_struct *vma;
unsigned long length, low_limit, high_limit, gap_start, gap_end;
- /* Adjust search length to account for worst case alignment overhead */
- length = info->length + info->align_mask;
- if (length < info->length)
- return -ENOMEM;
+ length = info->length;
/*
* Adjust search limits by the desired length.
* See implementation comment at top of unmapped_area().
*/
gap_end = info->high_limit;
+ gap_end -= gap_end_offset(info, gap_end);
if (gap_end < length)
return -ENOMEM;
high_limit = gap_end - length;
@@ -2020,6 +2033,7 @@ unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
check_current:
/* Check if current node has a suitable gap */
gap_end = vm_start_gap(vma);
+ gap_end -= gap_end_offset(info, gap_end);
if (gap_end < low_limit)
return -ENOMEM;
if (gap_start <= high_limit &&
@@ -2054,13 +2068,14 @@ unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
found:
/* We found a suitable gap. Clip it with the original high_limit. */
- if (gap_end > info->high_limit)
+ if (gap_end > info->high_limit) {
gap_end = info->high_limit;
+ gap_end -= gap_end_offset(info, gap_end);
+ }
found_highest:
/* Compute highest gap address at the desired alignment */
gap_end -= info->length;
- gap_end -= (gap_end - info->align_offset) & info->align_mask;
VM_BUG_ON(gap_end < info->low_limit);
VM_BUG_ON(gap_end < gap_start);
--
1.8.5.6
This reverts commit be4c2d4723a4a637f0d1b4f7c66447141a4b3564.
That commit caused a severe memory leak in nfs_readdir_make_qstr().
When listing a directory with more than 100 files (this is how many
struct nfs_cache_array_entry elements fit in one 4kB page), all
allocated file name strings past those 100 leak.
The root of the leakage is that those string pointers are managed in
pages which are never linked into the page cache.
fs/nfs/dir.c puts pages into the page cache by calling
read_cache_page(); the callback function nfs_readdir_filler() will
then fill the given page struct which was passed to it, which is
already linked in the page cache (by do_read_cache_page() calling
add_to_page_cache_lru()).
Commit be4c2d4723a4 added another (local) array of allocated pages, to
be filled with more data, instead of discarding excess items received
from the NFS server. Those additional pages can be used by the next
nfs_readdir_filler() call (from within the same nfs_readdir() call).
The leak happens when some of those additional pages are never used
(copied to the page cache using copy_highpage()). The pages will be
freed by nfs_readdir_free_pages(), but their contents will not. The
commit did not invoke nfs_readdir_clear_array() (and doing so would
have been dangerous, because it did not track which of those pages
were already copied to the page cache, risking double free bugs).
How to reproduce the leak:
- Use a kernel with CONFIG_SLUB_DEBUG_ON.
- Create a directory on a NFS mount with more than 100 files with
names long enough to use the "kmalloc-32" slab (so we can easily
look up the allocation counts):
for i in `seq 110`; do touch ${i}_0123456789abcdef; done
- Drop all caches:
echo 3 >/proc/sys/vm/drop_caches
- Check the allocation counter:
grep nfs_readdir /sys/kernel/slab/kmalloc-32/alloc_calls
30564391 nfs_readdir_add_to_array+0x73/0xd0 age=534558/4791307/6540952 pid=370-1048386 cpus=0-47 nodes=0-1
- Request a directory listing and check the allocation counters again:
ls
[...]
grep nfs_readdir /sys/kernel/slab/kmalloc-32/alloc_calls
30564511 nfs_readdir_add_to_array+0x73/0xd0 age=207/4792999/6542663 pid=370-1048386 cpus=0-47 nodes=0-1
There are now 120 new allocations.
- Drop all caches and check the counters again:
echo 3 >/proc/sys/vm/drop_caches
grep nfs_readdir /sys/kernel/slab/kmalloc-32/alloc_calls
30564401 nfs_readdir_add_to_array+0x73/0xd0 age=735/4793524/6543176 pid=370-1048386 cpus=0-47 nodes=0-1
110 allocations are gone, but 10 have leaked and will never be freed.
Unhelpfully, those allocations are explicitly excluded from KMEMLEAK,
that's why my initial attempts with KMEMLEAK were not successful:
/*
* Avoid a kmemleak false positive. The pointer to the name is stored
* in a page cache page which kmemleak does not scan.
*/
kmemleak_not_leak(string->name);
It would be possible to solve this bug without reverting the whole
commit:
- keep track of which pages were not used, and call
nfs_readdir_clear_array() on them, or
- manually link those pages into the page cache
But for now I have decided to just revert the commit, because the real
fix would require complex considerations, risking more dangerous
(crash) bugs, which may seem unsuitable for the stable branches.
Signed-off-by: Max Kellermann <mk(a)cm4all.com>
Cc: stable(a)vger.kernel.org
---
fs/nfs/dir.c | 90 ++++-------------------------------------------
fs/nfs/internal.h | 3 +-
2 files changed, 7 insertions(+), 86 deletions(-)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 57b6a45576ad..9f44ddc34c7b 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -140,19 +140,12 @@ struct nfs_cache_array {
struct nfs_cache_array_entry array[0];
};
-struct readdirvec {
- unsigned long nr;
- unsigned long index;
- struct page *pages[NFS_MAX_READDIR_RAPAGES];
-};
-
typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, bool);
typedef struct {
struct file *file;
struct page *page;
struct dir_context *ctx;
unsigned long page_index;
- struct readdirvec pvec;
u64 *dir_cookie;
u64 last_cookie;
loff_t current_index;
@@ -532,10 +525,6 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
struct nfs_cache_array *array;
unsigned int count = 0;
int status;
- int max_rapages = NFS_MAX_READDIR_RAPAGES;
-
- desc->pvec.index = desc->page_index;
- desc->pvec.nr = 0;
scratch = alloc_page(GFP_KERNEL);
if (scratch == NULL)
@@ -560,40 +549,20 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
if (desc->plus)
nfs_prime_dcache(file_dentry(desc->file), entry);
- status = nfs_readdir_add_to_array(entry, desc->pvec.pages[desc->pvec.nr]);
- if (status == -ENOSPC) {
- desc->pvec.nr++;
- if (desc->pvec.nr == max_rapages)
- break;
- status = nfs_readdir_add_to_array(entry, desc->pvec.pages[desc->pvec.nr]);
- }
+ status = nfs_readdir_add_to_array(entry, page);
if (status != 0)
break;
} while (!entry->eof);
- /*
- * page and desc->pvec.pages[0] are valid, don't need to check
- * whether or not to be NULL.
- */
- copy_highpage(page, desc->pvec.pages[0]);
-
out_nopages:
if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) {
- array = kmap_atomic(desc->pvec.pages[desc->pvec.nr]);
+ array = kmap(page);
array->eof_index = array->size;
status = 0;
- kunmap_atomic(array);
+ kunmap(page);
}
put_page(scratch);
-
- /*
- * desc->pvec.nr > 0 means at least one page was completely filled,
- * we should return -ENOSPC. Otherwise function
- * nfs_readdir_xdr_to_array will enter infinite loop.
- */
- if (desc->pvec.nr > 0)
- return -ENOSPC;
return status;
}
@@ -627,24 +596,6 @@ int nfs_readdir_alloc_pages(struct page **pages, unsigned int npages)
return -ENOMEM;
}
-/*
- * nfs_readdir_rapages_init initialize rapages by nfs_cache_array structure.
- */
-static
-void nfs_readdir_rapages_init(nfs_readdir_descriptor_t *desc)
-{
- struct nfs_cache_array *array;
- int max_rapages = NFS_MAX_READDIR_RAPAGES;
- int index;
-
- for (index = 0; index < max_rapages; index++) {
- array = kmap_atomic(desc->pvec.pages[index]);
- memset(array, 0, sizeof(struct nfs_cache_array));
- array->eof_index = -1;
- kunmap_atomic(array);
- }
-}
-
static
int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
{
@@ -655,12 +606,6 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
int status = -ENOMEM;
unsigned int array_size = ARRAY_SIZE(pages);
- /*
- * This means we hit readdir rdpages miss, the preallocated rdpages
- * are useless, the preallocate rdpages should be reinitialized.
- */
- nfs_readdir_rapages_init(desc);
-
entry.prev_cookie = 0;
entry.cookie = desc->last_cookie;
entry.eof = 0;
@@ -721,24 +666,9 @@ int nfs_readdir_filler(void *data, struct page* page)
struct inode *inode = file_inode(desc->file);
int ret;
- /*
- * If desc->page_index in range desc->pvec.index and
- * desc->pvec.index + desc->pvec.nr, we get readdir cache hit.
- */
- if (desc->page_index >= desc->pvec.index &&
- desc->page_index < (desc->pvec.index + desc->pvec.nr)) {
- /*
- * page and desc->pvec.pages[x] are valid, don't need to check
- * whether or not to be NULL.
- */
- copy_highpage(page, desc->pvec.pages[desc->page_index - desc->pvec.index]);
- ret = 0;
- } else {
- ret = nfs_readdir_xdr_to_array(desc, page, inode);
- if (ret < 0)
- goto error;
- }
-
+ ret = nfs_readdir_xdr_to_array(desc, page, inode);
+ if (ret < 0)
+ goto error;
SetPageUptodate(page);
if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
@@ -903,7 +833,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
*desc = &my_desc;
struct nfs_open_dir_context *dir_ctx = file->private_data;
int res = 0;
- int max_rapages = NFS_MAX_READDIR_RAPAGES;
dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n",
file, (long long)ctx->pos);
@@ -923,12 +852,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
desc->decode = NFS_PROTO(inode)->decode_dirent;
desc->plus = nfs_use_readdirplus(inode, ctx);
- res = nfs_readdir_alloc_pages(desc->pvec.pages, max_rapages);
- if (res < 0)
- return -ENOMEM;
-
- nfs_readdir_rapages_init(desc);
-
if (ctx->pos == 0 || nfs_attribute_cache_expired(inode))
res = nfs_revalidate_mapping(inode, file->f_mapping);
if (res < 0)
@@ -964,7 +887,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
break;
} while (!desc->eof);
out:
- nfs_readdir_free_pages(desc->pvec.pages, max_rapages);
if (res > 0)
res = 0;
dfprintk(FILE, "NFS: readdir(%pD2) returns %d\n", file, res);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 498fab72f70b..81e2fdff227e 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -69,8 +69,7 @@ struct nfs_clone_mount {
* Maximum number of pages that readdir can use for creating
* a vmapped array of pages.
*/
-#define NFS_MAX_READDIR_PAGES 64
-#define NFS_MAX_READDIR_RAPAGES 8
+#define NFS_MAX_READDIR_PAGES 8
struct nfs_client_initdata {
unsigned long init_flags;
--
2.20.1