The following commit has been merged into the x86/urgent branch of tip:
Commit-ID: dfb3911c3692e45b027f13c7dca3230921533953
Gitweb: https://git.kernel.org/tip/dfb3911c3692e45b027f13c7dca3230921533953
Author: Thomas Gleixner <tglx(a)linutronix.de>
AuthorDate: Wed, 14 Aug 2024 00:29:36 +02:00
Committer: Thomas Gleixner <tglx(a)linutronix.de>
CommitterDate: Fri, 16 Aug 2024 11:33:33 +02:00
x86/kaslr: Expose and use the end of the physical memory address space
iounmap() on x86 occasionally fails to unmap because the provided valid
ioremap address is not below high_memory. It turned out that this
happens due to KASLR.
KASLR uses the full address space between PAGE_OFFSET and vaddr_end to
randomize the starting points of the direct map, vmalloc and vmemmap
regions. It thereby limits the size of the direct map by using the
installed memory size plus an extra configurable margin for hot-plug
memory. This limitation is done to gain more randomization space
because otherwise only the holes between the direct map, vmalloc,
vmemmap and vaddr_end would be usable for randomizing.
The limited direct map size is not exposed to the rest of the kernel, so
the memory hot-plug and resource management related code paths still
operate under the assumption that the available address space can be
determined with MAX_PHYSMEM_BITS.
request_free_mem_region() allocates from (1 << MAX_PHYSMEM_BITS) - 1
downwards. That means the first allocation happens past the end of the
direct map and if unlucky this address is in the vmalloc space, which
causes high_memory to become greater than VMALLOC_START and consequently
causes iounmap() to fail for valid ioremap addresses.
MAX_PHYSMEM_BITS cannot be changed for that because the randomization
does not align with address bit boundaries and there are other places
which actually require to know the maximum number of address bits. All
remaining usage sites of MAX_PHYSMEM_BITS have been analyzed and found
to be correct.
Cure this by exposing the end of the direct map via PHYSMEM_END and use
that for the memory hot-plug and resource management related places
instead of relying on MAX_PHYSMEM_BITS. In the KASLR case PHYSMEM_END
maps to a variable which is initialized by the KASLR initialization and
otherwise it is based on MAX_PHYSMEM_BITS as before.
To prevent future hickups add a check into add_pages() to catch callers
trying to add memory above PHYSMEM_END.
Fixes: 0483e1fa6e09 ("x86/mm: Implement ASLR for kernel memory regions")
Reported-by: Max Ramanouski <max8rr8(a)gmail.com>
Reported-by: Alistair Popple <apopple(a)nvidia.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Tested-By: Max Ramanouski <max8rr8(a)gmail.com>
Tested-by: Alistair Popple <apopple(a)nvidia.com>
Reviewed-by: Dan Williams <dan.j.williams(a)intel.com>
Reviewed-by: Alistair Popple <apopple(a)nvidia.com>
Reviewed-by: Kees Cook <kees(a)kernel.org>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/all/87ed6soy3z.ffs@tglx
---
arch/x86/include/asm/page_64.h | 1 +-
arch/x86/include/asm/pgtable_64_types.h | 4 ++++-
arch/x86/mm/init_64.c | 4 ++++-
arch/x86/mm/kaslr.c | 26 +++++++++++++++++++++---
include/linux/mm.h | 4 ++++-
kernel/resource.c | 6 ++----
mm/memory_hotplug.c | 2 +-
mm/sparse.c | 2 +-
8 files changed, 40 insertions(+), 9 deletions(-)
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index af4302d..f3d257c 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -17,6 +17,7 @@ extern unsigned long phys_base;
extern unsigned long page_offset_base;
extern unsigned long vmalloc_base;
extern unsigned long vmemmap_base;
+extern unsigned long physmem_end;
static __always_inline unsigned long __phys_addr_nodebug(unsigned long x)
{
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 9053dfe..a98e534 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -140,6 +140,10 @@ extern unsigned int ptrs_per_p4d;
# define VMEMMAP_START __VMEMMAP_BASE_L4
#endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */
+#ifdef CONFIG_RANDOMIZE_MEMORY
+# define PHYSMEM_END physmem_end
+#endif
+
/*
* End of the region for which vmalloc page tables are pre-allocated.
* For non-KMSAN builds, this is the same as VMALLOC_END.
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index d8dbeac..ff25364 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -958,8 +958,12 @@ static void update_end_of_memory_vars(u64 start, u64 size)
int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
struct mhp_params *params)
{
+ unsigned long end = ((start_pfn + nr_pages) << PAGE_SHIFT) - 1;
int ret;
+ if (WARN_ON_ONCE(end > PHYSMEM_END))
+ return -ERANGE;
+
ret = __add_pages(nid, start_pfn, nr_pages, params);
WARN_ON_ONCE(ret);
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 37db264..0f2a3a4 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -47,13 +47,24 @@ static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;
*/
static __initdata struct kaslr_memory_region {
unsigned long *base;
+ unsigned long *end;
unsigned long size_tb;
} kaslr_regions[] = {
- { &page_offset_base, 0 },
- { &vmalloc_base, 0 },
- { &vmemmap_base, 0 },
+ {
+ .base = &page_offset_base,
+ .end = &physmem_end,
+ },
+ {
+ .base = &vmalloc_base,
+ },
+ {
+ .base = &vmemmap_base,
+ },
};
+/* The end of the possible address space for physical memory */
+unsigned long physmem_end __ro_after_init;
+
/* Get size in bytes used by the memory region */
static inline unsigned long get_padding(struct kaslr_memory_region *region)
{
@@ -82,6 +93,8 @@ void __init kernel_randomize_memory(void)
BUILD_BUG_ON(vaddr_end != CPU_ENTRY_AREA_BASE);
BUILD_BUG_ON(vaddr_end > __START_KERNEL_map);
+ /* Preset the end of the possible address space for physical memory */
+ physmem_end = ((1ULL << MAX_PHYSMEM_BITS) - 1);
if (!kaslr_memory_enabled())
return;
@@ -134,6 +147,13 @@ void __init kernel_randomize_memory(void)
*/
vaddr += get_padding(&kaslr_regions[i]);
vaddr = round_up(vaddr + 1, PUD_SIZE);
+
+ /*
+ * KASLR trims the maximum possible size of the
+ * direct-map. Update the physmem_end boundary.
+ */
+ if (kaslr_regions[i].end)
+ *kaslr_regions[i].end = __pa(vaddr) - 1;
remain_entropy -= entropy;
}
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c4b238a..b386415 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -97,6 +97,10 @@ extern const int mmap_rnd_compat_bits_max;
extern int mmap_rnd_compat_bits __read_mostly;
#endif
+#ifndef PHYSMEM_END
+# define PHYSMEM_END ((1ULL << MAX_PHYSMEM_BITS) - 1)
+#endif
+
#include <asm/page.h>
#include <asm/processor.h>
diff --git a/kernel/resource.c b/kernel/resource.c
index 14777af..a83040f 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1826,8 +1826,7 @@ static resource_size_t gfr_start(struct resource *base, resource_size_t size,
if (flags & GFR_DESCENDING) {
resource_size_t end;
- end = min_t(resource_size_t, base->end,
- (1ULL << MAX_PHYSMEM_BITS) - 1);
+ end = min_t(resource_size_t, base->end, PHYSMEM_END);
return end - size + 1;
}
@@ -1844,8 +1843,7 @@ static bool gfr_continue(struct resource *base, resource_size_t addr,
* @size did not wrap 0.
*/
return addr > addr - size &&
- addr <= min_t(resource_size_t, base->end,
- (1ULL << MAX_PHYSMEM_BITS) - 1);
+ addr <= min_t(resource_size_t, base->end, PHYSMEM_END);
}
static resource_size_t gfr_next(resource_size_t addr, resource_size_t size,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 66267c2..951878a 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1681,7 +1681,7 @@ struct range __weak arch_get_mappable_range(void)
struct range mhp_get_pluggable_range(bool need_mapping)
{
- const u64 max_phys = (1ULL << MAX_PHYSMEM_BITS) - 1;
+ const u64 max_phys = PHYSMEM_END;
struct range mhp_range;
if (need_mapping) {
diff --git a/mm/sparse.c b/mm/sparse.c
index e4b8300..0c3bff8 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -129,7 +129,7 @@ static inline int sparse_early_nid(struct mem_section *section)
static void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
unsigned long *end_pfn)
{
- unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
+ unsigned long max_sparsemem_pfn = (PHYSMEM_END + 1) >> PAGE_SHIFT;
/*
* Sanity checks - do not allow an architecture to pass
The quilt patch titled
Subject: alloc_tag: mark pages reserved during CMA activation as not tagged
has been removed from the -mm tree. Its filename was
alloc_tag-mark-pages-reserved-during-cma-activation-as-not-tagged.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Suren Baghdasaryan <surenb(a)google.com>
Subject: alloc_tag: mark pages reserved during CMA activation as not tagged
Date: Tue, 13 Aug 2024 08:07:57 -0700
During CMA activation, pages in CMA area are prepared and then freed
without being allocated. This triggers warnings when memory allocation
debug config (CONFIG_MEM_ALLOC_PROFILING_DEBUG) is enabled. Fix this by
marking these pages not tagged before freeing them.
Link: https://lkml.kernel.org/r/20240813150758.855881-2-surenb@google.com
Fixes: d224eb0287fb ("codetag: debug: mark codetags for reserved pages as empty")
Signed-off-by: Suren Baghdasaryan <surenb(a)google.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Cc: Kees Cook <keescook(a)chromium.org>
Cc: Kent Overstreet <kent.overstreet(a)linux.dev>
Cc: Pasha Tatashin <pasha.tatashin(a)soleen.com>
Cc: Sourav Panda <souravpanda(a)google.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: <stable(a)vger.kernel.org> [6.10]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/mm_init.c | 2 ++
1 file changed, 2 insertions(+)
--- a/mm/mm_init.c~alloc_tag-mark-pages-reserved-during-cma-activation-as-not-tagged
+++ a/mm/mm_init.c
@@ -2244,6 +2244,8 @@ void __init init_cma_reserved_pageblock(
set_pageblock_migratetype(page, MIGRATE_CMA);
set_page_refcounted(page);
+ /* pages were reserved and not allocated */
+ clear_page_tag_ref(page);
__free_pages(page, pageblock_order);
adjust_managed_page_count(page, pageblock_nr_pages);
_
Patches currently in -mm which might be from surenb(a)google.com are
The quilt patch titled
Subject: alloc_tag: introduce clear_page_tag_ref() helper function
has been removed from the -mm tree. Its filename was
alloc_tag-introduce-clear_page_tag_ref-helper-function.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Suren Baghdasaryan <surenb(a)google.com>
Subject: alloc_tag: introduce clear_page_tag_ref() helper function
Date: Tue, 13 Aug 2024 08:07:56 -0700
In several cases we are freeing pages which were not allocated using
common page allocators. For such cases, in order to keep allocation
accounting correct, we should clear the page tag to indicate that the page
being freed is expected to not have a valid allocation tag. Introduce
clear_page_tag_ref() helper function to be used for this.
Link: https://lkml.kernel.org/r/20240813150758.855881-1-surenb@google.com
Fixes: d224eb0287fb ("codetag: debug: mark codetags for reserved pages as empty")
Signed-off-by: Suren Baghdasaryan <surenb(a)google.com>
Suggested-by: David Hildenbrand <david(a)redhat.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Pasha Tatashin <pasha.tatashin(a)soleen.com>
Cc: Kees Cook <keescook(a)chromium.org>
Cc: Kent Overstreet <kent.overstreet(a)linux.dev>
Cc: Sourav Panda <souravpanda(a)google.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: <stable(a)vger.kernel.org> [6.10]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/pgalloc_tag.h | 13 +++++++++++++
mm/mm_init.c | 10 +---------
mm/page_alloc.c | 9 +--------
3 files changed, 15 insertions(+), 17 deletions(-)
--- a/include/linux/pgalloc_tag.h~alloc_tag-introduce-clear_page_tag_ref-helper-function
+++ a/include/linux/pgalloc_tag.h
@@ -43,6 +43,18 @@ static inline void put_page_tag_ref(unio
page_ext_put(page_ext_from_codetag_ref(ref));
}
+static inline void clear_page_tag_ref(struct page *page)
+{
+ if (mem_alloc_profiling_enabled()) {
+ union codetag_ref *ref = get_page_tag_ref(page);
+
+ if (ref) {
+ set_codetag_empty(ref);
+ put_page_tag_ref(ref);
+ }
+ }
+}
+
static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
unsigned int nr)
{
@@ -126,6 +138,7 @@ static inline void pgalloc_tag_sub_pages
static inline union codetag_ref *get_page_tag_ref(struct page *page) { return NULL; }
static inline void put_page_tag_ref(union codetag_ref *ref) {}
+static inline void clear_page_tag_ref(struct page *page) {}
static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
unsigned int nr) {}
static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
--- a/mm/mm_init.c~alloc_tag-introduce-clear_page_tag_ref-helper-function
+++ a/mm/mm_init.c
@@ -2459,15 +2459,7 @@ void __init memblock_free_pages(struct p
}
/* pages were reserved and not allocated */
- if (mem_alloc_profiling_enabled()) {
- union codetag_ref *ref = get_page_tag_ref(page);
-
- if (ref) {
- set_codetag_empty(ref);
- put_page_tag_ref(ref);
- }
- }
-
+ clear_page_tag_ref(page);
__free_pages_core(page, order, MEMINIT_EARLY);
}
--- a/mm/page_alloc.c~alloc_tag-introduce-clear_page_tag_ref-helper-function
+++ a/mm/page_alloc.c
@@ -5815,14 +5815,7 @@ unsigned long free_reserved_area(void *s
void free_reserved_page(struct page *page)
{
- if (mem_alloc_profiling_enabled()) {
- union codetag_ref *ref = get_page_tag_ref(page);
-
- if (ref) {
- set_codetag_empty(ref);
- put_page_tag_ref(ref);
- }
- }
+ clear_page_tag_ref(page);
ClearPageReserved(page);
init_page_count(page);
__free_page(page);
_
Patches currently in -mm which might be from surenb(a)google.com are
The quilt patch titled
Subject: selftests: memfd_secret: don't build memfd_secret test on unsupported arches
has been removed from the -mm tree. Its filename was
selftests-memfd_secret-dont-build-memfd_secret-test-on-unsupported-arches.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Muhammad Usama Anjum <usama.anjum(a)collabora.com>
Subject: selftests: memfd_secret: don't build memfd_secret test on unsupported arches
Date: Fri, 9 Aug 2024 12:56:42 +0500
[1] mentions that memfd_secret is only supported on arm64, riscv, x86 and
x86_64 for now. It doesn't support other architectures. I found the
build error on arm and decided to send the fix as it was creating noise on
KernelCI:
memfd_secret.c: In function 'memfd_secret':
memfd_secret.c:42:24: error: '__NR_memfd_secret' undeclared (first use in this function);
did you mean 'memfd_secret'?
42 | return syscall(__NR_memfd_secret, flags);
| ^~~~~~~~~~~~~~~~~
| memfd_secret
Hence I'm adding condition that memfd_secret should only be compiled on
supported architectures.
Also check in run_vmtests script if memfd_secret binary is present before
executing it.
Link: https://lkml.kernel.org/r/20240812061522.1933054-1-usama.anjum@collabora.com
Link: https://lore.kernel.org/all/20210518072034.31572-7-rppt@kernel.org/ [1]
Link: https://lkml.kernel.org/r/20240809075642.403247-1-usama.anjum@collabora.com
Fixes: 76fe17ef588a ("secretmem: test: add basic selftest for memfd_secret(2)")
Signed-off-by: Muhammad Usama Anjum <usama.anjum(a)collabora.com>
Reviewed-by: Shuah Khan <skhan(a)linuxfoundation.org>
Acked-by: Mike Rapoport (Microsoft) <rppt(a)kernel.org>
Cc: Albert Ou <aou(a)eecs.berkeley.edu>
Cc: James Bottomley <James.Bottomley(a)HansenPartnership.com>
Cc: Mike Rapoport (Microsoft) <rppt(a)kernel.org>
Cc: Palmer Dabbelt <palmer(a)dabbelt.com>
Cc: Paul Walmsley <paul.walmsley(a)sifive.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
tools/testing/selftests/mm/Makefile | 2 ++
tools/testing/selftests/mm/run_vmtests.sh | 3 +++
2 files changed, 5 insertions(+)
--- a/tools/testing/selftests/mm/Makefile~selftests-memfd_secret-dont-build-memfd_secret-test-on-unsupported-arches
+++ a/tools/testing/selftests/mm/Makefile
@@ -53,7 +53,9 @@ TEST_GEN_FILES += madv_populate
TEST_GEN_FILES += map_fixed_noreplace
TEST_GEN_FILES += map_hugetlb
TEST_GEN_FILES += map_populate
+ifneq (,$(filter $(ARCH),arm64 riscv riscv64 x86 x86_64))
TEST_GEN_FILES += memfd_secret
+endif
TEST_GEN_FILES += migration
TEST_GEN_FILES += mkdirty
TEST_GEN_FILES += mlock-random-test
--- a/tools/testing/selftests/mm/run_vmtests.sh~selftests-memfd_secret-dont-build-memfd_secret-test-on-unsupported-arches
+++ a/tools/testing/selftests/mm/run_vmtests.sh
@@ -374,8 +374,11 @@ CATEGORY="hmm" run_test bash ./test_hmm.
# MADV_POPULATE_READ and MADV_POPULATE_WRITE tests
CATEGORY="madv_populate" run_test ./madv_populate
+if [ -x ./memfd_secret ]
+then
(echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope 2>&1) | tap_prefix
CATEGORY="memfd_secret" run_test ./memfd_secret
+fi
# KSM KSM_MERGE_TIME_HUGE_PAGES test with size of 100
CATEGORY="ksm" run_test ./ksm_tests -H -s 100
_
Patches currently in -mm which might be from usama.anjum(a)collabora.com are
selftests-mm-fix-build-errors-on-armhf.patch
The quilt patch titled
Subject: mm: fix endless reclaim on machines with unaccepted memory
has been removed from the -mm tree. Its filename was
mm-fix-endless-reclaim-on-machines-with-unaccepted-memory.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: "Kirill A. Shutemov" <kirill.shutemov(a)linux.intel.com>
Subject: mm: fix endless reclaim on machines with unaccepted memory
Date: Fri, 9 Aug 2024 14:48:47 +0300
Unaccepted memory is considered unusable free memory, which is not counted
as free on the zone watermark check. This causes get_page_from_freelist()
to accept more memory to hit the high watermark, but it creates problems
in the reclaim path.
The reclaim path encounters a failed zone watermark check and attempts to
reclaim memory. This is usually successful, but if there is little or no
reclaimable memory, it can result in endless reclaim with little to no
progress. This can occur early in the boot process, just after start of
the init process when the only reclaimable memory is the page cache of the
init executable and its libraries.
Make unaccepted memory free from watermark check point of view. This way
unaccepted memory will never be the trigger of memory reclaim. Accept
more memory in the get_page_from_freelist() if needed.
Link: https://lkml.kernel.org/r/20240809114854.3745464-2-kirill.shutemov@linux.in…
Fixes: dcdfdd40fa82 ("mm: Add support for unaccepted memory")
Signed-off-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Reported-by: Jianxiong Gao <jxgao(a)google.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Tested-by: Jianxiong Gao <jxgao(a)google.com>
Cc: Borislav Petkov <bp(a)alien8.de>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Mel Gorman <mgorman(a)suse.de>
Cc: Mike Rapoport (Microsoft) <rppt(a)kernel.org>
Cc: Tom Lendacky <thomas.lendacky(a)amd.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: <stable(a)vger.kernel.org> [6.5+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/page_alloc.c | 42 ++++++++++++++++++++----------------------
1 file changed, 20 insertions(+), 22 deletions(-)
--- a/mm/page_alloc.c~mm-fix-endless-reclaim-on-machines-with-unaccepted-memory
+++ a/mm/page_alloc.c
@@ -287,7 +287,7 @@ EXPORT_SYMBOL(nr_online_nodes);
static bool page_contains_unaccepted(struct page *page, unsigned int order);
static void accept_page(struct page *page, unsigned int order);
-static bool try_to_accept_memory(struct zone *zone, unsigned int order);
+static bool cond_accept_memory(struct zone *zone, unsigned int order);
static inline bool has_unaccepted_memory(void);
static bool __free_unaccepted(struct page *page);
@@ -3072,9 +3072,6 @@ static inline long __zone_watermark_unus
if (!(alloc_flags & ALLOC_CMA))
unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
#endif
-#ifdef CONFIG_UNACCEPTED_MEMORY
- unusable_free += zone_page_state(z, NR_UNACCEPTED);
-#endif
return unusable_free;
}
@@ -3368,6 +3365,8 @@ retry:
}
}
+ cond_accept_memory(zone, order);
+
/*
* Detect whether the number of free pages is below high
* watermark. If so, we will decrease pcp->high and free
@@ -3393,10 +3392,8 @@ check_alloc_wmark:
gfp_mask)) {
int ret;
- if (has_unaccepted_memory()) {
- if (try_to_accept_memory(zone, order))
- goto try_this_zone;
- }
+ if (cond_accept_memory(zone, order))
+ goto try_this_zone;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
@@ -3450,10 +3447,8 @@ try_this_zone:
return page;
} else {
- if (has_unaccepted_memory()) {
- if (try_to_accept_memory(zone, order))
- goto try_this_zone;
- }
+ if (cond_accept_memory(zone, order))
+ goto try_this_zone;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/* Try again if zone has deferred pages */
@@ -6950,9 +6945,6 @@ static bool try_to_accept_memory_one(str
struct page *page;
bool last;
- if (list_empty(&zone->unaccepted_pages))
- return false;
-
spin_lock_irqsave(&zone->lock, flags);
page = list_first_entry_or_null(&zone->unaccepted_pages,
struct page, lru);
@@ -6978,23 +6970,29 @@ static bool try_to_accept_memory_one(str
return true;
}
-static bool try_to_accept_memory(struct zone *zone, unsigned int order)
+static bool cond_accept_memory(struct zone *zone, unsigned int order)
{
long to_accept;
- int ret = false;
+ bool ret = false;
+
+ if (!has_unaccepted_memory())
+ return false;
+
+ if (list_empty(&zone->unaccepted_pages))
+ return false;
/* How much to accept to get to high watermark? */
to_accept = high_wmark_pages(zone) -
(zone_page_state(zone, NR_FREE_PAGES) -
- __zone_watermark_unusable_free(zone, order, 0));
+ __zone_watermark_unusable_free(zone, order, 0) -
+ zone_page_state(zone, NR_UNACCEPTED));
- /* Accept at least one page */
- do {
+ while (to_accept > 0) {
if (!try_to_accept_memory_one(zone))
break;
ret = true;
to_accept -= MAX_ORDER_NR_PAGES;
- } while (to_accept > 0);
+ }
return ret;
}
@@ -7037,7 +7035,7 @@ static void accept_page(struct page *pag
{
}
-static bool try_to_accept_memory(struct zone *zone, unsigned int order)
+static bool cond_accept_memory(struct zone *zone, unsigned int order)
{
return false;
}
_
Patches currently in -mm which might be from kirill.shutemov(a)linux.intel.com are
mm-reduce-deferred-struct-page-init-ifdeffery.patch
mm-accept-memory-in-__alloc_pages_bulk.patch
mm-introduce-pageunaccepted-page-type.patch
mm-rework-accept-memory-helpers.patch
mm-add-a-helper-to-accept-page.patch
mm-page_isolation-handle-unaccepted-memory-isolation.patch
mm-accept-to-promo-watermark.patch
The quilt patch titled
Subject: mm/numa: no task_numa_fault() call if PMD is changed
has been removed from the -mm tree. Its filename was
mm-numa-no-task_numa_fault-call-if-pmd-is-changed.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Zi Yan <ziy(a)nvidia.com>
Subject: mm/numa: no task_numa_fault() call if PMD is changed
Date: Fri, 9 Aug 2024 10:59:05 -0400
When handling a numa page fault, task_numa_fault() should be called by a
process that restores the page table of the faulted folio to avoid
duplicated stats counting. Commit c5b5a3dd2c1f ("mm: thp: refactor NUMA
fault handling") restructured do_huge_pmd_numa_page() and did not avoid
task_numa_fault() call in the second page table check after a numa
migration failure. Fix it by making all !pmd_same() return immediately.
This issue can cause task_numa_fault() being called more than necessary
and lead to unexpected numa balancing results (It is hard to tell whether
the issue will cause positive or negative performance impact due to
duplicated numa fault counting).
Link: https://lkml.kernel.org/r/20240809145906.1513458-3-ziy@nvidia.com
Fixes: c5b5a3dd2c1f ("mm: thp: refactor NUMA fault handling")
Reported-by: "Huang, Ying" <ying.huang(a)intel.com>
Closes: https://lore.kernel.org/linux-mm/87zfqfw0yw.fsf@yhuang6-desk2.ccr.corp.inte…
Signed-off-by: Zi Yan <ziy(a)nvidia.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Cc: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Cc: "Huang, Ying" <ying.huang(a)intel.com>
Cc: Kefeng Wang <wangkefeng.wang(a)huawei.com>
Cc: Mel Gorman <mgorman(a)suse.de>
Cc: Yang Shi <shy828301(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/huge_memory.c | 29 +++++++++++++----------------
1 file changed, 13 insertions(+), 16 deletions(-)
--- a/mm/huge_memory.c~mm-numa-no-task_numa_fault-call-if-pmd-is-changed
+++ a/mm/huge_memory.c
@@ -1685,7 +1685,7 @@ vm_fault_t do_huge_pmd_numa_page(struct
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
spin_unlock(vmf->ptl);
- goto out;
+ return 0;
}
pmd = pmd_modify(oldpmd, vma->vm_page_prot);
@@ -1728,22 +1728,16 @@ vm_fault_t do_huge_pmd_numa_page(struct
if (!migrate_misplaced_folio(folio, vma, target_nid)) {
flags |= TNF_MIGRATED;
nid = target_nid;
- } else {
- flags |= TNF_MIGRATE_FAIL;
- vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
- if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
- spin_unlock(vmf->ptl);
- goto out;
- }
- goto out_map;
- }
-
-out:
- if (nid != NUMA_NO_NODE)
task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
+ return 0;
+ }
- return 0;
-
+ flags |= TNF_MIGRATE_FAIL;
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
+ spin_unlock(vmf->ptl);
+ return 0;
+ }
out_map:
/* Restore the PMD */
pmd = pmd_modify(oldpmd, vma->vm_page_prot);
@@ -1753,7 +1747,10 @@ out_map:
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
spin_unlock(vmf->ptl);
- goto out;
+
+ if (nid != NUMA_NO_NODE)
+ task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
+ return 0;
}
/*
_
Patches currently in -mm which might be from ziy(a)nvidia.com are
memory-tiering-read-last_cpupid-correctly-in-do_huge_pmd_numa_page.patch
memory-tiering-introduce-folio_use_access_time-check.patch
memory-tiering-count-pgpromote_success-when-mem-tiering-is-enabled.patch
mm-migrate-move-common-code-to-numa_migrate_check-was-numa_migrate_prep.patch
The quilt patch titled
Subject: mm/numa: no task_numa_fault() call if PTE is changed
has been removed from the -mm tree. Its filename was
mm-numa-no-task_numa_fault-call-if-pte-is-changed.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Zi Yan <ziy(a)nvidia.com>
Subject: mm/numa: no task_numa_fault() call if PTE is changed
Date: Fri, 9 Aug 2024 10:59:04 -0400
When handling a numa page fault, task_numa_fault() should be called by a
process that restores the page table of the faulted folio to avoid
duplicated stats counting. Commit b99a342d4f11 ("NUMA balancing: reduce
TLB flush via delaying mapping on hint page fault") restructured
do_numa_page() and did not avoid task_numa_fault() call in the second page
table check after a numa migration failure. Fix it by making all
!pte_same() return immediately.
This issue can cause task_numa_fault() being called more than necessary
and lead to unexpected numa balancing results (It is hard to tell whether
the issue will cause positive or negative performance impact due to
duplicated numa fault counting).
Link: https://lkml.kernel.org/r/20240809145906.1513458-2-ziy@nvidia.com
Fixes: b99a342d4f11 ("NUMA balancing: reduce TLB flush via delaying mapping on hint page fault")
Signed-off-by: Zi Yan <ziy(a)nvidia.com>
Reported-by: "Huang, Ying" <ying.huang(a)intel.com>
Closes: https://lore.kernel.org/linux-mm/87zfqfw0yw.fsf@yhuang6-desk2.ccr.corp.inte…
Acked-by: David Hildenbrand <david(a)redhat.com>
Cc: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Cc: Kefeng Wang <wangkefeng.wang(a)huawei.com>
Cc: Mel Gorman <mgorman(a)suse.de>
Cc: Yang Shi <shy828301(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/memory.c | 33 ++++++++++++++++-----------------
1 file changed, 16 insertions(+), 17 deletions(-)
--- a/mm/memory.c~mm-numa-no-task_numa_fault-call-if-pte-is-changed
+++ a/mm/memory.c
@@ -5295,7 +5295,7 @@ static vm_fault_t do_numa_page(struct vm
if (unlikely(!pte_same(old_pte, vmf->orig_pte))) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
- goto out;
+ return 0;
}
pte = pte_modify(old_pte, vma->vm_page_prot);
@@ -5358,23 +5358,19 @@ static vm_fault_t do_numa_page(struct vm
if (!migrate_misplaced_folio(folio, vma, target_nid)) {
nid = target_nid;
flags |= TNF_MIGRATED;
- } else {
- flags |= TNF_MIGRATE_FAIL;
- vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
- vmf->address, &vmf->ptl);
- if (unlikely(!vmf->pte))
- goto out;
- if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
- pte_unmap_unlock(vmf->pte, vmf->ptl);
- goto out;
- }
- goto out_map;
+ task_numa_fault(last_cpupid, nid, nr_pages, flags);
+ return 0;
}
-out:
- if (nid != NUMA_NO_NODE)
- task_numa_fault(last_cpupid, nid, nr_pages, flags);
- return 0;
+ flags |= TNF_MIGRATE_FAIL;
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (unlikely(!vmf->pte))
+ return 0;
+ if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return 0;
+ }
out_map:
/*
* Make it present again, depending on how arch implements
@@ -5387,7 +5383,10 @@ out_map:
numa_rebuild_single_mapping(vmf, vma, vmf->address, vmf->pte,
writable);
pte_unmap_unlock(vmf->pte, vmf->ptl);
- goto out;
+
+ if (nid != NUMA_NO_NODE)
+ task_numa_fault(last_cpupid, nid, nr_pages, flags);
+ return 0;
}
static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
_
Patches currently in -mm which might be from ziy(a)nvidia.com are
memory-tiering-read-last_cpupid-correctly-in-do_huge_pmd_numa_page.patch
memory-tiering-introduce-folio_use_access_time-check.patch
memory-tiering-count-pgpromote_success-when-mem-tiering-is-enabled.patch
mm-migrate-move-common-code-to-numa_migrate_check-was-numa_migrate_prep.patch
The quilt patch titled
Subject: mm/vmalloc: fix page mapping if vm_area_alloc_pages() with high order fallback to order 0
has been removed from the -mm tree. Its filename was
mm-vmalloc-fix-page-mapping-if-vm_area_alloc_pages-with-high-order-fallback-to-order-0.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Hailong Liu <hailong.liu(a)oppo.com>
Subject: mm/vmalloc: fix page mapping if vm_area_alloc_pages() with high order fallback to order 0
Date: Thu, 8 Aug 2024 20:19:56 +0800
The __vmap_pages_range_noflush() assumes its argument pages** contains
pages with the same page shift. However, since commit e9c3cda4d86e ("mm,
vmalloc: fix high order __GFP_NOFAIL allocations"), if gfp_flags includes
__GFP_NOFAIL with high order in vm_area_alloc_pages() and page allocation
failed for high order, the pages** may contain two different page shifts
(high order and order-0). This could lead __vmap_pages_range_noflush() to
perform incorrect mappings, potentially resulting in memory corruption.
Users might encounter this as follows (vmap_allow_huge = true, 2M is for
PMD_SIZE):
kvmalloc(2M, __GFP_NOFAIL|GFP_X)
__vmalloc_node_range_noprof(vm_flags=VM_ALLOW_HUGE_VMAP)
vm_area_alloc_pages(order=9) ---> order-9 allocation failed and fallback to order-0
vmap_pages_range()
vmap_pages_range_noflush()
__vmap_pages_range_noflush(page_shift = 21) ----> wrong mapping happens
We can remove the fallback code because if a high-order allocation fails,
__vmalloc_node_range_noprof() will retry with order-0. Therefore, it is
unnecessary to fallback to order-0 here. Therefore, fix this by removing
the fallback code.
Link: https://lkml.kernel.org/r/20240808122019.3361-1-hailong.liu@oppo.com
Fixes: e9c3cda4d86e ("mm, vmalloc: fix high order __GFP_NOFAIL allocations")
Signed-off-by: Hailong Liu <hailong.liu(a)oppo.com>
Reported-by: Tangquan Zheng <zhengtangquan(a)oppo.com>
Reviewed-by: Baoquan He <bhe(a)redhat.com>
Reviewed-by: Uladzislau Rezki (Sony) <urezki(a)gmail.com>
Acked-by: Barry Song <baohua(a)kernel.org>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/vmalloc.c | 11 ++---------
1 file changed, 2 insertions(+), 9 deletions(-)
--- a/mm/vmalloc.c~mm-vmalloc-fix-page-mapping-if-vm_area_alloc_pages-with-high-order-fallback-to-order-0
+++ a/mm/vmalloc.c
@@ -3584,15 +3584,8 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
page = alloc_pages_noprof(alloc_gfp, order);
else
page = alloc_pages_node_noprof(nid, alloc_gfp, order);
- if (unlikely(!page)) {
- if (!nofail)
- break;
-
- /* fall back to the zero order allocations */
- alloc_gfp |= __GFP_NOFAIL;
- order = 0;
- continue;
- }
+ if (unlikely(!page))
+ break;
/*
* Higher order allocations must be able to be treated as
_
Patches currently in -mm which might be from hailong.liu(a)oppo.com are