From: Steven Rostedt <rostedt(a)goodmis.org>
The following causes a vsnprintf fault:
# echo 's:wake_lat char[] wakee; u64 delta;' >> /sys/kernel/tracing/dynamic_events
# echo 'hist:keys=pid:ts=common_timestamp.usecs if !(common_flags & 0x18)' > /sys/kernel/tracing/events/sched/sched_waking/trigger
# echo 'hist:keys=next_pid:delta=common_timestamp.usecs-$ts:onmatch(sched.sched_waking).trace(wake_lat,next_comm,$delta)' > /sys/kernel/tracing/events/sched/sched_switch/trigger
Because the synthetic event's "wakee" field is created as a dynamic string
(even though the string copied is not). The print format to print the
dynamic string changed from "%*s" to "%s" because another location
(__set_synth_event_print_fmt()) exported this to user space, and user
space did not need that. But it is still used in print_synth_event(), and
the output looks like:
<idle>-0 [001] d..5. 193.428167: wake_lat: wakee=(efault)sshd-sessiondelta=155
sshd-session-879 [001] d..5. 193.811080: wake_lat: wakee=(efault)kworker/u34:5delta=58
<idle>-0 [002] d..5. 193.811198: wake_lat: wakee=(efault)bashdelta=91
bash-880 [002] d..5. 193.811371: wake_lat: wakee=(efault)kworker/u35:2delta=21
<idle>-0 [001] d..5. 193.811516: wake_lat: wakee=(efault)sshd-sessiondelta=129
sshd-session-879 [001] d..5. 193.967576: wake_lat: wakee=(efault)kworker/u34:5delta=50
The length isn't needed as the string is always nul terminated. Just print
the string and not add the length (which was hard coded to the max string
length anyway).
Cc: stable(a)vger.kernel.org
Cc: Mathieu Desnoyers <mathieu.desnoyers(a)efficios.com>
Cc: Tom Zanussi <zanussi(a)kernel.org>
Cc: Douglas Raillard <douglas.raillard(a)arm.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat(a)kernel.org>
Link: https://lore.kernel.org/20250407154139.69955768@gandalf.local.home
Fixes: 4d38328eb442d ("tracing: Fix synth event printk format for str fields");
Signed-off-by: Steven Rostedt (Google) <rostedt(a)goodmis.org>
---
kernel/trace/trace_events_synth.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 969f48742d72..33cfbd4ed76d 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -370,7 +370,6 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,
union trace_synth_field *data = &entry->fields[n_u64];
trace_seq_printf(s, print_fmt, se->fields[i]->name,
- STR_VAR_LEN_MAX,
(char *)entry + data->as_dynamic.offset,
i == se->n_fields - 1 ? "" : " ");
n_u64++;
--
2.47.2
From: Jarkko Sakkinen <jarkko.sakkinen(a)opinsys.com>
Add an isolated list of unreferenced keys to be queued for deletion, and
try to pin the keys in the garbage collector before processing anything.
Skip unpinnable keys.
Use this list for blocking the reaping process during the teardown:
1. First off, the keys added to `keys_graveyard` are snapshotted, and the
list is flushed. This the very last step in `key_put()`.
2. `key_put()` reaches zero. This will mark key as busy for the garbage
collector.
3. `key_garbage_collector()` will try to increase refcount, which won't go
above zero. Whenever this happens, the key will be skipped.
Cc: stable(a)vger.kernel.org # v6.1+
Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen(a)opinsys.com>
---
v8:
- One more rebasing error (2x list_splice_init, reported by Marek Szyprowski)
v7:
- Fixed multiple definitions (from rebasing).
v6:
- Rebase went wrong in v5.
v5:
- Rebased on top of v6.15-rc
- Updated commit message to explain how spin lock and refcount
isolate the time window in key_put().
v4:
- Pin the key while processing key type teardown. Skip dead keys.
- Revert key_gc_graveyard back key_gc_unused_keys.
- Rewrote the commit message.
- "unsigned long flags" declaration somehow did make to the previous
patch (sorry).
v3:
- Using spin_lock() fails since key_put() is executed inside IRQs.
Using spin_lock_irqsave() would neither work given the lock is
acquired for /proc/keys. Therefore, separate the lock for
graveyard and key_graveyard before reaping key_serial_tree.
v2:
- Rename key_gc_unused_keys as key_gc_graveyard, and re-document the
function.
---
include/linux/key.h | 7 ++-----
security/keys/gc.c | 36 ++++++++++++++++++++----------------
security/keys/internal.h | 5 +++++
security/keys/key.c | 7 +++++--
4 files changed, 32 insertions(+), 23 deletions(-)
diff --git a/include/linux/key.h b/include/linux/key.h
index ba05de8579ec..c50659184bdf 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -195,10 +195,8 @@ enum key_state {
struct key {
refcount_t usage; /* number of references */
key_serial_t serial; /* key serial number */
- union {
- struct list_head graveyard_link;
- struct rb_node serial_node;
- };
+ struct list_head graveyard_link; /* key->usage == 0 */
+ struct rb_node serial_node;
#ifdef CONFIG_KEY_NOTIFICATIONS
struct watch_list *watchers; /* Entities watching this key for changes */
#endif
@@ -236,7 +234,6 @@ struct key {
#define KEY_FLAG_ROOT_CAN_INVAL 7 /* set if key can be invalidated by root without permission */
#define KEY_FLAG_KEEP 8 /* set if key should not be removed */
#define KEY_FLAG_UID_KEYRING 9 /* set if key is a user or user session keyring */
-#define KEY_FLAG_FINAL_PUT 10 /* set if final put has happened on key */
/* the key type and key description string
* - the desc is used to match a key against search criteria
diff --git a/security/keys/gc.c b/security/keys/gc.c
index f27223ea4578..9ccd8ee6fcdb 100644
--- a/security/keys/gc.c
+++ b/security/keys/gc.c
@@ -189,6 +189,7 @@ static void key_garbage_collector(struct work_struct *work)
struct rb_node *cursor;
struct key *key;
time64_t new_timer, limit, expiry;
+ unsigned long flags;
kenter("[%lx,%x]", key_gc_flags, gc_state);
@@ -206,21 +207,35 @@ static void key_garbage_collector(struct work_struct *work)
new_timer = TIME64_MAX;
+ spin_lock_irqsave(&key_graveyard_lock, flags);
+ list_splice_init(&key_graveyard, &graveyard);
+ spin_unlock_irqrestore(&key_graveyard_lock, flags);
+
+ list_for_each_entry(key, &graveyard, graveyard_link) {
+ spin_lock(&key_serial_lock);
+ kdebug("unrefd key %d", key->serial);
+ rb_erase(&key->serial_node, &key_serial_tree);
+ spin_unlock(&key_serial_lock);
+ }
+
/* As only this function is permitted to remove things from the key
* serial tree, if cursor is non-NULL then it will always point to a
* valid node in the tree - even if lock got dropped.
*/
spin_lock(&key_serial_lock);
+ key = NULL;
cursor = rb_first(&key_serial_tree);
continue_scanning:
+ key_put(key);
while (cursor) {
key = rb_entry(cursor, struct key, serial_node);
cursor = rb_next(cursor);
-
- if (test_bit(KEY_FLAG_FINAL_PUT, &key->flags)) {
- smp_mb(); /* Clobber key->user after FINAL_PUT seen. */
- goto found_unreferenced_key;
+ /* key_get(), unless zero: */
+ if (!refcount_inc_not_zero(&key->usage)) {
+ key = NULL;
+ gc_state |= KEY_GC_REAP_AGAIN;
+ goto skip_dead_key;
}
if (unlikely(gc_state & KEY_GC_REAPING_DEAD_1)) {
@@ -274,6 +289,7 @@ static void key_garbage_collector(struct work_struct *work)
spin_lock(&key_serial_lock);
goto continue_scanning;
}
+ key_put(key);
/* We've completed the pass. Set the timer if we need to and queue a
* new cycle if necessary. We keep executing cycles until we find one
@@ -328,18 +344,6 @@ static void key_garbage_collector(struct work_struct *work)
kleave(" [end %x]", gc_state);
return;
- /* We found an unreferenced key - once we've removed it from the tree,
- * we can safely drop the lock.
- */
-found_unreferenced_key:
- kdebug("unrefd key %d", key->serial);
- rb_erase(&key->serial_node, &key_serial_tree);
- spin_unlock(&key_serial_lock);
-
- list_add_tail(&key->graveyard_link, &graveyard);
- gc_state |= KEY_GC_REAP_AGAIN;
- goto maybe_resched;
-
/* We found a restricted keyring and need to update the restriction if
* it is associated with the dead key type.
*/
diff --git a/security/keys/internal.h b/security/keys/internal.h
index 2cffa6dc8255..4e3d9b322390 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -63,9 +63,14 @@ struct key_user {
int qnbytes; /* number of bytes allocated to this user */
};
+extern struct list_head key_graveyard;
+extern spinlock_t key_graveyard_lock;
+
extern struct rb_root key_user_tree;
extern spinlock_t key_user_lock;
extern struct key_user root_key_user;
+extern struct list_head key_graveyard;
+extern spinlock_t key_graveyard_lock;
extern struct key_user *key_user_lookup(kuid_t uid);
extern void key_user_put(struct key_user *user);
diff --git a/security/keys/key.c b/security/keys/key.c
index 7198cd2ac3a3..7511f2017b6b 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -22,6 +22,8 @@ DEFINE_SPINLOCK(key_serial_lock);
struct rb_root key_user_tree; /* tree of quota records indexed by UID */
DEFINE_SPINLOCK(key_user_lock);
+LIST_HEAD(key_graveyard);
+DEFINE_SPINLOCK(key_graveyard_lock);
unsigned int key_quota_root_maxkeys = 1000000; /* root's key count quota */
unsigned int key_quota_root_maxbytes = 25000000; /* root's key space quota */
@@ -658,8 +660,9 @@ void key_put(struct key *key)
key->user->qnbytes -= key->quotalen;
spin_unlock_irqrestore(&key->user->lock, flags);
}
- smp_mb(); /* key->user before FINAL_PUT set. */
- set_bit(KEY_FLAG_FINAL_PUT, &key->flags);
+ spin_lock_irqsave(&key_graveyard_lock, flags);
+ list_add_tail(&key->graveyard_link, &key_graveyard);
+ spin_unlock_irqrestore(&key_graveyard_lock, flags);
schedule_work(&key_gc_work);
}
}
--
2.39.5
Hi,
Can I report an issue with 6.12 LTS?
This backport in 6.12.23
[805e3ce5e0e32b31dcecc0774c57c17a1f13cef6][1]
also needs this upstream commit as well
[22cc5ca5de52bbfc36a7d4a55323f91fb4492264][2]
If it is missing and you don't have XEN enabled the build fails:
```
arch/x86/coco/tdx/tdx.c:1080:13: error: no member named 'safe_halt' in
'struct pv_irq_ops'
1080 | pv_ops.irq.safe_halt = tdx_safe_halt;
| ~~~~~~~~~~ ^
arch/x86/coco/tdx/tdx.c:1081:13: error: no member named 'halt' in
'struct pv_irq_ops'
1081 | pv_ops.irq.halt = tdx_halt;
| ~~~~~~~~~~ ^
2 errors generated.
make[5]: *** [scripts/Makefile.build:229: arch/x86/coco/tdx/tdx.o] Error 1
make[4]: *** [scripts/Makefile.build:478: arch/x86/coco/tdx] Error 2
make[3]: *** [scripts/Makefile.build:478: arch/x86/coco] Error 2
make[2]: *** [scripts/Makefile.build:478: arch/x86] Error 2
```
To make it work I have added the backport of
[805e3ce5e0e32b31dcecc0774c57c17a1f13cef6][1] as patch in my local build[3].
Best regards,
Ike
[1]:
https://web.git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit…
[2]:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
[3]:
https://gitlab.com/herecura/packages/linux-bede-lts/-/blob/0d7e313f13fdae9b…
Hello,
This series disables the "serdes_wiz0" and "serdes_wiz1" device-tree
nodes in the J722S SoC file and enables them in the board files where
they are required along with "serdes0" and "serdes1". There are two
reasons behind this change:
1. To follow the existing convention of disabling nodes in the SoC file
and enabling them in the board file as required.
2. To address situations where a board file hasn't explicitly disabled
"serdes_wiz0" and "serdes_wiz1" (example: am67a-beagley-ai.dts)
as a result of which booting the board displays the following errors:
wiz bus@f0000:phy@f000000: probe with driver wiz failed with error -12
...
wiz bus@f0000:phy@f010000: probe with driver wiz failed with error -12
Series is based on linux-next tagged next-20250408.
v1 of this series is at:
https://lore.kernel.org/r/20250408060636.3413856-1-s-vadapalli@ti.com/
Changes since v1:
- Added "Fixes" tag and updated commit message accordingly.
Regards,
Siddharth.
Siddharth Vadapalli (2):
arm64: dts: ti: k3-j722s-evm: Enable "serdes_wiz0" and "serdes_wiz1"
arm64: dts: ti: k3-j722s-main: Disable "serdes_wiz0" and "serdes_wiz1"
arch/arm64/boot/dts/ti/k3-j722s-evm.dts | 8 ++++++++
arch/arm64/boot/dts/ti/k3-j722s-main.dtsi | 4 ++++
2 files changed, 12 insertions(+)
--
2.34.1
The quilt patch titled
Subject: mm: fix apply_to_existing_page_range()
has been removed from the -mm tree. Its filename was
mm-fix-apply_to_existing_page_range.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: "Kirill A. Shutemov" <kirill.shutemov(a)linux.intel.com>
Subject: mm: fix apply_to_existing_page_range()
Date: Wed, 9 Apr 2025 12:40:43 +0300
In the case of apply_to_existing_page_range(), apply_to_pte_range() is
reached with 'create' set to false. When !create, the loop over the PTE
page table is broken.
apply_to_pte_range() will only move to the next PTE entry if 'create' is
true or if the current entry is not pte_none().
This means that the user of apply_to_existing_page_range() will not have
'fn' called for any entries after the first pte_none() in the PTE page
table.
Fix the loop logic in apply_to_pte_range().
There are no known runtime issues from this, but the fix is trivial enough
for stable@ even without a known buggy user.
Link: https://lkml.kernel.org/r/20250409094043.1629234-1-kirill.shutemov@linux.in…
Signed-off-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Fixes: be1db4753ee6 ("mm/memory.c: add apply_to_existing_page_range() helper")
Cc: Daniel Axtens <dja(a)axtens.net>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/memory.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/mm/memory.c~mm-fix-apply_to_existing_page_range
+++ a/mm/memory.c
@@ -2938,11 +2938,11 @@ static int apply_to_pte_range(struct mm_
if (fn) {
do {
if (create || !pte_none(ptep_get(pte))) {
- err = fn(pte++, addr, data);
+ err = fn(pte, addr, data);
if (err)
break;
}
- } while (addr += PAGE_SIZE, addr != end);
+ } while (pte++, addr += PAGE_SIZE, addr != end);
}
*mask |= PGTBL_PTE_MODIFIED;
_
Patches currently in -mm which might be from kirill.shutemov(a)linux.intel.com are
mm-page_alloc-fix-deadlock-on-cpu_hotplug_lock-in-__accept_page.patch
The quilt patch titled
Subject: alloc_tag: handle incomplete bulk allocations in vm_module_tags_populate
has been removed from the -mm tree. Its filename was
alloc_tag-handle-incomplete-bulk-allocations-in-vm_module_tags_populate.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: "T.J. Mercier" <tjmercier(a)google.com>
Subject: alloc_tag: handle incomplete bulk allocations in vm_module_tags_populate
Date: Wed, 9 Apr 2025 22:51:11 +0000
alloc_pages_bulk_node() may partially succeed and allocate fewer than the
requested nr_pages. There are several conditions under which this can
occur, but we have encountered the case where CONFIG_PAGE_OWNER is enabled
causing all bulk allocations to always fallback to single page allocations
due to commit 187ad460b841 ("mm/page_alloc: avoid page allocator recursion
with pagesets.lock held").
Currently vm_module_tags_populate() immediately fails when
alloc_pages_bulk_node() returns fewer than the requested number of pages.
When this happens memory allocation profiling gets disabled, for example
[ 14.297583] [9: modprobe: 465] Failed to allocate memory for allocation tags in the module scsc_wlan. Memory allocation profiling is disabled!
[ 14.299339] [9: modprobe: 465] modprobe: Failed to insmod '/vendor/lib/modules/scsc_wlan.ko' with args '': Out of memory
This patch causes vm_module_tags_populate() to retry bulk allocations for
the remaining memory instead of failing immediately which will avoid the
disablement of memory allocation profiling.
Link: https://lkml.kernel.org/r/20250409225111.3770347-1-tjmercier@google.com
Fixes: 0f9b685626da ("alloc_tag: populate memory for module tags as needed")
Signed-off-by: T.J. Mercier <tjmercier(a)google.com>
Reported-by: Janghyuck Kim <janghyuck.kim(a)samsung.com>
Acked-by: Suren Baghdasaryan <surenb(a)google.com>
Cc: Kent Overstreet <kent.overstreet(a)linux.dev>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
lib/alloc_tag.c | 15 ++++++++++++---
1 file changed, 12 insertions(+), 3 deletions(-)
--- a/lib/alloc_tag.c~alloc_tag-handle-incomplete-bulk-allocations-in-vm_module_tags_populate
+++ a/lib/alloc_tag.c
@@ -422,11 +422,20 @@ static int vm_module_tags_populate(void)
unsigned long old_shadow_end = ALIGN(phys_end, MODULE_ALIGN);
unsigned long new_shadow_end = ALIGN(new_end, MODULE_ALIGN);
unsigned long more_pages;
- unsigned long nr;
+ unsigned long nr = 0;
more_pages = ALIGN(new_end - phys_end, PAGE_SIZE) >> PAGE_SHIFT;
- nr = alloc_pages_bulk_node(GFP_KERNEL | __GFP_NOWARN,
- NUMA_NO_NODE, more_pages, next_page);
+ while (nr < more_pages) {
+ unsigned long allocated;
+
+ allocated = alloc_pages_bulk_node(GFP_KERNEL | __GFP_NOWARN,
+ NUMA_NO_NODE, more_pages - nr, next_page + nr);
+
+ if (!allocated)
+ break;
+ nr += allocated;
+ }
+
if (nr < more_pages ||
vmap_pages_range(phys_end, phys_end + (nr << PAGE_SHIFT), PAGE_KERNEL,
next_page, PAGE_SHIFT) < 0) {
_
Patches currently in -mm which might be from tjmercier(a)google.com are
The quilt patch titled
Subject: mm: fix filemap_get_folios_contig returning batches of identical folios
has been removed from the -mm tree. Its filename was
mm-fix-filemap_get_folios_contig-returning-batches-of-identical-folios.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: "Vishal Moola (Oracle)" <vishal.moola(a)gmail.com>
Subject: mm: fix filemap_get_folios_contig returning batches of identical folios
Date: Thu, 3 Apr 2025 16:54:17 -0700
filemap_get_folios_contig() is supposed to return distinct folios found
within [start, end]. Large folios in the Xarray become multi-index
entries. xas_next() can iterate through the sub-indexes before finding a
sibling entry and breaking out of the loop.
This can result in a returned folio_batch containing an indeterminate
number of duplicate folios, which forces the callers to skeptically handle
the returned batch. This is inefficient and incurs a large maintenance
overhead.
We can fix this by calling xas_advance() after we have successfully adding
a folio to the batch to ensure our Xarray is positioned such that it will
correctly find the next folio - similar to filemap_get_read_batch().
Link: https://lkml.kernel.org/r/Z-8s1-kiIDkzgRbc@fedora
Fixes: 35b471467f88 ("filemap: add filemap_get_folios_contig()")
Signed-off-by: Vishal Moola (Oracle) <vishal.moola(a)gmail.com>
Reported-by: Qu Wenruo <quwenruo.btrfs(a)gmx.com>
Closes: https://lkml.kernel.org/r/b714e4de-2583-4035-b829-72cfb5eb6fc6@gmx.com
Tested-by: Qu Wenruo <quwenruo.btrfs(a)gmx.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Vivek Kasireddy <vivek.kasireddy(a)intel.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/filemap.c | 1 +
1 file changed, 1 insertion(+)
--- a/mm/filemap.c~mm-fix-filemap_get_folios_contig-returning-batches-of-identical-folios
+++ a/mm/filemap.c
@@ -2244,6 +2244,7 @@ unsigned filemap_get_folios_contig(struc
*start = folio->index + nr;
goto out;
}
+ xas_advance(&xas, folio_next_index(folio) - 1);
continue;
put_folio:
folio_put(folio);
_
Patches currently in -mm which might be from vishal.moola(a)gmail.com are
mm-compaction-use-folio-in-hugetlb-pathway.patch
The quilt patch titled
Subject: mm: page_alloc: speed up fallbacks in rmqueue_bulk()
has been removed from the -mm tree. Its filename was
mm-page_alloc-speed-up-fallbacks-in-rmqueue_bulk.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Johannes Weiner <hannes(a)cmpxchg.org>
Subject: mm: page_alloc: speed up fallbacks in rmqueue_bulk()
Date: Mon, 7 Apr 2025 14:01:53 -0400
The test robot identified c2f6ea38fc1b ("mm: page_alloc: don't steal
single pages from biggest buddy") as the root cause of a 56.4% regression
in vm-scalability::lru-file-mmap-read.
Carlos reports an earlier patch, c0cd6f557b90 ("mm: page_alloc: fix
freelist movement during block conversion"), as the root cause for a
regression in worst-case zone->lock+irqoff hold times.
Both of these patches modify the page allocator's fallback path to be less
greedy in an effort to stave off fragmentation. The flip side of this is
that fallbacks are also less productive each time around, which means the
fallback search can run much more frequently.
Carlos' traces point to rmqueue_bulk() specifically, which tries to refill
the percpu cache by allocating a large batch of pages in a loop. It
highlights how once the native freelists are exhausted, the fallback code
first scans orders top-down for whole blocks to claim, then falls back to
a bottom-up search for the smallest buddy to steal. For the next batch
page, it goes through the same thing again.
This can be made more efficient. Since rmqueue_bulk() holds the
zone->lock over the entire batch, the freelists are not subject to outside
changes; when the search for a block to claim has already failed, there is
no point in trying again for the next page.
Modify __rmqueue() to remember the last successful fallback mode, and
restart directly from there on the next rmqueue_bulk() iteration.
Oliver confirms that this improves beyond the regression that the test
robot reported against c2f6ea38fc1b:
commit:
f3b92176f4 ("tools/selftests: add guard region test for /proc/$pid/pagemap")
c2f6ea38fc ("mm: page_alloc: don't steal single pages from biggest buddy")
acc4d5ff0b ("Merge tag 'net-6.15-rc0' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net")
2c847f27c3 ("mm: page_alloc: speed up fallbacks in rmqueue_bulk()") <--- your patch
f3b92176f4f7100f c2f6ea38fc1b640aa7a2e155cc1 acc4d5ff0b61eb1715c498b6536 2c847f27c37da65a93d23c237c5
---------------- --------------------------- --------------------------- ---------------------------
%stddev %change %stddev %change %stddev %change %stddev
\ | \ | \ | \
25525364 �� 3% -56.4% 11135467 -57.8% 10779336 +31.6% 33581409 vm-scalability.throughput
Carlos confirms that worst-case times are almost fully recovered
compared to before the earlier culprit patch:
2dd482ba627d (before freelist hygiene): 1ms
c0cd6f557b90 (after freelist hygiene): 90ms
next-20250319 (steal smallest buddy): 280ms
this patch : 8ms
[jackmanb(a)google.com: comment updates]
Link: https://lkml.kernel.org/r/D92AC0P9594X.3BML64MUKTF8Z@google.com
[hannes(a)cmpxchg.org: reset rmqueue_mode in rmqueue_buddy() error loop, per Yunsheng Lin]
Link: https://lkml.kernel.org/r/20250409140023.GA2313@cmpxchg.org
Link: https://lkml.kernel.org/r/20250407180154.63348-1-hannes@cmpxchg.org
Fixes: c0cd6f557b90 ("mm: page_alloc: fix freelist movement during block conversion")
Fixes: c2f6ea38fc1b ("mm: page_alloc: don't steal single pages from biggest buddy")
Signed-off-by: Johannes Weiner <hannes(a)cmpxchg.org>
Signed-off-by: Brendan Jackman <jackmanb(a)google.com>
Reported-by: kernel test robot <oliver.sang(a)intel.com>
Reported-by: Carlos Song <carlos.song(a)nxp.com>
Tested-by: Carlos Song <carlos.song(a)nxp.com>
Tested-by: kernel test robot <oliver.sang(a)intel.com>
Closes: https://lore.kernel.org/oe-lkp/202503271547.fc08b188-lkp@intel.com
Reviewed-by: Brendan Jackman <jackmanb(a)google.com>
Tested-by: Shivank Garg <shivankg(a)amd.com>
Acked-by: Zi Yan <ziy(a)nvidia.com>
Reviewed-by: Vlastimil Babka <vbabka(a)suse.cz>
Cc: <stable(a)vger.kernel.org> [6.10+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/page_alloc.c | 113 ++++++++++++++++++++++++++++++++--------------
1 file changed, 79 insertions(+), 34 deletions(-)
--- a/mm/page_alloc.c~mm-page_alloc-speed-up-fallbacks-in-rmqueue_bulk
+++ a/mm/page_alloc.c
@@ -2183,23 +2183,15 @@ try_to_claim_block(struct zone *zone, st
}
/*
- * Try finding a free buddy page on the fallback list.
- *
- * This will attempt to claim a whole pageblock for the requested type
- * to ensure grouping of such requests in the future.
- *
- * If a whole block cannot be claimed, steal an individual page, regressing to
- * __rmqueue_smallest() logic to at least break up as little contiguity as
- * possible.
+ * Try to allocate from some fallback migratetype by claiming the entire block,
+ * i.e. converting it to the allocation's start migratetype.
*
* The use of signed ints for order and current_order is a deliberate
* deviation from the rest of this file, to make the for loop
* condition simpler.
- *
- * Return the stolen page, or NULL if none can be found.
*/
static __always_inline struct page *
-__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
+__rmqueue_claim(struct zone *zone, int order, int start_migratetype,
unsigned int alloc_flags)
{
struct free_area *area;
@@ -2237,14 +2229,29 @@ __rmqueue_fallback(struct zone *zone, in
page = try_to_claim_block(zone, page, current_order, order,
start_migratetype, fallback_mt,
alloc_flags);
- if (page)
- goto got_one;
+ if (page) {
+ trace_mm_page_alloc_extfrag(page, order, current_order,
+ start_migratetype, fallback_mt);
+ return page;
+ }
}
- if (alloc_flags & ALLOC_NOFRAGMENT)
- return NULL;
+ return NULL;
+}
+
+/*
+ * Try to steal a single page from some fallback migratetype. Leave the rest of
+ * the block as its current migratetype, potentially causing fragmentation.
+ */
+static __always_inline struct page *
+__rmqueue_steal(struct zone *zone, int order, int start_migratetype)
+{
+ struct free_area *area;
+ int current_order;
+ struct page *page;
+ int fallback_mt;
+ bool claim_block;
- /* No luck claiming pageblock. Find the smallest fallback page */
for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
@@ -2254,25 +2261,28 @@ __rmqueue_fallback(struct zone *zone, in
page = get_page_from_free_area(area, fallback_mt);
page_del_and_expand(zone, page, order, current_order, fallback_mt);
- goto got_one;
+ trace_mm_page_alloc_extfrag(page, order, current_order,
+ start_migratetype, fallback_mt);
+ return page;
}
return NULL;
-
-got_one:
- trace_mm_page_alloc_extfrag(page, order, current_order,
- start_migratetype, fallback_mt);
-
- return page;
}
+enum rmqueue_mode {
+ RMQUEUE_NORMAL,
+ RMQUEUE_CMA,
+ RMQUEUE_CLAIM,
+ RMQUEUE_STEAL,
+};
+
/*
* Do the hard work of removing an element from the buddy allocator.
* Call me with the zone->lock already held.
*/
static __always_inline struct page *
__rmqueue(struct zone *zone, unsigned int order, int migratetype,
- unsigned int alloc_flags)
+ unsigned int alloc_flags, enum rmqueue_mode *mode)
{
struct page *page;
@@ -2291,16 +2301,48 @@ __rmqueue(struct zone *zone, unsigned in
}
}
- page = __rmqueue_smallest(zone, order, migratetype);
- if (unlikely(!page)) {
- if (alloc_flags & ALLOC_CMA)
+ /*
+ * First try the freelists of the requested migratetype, then try
+ * fallbacks modes with increasing levels of fragmentation risk.
+ *
+ * The fallback logic is expensive and rmqueue_bulk() calls in
+ * a loop with the zone->lock held, meaning the freelists are
+ * not subject to any outside changes. Remember in *mode where
+ * we found pay dirt, to save us the search on the next call.
+ */
+ switch (*mode) {
+ case RMQUEUE_NORMAL:
+ page = __rmqueue_smallest(zone, order, migratetype);
+ if (page)
+ return page;
+ fallthrough;
+ case RMQUEUE_CMA:
+ if (alloc_flags & ALLOC_CMA) {
page = __rmqueue_cma_fallback(zone, order);
-
- if (!page)
- page = __rmqueue_fallback(zone, order, migratetype,
- alloc_flags);
+ if (page) {
+ *mode = RMQUEUE_CMA;
+ return page;
+ }
+ }
+ fallthrough;
+ case RMQUEUE_CLAIM:
+ page = __rmqueue_claim(zone, order, migratetype, alloc_flags);
+ if (page) {
+ /* Replenished preferred freelist, back to normal mode. */
+ *mode = RMQUEUE_NORMAL;
+ return page;
+ }
+ fallthrough;
+ case RMQUEUE_STEAL:
+ if (!(alloc_flags & ALLOC_NOFRAGMENT)) {
+ page = __rmqueue_steal(zone, order, migratetype);
+ if (page) {
+ *mode = RMQUEUE_STEAL;
+ return page;
+ }
+ }
}
- return page;
+ return NULL;
}
/*
@@ -2312,6 +2354,7 @@ static int rmqueue_bulk(struct zone *zon
unsigned long count, struct list_head *list,
int migratetype, unsigned int alloc_flags)
{
+ enum rmqueue_mode rmqm = RMQUEUE_NORMAL;
unsigned long flags;
int i;
@@ -2323,7 +2366,7 @@ static int rmqueue_bulk(struct zone *zon
}
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype,
- alloc_flags);
+ alloc_flags, &rmqm);
if (unlikely(page == NULL))
break;
@@ -2948,7 +2991,9 @@ struct page *rmqueue_buddy(struct zone *
if (alloc_flags & ALLOC_HIGHATOMIC)
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (!page) {
- page = __rmqueue(zone, order, migratetype, alloc_flags);
+ enum rmqueue_mode rmqm = RMQUEUE_NORMAL;
+
+ page = __rmqueue(zone, order, migratetype, alloc_flags, &rmqm);
/*
* If the allocation fails, allow OOM handling and
_
Patches currently in -mm which might be from hannes(a)cmpxchg.org are
mm-page_alloc-tighten-up-find_suitable_fallback.patch
The quilt patch titled
Subject: mm/vma: add give_up_on_oom option on modify/merge, use in uffd release
has been removed from the -mm tree. Its filename was
mm-vma-add-give_up_on_oom-option-on-modify-merge-use-in-uffd-release.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Subject: mm/vma: add give_up_on_oom option on modify/merge, use in uffd release
Date: Fri, 21 Mar 2025 10:09:37 +0000
Currently, if a VMA merge fails due to an OOM condition arising on commit
merge or a failure to duplicate anon_vma's, we report this so the caller
can handle it.
However there are cases where the caller is only ostensibly trying a
merge, and doesn't mind if it fails due to this condition.
Since we do not want to introduce an implicit assumption that we only
actually modify VMAs after OOM conditions might arise, add a 'give up on
oom' option and make an explicit contract that, should this flag be set, we
absolutely will not modify any VMAs should OOM arise and just bail out.
Since it'd be very unusual for a user to try to vma_modify() with this flag
set but be specifying a range within a VMA which ends up being split (which
can fail due to rlimit issues, not only OOM), we add a debug warning for
this condition.
The motivating reason for this is uffd release - syzkaller (and Pedro
Falcato's VERY astute analysis) found a way in which an injected fault on
allocation, triggering an OOM condition on commit merge, would result in
uffd code becoming confused and treating an error value as if it were a VMA
pointer.
To avoid this, we make use of this new VMG flag to ensure that this never
occurs, utilising the fact that, should we be clearing entire VMAs, we do
not wish an OOM event to be reported to us.
Many thanks to Pedro Falcato for his excellent analysis and Jann Horn for
his insightful and intelligent analysis of the situation, both of whom were
instrumental in this fix.
Link: https://lkml.kernel.org/r/20250321100937.46634-1-lorenzo.stoakes@oracle.com
Reported-by: syzbot+20ed41006cf9d842c2b5(a)syzkaller.appspotmail.com
Closes: https://lore.kernel.org/all/67dc67f0.050a0220.25ae54.001e.GAE@google.com/
Fixes: 47b16d0462a4 ("mm: abort vma_modify() on merge out of memory failure")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Suggested-by: Pedro Falcato <pfalcato(a)suse.de>
Suggested-by: Jann Horn <jannh(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/userfaultfd.c | 13 +++++++++--
mm/vma.c | 51 +++++++++++++++++++++++++++++++++++++++++----
mm/vma.h | 9 +++++++
3 files changed, 66 insertions(+), 7 deletions(-)
--- a/mm/userfaultfd.c~mm-vma-add-give_up_on_oom-option-on-modify-merge-use-in-uffd-release
+++ a/mm/userfaultfd.c
@@ -1902,6 +1902,14 @@ struct vm_area_struct *userfaultfd_clear
unsigned long end)
{
struct vm_area_struct *ret;
+ bool give_up_on_oom = false;
+
+ /*
+ * If we are modifying only and not splitting, just give up on the merge
+ * if OOM prevents us from merging successfully.
+ */
+ if (start == vma->vm_start && end == vma->vm_end)
+ give_up_on_oom = true;
/* Reset ptes for the whole vma range if wr-protected */
if (userfaultfd_wp(vma))
@@ -1909,7 +1917,7 @@ struct vm_area_struct *userfaultfd_clear
ret = vma_modify_flags_uffd(vmi, prev, vma, start, end,
vma->vm_flags & ~__VM_UFFD_FLAGS,
- NULL_VM_UFFD_CTX);
+ NULL_VM_UFFD_CTX, give_up_on_oom);
/*
* In the vma_merge() successful mprotect-like case 8:
@@ -1960,7 +1968,8 @@ int userfaultfd_register_range(struct us
new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
new_flags,
- (struct vm_userfaultfd_ctx){ctx});
+ (struct vm_userfaultfd_ctx){ctx},
+ /* give_up_on_oom = */false);
if (IS_ERR(vma))
return PTR_ERR(vma);
--- a/mm/vma.c~mm-vma-add-give_up_on_oom-option-on-modify-merge-use-in-uffd-release
+++ a/mm/vma.c
@@ -666,6 +666,9 @@ static void vmg_adjust_set_range(struct
/*
* Actually perform the VMA merge operation.
*
+ * IMPORTANT: We guarantee that, should vmg->give_up_on_oom is set, to not
+ * modify any VMAs or cause inconsistent state should an OOM condition arise.
+ *
* Returns 0 on success, or an error value on failure.
*/
static int commit_merge(struct vma_merge_struct *vmg)
@@ -685,6 +688,12 @@ static int commit_merge(struct vma_merge
init_multi_vma_prep(&vp, vma, vmg);
+ /*
+ * If vmg->give_up_on_oom is set, we're safe, because we don't actually
+ * manipulate any VMAs until we succeed at preallocation.
+ *
+ * Past this point, we will not return an error.
+ */
if (vma_iter_prealloc(vmg->vmi, vma))
return -ENOMEM;
@@ -915,7 +924,13 @@ static __must_check struct vm_area_struc
if (anon_dup)
unlink_anon_vmas(anon_dup);
- vmg->state = VMA_MERGE_ERROR_NOMEM;
+ /*
+ * We've cleaned up any cloned anon_vma's, no VMAs have been
+ * modified, no harm no foul if the user requests that we not
+ * report this and just give up, leaving the VMAs unmerged.
+ */
+ if (!vmg->give_up_on_oom)
+ vmg->state = VMA_MERGE_ERROR_NOMEM;
return NULL;
}
@@ -926,7 +941,15 @@ static __must_check struct vm_area_struc
abort:
vma_iter_set(vmg->vmi, start);
vma_iter_load(vmg->vmi);
- vmg->state = VMA_MERGE_ERROR_NOMEM;
+
+ /*
+ * This means we have failed to clone anon_vma's correctly, but no
+ * actual changes to VMAs have occurred, so no harm no foul - if the
+ * user doesn't want this reported and instead just wants to give up on
+ * the merge, allow it.
+ */
+ if (!vmg->give_up_on_oom)
+ vmg->state = VMA_MERGE_ERROR_NOMEM;
return NULL;
}
@@ -1068,6 +1091,10 @@ int vma_expand(struct vma_merge_struct *
/* This should already have been checked by this point. */
VM_WARN_ON_VMG(!can_merge_remove_vma(next), vmg);
vma_start_write(next);
+ /*
+ * In this case we don't report OOM, so vmg->give_up_on_mm is
+ * safe.
+ */
ret = dup_anon_vma(middle, next, &anon_dup);
if (ret)
return ret;
@@ -1090,9 +1117,15 @@ int vma_expand(struct vma_merge_struct *
return 0;
nomem:
- vmg->state = VMA_MERGE_ERROR_NOMEM;
if (anon_dup)
unlink_anon_vmas(anon_dup);
+ /*
+ * If the user requests that we just give upon OOM, we are safe to do so
+ * here, as commit merge provides this contract to us. Nothing has been
+ * changed - no harm no foul, just don't report it.
+ */
+ if (!vmg->give_up_on_oom)
+ vmg->state = VMA_MERGE_ERROR_NOMEM;
return -ENOMEM;
}
@@ -1534,6 +1567,13 @@ static struct vm_area_struct *vma_modify
if (vmg_nomem(vmg))
return ERR_PTR(-ENOMEM);
+ /*
+ * Split can fail for reasons other than OOM, so if the user requests
+ * this it's probably a mistake.
+ */
+ VM_WARN_ON(vmg->give_up_on_oom &&
+ (vma->vm_start != start || vma->vm_end != end));
+
/* Split any preceding portion of the VMA. */
if (vma->vm_start < start) {
int err = split_vma(vmg->vmi, vma, start, 1);
@@ -1602,12 +1642,15 @@ struct vm_area_struct
struct vm_area_struct *vma,
unsigned long start, unsigned long end,
unsigned long new_flags,
- struct vm_userfaultfd_ctx new_ctx)
+ struct vm_userfaultfd_ctx new_ctx,
+ bool give_up_on_oom)
{
VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
vmg.flags = new_flags;
vmg.uffd_ctx = new_ctx;
+ if (give_up_on_oom)
+ vmg.give_up_on_oom = true;
return vma_modify(&vmg);
}
--- a/mm/vma.h~mm-vma-add-give_up_on_oom-option-on-modify-merge-use-in-uffd-release
+++ a/mm/vma.h
@@ -114,6 +114,12 @@ struct vma_merge_struct {
*/
bool just_expand :1;
+ /*
+ * If a merge is possible, but an OOM error occurs, give up and don't
+ * execute the merge, returning NULL.
+ */
+ bool give_up_on_oom :1;
+
/* Internal flags set during merge process: */
/*
@@ -255,7 +261,8 @@ __must_check struct vm_area_struct
struct vm_area_struct *vma,
unsigned long start, unsigned long end,
unsigned long new_flags,
- struct vm_userfaultfd_ctx new_ctx);
+ struct vm_userfaultfd_ctx new_ctx,
+ bool give_up_on_oom);
__must_check struct vm_area_struct
*vma_merge_new_range(struct vma_merge_struct *vmg);
_
Patches currently in -mm which might be from lorenzo.stoakes(a)oracle.com are
maintainers-add-memory-advice-section.patch
mm-vma-fix-incorrectly-disallowed-anonymous-vma-merges.patch
tools-testing-add-procmap_query-helper-functions-in-mm-self-tests.patch
tools-testing-selftests-assert-that-anon-merge-cases-behave-as-expected.patch