The patch titled
Subject: mm/page_alloc: fix race condition in unaccepted memory handling
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-page_alloc-fix-race-condition-in-unaccepted-memory-handling.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: "Kirill A. Shutemov" <kirill.shutemov(a)linux.intel.com>
Subject: mm/page_alloc: fix race condition in unaccepted memory handling
Date: Tue, 6 May 2025 16:32:07 +0300
The page allocator tracks the number of zones that have unaccepted memory
using static_branch_enc/dec() and uses that static branch in hot paths to
determine if it needs to deal with unaccepted memory.
Borislav and Thomas pointed out that the tracking is racy: operations on
static_branch are not serialized against adding/removing unaccepted pages
to/from the zone.
Sanity checks inside static_branch machinery detects it:
WARNING: CPU: 0 PID: 10 at kernel/jump_label.c:276 __static_key_slow_dec_cpuslocked+0x8e/0xa0
The comment around the WARN() explains the problem:
/*
* Warn about the '-1' case though; since that means a
* decrement is concurrent with a first (0->1) increment. IOW
* people are trying to disable something that wasn't yet fully
* enabled. This suggests an ordering problem on the user side.
*/
The effect of this static_branch optimization is only visible on
microbenchmark.
Instead of adding more complexity around it, remove it altogether.
Link: https://lkml.kernel.org/r/20250506133207.1009676-1-kirill.shutemov@linux.in…
Signed-off-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Fixes: dcdfdd40fa82 ("mm: Add support for unaccepted memory")
Link: https://lore.kernel.org/all/20250506092445.GBaBnVXXyvnazly6iF@fat_crate.loc…
Reported-by: Borislav Petkov <bp(a)alien8.de>
Tested-by: Borislav Petkov (AMD) <bp(a)alien8.de>
Reported-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Suren Baghdasaryan <surenb(a)google.com>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Brendan Jackman <jackmanb(a)google.com>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: <stable(a)vger.kernel.org> [6.5+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/internal.h | 1
mm/mm_init.c | 1
mm/page_alloc.c | 47 ----------------------------------------------
3 files changed, 49 deletions(-)
--- a/mm/internal.h~mm-page_alloc-fix-race-condition-in-unaccepted-memory-handling
+++ a/mm/internal.h
@@ -1590,7 +1590,6 @@ unsigned long move_page_tables(struct pa
#ifdef CONFIG_UNACCEPTED_MEMORY
void accept_page(struct page *page);
-void unaccepted_cleanup_work(struct work_struct *work);
#else /* CONFIG_UNACCEPTED_MEMORY */
static inline void accept_page(struct page *page)
{
--- a/mm/mm_init.c~mm-page_alloc-fix-race-condition-in-unaccepted-memory-handling
+++ a/mm/mm_init.c
@@ -1441,7 +1441,6 @@ static void __meminit zone_init_free_lis
#ifdef CONFIG_UNACCEPTED_MEMORY
INIT_LIST_HEAD(&zone->unaccepted_pages);
- INIT_WORK(&zone->unaccepted_cleanup, unaccepted_cleanup_work);
#endif
}
--- a/mm/page_alloc.c~mm-page_alloc-fix-race-condition-in-unaccepted-memory-handling
+++ a/mm/page_alloc.c
@@ -7180,16 +7180,8 @@ bool has_managed_dma(void)
#ifdef CONFIG_UNACCEPTED_MEMORY
-/* Counts number of zones with unaccepted pages. */
-static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages);
-
static bool lazy_accept = true;
-void unaccepted_cleanup_work(struct work_struct *work)
-{
- static_branch_dec(&zones_with_unaccepted_pages);
-}
-
static int __init accept_memory_parse(char *p)
{
if (!strcmp(p, "lazy")) {
@@ -7214,11 +7206,7 @@ static bool page_contains_unaccepted(str
static void __accept_page(struct zone *zone, unsigned long *flags,
struct page *page)
{
- bool last;
-
list_del(&page->lru);
- last = list_empty(&zone->unaccepted_pages);
-
account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
__mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES);
__ClearPageUnaccepted(page);
@@ -7227,28 +7215,6 @@ static void __accept_page(struct zone *z
accept_memory(page_to_phys(page), PAGE_SIZE << MAX_PAGE_ORDER);
__free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL);
-
- if (last) {
- /*
- * There are two corner cases:
- *
- * - If allocation occurs during the CPU bring up,
- * static_branch_dec() cannot be used directly as
- * it causes a deadlock on cpu_hotplug_lock.
- *
- * Instead, use schedule_work() to prevent deadlock.
- *
- * - If allocation occurs before workqueues are initialized,
- * static_branch_dec() should be called directly.
- *
- * Workqueues are initialized before CPU bring up, so this
- * will not conflict with the first scenario.
- */
- if (system_wq)
- schedule_work(&zone->unaccepted_cleanup);
- else
- unaccepted_cleanup_work(&zone->unaccepted_cleanup);
- }
}
void accept_page(struct page *page)
@@ -7285,20 +7251,12 @@ static bool try_to_accept_memory_one(str
return true;
}
-static inline bool has_unaccepted_memory(void)
-{
- return static_branch_unlikely(&zones_with_unaccepted_pages);
-}
-
static bool cond_accept_memory(struct zone *zone, unsigned int order,
int alloc_flags)
{
long to_accept, wmark;
bool ret = false;
- if (!has_unaccepted_memory())
- return false;
-
if (list_empty(&zone->unaccepted_pages))
return false;
@@ -7336,22 +7294,17 @@ static bool __free_unaccepted(struct pag
{
struct zone *zone = page_zone(page);
unsigned long flags;
- bool first = false;
if (!lazy_accept)
return false;
spin_lock_irqsave(&zone->lock, flags);
- first = list_empty(&zone->unaccepted_pages);
list_add_tail(&page->lru, &zone->unaccepted_pages);
account_freepages(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
__mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES);
__SetPageUnaccepted(page);
spin_unlock_irqrestore(&zone->lock, flags);
- if (first)
- static_branch_inc(&zones_with_unaccepted_pages);
-
return true;
}
_
Patches currently in -mm which might be from kirill.shutemov(a)linux.intel.com are
mm-page_alloc-ensure-try_alloc_pages-plays-well-with-unaccepted-memory.patch
mm-page_alloc-fix-race-condition-in-unaccepted-memory-handling.patch
After a recent change [1] in clang's randstruct implementation to
randomize structures that only contain function pointers, there is an
error because qede_ll_ops get randomized but does not use a designated
initializer for the first member:
drivers/net/ethernet/qlogic/qede/qede_main.c:206:2: error: a randomized struct can only be initialized with a designated initializer
206 | {
| ^
Explicitly initialize the common member using a designated initializer
to fix the build.
Cc: stable(a)vger.kernel.org
Fixes: 035f7f87b729 ("randstruct: Enable Clang support")
Link: https://github.com/llvm/llvm-project/commit/04364fb888eea6db9811510607bed4b… [1]
Signed-off-by: Nathan Chancellor <nathan(a)kernel.org>
---
drivers/net/ethernet/qlogic/qede/qede_main.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 99df00c30b8c..b5d744d2586f 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -203,7 +203,7 @@ static struct pci_driver qede_pci_driver = {
};
static struct qed_eth_cb_ops qede_ll_ops = {
- {
+ .common = {
#ifdef CONFIG_RFS_ACCEL
.arfs_filter_op = qede_arfs_filter_op,
#endif
---
base-commit: 9540984da649d46f699c47f28c68bbd3c9d99e4c
change-id: 20250507-qede-fix-clang-randstruct-011a3119f5d6
Best regards,
--
Nathan Chancellor <nathan(a)kernel.org>
On 07/05/2025 16:45, Sasha Levin wrote:
> This is a note to let you know that I've just added the patch titled
>
> spi: tegra114: Don't fail set_cs_timing when delays are zero
>
> to the 5.15-stable tree which can be found at:
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
>
> The filename of the patch is:
> spi-tegra114-don-t-fail-set_cs_timing-when-delays-ar.patch
> and it can be found in the queue-5.15 subdirectory.
>
> If you, or anyone else, feels it should not be added to the stable tree,
> please let <stable(a)vger.kernel.org> know about it.
Please drop this from the stable queue because there is another fix
pending to fix this fix.
Jon
--
nvpublic
Commit 7ab4f0e37a0f ("ACPI PPTT: Fix coding mistakes in a couple of
sizeof() calls") corrects the processer entry size but unmasked a longer
standing bug where the last entry in the structure can get skipped due
to an off-by-one mistake if the last entry ends exactly at the end of
the ACPI subtable.
The error manifests for instance on EC2 Graviton Metal instances with
ACPI PPTT: PPTT table found, but unable to locate core 63 (63)
[...]
ACPI: SPE must be homogeneous
Fixes: 2bd00bcd73e5 ("ACPI/PPTT: Add Processor Properties Topology Table parsing")
Cc: stable(a)vger.kernel.org
Signed-off-by: Maximilian Heyne <mheyne(a)amazon.de>
---
drivers/acpi/pptt.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/acpi/pptt.c b/drivers/acpi/pptt.c
index f73ce6e13065d..4364da90902e5 100644
--- a/drivers/acpi/pptt.c
+++ b/drivers/acpi/pptt.c
@@ -231,7 +231,7 @@ static int acpi_pptt_leaf_node(struct acpi_table_header *table_hdr,
sizeof(struct acpi_table_pptt));
proc_sz = sizeof(struct acpi_pptt_processor);
- while ((unsigned long)entry + proc_sz < table_end) {
+ while ((unsigned long)entry + proc_sz <= table_end) {
cpu_node = (struct acpi_pptt_processor *)entry;
if (entry->type == ACPI_PPTT_TYPE_PROCESSOR &&
cpu_node->parent == node_entry)
@@ -273,7 +273,7 @@ static struct acpi_pptt_processor *acpi_find_processor_node(struct acpi_table_he
proc_sz = sizeof(struct acpi_pptt_processor);
/* find the processor structure associated with this cpuid */
- while ((unsigned long)entry + proc_sz < table_end) {
+ while ((unsigned long)entry + proc_sz <= table_end) {
cpu_node = (struct acpi_pptt_processor *)entry;
if (entry->length == 0) {
--
2.47.1
Amazon Web Services Development Center Germany GmbH
Tamara-Danz-Str. 13
10243 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss
Eingetragen am Amtsgericht Charlottenburg unter HRB 257764 B
Sitz: Berlin
Ust-ID: DE 365 538 597
On 07/05/2025 16:43, Sasha Levin wrote:
> This is a note to let you know that I've just added the patch titled
>
> spi: tegra114: Don't fail set_cs_timing when delays are zero
>
> to the 6.1-stable tree which can be found at:
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
>
> The filename of the patch is:
> spi-tegra114-don-t-fail-set_cs_timing-when-delays-ar.patch
> and it can be found in the queue-6.1 subdirectory.
>
> If you, or anyone else, feels it should not be added to the stable tree,
> please let <stable(a)vger.kernel.org> know about it.
Please don't queue this up for stable yet. This fix is not correct and
there is another change pending to correct this change.
Jon
--
nvpublic
From: Chuck Lever <chuck.lever(a)oracle.com>
RFC 7862 states that if an NFS server implements a CLONE operation,
it MUST also implement FATTR4_CLONE_BLKSIZE. NFSD implements CLONE,
but does not implement FATTR4_CLONE_BLKSIZE.
Note that in Section 12.2, RFC 7862 claims that
FATTR4_CLONE_BLKSIZE is RECOMMENDED, not REQUIRED. Likely this is
because a minor version is not permitted to add a REQUIRED
attribute. Confusing.
We assume this attribute reports a block size as a count of bytes,
as RFC 7862 does not specify a unit.
Reported-by: Roland Mainz <roland.mainz(a)nrubsig.org>
Suggested-by: Christoph Hellwig <hch(a)infradead.org>
Reviewed-by: Roland Mainz <roland.mainz(a)nrubsig.org>
Cc: stable(a)vger.kernel.org # v6.7+
Signed-off-by: Chuck Lever <chuck.lever(a)oracle.com>
---
fs/nfsd/nfs4xdr.c | 19 ++++++++++++++++++-
1 file changed, 18 insertions(+), 1 deletion(-)
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index e67420729ecd..9eb8e5704622 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3391,6 +3391,23 @@ static __be32 nfsd4_encode_fattr4_suppattr_exclcreat(struct xdr_stream *xdr,
return nfsd4_encode_bitmap4(xdr, supp[0], supp[1], supp[2]);
}
+/*
+ * Copied from generic_remap_checks/generic_remap_file_range_prep.
+ *
+ * These generic functions use the file system's s_blocksize, but
+ * individual file systems aren't required to use
+ * generic_remap_file_range_prep. Until there is a mechanism for
+ * determining a particular file system's (or file's) clone block
+ * size, this is the best NFSD can do.
+ */
+static __be32 nfsd4_encode_fattr4_clone_blksize(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ struct inode *inode = d_inode(args->dentry);
+
+ return nfsd4_encode_uint32_t(xdr, inode->i_sb->s_blocksize);
+}
+
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
static __be32 nfsd4_encode_fattr4_sec_label(struct xdr_stream *xdr,
const struct nfsd4_fattr_args *args)
@@ -3545,7 +3562,7 @@ static const nfsd4_enc_attr nfsd4_enc_fattr4_encode_ops[] = {
[FATTR4_MODE_SET_MASKED] = nfsd4_encode_fattr4__noop,
[FATTR4_SUPPATTR_EXCLCREAT] = nfsd4_encode_fattr4_suppattr_exclcreat,
[FATTR4_FS_CHARSET_CAP] = nfsd4_encode_fattr4__noop,
- [FATTR4_CLONE_BLKSIZE] = nfsd4_encode_fattr4__noop,
+ [FATTR4_CLONE_BLKSIZE] = nfsd4_encode_fattr4_clone_blksize,
[FATTR4_SPACE_FREED] = nfsd4_encode_fattr4__noop,
[FATTR4_CHANGE_ATTR_TYPE] = nfsd4_encode_fattr4__noop,
--
2.49.0