The quilt patch titled
Subject: watchdog: fix watchdog may detect false positive of softlockup
has been removed from the -mm tree. Its filename was
watchdog-fix-watchdog-may-detect-false-positive-of-softlockup.patch
This patch was dropped because it was merged into the mm-nonmm-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Luo Gengkun <luogengkun(a)huaweicloud.com>
Subject: watchdog: fix watchdog may detect false positive of softlockup
Date: Mon, 21 Apr 2025 03:50:21 +0000
When updating `watchdog_thresh`, there is a race condition between writing
the new `watchdog_thresh` value and stopping the old watchdog timer. If
the old timer triggers during this window, it may falsely detect a
softlockup due to the old interval and the new `watchdog_thresh` value
being used. The problem can be described as follow:
# We asuume previous watchdog_thresh is 60, so the watchdog timer is
# coming every 24s.
echo 10 > /proc/sys/kernel/watchdog_thresh (User space)
|
+------>+ update watchdog_thresh (We are in kernel now)
|
| # using old interval and new `watchdog_thresh`
+------>+ watchdog hrtimer (irq context: detect softlockup)
|
|
+-------+
|
|
+ softlockup_stop_all
To fix this problem, introduce a shadow variable for `watchdog_thresh`.
The update to the actual `watchdog_thresh` is delayed until after the old
timer is stopped, preventing false positives.
The following testcase may help to understand this problem.
---------------------------------------------
echo RT_RUNTIME_SHARE > /sys/kernel/debug/sched/features
echo -1 > /proc/sys/kernel/sched_rt_runtime_us
echo 0 > /sys/kernel/debug/sched/fair_server/cpu3/runtime
echo 60 > /proc/sys/kernel/watchdog_thresh
taskset -c 3 chrt -r 99 /bin/bash -c "while true;do true; done" &
echo 10 > /proc/sys/kernel/watchdog_thresh &
---------------------------------------------
The test case above first removes the throttling restrictions for
real-time tasks. It then sets watchdog_thresh to 60 and executes a
real-time task ,a simple while(1) loop, on cpu3. Consequently, the final
command gets blocked because the presence of this real-time thread
prevents kworker:3 from being selected by the scheduler. This eventually
triggers a softlockup detection on cpu3 due to watchdog_timer_fn operating
with inconsistent variable - using both the old interval and the updated
watchdog_thresh simultaneously.
[nysal(a)linux.ibm.com: fix the SOFTLOCKUP_DETECTOR=n case]
Link: https://lkml.kernel.org/r/20250502111120.282690-1-nysal@linux.ibm.com
Link: https://lkml.kernel.org/r/20250421035021.3507649-1-luogengkun@huaweicloud.c…
Signed-off-by: Luo Gengkun <luogengkun(a)huaweicloud.com>
Signed-off-by: Nysal Jan K.A. <nysal(a)linux.ibm.com>
Cc: Doug Anderson <dianders(a)chromium.org>
Cc: Joel Granados <joel.granados(a)kernel.org>
Cc: Song Liu <song(a)kernel.org>
Cc: Thomas Gleinxer <tglx(a)linutronix.de>
Cc: "Nysal Jan K.A." <nysal(a)linux.ibm.com>
Cc: Venkat Rao Bagalkote <venkat88(a)linux.ibm.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
kernel/watchdog.c | 41 +++++++++++++++++++++++++++--------------
1 file changed, 27 insertions(+), 14 deletions(-)
--- a/kernel/watchdog.c~watchdog-fix-watchdog-may-detect-false-positive-of-softlockup
+++ a/kernel/watchdog.c
@@ -47,6 +47,7 @@ int __read_mostly watchdog_user_enabled
static int __read_mostly watchdog_hardlockup_user_enabled = WATCHDOG_HARDLOCKUP_DEFAULT;
static int __read_mostly watchdog_softlockup_user_enabled = 1;
int __read_mostly watchdog_thresh = 10;
+static int __read_mostly watchdog_thresh_next;
static int __read_mostly watchdog_hardlockup_available;
struct cpumask watchdog_cpumask __read_mostly;
@@ -870,12 +871,20 @@ int lockup_detector_offline_cpu(unsigned
return 0;
}
-static void __lockup_detector_reconfigure(void)
+static void __lockup_detector_reconfigure(bool thresh_changed)
{
cpus_read_lock();
watchdog_hardlockup_stop();
softlockup_stop_all();
+ /*
+ * To prevent watchdog_timer_fn from using the old interval and
+ * the new watchdog_thresh at the same time, which could lead to
+ * false softlockup reports, it is necessary to update the
+ * watchdog_thresh after the softlockup is completed.
+ */
+ if (thresh_changed)
+ watchdog_thresh = READ_ONCE(watchdog_thresh_next);
set_sample_period();
lockup_detector_update_enable();
if (watchdog_enabled && watchdog_thresh)
@@ -888,7 +897,7 @@ static void __lockup_detector_reconfigur
void lockup_detector_reconfigure(void)
{
mutex_lock(&watchdog_mutex);
- __lockup_detector_reconfigure();
+ __lockup_detector_reconfigure(false);
mutex_unlock(&watchdog_mutex);
}
@@ -908,27 +917,29 @@ static __init void lockup_detector_setup
return;
mutex_lock(&watchdog_mutex);
- __lockup_detector_reconfigure();
+ __lockup_detector_reconfigure(false);
softlockup_initialized = true;
mutex_unlock(&watchdog_mutex);
}
#else /* CONFIG_SOFTLOCKUP_DETECTOR */
-static void __lockup_detector_reconfigure(void)
+static void __lockup_detector_reconfigure(bool thresh_changed)
{
cpus_read_lock();
watchdog_hardlockup_stop();
+ if (thresh_changed)
+ watchdog_thresh = READ_ONCE(watchdog_thresh_next);
lockup_detector_update_enable();
watchdog_hardlockup_start();
cpus_read_unlock();
}
void lockup_detector_reconfigure(void)
{
- __lockup_detector_reconfigure();
+ __lockup_detector_reconfigure(false);
}
static inline void lockup_detector_setup(void)
{
- __lockup_detector_reconfigure();
+ __lockup_detector_reconfigure(false);
}
#endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
@@ -946,11 +957,11 @@ void lockup_detector_soft_poweroff(void)
#ifdef CONFIG_SYSCTL
/* Propagate any changes to the watchdog infrastructure */
-static void proc_watchdog_update(void)
+static void proc_watchdog_update(bool thresh_changed)
{
/* Remove impossible cpus to keep sysctl output clean. */
cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask);
- __lockup_detector_reconfigure();
+ __lockup_detector_reconfigure(thresh_changed);
}
/*
@@ -984,7 +995,7 @@ static int proc_watchdog_common(int whic
} else {
err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!err && old != READ_ONCE(*param))
- proc_watchdog_update();
+ proc_watchdog_update(false);
}
mutex_unlock(&watchdog_mutex);
return err;
@@ -1035,11 +1046,13 @@ static int proc_watchdog_thresh(const st
mutex_lock(&watchdog_mutex);
- old = READ_ONCE(watchdog_thresh);
+ watchdog_thresh_next = READ_ONCE(watchdog_thresh);
+
+ old = watchdog_thresh_next;
err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (!err && write && old != READ_ONCE(watchdog_thresh))
- proc_watchdog_update();
+ if (!err && write && old != READ_ONCE(watchdog_thresh_next))
+ proc_watchdog_update(true);
mutex_unlock(&watchdog_mutex);
return err;
@@ -1060,7 +1073,7 @@ static int proc_watchdog_cpumask(const s
err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
if (!err && write)
- proc_watchdog_update();
+ proc_watchdog_update(false);
mutex_unlock(&watchdog_mutex);
return err;
@@ -1080,7 +1093,7 @@ static const struct ctl_table watchdog_s
},
{
.procname = "watchdog_thresh",
- .data = &watchdog_thresh,
+ .data = &watchdog_thresh_next,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_watchdog_thresh,
_
Patches currently in -mm which might be from luogengkun(a)huaweicloud.com are
The quilt patch titled
Subject: mm: fix ratelimit_pages update error in dirty_ratio_handler()
has been removed from the -mm tree. Its filename was
mm-fix-ratelimit_pages-update-error-in-dirty_ratio_handler.patch
This patch was dropped because it was merged into the mm-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Jinliang Zheng <alexjlzheng(a)tencent.com>
Subject: mm: fix ratelimit_pages update error in dirty_ratio_handler()
Date: Tue, 15 Apr 2025 17:02:32 +0800
In dirty_ratio_handler(), vm_dirty_bytes must be set to zero before
calling writeback_set_ratelimit(), as global_dirty_limits() always
prioritizes the value of vm_dirty_bytes.
It's domain_dirty_limits() that's relevant here, not node_dirty_ok:
dirty_ratio_handler
writeback_set_ratelimit
global_dirty_limits(&dirty_thresh) <- ratelimit_pages based on dirty_thresh
domain_dirty_limits
if (bytes) <- bytes = vm_dirty_bytes <--------+
thresh = f1(bytes) <- prioritizes vm_dirty_bytes |
else |
thresh = f2(ratio) |
ratelimit_pages = f3(dirty_thresh) |
vm_dirty_bytes = 0 <- it's late! ---------------------+
This causes ratelimit_pages to still use the value calculated based on
vm_dirty_bytes, which is wrong now.
The impact visible to userspace is difficult to capture directly because
there is no procfs/sysfs interface exported to user space. However, it
will have a real impact on the balance of dirty pages.
For example:
1. On default, we have vm_dirty_ratio=40, vm_dirty_bytes=0
2. echo 8192 > dirty_bytes, then vm_dirty_bytes=8192,
vm_dirty_ratio=0, and ratelimit_pages is calculated based on
vm_dirty_bytes now.
3. echo 20 > dirty_ratio, then since vm_dirty_bytes is not reset to
zero when writeback_set_ratelimit() -> global_dirty_limits() ->
domain_dirty_limits() is called, reallimit_pages is still calculated
based on vm_dirty_bytes instead of vm_dirty_ratio. This does not
conform to the actual intent of the user.
Link: https://lkml.kernel.org/r/20250415090232.7544-1-alexjlzheng@tencent.com
Fixes: 9d823e8f6b1b ("writeback: per task dirty rate limit")
Signed-off-by: Jinliang Zheng <alexjlzheng(a)tencent.com>
Reviewed-by: MengEn Sun <mengensun(a)tencent.com>
Cc: Andrea Righi <andrea(a)betterlinux.com>
Cc: Fenggaung Wu <fengguang.wu(a)intel.com>
Cc: Jinliang Zheng <alexjlzheng(a)tencent.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/page-writeback.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/mm/page-writeback.c~mm-fix-ratelimit_pages-update-error-in-dirty_ratio_handler
+++ a/mm/page-writeback.c
@@ -520,8 +520,8 @@ static int dirty_ratio_handler(const str
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
- writeback_set_ratelimit();
vm_dirty_bytes = 0;
+ writeback_set_ratelimit();
}
return ret;
}
_
Patches currently in -mm which might be from alexjlzheng(a)tencent.com are
The quilt patch titled
Subject: mm: userfaultfd: correct dirty flags set for both present and swap pte
has been removed from the -mm tree. Its filename was
mm-userfaultfd-correct-dirty-flags-set-for-both-present-and-swap-pte.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Barry Song <v-songbaohua(a)oppo.com>
Subject: mm: userfaultfd: correct dirty flags set for both present and swap pte
Date: Fri, 9 May 2025 10:09:12 +1200
As David pointed out, what truly matters for mremap and userfaultfd move
operations is the soft dirty bit. The current comment and
implementation���which always sets the dirty bit for present PTEs and
fails to set the soft dirty bit for swap PTEs���are incorrect. This could
break features like Checkpoint-Restore in Userspace (CRIU).
This patch updates the behavior to correctly set the soft dirty bit for
both present and swap PTEs in accordance with mremap.
Link: https://lkml.kernel.org/r/20250508220912.7275-1-21cnbao@gmail.com
Fixes: adef440691ba ("userfaultfd: UFFDIO_MOVE uABI")
Signed-off-by: Barry Song <v-songbaohua(a)oppo.com>
Reported-by: David Hildenbrand <david(a)redhat.com>
Closes: https://lore.kernel.org/linux-mm/02f14ee1-923f-47e3-a994-4950afb9afcc@redha…
Acked-by: Peter Xu <peterx(a)redhat.com>
Reviewed-by: Suren Baghdasaryan <surenb(a)google.com>
Cc: Lokesh Gidra <lokeshgidra(a)google.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/userfaultfd.c | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)
--- a/mm/userfaultfd.c~mm-userfaultfd-correct-dirty-flags-set-for-both-present-and-swap-pte
+++ a/mm/userfaultfd.c
@@ -1064,8 +1064,13 @@ static int move_present_pte(struct mm_st
src_folio->index = linear_page_index(dst_vma, dst_addr);
orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot);
- /* Follow mremap() behavior and treat the entry dirty after the move */
- orig_dst_pte = pte_mkwrite(pte_mkdirty(orig_dst_pte), dst_vma);
+ /* Set soft dirty bit so userspace can notice the pte was moved */
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ orig_dst_pte = pte_mksoft_dirty(orig_dst_pte);
+#endif
+ if (pte_dirty(orig_src_pte))
+ orig_dst_pte = pte_mkdirty(orig_dst_pte);
+ orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma);
set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte);
out:
@@ -1100,6 +1105,9 @@ static int move_swap_pte(struct mm_struc
}
orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte);
+#endif
set_pte_at(mm, dst_addr, dst_pte, orig_src_pte);
double_pt_unlock(dst_ptl, src_ptl);
_
Patches currently in -mm which might be from v-songbaohua(a)oppo.com are
The quilt patch titled
Subject: mm/page_alloc: fix race condition in unaccepted memory handling
has been removed from the -mm tree. Its filename was
mm-page_alloc-fix-race-condition-in-unaccepted-memory-handling.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: "Kirill A. Shutemov" <kirill.shutemov(a)linux.intel.com>
Subject: mm/page_alloc: fix race condition in unaccepted memory handling
Date: Tue, 6 May 2025 16:32:07 +0300
The page allocator tracks the number of zones that have unaccepted memory
using static_branch_enc/dec() and uses that static branch in hot paths to
determine if it needs to deal with unaccepted memory.
Borislav and Thomas pointed out that the tracking is racy: operations on
static_branch are not serialized against adding/removing unaccepted pages
to/from the zone.
Sanity checks inside static_branch machinery detects it:
WARNING: CPU: 0 PID: 10 at kernel/jump_label.c:276 __static_key_slow_dec_cpuslocked+0x8e/0xa0
The comment around the WARN() explains the problem:
/*
* Warn about the '-1' case though; since that means a
* decrement is concurrent with a first (0->1) increment. IOW
* people are trying to disable something that wasn't yet fully
* enabled. This suggests an ordering problem on the user side.
*/
The effect of this static_branch optimization is only visible on
microbenchmark.
Instead of adding more complexity around it, remove it altogether.
Link: https://lkml.kernel.org/r/20250506133207.1009676-1-kirill.shutemov@linux.in…
Signed-off-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Fixes: dcdfdd40fa82 ("mm: Add support for unaccepted memory")
Link: https://lore.kernel.org/all/20250506092445.GBaBnVXXyvnazly6iF@fat_crate.loc…
Reported-by: Borislav Petkov <bp(a)alien8.de>
Tested-by: Borislav Petkov (AMD) <bp(a)alien8.de>
Reported-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Suren Baghdasaryan <surenb(a)google.com>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Brendan Jackman <jackmanb(a)google.com>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: <stable(a)vger.kernel.org> [6.5+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/internal.h | 1
mm/mm_init.c | 1
mm/page_alloc.c | 47 ----------------------------------------------
3 files changed, 49 deletions(-)
--- a/mm/internal.h~mm-page_alloc-fix-race-condition-in-unaccepted-memory-handling
+++ a/mm/internal.h
@@ -1590,7 +1590,6 @@ unsigned long move_page_tables(struct pa
#ifdef CONFIG_UNACCEPTED_MEMORY
void accept_page(struct page *page);
-void unaccepted_cleanup_work(struct work_struct *work);
#else /* CONFIG_UNACCEPTED_MEMORY */
static inline void accept_page(struct page *page)
{
--- a/mm/mm_init.c~mm-page_alloc-fix-race-condition-in-unaccepted-memory-handling
+++ a/mm/mm_init.c
@@ -1441,7 +1441,6 @@ static void __meminit zone_init_free_lis
#ifdef CONFIG_UNACCEPTED_MEMORY
INIT_LIST_HEAD(&zone->unaccepted_pages);
- INIT_WORK(&zone->unaccepted_cleanup, unaccepted_cleanup_work);
#endif
}
--- a/mm/page_alloc.c~mm-page_alloc-fix-race-condition-in-unaccepted-memory-handling
+++ a/mm/page_alloc.c
@@ -7172,16 +7172,8 @@ bool has_managed_dma(void)
#ifdef CONFIG_UNACCEPTED_MEMORY
-/* Counts number of zones with unaccepted pages. */
-static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages);
-
static bool lazy_accept = true;
-void unaccepted_cleanup_work(struct work_struct *work)
-{
- static_branch_dec(&zones_with_unaccepted_pages);
-}
-
static int __init accept_memory_parse(char *p)
{
if (!strcmp(p, "lazy")) {
@@ -7206,11 +7198,7 @@ static bool page_contains_unaccepted(str
static void __accept_page(struct zone *zone, unsigned long *flags,
struct page *page)
{
- bool last;
-
list_del(&page->lru);
- last = list_empty(&zone->unaccepted_pages);
-
account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
__mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES);
__ClearPageUnaccepted(page);
@@ -7219,28 +7207,6 @@ static void __accept_page(struct zone *z
accept_memory(page_to_phys(page), PAGE_SIZE << MAX_PAGE_ORDER);
__free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL);
-
- if (last) {
- /*
- * There are two corner cases:
- *
- * - If allocation occurs during the CPU bring up,
- * static_branch_dec() cannot be used directly as
- * it causes a deadlock on cpu_hotplug_lock.
- *
- * Instead, use schedule_work() to prevent deadlock.
- *
- * - If allocation occurs before workqueues are initialized,
- * static_branch_dec() should be called directly.
- *
- * Workqueues are initialized before CPU bring up, so this
- * will not conflict with the first scenario.
- */
- if (system_wq)
- schedule_work(&zone->unaccepted_cleanup);
- else
- unaccepted_cleanup_work(&zone->unaccepted_cleanup);
- }
}
void accept_page(struct page *page)
@@ -7277,20 +7243,12 @@ static bool try_to_accept_memory_one(str
return true;
}
-static inline bool has_unaccepted_memory(void)
-{
- return static_branch_unlikely(&zones_with_unaccepted_pages);
-}
-
static bool cond_accept_memory(struct zone *zone, unsigned int order,
int alloc_flags)
{
long to_accept, wmark;
bool ret = false;
- if (!has_unaccepted_memory())
- return false;
-
if (list_empty(&zone->unaccepted_pages))
return false;
@@ -7328,22 +7286,17 @@ static bool __free_unaccepted(struct pag
{
struct zone *zone = page_zone(page);
unsigned long flags;
- bool first = false;
if (!lazy_accept)
return false;
spin_lock_irqsave(&zone->lock, flags);
- first = list_empty(&zone->unaccepted_pages);
list_add_tail(&page->lru, &zone->unaccepted_pages);
account_freepages(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
__mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES);
__SetPageUnaccepted(page);
spin_unlock_irqrestore(&zone->lock, flags);
- if (first)
- static_branch_inc(&zones_with_unaccepted_pages);
-
return true;
}
_
Patches currently in -mm which might be from kirill.shutemov(a)linux.intel.com are
The quilt patch titled
Subject: mm/page_alloc: ensure try_alloc_pages() plays well with unaccepted memory
has been removed from the -mm tree. Its filename was
mm-page_alloc-ensure-try_alloc_pages-plays-well-with-unaccepted-memory.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: "Kirill A. Shutemov" <kirill.shutemov(a)linux.intel.com>
Subject: mm/page_alloc: ensure try_alloc_pages() plays well with unaccepted memory
Date: Tue, 6 May 2025 14:25:08 +0300
try_alloc_pages() will not attempt to allocate memory if the system has
*any* unaccepted memory. Memory is accepted as needed and can remain in
the system indefinitely, causing the interface to always fail.
Rather than immediately giving up, attempt to use already accepted memory
on free lists.
Pass 'alloc_flags' to cond_accept_memory() and do not accept new memory
for ALLOC_TRYLOCK requests.
Found via code inspection - only BPF uses this at present and the
runtime effects are unclear.
Link: https://lkml.kernel.org/r/20250506112509.905147-2-kirill.shutemov@linux.int…
Signed-off-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Fixes: 97769a53f117 ("mm, bpf: Introduce try_alloc_pages() for opportunistic page allocation")
Cc: Alexei Starovoitov <ast(a)kernel.org>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Suren Baghdasaryan <surenb(a)google.com>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Brendan Jackman <jackmanb(a)google.com>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/page_alloc.c | 28 +++++++++++++++-------------
1 file changed, 15 insertions(+), 13 deletions(-)
--- a/mm/page_alloc.c~mm-page_alloc-ensure-try_alloc_pages-plays-well-with-unaccepted-memory
+++ a/mm/page_alloc.c
@@ -290,7 +290,8 @@ EXPORT_SYMBOL(nr_online_nodes);
#endif
static bool page_contains_unaccepted(struct page *page, unsigned int order);
-static bool cond_accept_memory(struct zone *zone, unsigned int order);
+static bool cond_accept_memory(struct zone *zone, unsigned int order,
+ int alloc_flags);
static bool __free_unaccepted(struct page *page);
int page_group_by_mobility_disabled __read_mostly;
@@ -3611,7 +3612,7 @@ retry:
}
}
- cond_accept_memory(zone, order);
+ cond_accept_memory(zone, order, alloc_flags);
/*
* Detect whether the number of free pages is below high
@@ -3638,7 +3639,7 @@ check_alloc_wmark:
gfp_mask)) {
int ret;
- if (cond_accept_memory(zone, order))
+ if (cond_accept_memory(zone, order, alloc_flags))
goto try_this_zone;
/*
@@ -3691,7 +3692,7 @@ try_this_zone:
return page;
} else {
- if (cond_accept_memory(zone, order))
+ if (cond_accept_memory(zone, order, alloc_flags))
goto try_this_zone;
/* Try again if zone has deferred pages */
@@ -4844,7 +4845,7 @@ unsigned long alloc_pages_bulk_noprof(gf
goto failed;
}
- cond_accept_memory(zone, 0);
+ cond_accept_memory(zone, 0, alloc_flags);
retry_this_zone:
mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages;
if (zone_watermark_fast(zone, 0, mark,
@@ -4853,7 +4854,7 @@ retry_this_zone:
break;
}
- if (cond_accept_memory(zone, 0))
+ if (cond_accept_memory(zone, 0, alloc_flags))
goto retry_this_zone;
/* Try again if zone has deferred pages */
@@ -7281,7 +7282,8 @@ static inline bool has_unaccepted_memory
return static_branch_unlikely(&zones_with_unaccepted_pages);
}
-static bool cond_accept_memory(struct zone *zone, unsigned int order)
+static bool cond_accept_memory(struct zone *zone, unsigned int order,
+ int alloc_flags)
{
long to_accept, wmark;
bool ret = false;
@@ -7292,6 +7294,10 @@ static bool cond_accept_memory(struct zo
if (list_empty(&zone->unaccepted_pages))
return false;
+ /* Bailout, since try_to_accept_memory_one() needs to take a lock */
+ if (alloc_flags & ALLOC_TRYLOCK)
+ return false;
+
wmark = promo_wmark_pages(zone);
/*
@@ -7348,7 +7354,8 @@ static bool page_contains_unaccepted(str
return false;
}
-static bool cond_accept_memory(struct zone *zone, unsigned int order)
+static bool cond_accept_memory(struct zone *zone, unsigned int order,
+ int alloc_flags)
{
return false;
}
@@ -7419,11 +7426,6 @@ struct page *try_alloc_pages_noprof(int
if (!pcp_allowed_order(order))
return NULL;
-#ifdef CONFIG_UNACCEPTED_MEMORY
- /* Bailout, since try_to_accept_memory_one() needs to take a lock */
- if (has_unaccepted_memory())
- return NULL;
-#endif
/* Bailout, since _deferred_grow_zone() needs to take a lock */
if (deferred_pages_enabled())
return NULL;
_
Patches currently in -mm which might be from kirill.shutemov(a)linux.intel.com are
The quilt patch titled
Subject: mm: hugetlb: fix incorrect fallback for subpool
has been removed from the -mm tree. Its filename was
mm-hugetlb-fix-incorrect-fallback-for-subpool.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Wupeng Ma <mawupeng1(a)huawei.com>
Subject: mm: hugetlb: fix incorrect fallback for subpool
Date: Thu, 10 Apr 2025 14:26:33 +0800
During our testing with hugetlb subpool enabled, we observe that
hstate->resv_huge_pages may underflow into negative values. Root cause
analysis reveals a race condition in subpool reservation fallback handling
as follow:
hugetlb_reserve_pages()
/* Attempt subpool reservation */
gbl_reserve = hugepage_subpool_get_pages(spool, chg);
/* Global reservation may fail after subpool allocation */
if (hugetlb_acct_memory(h, gbl_reserve) < 0)
goto out_put_pages;
out_put_pages:
/* This incorrectly restores reservation to subpool */
hugepage_subpool_put_pages(spool, chg);
When hugetlb_acct_memory() fails after subpool allocation, the current
implementation over-commits subpool reservations by returning the full
'chg' value instead of the actual allocated 'gbl_reserve' amount. This
discrepancy propagates to global reservations during subsequent releases,
eventually causing resv_huge_pages underflow.
This problem can be trigger easily with the following steps:
1. reverse hugepage for hugeltb allocation
2. mount hugetlbfs with min_size to enable hugetlb subpool
3. alloc hugepages with two task(make sure the second will fail due to
insufficient amount of hugepages)
4. with for a few seconds and repeat step 3 which will make
hstate->resv_huge_pages to go below zero.
To fix this problem, return corrent amount of pages to subpool during the
fallback after hugepage_subpool_get_pages is called.
Link: https://lkml.kernel.org/r/20250410062633.3102457-1-mawupeng1@huawei.com
Fixes: 1c5ecae3a93f ("hugetlbfs: add minimum size accounting to subpools")
Signed-off-by: Wupeng Ma <mawupeng1(a)huawei.com>
Tested-by: Joshua Hahn <joshua.hahnjy(a)gmail.com>
Reviewed-by: Oscar Salvador <osalvador(a)suse.de>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Ma Wupeng <mawupeng1(a)huawei.com>
Cc: Muchun Song <muchun.song(a)linux.dev>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/hugetlb.c | 28 ++++++++++++++++++++++------
1 file changed, 22 insertions(+), 6 deletions(-)
--- a/mm/hugetlb.c~mm-hugetlb-fix-incorrect-fallback-for-subpool
+++ a/mm/hugetlb.c
@@ -3010,7 +3010,7 @@ struct folio *alloc_hugetlb_folio(struct
struct hugepage_subpool *spool = subpool_vma(vma);
struct hstate *h = hstate_vma(vma);
struct folio *folio;
- long retval, gbl_chg;
+ long retval, gbl_chg, gbl_reserve;
map_chg_state map_chg;
int ret, idx;
struct hugetlb_cgroup *h_cg = NULL;
@@ -3163,8 +3163,16 @@ out_uncharge_cgroup_reservation:
hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
h_cg);
out_subpool_put:
- if (map_chg)
- hugepage_subpool_put_pages(spool, 1);
+ /*
+ * put page to subpool iff the quota of subpool's rsv_hpages is used
+ * during hugepage_subpool_get_pages.
+ */
+ if (map_chg && !gbl_chg) {
+ gbl_reserve = hugepage_subpool_put_pages(spool, 1);
+ hugetlb_acct_memory(h, -gbl_reserve);
+ }
+
+
out_end_reservation:
if (map_chg != MAP_CHG_ENFORCED)
vma_end_reservation(h, vma, addr);
@@ -7239,7 +7247,7 @@ bool hugetlb_reserve_pages(struct inode
struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
- long chg = -1, add = -1;
+ long chg = -1, add = -1, spool_resv, gbl_resv;
struct hstate *h = hstate_inode(inode);
struct hugepage_subpool *spool = subpool_inode(inode);
struct resv_map *resv_map;
@@ -7374,8 +7382,16 @@ bool hugetlb_reserve_pages(struct inode
return true;
out_put_pages:
- /* put back original number of pages, chg */
- (void)hugepage_subpool_put_pages(spool, chg);
+ spool_resv = chg - gbl_reserve;
+ if (spool_resv) {
+ /* put sub pool's reservation back, chg - gbl_reserve */
+ gbl_resv = hugepage_subpool_put_pages(spool, spool_resv);
+ /*
+ * subpool's reserved pages can not be put back due to race,
+ * return to hstate.
+ */
+ hugetlb_acct_memory(h, -gbl_resv);
+ }
out_uncharge_cgroup:
hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
chg * pages_per_huge_page(h), h_cg);
_
Patches currently in -mm which might be from mawupeng1(a)huawei.com are
A new on by default warning in clang [1] aims to flags instances where
const variables without static or thread local storage or const members
in aggregate types are not initialized because it can lead to an
indeterminate value. This is quite noisy for the kernel due to
instances originating from header files such as:
drivers/gpu/drm/i915/gt/intel_ring.h:62:2: error: default initialization of an object of type 'typeof (ring->size)' (aka 'const unsigned int') leaves the object uninitialized [-Werror,-Wdefault-const-init-var-unsafe]
62 | typecheck(typeof(ring->size), next);
| ^
include/linux/typecheck.h:10:9: note: expanded from macro 'typecheck'
10 | ({ type __dummy; \
| ^
include/net/ip.h:478:14: error: default initialization of an object of type 'typeof (rt->dst.expires)' (aka 'const unsigned long') leaves the object uninitialized [-Werror,-Wdefault-const-init-var-unsafe]
478 | if (mtu && time_before(jiffies, rt->dst.expires))
| ^
include/linux/jiffies.h:138:26: note: expanded from macro 'time_before'
138 | #define time_before(a,b) time_after(b,a)
| ^
include/linux/jiffies.h:128:3: note: expanded from macro 'time_after'
128 | (typecheck(unsigned long, a) && \
| ^
include/linux/typecheck.h:11:12: note: expanded from macro 'typecheck'
11 | typeof(x) __dummy2; \
| ^
include/linux/list.h:409:27: warning: default initialization of an object of type 'union (unnamed union at include/linux/list.h:409:27)' with const member leaves the object uninitialized [-Wdefault-const-init-field-unsafe]
409 | struct list_head *next = smp_load_acquire(&head->next);
| ^
include/asm-generic/barrier.h:176:29: note: expanded from macro 'smp_load_acquire'
176 | #define smp_load_acquire(p) __smp_load_acquire(p)
| ^
arch/arm64/include/asm/barrier.h:164:59: note: expanded from macro '__smp_load_acquire'
164 | union { __unqual_scalar_typeof(*p) __val; char __c[1]; } __u; \
| ^
include/linux/list.h:409:27: note: member '__val' declared 'const' here
crypto/scatterwalk.c:66:22: error: default initialization of an object of type 'struct scatter_walk' with const member leaves the object uninitialized [-Werror,-Wdefault-const-init-field-unsafe]
66 | struct scatter_walk walk;
| ^
include/crypto/algapi.h:112:15: note: member 'addr' declared 'const' here
112 | void *const addr;
| ^
fs/hugetlbfs/inode.c:733:24: error: default initialization of an object of type 'struct vm_area_struct' with const member leaves the object uninitialized [-Werror,-Wdefault-const-init-field-unsafe]
733 | struct vm_area_struct pseudo_vma;
| ^
include/linux/mm_types.h:803:20: note: member 'vm_flags' declared 'const' here
803 | const vm_flags_t vm_flags;
| ^
Silencing the instances from typecheck.h is difficult because '= {}' is
not available in older but supported compilers and '= {0}' would cause
warnings about a literal 0 being treated as NULL. While it might be
possible to come up with a local hack to silence the warning for
clang-21+, it may not be worth it since -Wuninitialized will still
trigger if an uninitialized const variable is actually used.
In all audited cases of the "field" variant of the warning, the members
are either not used in the particular call path, modified through other
means such as memset() / memcpy() because the containing object is not
const, or are within a union with other non-const members.
Since this warning does not appear to have a high signal to noise ratio,
just disable it.
Cc: stable(a)vger.kernel.org
Link: https://github.com/llvm/llvm-project/commit/576161cb6069e2c7656a8ef530727a0… [1]
Reported-by: Linux Kernel Functional Testing <lkft(a)linaro.org>
Closes: https://lore.kernel.org/CA+G9fYuNjKcxFKS_MKPRuga32XbndkLGcY-PVuoSwzv6VWbY=w…
Reported-by: Marcus Seyfarth <m.seyfarth(a)gmail.com>
Closes: https://github.com/ClangBuiltLinux/linux/issues/2088
Signed-off-by: Nathan Chancellor <nathan(a)kernel.org>
---
Changes in v2:
- Disable -Wdefault-const-init-var-unsafe as well, as '= {}' does not
work in typecheck() for all supported compilers and it may not be
worth a local hack.
- Link to v1: https://lore.kernel.org/r/20250501-default-const-init-clang-v1-0-3d2c6c185d…
---
scripts/Makefile.extrawarn | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/scripts/Makefile.extrawarn b/scripts/Makefile.extrawarn
index d88acdf40855..fd649c68e198 100644
--- a/scripts/Makefile.extrawarn
+++ b/scripts/Makefile.extrawarn
@@ -37,6 +37,18 @@ KBUILD_CFLAGS += -Wno-gnu
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111219
KBUILD_CFLAGS += $(call cc-disable-warning, format-overflow-non-kprintf)
KBUILD_CFLAGS += $(call cc-disable-warning, format-truncation-non-kprintf)
+
+# Clang may emit a warning when a const variable, such as the dummy variables
+# in typecheck(), or const member of an aggregate type are not initialized,
+# which can result in unexpected behavior. However, in many audited cases of
+# the "field" variant of the warning, this is intentional because the field is
+# never used within a particular call path, the field is within a union with
+# other non-const members, or the containing object is not const so the field
+# can be modified via memcpy() / memset(). While the variable warning also gets
+# disabled with this same switch, there should not be too much coverage lost
+# because -Wuninitialized will still flag when an uninitialized const variable
+# is used.
+KBUILD_CFLAGS += $(call cc-disable-warning, default-const-init-unsafe)
else
# gcc inanely warns about local variables called 'main'
---
base-commit: 92a09c47464d040866cf2b4cd052bc60555185fb
change-id: 20250430-default-const-init-clang-b6e21b8d03b6
Best regards,
--
Nathan Chancellor <nathan(a)kernel.org>
I found data-race in my fuzzer:
==================================================================
BUG: KCSAN: data-race in rtc_dev_poll / rtc_handle_legacy_irq
write to 0xffff88800b307380 of 8 bytes by interrupt on cpu 1:
rtc_handle_legacy_irq+0x58/0xb0 drivers/rtc/interface.c:624
rtc_pie_update_irq+0x75/0x90 drivers/rtc/interface.c:672
__run_hrtimer kernel/time/hrtimer.c:1761 [inline]
__hrtimer_run_queues+0x2c4/0x5d0 kernel/time/hrtimer.c:1825
hrtimer_interrupt+0x214/0x4a0 kernel/time/hrtimer.c:1887
local_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1038 [inline]
....
read to 0xffff88800b307380 of 8 bytes by task 11566 on cpu 0:
rtc_dev_poll+0x6c/0xa0 drivers/rtc/dev.c:198
vfs_poll include/linux/poll.h:82 [inline]
select_poll_one fs/select.c:480 [inline]
do_select+0x95f/0x1030 fs/select.c:536
core_sys_select+0x284/0x6d0 fs/select.c:677
do_pselect.constprop.0+0x118/0x150 fs/select.c:759
....
value changed: 0x00000000000801c0 -> 0x00000000000802c0
==================================================================
rtc_dev_poll() is reading rtc->irq_data without a spinlock for some
unknown reason. This causes a data-race, so we need to add a spinlock
to fix it.
Cc: <stable(a)vger.kernel.org>
Fixes: e824290e5dcf ("[PATCH] RTC subsystem: dev interface")
Signed-off-by: Jeongjun Park <aha310510(a)gmail.com>
---
drivers/rtc/dev.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/rtc/dev.c b/drivers/rtc/dev.c
index 0eeae5bcc3aa..a6570a5a938a 100644
--- a/drivers/rtc/dev.c
+++ b/drivers/rtc/dev.c
@@ -195,7 +195,9 @@ static __poll_t rtc_dev_poll(struct file *file, poll_table *wait)
poll_wait(file, &rtc->irq_queue, wait);
+ spin_lock_irq(&rtc->irq_lock);
data = rtc->irq_data;
+ spin_unlock_irq(&rtc->irq_lock);
return (data != 0) ? (EPOLLIN | EPOLLRDNORM) : 0;
}
--
The patch titled
Subject: mm: vmscan: avoid signedness error for GCC 5.4
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-vmscan-avoid-signedness-error-for-gcc-54.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: WangYuli <wangyuli(a)uniontech.com>
Subject: mm: vmscan: avoid signedness error for GCC 5.4
Date: Wed, 7 May 2025 12:08:27 +0800
To the GCC 5.4 compiler, (MAX_NR_TIERS - 1) (i.e., (4U - 1)) is unsigned,
whereas tier is a signed integer.
Then, the __types_ok check within the __careful_cmp_once macro failed,
triggered BUILD_BUG_ON.
Use min_t instead of min to circumvent this compiler error.
Fix follow error with gcc 5.4:
mm/vmscan.c: In function `read_ctrl_pos':
mm/vmscan.c:3166:728: error: call to `__compiletime_assert_887' declared with attribute error: min(tier, 4U - 1) signedness error
Link: https://lkml.kernel.org/r/62726950F697595A+20250507040827.1147510-1-wangyul…
Fixes: 37a260870f2c ("mm/mglru: rework type selection")
Signed-off-by: WangYuli <wangyuli(a)uniontech.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/vmscan.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/mm/vmscan.c~mm-vmscan-avoid-signedness-error-for-gcc-54
+++ a/mm/vmscan.c
@@ -3163,7 +3163,7 @@ static void read_ctrl_pos(struct lruvec
pos->gain = gain;
pos->refaulted = pos->total = 0;
- for (i = tier % MAX_NR_TIERS; i <= min(tier, MAX_NR_TIERS - 1); i++) {
+ for (i = tier % MAX_NR_TIERS; i <= min_t(int, tier, MAX_NR_TIERS - 1); i++) {
pos->refaulted += lrugen->avg_refaulted[type][i] +
atomic_long_read(&lrugen->refaulted[hist][type][i]);
pos->total += lrugen->avg_total[type][i] +
_
Patches currently in -mm which might be from wangyuli(a)uniontech.com are
mm-vmscan-avoid-signedness-error-for-gcc-54.patch
ocfs2-o2net_idle_timer-rename-del_timer_sync-in-comment.patch
treewide-fix-typo-previlege.patch