The future move of pin-init to `syn` uncovers the following private
intra-doc link:
error: public documentation for `Devres` links to private item `Self::inner`
--> rust/kernel/devres.rs:106:7
|
106 | /// [`Self::inner`] is guaranteed to be initialized and is always accessed read-only.
| ^^^^^^^^^^^ this item is private
|
= note: this link will resolve properly if you pass `--document-private-items`
= note: `-D rustdoc::private-intra-doc-links` implied by `-D warnings`
= help: to override `-D warnings` add `#[allow(rustdoc::private_intra_doc_links)]`
Currently, when rendered, the link points to "nowhere" (an inexistent
anchor for a "method").
Thus fix it.
Cc: stable(a)vger.kernel.org
Fixes: f5d3ef25d238 ("rust: devres: get rid of Devres' inner Arc")
Signed-off-by: Miguel Ojeda <ojeda(a)kernel.org>
---
rust/kernel/devres.rs | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/rust/kernel/devres.rs b/rust/kernel/devres.rs
index 10a6a1789854..2392c281459e 100644
--- a/rust/kernel/devres.rs
+++ b/rust/kernel/devres.rs
@@ -103,7 +103,7 @@ struct Inner<T: Send> {
///
/// # Invariants
///
-/// [`Self::inner`] is guaranteed to be initialized and is always accessed read-only.
+/// `Self::inner` is guaranteed to be initialized and is always accessed read-only.
#[pin_data(PinnedDrop)]
pub struct Devres<T: Send> {
dev: ARef<Device>,
base-commit: dcb6fa37fd7bc9c3d2b066329b0d27dedf8becaa
--
2.51.0
The future move of pin-init to `syn` uncovers the following broken
intra-doc link:
error: unresolved link to `crate::pin_init`
--> rust/kernel/sync/condvar.rs:39:40
|
39 | /// instances is with the [`pin_init`](crate::pin_init!) and [`new_condvar`] macros.
| ^^^^^^^^^^^^^^^^ no item named `pin_init` in module `kernel`
|
= note: `-D rustdoc::broken-intra-doc-links` implied by `-D warnings`
= help: to override `-D warnings` add `#[allow(rustdoc::broken_intra_doc_links)]`
Currently, when rendered, the link points to a literal `crate::pin_init!`
URL.
Thus fix it.
Cc: stable(a)vger.kernel.org
Fixes: 129e97be8e28 ("rust: pin-init: fix documentation links")
Signed-off-by: Miguel Ojeda <ojeda(a)kernel.org>
---
rust/kernel/sync/condvar.rs | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/rust/kernel/sync/condvar.rs b/rust/kernel/sync/condvar.rs
index c6ec64295c9f..aa5b9a7a726d 100644
--- a/rust/kernel/sync/condvar.rs
+++ b/rust/kernel/sync/condvar.rs
@@ -36,7 +36,7 @@ macro_rules! new_condvar {
/// spuriously.
///
/// Instances of [`CondVar`] need a lock class and to be pinned. The recommended way to create such
-/// instances is with the [`pin_init`](crate::pin_init!) and [`new_condvar`] macros.
+/// instances is with the [`pin_init`](pin_init::pin_init!) and [`new_condvar`] macros.
///
/// # Examples
///
base-commit: dcb6fa37fd7bc9c3d2b066329b0d27dedf8becaa
--
2.51.0
This is the start of the stable review cycle for the 6.6.116 release.
There are 32 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Sun, 02 Nov 2025 14:00:34 +0000.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v6.x/stable-review/patch-6.6.116-rc…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-6.6.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 6.6.116-rc1
William Breathitt Gray <wbg(a)kernel.org>
gpio: idio-16: Define fixed direction of the GPIO lines
Ioana Ciornei <ioana.ciornei(a)nxp.com>
gpio: regmap: add the .fixed_direction_output configuration parameter
Mathieu Dubois-Briand <mathieu.dubois-briand(a)bootlin.com>
gpio: regmap: Allow to allocate regmap-irq device
Vincent Mailhol <mailhol.vincent(a)wanadoo.fr>
bits: introduce fixed-type GENMASK_U*()
Vincent Mailhol <mailhol.vincent(a)wanadoo.fr>
bits: add comments and newlines to #if, #else and #endif directives
Mathias Nyman <mathias.nyman(a)linux.intel.com>
xhci: dbc: fix bogus 1024 byte prefix if ttyDBC read races with stall event
Mathias Nyman <mathias.nyman(a)linux.intel.com>
xhci: dbc: Avoid event polling busyloop if pending rx transfers are inactive.
Mathias Nyman <mathias.nyman(a)linux.intel.com>
xhci: dbc: Improve performance by removing delay in transfer event polling.
Uday M Bhat <uday.m.bhat(a)intel.com>
xhci: dbc: Allow users to modify DbC poll interval via sysfs
Mathias Nyman <mathias.nyman(a)linux.intel.com>
xhci: dbc: poll at different rate depending on data transfer activity
Hugo Villeneuve <hvilleneuve(a)dimonoff.com>
serial: sc16is7xx: remove useless enable of enhanced features
Hugo Villeneuve <hvilleneuve(a)dimonoff.com>
serial: sc16is7xx: refactor EFR lock
Hugo Villeneuve <hvilleneuve(a)dimonoff.com>
serial: sc16is7xx: reorder code to remove prototype declarations
Hugo Villeneuve <hvilleneuve(a)dimonoff.com>
serial: sc16is7xx: remove unused to_sc16is7xx_port macro
Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
selftests: mptcp: join: mark 'delete re-add signal' as skipped if not supported
Geliang Tang <tanggeliang(a)kylinos.cn>
selftests: mptcp: disable add_addr retrans in endpoint_tests
Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
mptcp: pm: in-kernel: C-flag: handle late ADD_ADDR
Menglong Dong <menglong8.dong(a)gmail.com>
arch: Add the macro COMPILE_OFFSETS to all the asm-offsets.c
Filipe Manana <fdmanana(a)suse.com>
btrfs: use smp_mb__after_atomic() when forcing COW in create_pending_snapshot()
Filipe Manana <fdmanana(a)suse.com>
btrfs: use level argument in log tree walk callback replay_one_buffer()
Filipe Manana <fdmanana(a)suse.com>
btrfs: always drop log root tree reference in btrfs_replay_log()
Thorsten Blum <thorsten.blum(a)linux.dev>
btrfs: scrub: replace max_t()/min_t() with clamp() in scrub_throttle_dev_io()
Naohiro Aota <naohiro.aota(a)wdc.com>
btrfs: zoned: refine extent allocator hint selection
Johannes Thumshirn <johannes.thumshirn(a)wdc.com>
btrfs: zoned: return error from btrfs_zone_finish_endio()
Avadhut Naik <avadhut.naik(a)amd.com>
EDAC/mc_sysfs: Increase legacy channel support to 16
David Kaplan <david.kaplan(a)amd.com>
x86/bugs: Fix reporting of LFENCE retpoline
David Kaplan <david.kaplan(a)amd.com>
x86/bugs: Report correct retbleed mitigation status
Josh Poimboeuf <jpoimboe(a)kernel.org>
perf: Skip user unwind if the task is a kernel thread
Josh Poimboeuf <jpoimboe(a)kernel.org>
perf: Have get_perf_callchain() return NULL if crosstask and user are set
Steven Rostedt <rostedt(a)goodmis.org>
perf: Use current->flags & PF_KTHREAD|PF_USER_WORKER instead of current->mm == NULL
Richard Guy Briggs <rgb(a)redhat.com>
audit: record fanotify event regardless of presence of rules
Xiang Mei <xmei5(a)asu.edu>
net/sched: sch_qfq: Fix null-deref in agg_dequeue
-------------
Diffstat:
.../ABI/testing/sysfs-bus-pci-drivers-xhci_hcd | 10 ++
Makefile | 4 +-
arch/alpha/kernel/asm-offsets.c | 1 +
arch/arc/kernel/asm-offsets.c | 1 +
arch/arm/kernel/asm-offsets.c | 2 +
arch/arm64/kernel/asm-offsets.c | 1 +
arch/csky/kernel/asm-offsets.c | 1 +
arch/hexagon/kernel/asm-offsets.c | 1 +
arch/loongarch/kernel/asm-offsets.c | 2 +
arch/m68k/kernel/asm-offsets.c | 1 +
arch/microblaze/kernel/asm-offsets.c | 1 +
arch/mips/kernel/asm-offsets.c | 2 +
arch/nios2/kernel/asm-offsets.c | 1 +
arch/openrisc/kernel/asm-offsets.c | 1 +
arch/parisc/kernel/asm-offsets.c | 1 +
arch/powerpc/kernel/asm-offsets.c | 1 +
arch/riscv/kernel/asm-offsets.c | 1 +
arch/s390/kernel/asm-offsets.c | 1 +
arch/sh/kernel/asm-offsets.c | 1 +
arch/sparc/kernel/asm-offsets.c | 1 +
arch/um/kernel/asm-offsets.c | 2 +
arch/x86/kernel/cpu/bugs.c | 9 +-
arch/xtensa/kernel/asm-offsets.c | 1 +
drivers/edac/edac_mc_sysfs.c | 24 +++
drivers/gpio/gpio-idio-16.c | 5 +
drivers/gpio/gpio-regmap.c | 53 +++++-
drivers/tty/serial/sc16is7xx.c | 185 ++++++++++-----------
drivers/usb/host/xhci-dbgcap.c | 70 +++++++-
drivers/usb/host/xhci-dbgcap.h | 7 +-
fs/btrfs/disk-io.c | 2 +-
fs/btrfs/extent-tree.c | 6 +-
fs/btrfs/inode.c | 7 +-
fs/btrfs/scrub.c | 3 +-
fs/btrfs/transaction.c | 2 +-
fs/btrfs/tree-log.c | 9 +-
fs/btrfs/zoned.c | 8 +-
fs/btrfs/zoned.h | 9 +-
include/linux/audit.h | 2 +-
include/linux/bitops.h | 1 -
include/linux/bits.h | 38 ++++-
include/linux/gpio/regmap.h | 16 ++
include/net/pkt_sched.h | 25 ++-
kernel/events/callchain.c | 16 +-
kernel/events/core.c | 7 +-
net/mptcp/pm_netlink.c | 6 +
net/sched/sch_api.c | 10 --
net/sched/sch_hfsc.c | 16 --
net/sched/sch_qfq.c | 2 +-
tools/testing/selftests/net/mptcp/mptcp_join.sh | 3 +-
49 files changed, 405 insertions(+), 174 deletions(-)
KASAN reports a global-out-of-bounds access when running these nfit
tests: clear.sh, pmem-errors, pfn-meta-errors.sh, btt-errors.sh,
daxdev-errors.sh, and inject-error.sh.
[] BUG: KASAN: global-out-of-bounds in nfit_test_ctl+0x769f/0x7840 [nfit_test]
[] Read of size 4 at addr ffffffffc03ea01c by task ndctl/1215
[] The buggy address belongs to the variable:
[] handle+0x1c/0x1df4 [nfit_test]
The nfit_test mock platform defines a static table of 7 NFIT DIMM
handles, but nfit_test.0 builds 8 mock DIMMs total (5 DCR + 3 PM).
When the final DIMM (id == 7) is selected, this code:
spa->devices[0].nfit_device_handle = handle[nvdimm->id];
indexes past the end of the 7-entry table, triggering KASAN.
Fix this by adding an eighth entry to the handle[] table and a
defensive bounds check so the test fails cleanly instead of
dereferencing out-of-bounds memory.
To generate a unique handle, the new entry sets the 'imc' field rather
than the 'chan' field. This matches the pattern of earlier entries
and avoids introducing a non-zero 'chan' which is never used in the
table. Computing the new handle shows no collision.
Notes from spelunkering for a Fixes Tag:
Commit 209851649dc4 ("acpi: nfit: Add support for hot-add") increased
the mock DIMMs to eight yet kept the handle[] array at seven.
Commit 10246dc84dfc ("acpi nfit: nfit_test supports translate SPA")
began using the last mock DIMM, triggering the KASAN.
Commit af31b04b67f4 ("tools/testing/nvdimm: Fix the array size for
dimm devices.") addressed a related KASAN warning but not the actual
handle array length.
Fixes: 209851649dc4 ("acpi: nfit: Add support for hot-add")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Alison Schofield <alison.schofield(a)intel.com>
---
tools/testing/nvdimm/test/nfit.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index cfd4378e2129..cdbf9e8ee80a 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -129,6 +129,7 @@ static u32 handle[] = {
[4] = NFIT_DIMM_HANDLE(0, 1, 0, 0, 0),
[5] = NFIT_DIMM_HANDLE(1, 0, 0, 0, 0),
[6] = NFIT_DIMM_HANDLE(1, 0, 0, 0, 1),
+ [7] = NFIT_DIMM_HANDLE(1, 0, 1, 0, 1),
};
static unsigned long dimm_fail_cmd_flags[ARRAY_SIZE(handle)];
@@ -688,6 +689,13 @@ static int nfit_test_search_spa(struct nvdimm_bus *bus,
nd_mapping = &nd_region->mapping[nd_region->ndr_mappings - 1];
nvdimm = nd_mapping->nvdimm;
+ if (WARN_ON_ONCE(nvdimm->id >= ARRAY_SIZE(handle))) {
+ dev_err(&bus->dev,
+ "invalid nvdimm->id %u >= handle array size %zu\n",
+ nvdimm->id, ARRAY_SIZE(handle));
+ return -EINVAL;
+ }
+
spa->devices[0].nfit_device_handle = handle[nvdimm->id];
spa->num_nvdimms = 1;
spa->devices[0].dpa = dpa;
base-commit: 211ddde0823f1442e4ad052a2f30f050145ccada
--
2.37.3
From: Quentin Schulz <quentin.schulz(a)cherry.de>
In commit 296602b8e5f7 ("arm64: dts: rockchip: Move RK3399 OPPs to dtsi
files for SoC variants"), everything shared between variants of RK3399
was put into rk3399-base.dtsi and the rest in variant-specific DTSI,
such as rk3399-t, rk3399-op1, rk3399, etc.
Therefore, the variant-specific DTSI should include rk3399-base.dtsi and
not another variant's DTSI.
rk3399-op1 wrongly includes rk3399 (a variant) DTSI instead of
rk3399-base DTSI, let's fix this oversight by including the intended
DTSI.
Fortunately, this had no impact on the resulting DTB since all nodes
were named the same and all node properties were overridden in
rk3399-op1.dtsi. This was checked by doing a checksum of rk3399-op1 DTBs
before and after this commit.
No intended change in behavior.
Fixes: 296602b8e5f7 ("arm64: dts: rockchip: Move RK3399 OPPs to dtsi files for SoC variants")
Cc: stable(a)vger.kernel.org
Signed-off-by: Quentin Schulz <quentin.schulz(a)cherry.de>
---
arch/arm64/boot/dts/rockchip/rk3399-op1.dtsi | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/arm64/boot/dts/rockchip/rk3399-op1.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-op1.dtsi
index c4f4f1ff6117b..9da6fd82e46b2 100644
--- a/arch/arm64/boot/dts/rockchip/rk3399-op1.dtsi
+++ b/arch/arm64/boot/dts/rockchip/rk3399-op1.dtsi
@@ -3,7 +3,7 @@
* Copyright (c) 2016-2017 Fuzhou Rockchip Electronics Co., Ltd
*/
-#include "rk3399.dtsi"
+#include "rk3399-base.dtsi"
/ {
cluster0_opp: opp-table-0 {
---
base-commit: e53642b87a4f4b03a8d7e5f8507fc3cd0c595ea6
change-id: 20251029-rk3399-op1-include-b311b4e16909
Best regards,
--
Quentin Schulz <quentin.schulz(a)cherry.de>
The patch titled
Subject: mm/secretmem: fix use-after-free race in fault handler
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-secretmem-fix-use-after-free-race-in-fault-handler.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Lance Yang <lance.yang(a)linux.dev>
Subject: mm/secretmem: fix use-after-free race in fault handler
Date: Fri, 31 Oct 2025 20:09:55 +0800
When a page fault occurs in a secret memory file created with
`memfd_secret(2)`, the kernel will allocate a new folio for it, mark the
underlying page as not-present in the direct map, and add it to the file
mapping.
If two tasks cause a fault in the same page concurrently, both could end
up allocating a folio and removing the page from the direct map, but only
one would succeed in adding the folio to the file mapping. The task that
failed undoes the effects of its attempt by (a) freeing the folio again
and (b) putting the page back into the direct map. However, by doing
these two operations in this order, the page becomes available to the
allocator again before it is placed back in the direct mapping.
If another task attempts to allocate the page between (a) and (b), and the
kernel tries to access it via the direct map, it would result in a
supervisor not-present page fault.
Fix the ordering to restore the direct map before the folio is freed.
Link: https://lkml.kernel.org/r/20251031120955.92116-1-lance.yang@linux.dev
Fixes: 1507f51255c9 ("mm: introduce memfd_secret system call to create "secret" memory areas")
Signed-off-by: Lance Yang <lance.yang(a)linux.dev>
Reported-by: Google Big Sleep <big-sleep-vuln-reports(a)google.com>
Closes: https://lore.kernel.org/linux-mm/CAEXGt5QeDpiHTu3K9tvjUTPqo+d-=wuCNYPa+6sWK…
Acked-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Mike Rapoport (Microsoft) <rppt(a)kernel.org>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/secretmem.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/mm/secretmem.c~mm-secretmem-fix-use-after-free-race-in-fault-handler
+++ a/mm/secretmem.c
@@ -82,13 +82,13 @@ retry:
__folio_mark_uptodate(folio);
err = filemap_add_folio(mapping, folio, offset, gfp);
if (unlikely(err)) {
- folio_put(folio);
/*
* If a split of large page was required, it
* already happened when we marked the page invalid
* which guarantees that this call won't fail
*/
set_direct_map_default_noflush(folio_page(folio, 0));
+ folio_put(folio);
if (err == -EEXIST)
goto retry;
_
Patches currently in -mm which might be from lance.yang(a)linux.dev are
mm-secretmem-fix-use-after-free-race-in-fault-handler.patch
mm-khugepaged-guard-is_zero_pfn-calls-with-pte_present.patch
From: Doug Berger <opendmb(a)gmail.com>
[ Upstream commit 382748c05e58a9f1935f5a653c352422375566ea ]
Commit 16b269436b72 ("sched/deadline: Modify cpudl::free_cpus
to reflect rd->online") introduced the cpudl_set/clear_freecpu
functions to allow the cpu_dl::free_cpus mask to be manipulated
by the deadline scheduler class rq_on/offline callbacks so the
mask would also reflect this state.
Commit 9659e1eeee28 ("sched/deadline: Remove cpu_active_mask
from cpudl_find()") removed the check of the cpu_active_mask to
save some processing on the premise that the cpudl::free_cpus
mask already reflected the runqueue online state.
Unfortunately, there are cases where it is possible for the
cpudl_clear function to set the free_cpus bit for a CPU when the
deadline runqueue is offline. When this occurs while a CPU is
connected to the default root domain the flag may retain the bad
state after the CPU has been unplugged. Later, a different CPU
that is transitioning through the default root domain may push a
deadline task to the powered down CPU when cpudl_find sees its
free_cpus bit is set. If this happens the task will not have the
opportunity to run.
One example is outlined here:
https://lore.kernel.org/lkml/20250110233010.2339521-1-opendmb@gmail.com
Another occurs when the last deadline task is migrated from a
CPU that has an offlined runqueue. The dequeue_task member of
the deadline scheduler class will eventually call cpudl_clear
and set the free_cpus bit for the CPU.
This commit modifies the cpudl_clear function to be aware of the
online state of the deadline runqueue so that the free_cpus mask
can be updated appropriately.
It is no longer necessary to manage the mask outside of the
cpudl_set/clear functions so the cpudl_set/clear_freecpu
functions are removed. In addition, since the free_cpus mask is
now only updated under the cpudl lock the code was changed to
use the non-atomic __cpumask functions.
Signed-off-by: Doug Berger <opendmb(a)gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Signed-off-by: Florian Fainelli <florian.fainelli(a)broadcom.com>
---
kernel/sched/cpudeadline.c | 34 +++++++++-------------------------
kernel/sched/cpudeadline.h | 4 +---
kernel/sched/deadline.c | 8 ++++----
3 files changed, 14 insertions(+), 32 deletions(-)
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 8cb06c8c7eb1..fd28ba4e1a28 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -166,12 +166,13 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
* cpudl_clear - remove a CPU from the cpudl max-heap
* @cp: the cpudl max-heap context
* @cpu: the target CPU
+ * @online: the online state of the deadline runqueue
*
* Notes: assumes cpu_rq(cpu)->lock is locked
*
* Returns: (void)
*/
-void cpudl_clear(struct cpudl *cp, int cpu)
+void cpudl_clear(struct cpudl *cp, int cpu, bool online)
{
int old_idx, new_cpu;
unsigned long flags;
@@ -184,7 +185,7 @@ void cpudl_clear(struct cpudl *cp, int cpu)
if (old_idx == IDX_INVALID) {
/*
* Nothing to remove if old_idx was invalid.
- * This could happen if a rq_offline_dl is
+ * This could happen if rq_online_dl or rq_offline_dl is
* called for a CPU without -dl tasks running.
*/
} else {
@@ -195,9 +196,12 @@ void cpudl_clear(struct cpudl *cp, int cpu)
cp->elements[new_cpu].idx = old_idx;
cp->elements[cpu].idx = IDX_INVALID;
cpudl_heapify(cp, old_idx);
-
- cpumask_set_cpu(cpu, cp->free_cpus);
}
+ if (likely(online))
+ __cpumask_set_cpu(cpu, cp->free_cpus);
+ else
+ __cpumask_clear_cpu(cpu, cp->free_cpus);
+
raw_spin_unlock_irqrestore(&cp->lock, flags);
}
@@ -228,7 +232,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
cp->elements[new_idx].cpu = cpu;
cp->elements[cpu].idx = new_idx;
cpudl_heapify_up(cp, new_idx);
- cpumask_clear_cpu(cpu, cp->free_cpus);
+ __cpumask_clear_cpu(cpu, cp->free_cpus);
} else {
cp->elements[old_idx].dl = dl;
cpudl_heapify(cp, old_idx);
@@ -237,26 +241,6 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
raw_spin_unlock_irqrestore(&cp->lock, flags);
}
-/*
- * cpudl_set_freecpu - Set the cpudl.free_cpus
- * @cp: the cpudl max-heap context
- * @cpu: rd attached CPU
- */
-void cpudl_set_freecpu(struct cpudl *cp, int cpu)
-{
- cpumask_set_cpu(cpu, cp->free_cpus);
-}
-
-/*
- * cpudl_clear_freecpu - Clear the cpudl.free_cpus
- * @cp: the cpudl max-heap context
- * @cpu: rd attached CPU
- */
-void cpudl_clear_freecpu(struct cpudl *cp, int cpu)
-{
- cpumask_clear_cpu(cpu, cp->free_cpus);
-}
-
/*
* cpudl_init - initialize the cpudl structure
* @cp: the cpudl max-heap context
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index 0adeda93b5fb..ecff718d94ae 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -18,9 +18,7 @@ struct cpudl {
#ifdef CONFIG_SMP
int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask);
void cpudl_set(struct cpudl *cp, int cpu, u64 dl);
-void cpudl_clear(struct cpudl *cp, int cpu);
+void cpudl_clear(struct cpudl *cp, int cpu, bool online);
int cpudl_init(struct cpudl *cp);
-void cpudl_set_freecpu(struct cpudl *cp, int cpu);
-void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
void cpudl_cleanup(struct cpudl *cp);
#endif /* CONFIG_SMP */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 6548bd90c5c3..85e4ef476686 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1414,7 +1414,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
if (!dl_rq->dl_nr_running) {
dl_rq->earliest_dl.curr = 0;
dl_rq->earliest_dl.next = 0;
- cpudl_clear(&rq->rd->cpudl, rq->cpu);
+ cpudl_clear(&rq->rd->cpudl, rq->cpu, rq->online);
} else {
struct rb_node *leftmost = dl_rq->root.rb_leftmost;
struct sched_dl_entity *entry;
@@ -2349,9 +2349,10 @@ static void rq_online_dl(struct rq *rq)
if (rq->dl.overloaded)
dl_set_overload(rq);
- cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
if (rq->dl.dl_nr_running > 0)
cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr);
+ else
+ cpudl_clear(&rq->rd->cpudl, rq->cpu, true);
}
/* Assumes rq->lock is held */
@@ -2360,8 +2361,7 @@ static void rq_offline_dl(struct rq *rq)
if (rq->dl.overloaded)
dl_clear_overload(rq);
- cpudl_clear(&rq->rd->cpudl, rq->cpu);
- cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
+ cpudl_clear(&rq->rd->cpudl, rq->cpu, false);
}
void __init init_sched_dl_class(void)
--
2.34.1
From: Kairui Song <kasong(a)tencent.com>
Since commit 1b7e90020eb77 ("mm, swap: use percpu cluster as allocation
fast path"), swap allocation is protected by a local lock, which means
we can't do any sleeping calls during allocation.
However, the discard routine is not taken well care of. When the swap
allocator failed to find any usable cluster, it would look at the
pending discard cluster and try to issue some blocking discards. It may
not necessarily sleep, but the cond_resched at the bio layer indicates
this is wrong when combined with a local lock. And the bio GFP flag used
for discard bio is also wrong (not atomic).
It's arguable whether this synchronous discard is helpful at all. In
most cases, the async discard is good enough. And the swap allocator is
doing very differently at organizing the clusters since the recent
change, so it is very rare to see discard clusters piling up.
So far, no issues have been observed or reported with typical SSD setups
under months of high pressure. This issue was found during my code
review. But by hacking the kernel a bit: adding a mdelay(500) in the
async discard path, this issue will be observable with WARNING triggered
by the wrong GFP and cond_resched in the bio layer for debug builds.
So now let's apply a hotfix for this issue: remove the synchronous
discard in the swap allocation path. And when order 0 is failing with
all cluster list drained on all swap devices, try to do a discard
following the swap device priority list. If any discards released some
cluster, try the allocation again. This way, we can still avoid OOM due
to swap failure if the hardware is very slow and memory pressure is
extremely high.
This may cause more fragmentation issues if the discarding hardware is
really slow. Ideally, we want to discard pending clusters before
continuing to iterate the fragment cluster lists. This can be
implemented in a cleaner way if we clean up the device list iteration
part first.
Cc: stable(a)vger.kernel.org
Fixes: 1b7e90020eb77 ("mm, swap: use percpu cluster as allocation fast path")
Acked-by: Nhat Pham <nphamcs(a)gmail.com>
Signed-off-by: Kairui Song <kasong(a)tencent.com>
---
mm/swapfile.c | 40 +++++++++++++++++++++++++++++++++-------
1 file changed, 33 insertions(+), 7 deletions(-)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index cb2392ed8e0e..33e0bd905c55 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1101,13 +1101,6 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
goto done;
}
- /*
- * We don't have free cluster but have some clusters in discarding,
- * do discard now and reclaim them.
- */
- if ((si->flags & SWP_PAGE_DISCARD) && swap_do_scheduled_discard(si))
- goto new_cluster;
-
if (order)
goto done;
@@ -1394,6 +1387,33 @@ static bool swap_alloc_slow(swp_entry_t *entry,
return false;
}
+/*
+ * Discard pending clusters in a synchronized way when under high pressure.
+ * Return: true if any cluster is discarded.
+ */
+static bool swap_sync_discard(void)
+{
+ bool ret = false;
+ int nid = numa_node_id();
+ struct swap_info_struct *si, *next;
+
+ spin_lock(&swap_avail_lock);
+ plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], avail_lists[nid]) {
+ spin_unlock(&swap_avail_lock);
+ if (get_swap_device_info(si)) {
+ if (si->flags & SWP_PAGE_DISCARD)
+ ret = swap_do_scheduled_discard(si);
+ put_swap_device(si);
+ }
+ if (ret)
+ return true;
+ spin_lock(&swap_avail_lock);
+ }
+ spin_unlock(&swap_avail_lock);
+
+ return false;
+}
+
/**
* folio_alloc_swap - allocate swap space for a folio
* @folio: folio we want to move to swap
@@ -1432,11 +1452,17 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp)
}
}
+again:
local_lock(&percpu_swap_cluster.lock);
if (!swap_alloc_fast(&entry, order))
swap_alloc_slow(&entry, order);
local_unlock(&percpu_swap_cluster.lock);
+ if (unlikely(!order && !entry.val)) {
+ if (swap_sync_discard())
+ goto again;
+ }
+
/* Need to call this even if allocation failed, for MEMCG_SWAP_FAIL. */
if (mem_cgroup_try_charge_swap(folio, entry))
goto out_free;
--
2.51.0