The patch below does not apply to the 6.17-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.17.y
git checkout FETCH_HEAD
git cherry-pick -x 2e454fb8056df6da4bba7d89a57bf60e217463c0
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025101516-stowaway-bogged-2173@gregkh' --subject-prefix 'PATCH 6.17.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2e454fb8056df6da4bba7d89a57bf60e217463c0 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang(a)intel.com>
Date: Fri, 29 Aug 2025 15:29:06 -0700
Subject: [PATCH] cxl, acpi/hmat: Update CXL access coordinates directly
instead of through HMAT
The current implementation of CXL memory hotplug notifier gets called
before the HMAT memory hotplug notifier. The CXL driver calculates the
access coordinates (bandwidth and latency values) for the CXL end to
end path (i.e. CPU to endpoint). When the CXL region is onlined, the CXL
memory hotplug notifier writes the access coordinates to the HMAT target
structs. Then the HMAT memory hotplug notifier is called and it creates
the access coordinates for the node sysfs attributes.
During testing on an Intel platform, it was found that although the
newly calculated coordinates were pushed to sysfs, the sysfs attributes for
the access coordinates showed up with the wrong initiator. The system has
4 nodes (0, 1, 2, 3) where node 0 and 1 are CPU nodes and node 2 and 3 are
CXL nodes. The expectation is that node 2 would show up as a target to node
0:
/sys/devices/system/node/node2/access0/initiators/node0
However it was observed that node 2 showed up as a target under node 1:
/sys/devices/system/node/node2/access0/initiators/node1
The original intent of the 'ext_updated' flag in HMAT handling code was to
stop HMAT memory hotplug callback from clobbering the access coordinates
after CXL has injected its calculated coordinates and replaced the generic
target access coordinates provided by the HMAT table in the HMAT target
structs. However the flag is hacky at best and blocks the updates from
other CXL regions that are onlined in the same node later on. Remove the
'ext_updated' flag usage and just update the access coordinates for the
nodes directly without touching HMAT target data.
The hotplug memory callback ordering is changed. Instead of changing CXL,
move HMAT back so there's room for the levels rather than have CXL share
the same level as SLAB_CALLBACK_PRI. The change will resulting in the CXL
callback to be executed after the HMAT callback.
With the change, the CXL hotplug memory notifier runs after the HMAT
callback. The HMAT callback will create the node sysfs attributes for
access coordinates. The CXL callback will write the access coordinates to
the now created node sysfs attributes directly and will not pollute the
HMAT target values.
A nodemask is introduced to keep track if a node has been updated and
prevents further updates.
Fixes: 067353a46d8c ("cxl/region: Add memory hotplug notifier for cxl region")
Cc: stable(a)vger.kernel.org
Tested-by: Marc Herbert <marc.herbert(a)linux.intel.com>
Reviewed-by: Dan Williams <dan.j.williams(a)intel.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron(a)huawei.com>
Link: https://patch.msgid.link/20250829222907.1290912-4-dave.jiang@intel.com
Signed-off-by: Dave Jiang <dave.jiang(a)intel.com>
diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c
index 4958301f5417..5d32490dc4ab 100644
--- a/drivers/acpi/numa/hmat.c
+++ b/drivers/acpi/numa/hmat.c
@@ -74,7 +74,6 @@ struct memory_target {
struct node_cache_attrs cache_attrs;
u8 gen_port_device_handle[ACPI_SRAT_DEVICE_HANDLE_SIZE];
bool registered;
- bool ext_updated; /* externally updated */
};
struct memory_initiator {
@@ -391,7 +390,6 @@ int hmat_update_target_coordinates(int nid, struct access_coordinate *coord,
coord->read_bandwidth, access);
hmat_update_target_access(target, ACPI_HMAT_WRITE_BANDWIDTH,
coord->write_bandwidth, access);
- target->ext_updated = true;
return 0;
}
@@ -773,10 +771,6 @@ static void hmat_update_target_attrs(struct memory_target *target,
u32 best = 0;
int i;
- /* Don't update if an external agent has changed the data. */
- if (target->ext_updated)
- return;
-
/* Don't update for generic port if there's no device handle */
if ((access == NODE_ACCESS_CLASS_GENPORT_SINK_LOCAL ||
access == NODE_ACCESS_CLASS_GENPORT_SINK_CPU) &&
diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c
index c0af645425f4..c891fd618cfd 100644
--- a/drivers/cxl/core/cdat.c
+++ b/drivers/cxl/core/cdat.c
@@ -1081,8 +1081,3 @@ int cxl_update_hmat_access_coordinates(int nid, struct cxl_region *cxlr,
{
return hmat_update_target_coordinates(nid, &cxlr->coord[access], access);
}
-
-bool cxl_need_node_perf_attrs_update(int nid)
-{
- return !acpi_node_backed_by_real_pxm(nid);
-}
diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
index 2669f251d677..a253d308f3c9 100644
--- a/drivers/cxl/core/core.h
+++ b/drivers/cxl/core/core.h
@@ -139,7 +139,6 @@ long cxl_pci_get_latency(struct pci_dev *pdev);
int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c);
int cxl_update_hmat_access_coordinates(int nid, struct cxl_region *cxlr,
enum access_coordinate_class access);
-bool cxl_need_node_perf_attrs_update(int nid);
int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port,
struct access_coordinate *c);
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index 71cc42d05248..0ed95cbc5d5b 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -30,6 +30,12 @@
* 3. Decoder targets
*/
+/*
+ * nodemask that sets per node when the access_coordinates for the node has
+ * been updated by the CXL memory hotplug notifier.
+ */
+static nodemask_t nodemask_region_seen = NODE_MASK_NONE;
+
static struct cxl_region *to_cxl_region(struct device *dev);
#define __ACCESS_ATTR_RO(_level, _name) { \
@@ -2442,14 +2448,8 @@ static bool cxl_region_update_coordinates(struct cxl_region *cxlr, int nid)
for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) {
if (cxlr->coord[i].read_bandwidth) {
- rc = 0;
- if (cxl_need_node_perf_attrs_update(nid))
- node_set_perf_attrs(nid, &cxlr->coord[i], i);
- else
- rc = cxl_update_hmat_access_coordinates(nid, cxlr, i);
-
- if (rc == 0)
- cset++;
+ node_update_perf_attrs(nid, &cxlr->coord[i], i);
+ cset++;
}
}
@@ -2487,6 +2487,10 @@ static int cxl_region_perf_attrs_callback(struct notifier_block *nb,
if (nid != region_nid)
return NOTIFY_DONE;
+ /* No action needed if node bit already set */
+ if (node_test_and_set(nid, nodemask_region_seen))
+ return NOTIFY_DONE;
+
if (!cxl_region_update_coordinates(cxlr, nid))
return NOTIFY_DONE;
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 1305102688d0..0b755d1ef1ec 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -120,8 +120,8 @@ struct mem_section;
*/
#define DEFAULT_CALLBACK_PRI 0
#define SLAB_CALLBACK_PRI 1
-#define HMAT_CALLBACK_PRI 2
#define CXL_CALLBACK_PRI 5
+#define HMAT_CALLBACK_PRI 6
#define MM_COMPUTE_BATCH_PRI 10
#define CPUSET_CALLBACK_PRI 10
#define MEMTIER_HOTPLUG_PRI 100
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 6eb350a2233100a283f882c023e5ad426d0ed63b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025101556-district-timid-1eda@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 6eb350a2233100a283f882c023e5ad426d0ed63b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx(a)linutronix.de>
Date: Wed, 13 Aug 2025 17:02:30 +0200
Subject: [PATCH] rseq: Protect event mask against membarrier IPI
rseq_need_restart() reads and clears task::rseq_event_mask with preemption
disabled to guard against the scheduler.
But membarrier() uses an IPI and sets the PREEMPT bit in the event mask
from the IPI, which leaves that RMW operation unprotected.
Use guard(irq) if CONFIG_MEMBARRIER is enabled to fix that.
Fixes: 2a36ab717e8f ("rseq/membarrier: Add MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ")
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Reviewed-by: Boqun Feng <boqun.feng(a)gmail.com>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers(a)efficios.com>
Cc: stable(a)vger.kernel.org
diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index bc8af3eb5598..1fbeb61babeb 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -7,6 +7,12 @@
#include <linux/preempt.h>
#include <linux/sched.h>
+#ifdef CONFIG_MEMBARRIER
+# define RSEQ_EVENT_GUARD irq
+#else
+# define RSEQ_EVENT_GUARD preempt
+#endif
+
/*
* Map the event mask on the user-space ABI enum rseq_cs_flags
* for direct mask checks.
@@ -41,9 +47,8 @@ static inline void rseq_handle_notify_resume(struct ksignal *ksig,
static inline void rseq_signal_deliver(struct ksignal *ksig,
struct pt_regs *regs)
{
- preempt_disable();
- __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask);
- preempt_enable();
+ scoped_guard(RSEQ_EVENT_GUARD)
+ __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask);
rseq_handle_notify_resume(ksig, regs);
}
diff --git a/kernel/rseq.c b/kernel/rseq.c
index b7a1ec327e81..2452b7366b00 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -342,12 +342,12 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
/*
* Load and clear event mask atomically with respect to
- * scheduler preemption.
+ * scheduler preemption and membarrier IPIs.
*/
- preempt_disable();
- event_mask = t->rseq_event_mask;
- t->rseq_event_mask = 0;
- preempt_enable();
+ scoped_guard(RSEQ_EVENT_GUARD) {
+ event_mask = t->rseq_event_mask;
+ t->rseq_event_mask = 0;
+ }
return !!event_mask;
}
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 6eb350a2233100a283f882c023e5ad426d0ed63b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025101556-riches-vivacious-ee8f@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 6eb350a2233100a283f882c023e5ad426d0ed63b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx(a)linutronix.de>
Date: Wed, 13 Aug 2025 17:02:30 +0200
Subject: [PATCH] rseq: Protect event mask against membarrier IPI
rseq_need_restart() reads and clears task::rseq_event_mask with preemption
disabled to guard against the scheduler.
But membarrier() uses an IPI and sets the PREEMPT bit in the event mask
from the IPI, which leaves that RMW operation unprotected.
Use guard(irq) if CONFIG_MEMBARRIER is enabled to fix that.
Fixes: 2a36ab717e8f ("rseq/membarrier: Add MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ")
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Reviewed-by: Boqun Feng <boqun.feng(a)gmail.com>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers(a)efficios.com>
Cc: stable(a)vger.kernel.org
diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index bc8af3eb5598..1fbeb61babeb 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -7,6 +7,12 @@
#include <linux/preempt.h>
#include <linux/sched.h>
+#ifdef CONFIG_MEMBARRIER
+# define RSEQ_EVENT_GUARD irq
+#else
+# define RSEQ_EVENT_GUARD preempt
+#endif
+
/*
* Map the event mask on the user-space ABI enum rseq_cs_flags
* for direct mask checks.
@@ -41,9 +47,8 @@ static inline void rseq_handle_notify_resume(struct ksignal *ksig,
static inline void rseq_signal_deliver(struct ksignal *ksig,
struct pt_regs *regs)
{
- preempt_disable();
- __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask);
- preempt_enable();
+ scoped_guard(RSEQ_EVENT_GUARD)
+ __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask);
rseq_handle_notify_resume(ksig, regs);
}
diff --git a/kernel/rseq.c b/kernel/rseq.c
index b7a1ec327e81..2452b7366b00 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -342,12 +342,12 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
/*
* Load and clear event mask atomically with respect to
- * scheduler preemption.
+ * scheduler preemption and membarrier IPIs.
*/
- preempt_disable();
- event_mask = t->rseq_event_mask;
- t->rseq_event_mask = 0;
- preempt_enable();
+ scoped_guard(RSEQ_EVENT_GUARD) {
+ event_mask = t->rseq_event_mask;
+ t->rseq_event_mask = 0;
+ }
return !!event_mask;
}
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 5973a62efa34c80c9a4e5eac1fca6f6209b902af
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025101551-unsealed-whisking-8514@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 5973a62efa34c80c9a4e5eac1fca6f6209b902af Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov(a)fb.com>
Date: Fri, 19 Sep 2025 14:27:51 -0700
Subject: [PATCH] arm64: map [_text, _stext) virtual address range
non-executable+read-only
Since the referenced fixes commit, the kernel's .text section is only
mapped starting from _stext; the region [_text, _stext) is omitted. As a
result, other vmalloc/vmap allocations may use the virtual addresses
nominally in the range [_text, _stext). This address reuse confuses
multiple things:
1. crash_prepare_elf64_headers() sets up a segment in /proc/vmcore
mapping the entire range [_text, _end) to
[__pa_symbol(_text), __pa_symbol(_end)). Reading an address in
[_text, _stext) from /proc/vmcore therefore gives the incorrect
result.
2. Tools doing symbolization (either by reading /proc/kallsyms or based
on the vmlinux ELF file) will incorrectly identify vmalloc/vmap
allocations in [_text, _stext) as kernel symbols.
In practice, both of these issues affect the drgn debugger.
Specifically, there were cases where the vmap IRQ stacks for some CPUs
were allocated in [_text, _stext). As a result, drgn could not get the
stack trace for a crash in an IRQ handler because the core dump
contained invalid data for the IRQ stack address. The stack addresses
were also symbolized as being in the _text symbol.
Fix this by bringing back the mapping of [_text, _stext), but now make
it non-executable and read-only. This prevents other allocations from
using it while still achieving the original goal of not mapping
unpredictable data as executable. Other than the changed protection,
this is effectively a revert of the fixes commit.
Fixes: e2a073dde921 ("arm64: omit [_text, _stext) from permanent kernel mapping")
Cc: stable(a)vger.kernel.org
Signed-off-by: Omar Sandoval <osandov(a)fb.com>
Signed-off-by: Will Deacon <will(a)kernel.org>
diff --git a/arch/arm64/kernel/pi/map_kernel.c b/arch/arm64/kernel/pi/map_kernel.c
index e6d35eff1486..e8ddbde31a83 100644
--- a/arch/arm64/kernel/pi/map_kernel.c
+++ b/arch/arm64/kernel/pi/map_kernel.c
@@ -78,6 +78,12 @@ static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level)
twopass |= enable_scs;
prot = twopass ? data_prot : text_prot;
+ /*
+ * [_stext, _text) isn't executed after boot and contains some
+ * non-executable, unpredictable data, so map it non-executable.
+ */
+ map_segment(init_pg_dir, &pgdp, va_offset, _text, _stext, data_prot,
+ false, root_level);
map_segment(init_pg_dir, &pgdp, va_offset, _stext, _etext, prot,
!twopass, root_level);
map_segment(init_pg_dir, &pgdp, va_offset, __start_rodata,
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 77c7926a4df6..23c05dc7a8f2 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -214,7 +214,7 @@ static void __init request_standard_resources(void)
unsigned long i = 0;
size_t res_size;
- kernel_code.start = __pa_symbol(_stext);
+ kernel_code.start = __pa_symbol(_text);
kernel_code.end = __pa_symbol(__init_begin - 1);
kernel_data.start = __pa_symbol(_sdata);
kernel_data.end = __pa_symbol(_end - 1);
@@ -280,7 +280,7 @@ u64 cpu_logical_map(unsigned int cpu)
void __init __no_sanitize_address setup_arch(char **cmdline_p)
{
- setup_initial_init_mm(_stext, _etext, _edata, _end);
+ setup_initial_init_mm(_text, _etext, _edata, _end);
*cmdline_p = boot_command_line;
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 70c2ca813c18..524d34a0e921 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -279,7 +279,7 @@ void __init arm64_memblock_init(void)
* Register the kernel text, kernel data, initrd, and initial
* pagetables with memblock.
*/
- memblock_reserve(__pa_symbol(_stext), _end - _stext);
+ memblock_reserve(__pa_symbol(_text), _end - _text);
if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {
/* the generic initrd code expects virtual addresses */
initrd_start = __phys_to_virt(phys_initrd_start);
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 0ba1a15e7e74..10c258099581 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -965,8 +965,8 @@ void __init mark_linear_text_alias_ro(void)
/*
* Remove the write permissions from the linear alias of .text/.rodata
*/
- update_mapping_prot(__pa_symbol(_stext), (unsigned long)lm_alias(_stext),
- (unsigned long)__init_begin - (unsigned long)_stext,
+ update_mapping_prot(__pa_symbol(_text), (unsigned long)lm_alias(_text),
+ (unsigned long)__init_begin - (unsigned long)_text,
PAGE_KERNEL_RO);
}
@@ -1037,7 +1037,7 @@ static inline bool force_pte_mapping(void)
static void __init map_mem(pgd_t *pgdp)
{
static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN);
- phys_addr_t kernel_start = __pa_symbol(_stext);
+ phys_addr_t kernel_start = __pa_symbol(_text);
phys_addr_t kernel_end = __pa_symbol(__init_begin);
phys_addr_t start, end;
phys_addr_t early_kfence_pool;
@@ -1086,7 +1086,7 @@ static void __init map_mem(pgd_t *pgdp)
}
/*
- * Map the linear alias of the [_stext, __init_begin) interval
+ * Map the linear alias of the [_text, __init_begin) interval
* as non-executable now, and remove the write permission in
* mark_linear_text_alias_ro() below (which will be called after
* alternative patching has completed). This makes the contents
@@ -1113,6 +1113,10 @@ void mark_rodata_ro(void)
WRITE_ONCE(rodata_is_rw, false);
update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata,
section_size, PAGE_KERNEL_RO);
+ /* mark the range between _text and _stext as read only. */
+ update_mapping_prot(__pa_symbol(_text), (unsigned long)_text,
+ (unsigned long)_stext - (unsigned long)_text,
+ PAGE_KERNEL_RO);
}
static void __init declare_vma(struct vm_struct *vma,
@@ -1183,7 +1187,7 @@ static void __init declare_kernel_vmas(void)
{
static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT];
- declare_vma(&vmlinux_seg[0], _stext, _etext, VM_NO_GUARD);
+ declare_vma(&vmlinux_seg[0], _text, _etext, VM_NO_GUARD);
declare_vma(&vmlinux_seg[1], __start_rodata, __inittext_begin, VM_NO_GUARD);
declare_vma(&vmlinux_seg[2], __inittext_begin, __inittext_end, VM_NO_GUARD);
declare_vma(&vmlinux_seg[3], __initdata_begin, __initdata_end, VM_NO_GUARD);
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 5973a62efa34c80c9a4e5eac1fca6f6209b902af
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025101549-slurp-darkened-955a@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 5973a62efa34c80c9a4e5eac1fca6f6209b902af Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov(a)fb.com>
Date: Fri, 19 Sep 2025 14:27:51 -0700
Subject: [PATCH] arm64: map [_text, _stext) virtual address range
non-executable+read-only
Since the referenced fixes commit, the kernel's .text section is only
mapped starting from _stext; the region [_text, _stext) is omitted. As a
result, other vmalloc/vmap allocations may use the virtual addresses
nominally in the range [_text, _stext). This address reuse confuses
multiple things:
1. crash_prepare_elf64_headers() sets up a segment in /proc/vmcore
mapping the entire range [_text, _end) to
[__pa_symbol(_text), __pa_symbol(_end)). Reading an address in
[_text, _stext) from /proc/vmcore therefore gives the incorrect
result.
2. Tools doing symbolization (either by reading /proc/kallsyms or based
on the vmlinux ELF file) will incorrectly identify vmalloc/vmap
allocations in [_text, _stext) as kernel symbols.
In practice, both of these issues affect the drgn debugger.
Specifically, there were cases where the vmap IRQ stacks for some CPUs
were allocated in [_text, _stext). As a result, drgn could not get the
stack trace for a crash in an IRQ handler because the core dump
contained invalid data for the IRQ stack address. The stack addresses
were also symbolized as being in the _text symbol.
Fix this by bringing back the mapping of [_text, _stext), but now make
it non-executable and read-only. This prevents other allocations from
using it while still achieving the original goal of not mapping
unpredictable data as executable. Other than the changed protection,
this is effectively a revert of the fixes commit.
Fixes: e2a073dde921 ("arm64: omit [_text, _stext) from permanent kernel mapping")
Cc: stable(a)vger.kernel.org
Signed-off-by: Omar Sandoval <osandov(a)fb.com>
Signed-off-by: Will Deacon <will(a)kernel.org>
diff --git a/arch/arm64/kernel/pi/map_kernel.c b/arch/arm64/kernel/pi/map_kernel.c
index e6d35eff1486..e8ddbde31a83 100644
--- a/arch/arm64/kernel/pi/map_kernel.c
+++ b/arch/arm64/kernel/pi/map_kernel.c
@@ -78,6 +78,12 @@ static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level)
twopass |= enable_scs;
prot = twopass ? data_prot : text_prot;
+ /*
+ * [_stext, _text) isn't executed after boot and contains some
+ * non-executable, unpredictable data, so map it non-executable.
+ */
+ map_segment(init_pg_dir, &pgdp, va_offset, _text, _stext, data_prot,
+ false, root_level);
map_segment(init_pg_dir, &pgdp, va_offset, _stext, _etext, prot,
!twopass, root_level);
map_segment(init_pg_dir, &pgdp, va_offset, __start_rodata,
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 77c7926a4df6..23c05dc7a8f2 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -214,7 +214,7 @@ static void __init request_standard_resources(void)
unsigned long i = 0;
size_t res_size;
- kernel_code.start = __pa_symbol(_stext);
+ kernel_code.start = __pa_symbol(_text);
kernel_code.end = __pa_symbol(__init_begin - 1);
kernel_data.start = __pa_symbol(_sdata);
kernel_data.end = __pa_symbol(_end - 1);
@@ -280,7 +280,7 @@ u64 cpu_logical_map(unsigned int cpu)
void __init __no_sanitize_address setup_arch(char **cmdline_p)
{
- setup_initial_init_mm(_stext, _etext, _edata, _end);
+ setup_initial_init_mm(_text, _etext, _edata, _end);
*cmdline_p = boot_command_line;
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 70c2ca813c18..524d34a0e921 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -279,7 +279,7 @@ void __init arm64_memblock_init(void)
* Register the kernel text, kernel data, initrd, and initial
* pagetables with memblock.
*/
- memblock_reserve(__pa_symbol(_stext), _end - _stext);
+ memblock_reserve(__pa_symbol(_text), _end - _text);
if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {
/* the generic initrd code expects virtual addresses */
initrd_start = __phys_to_virt(phys_initrd_start);
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 0ba1a15e7e74..10c258099581 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -965,8 +965,8 @@ void __init mark_linear_text_alias_ro(void)
/*
* Remove the write permissions from the linear alias of .text/.rodata
*/
- update_mapping_prot(__pa_symbol(_stext), (unsigned long)lm_alias(_stext),
- (unsigned long)__init_begin - (unsigned long)_stext,
+ update_mapping_prot(__pa_symbol(_text), (unsigned long)lm_alias(_text),
+ (unsigned long)__init_begin - (unsigned long)_text,
PAGE_KERNEL_RO);
}
@@ -1037,7 +1037,7 @@ static inline bool force_pte_mapping(void)
static void __init map_mem(pgd_t *pgdp)
{
static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN);
- phys_addr_t kernel_start = __pa_symbol(_stext);
+ phys_addr_t kernel_start = __pa_symbol(_text);
phys_addr_t kernel_end = __pa_symbol(__init_begin);
phys_addr_t start, end;
phys_addr_t early_kfence_pool;
@@ -1086,7 +1086,7 @@ static void __init map_mem(pgd_t *pgdp)
}
/*
- * Map the linear alias of the [_stext, __init_begin) interval
+ * Map the linear alias of the [_text, __init_begin) interval
* as non-executable now, and remove the write permission in
* mark_linear_text_alias_ro() below (which will be called after
* alternative patching has completed). This makes the contents
@@ -1113,6 +1113,10 @@ void mark_rodata_ro(void)
WRITE_ONCE(rodata_is_rw, false);
update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata,
section_size, PAGE_KERNEL_RO);
+ /* mark the range between _text and _stext as read only. */
+ update_mapping_prot(__pa_symbol(_text), (unsigned long)_text,
+ (unsigned long)_stext - (unsigned long)_text,
+ PAGE_KERNEL_RO);
}
static void __init declare_vma(struct vm_struct *vma,
@@ -1183,7 +1187,7 @@ static void __init declare_kernel_vmas(void)
{
static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT];
- declare_vma(&vmlinux_seg[0], _stext, _etext, VM_NO_GUARD);
+ declare_vma(&vmlinux_seg[0], _text, _etext, VM_NO_GUARD);
declare_vma(&vmlinux_seg[1], __start_rodata, __inittext_begin, VM_NO_GUARD);
declare_vma(&vmlinux_seg[2], __inittext_begin, __inittext_end, VM_NO_GUARD);
declare_vma(&vmlinux_seg[3], __initdata_begin, __initdata_end, VM_NO_GUARD);
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 5973a62efa34c80c9a4e5eac1fca6f6209b902af
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025101548-stiffly-eagle-d9a5@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 5973a62efa34c80c9a4e5eac1fca6f6209b902af Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov(a)fb.com>
Date: Fri, 19 Sep 2025 14:27:51 -0700
Subject: [PATCH] arm64: map [_text, _stext) virtual address range
non-executable+read-only
Since the referenced fixes commit, the kernel's .text section is only
mapped starting from _stext; the region [_text, _stext) is omitted. As a
result, other vmalloc/vmap allocations may use the virtual addresses
nominally in the range [_text, _stext). This address reuse confuses
multiple things:
1. crash_prepare_elf64_headers() sets up a segment in /proc/vmcore
mapping the entire range [_text, _end) to
[__pa_symbol(_text), __pa_symbol(_end)). Reading an address in
[_text, _stext) from /proc/vmcore therefore gives the incorrect
result.
2. Tools doing symbolization (either by reading /proc/kallsyms or based
on the vmlinux ELF file) will incorrectly identify vmalloc/vmap
allocations in [_text, _stext) as kernel symbols.
In practice, both of these issues affect the drgn debugger.
Specifically, there were cases where the vmap IRQ stacks for some CPUs
were allocated in [_text, _stext). As a result, drgn could not get the
stack trace for a crash in an IRQ handler because the core dump
contained invalid data for the IRQ stack address. The stack addresses
were also symbolized as being in the _text symbol.
Fix this by bringing back the mapping of [_text, _stext), but now make
it non-executable and read-only. This prevents other allocations from
using it while still achieving the original goal of not mapping
unpredictable data as executable. Other than the changed protection,
this is effectively a revert of the fixes commit.
Fixes: e2a073dde921 ("arm64: omit [_text, _stext) from permanent kernel mapping")
Cc: stable(a)vger.kernel.org
Signed-off-by: Omar Sandoval <osandov(a)fb.com>
Signed-off-by: Will Deacon <will(a)kernel.org>
diff --git a/arch/arm64/kernel/pi/map_kernel.c b/arch/arm64/kernel/pi/map_kernel.c
index e6d35eff1486..e8ddbde31a83 100644
--- a/arch/arm64/kernel/pi/map_kernel.c
+++ b/arch/arm64/kernel/pi/map_kernel.c
@@ -78,6 +78,12 @@ static void __init map_kernel(u64 kaslr_offset, u64 va_offset, int root_level)
twopass |= enable_scs;
prot = twopass ? data_prot : text_prot;
+ /*
+ * [_stext, _text) isn't executed after boot and contains some
+ * non-executable, unpredictable data, so map it non-executable.
+ */
+ map_segment(init_pg_dir, &pgdp, va_offset, _text, _stext, data_prot,
+ false, root_level);
map_segment(init_pg_dir, &pgdp, va_offset, _stext, _etext, prot,
!twopass, root_level);
map_segment(init_pg_dir, &pgdp, va_offset, __start_rodata,
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 77c7926a4df6..23c05dc7a8f2 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -214,7 +214,7 @@ static void __init request_standard_resources(void)
unsigned long i = 0;
size_t res_size;
- kernel_code.start = __pa_symbol(_stext);
+ kernel_code.start = __pa_symbol(_text);
kernel_code.end = __pa_symbol(__init_begin - 1);
kernel_data.start = __pa_symbol(_sdata);
kernel_data.end = __pa_symbol(_end - 1);
@@ -280,7 +280,7 @@ u64 cpu_logical_map(unsigned int cpu)
void __init __no_sanitize_address setup_arch(char **cmdline_p)
{
- setup_initial_init_mm(_stext, _etext, _edata, _end);
+ setup_initial_init_mm(_text, _etext, _edata, _end);
*cmdline_p = boot_command_line;
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 70c2ca813c18..524d34a0e921 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -279,7 +279,7 @@ void __init arm64_memblock_init(void)
* Register the kernel text, kernel data, initrd, and initial
* pagetables with memblock.
*/
- memblock_reserve(__pa_symbol(_stext), _end - _stext);
+ memblock_reserve(__pa_symbol(_text), _end - _text);
if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {
/* the generic initrd code expects virtual addresses */
initrd_start = __phys_to_virt(phys_initrd_start);
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 0ba1a15e7e74..10c258099581 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -965,8 +965,8 @@ void __init mark_linear_text_alias_ro(void)
/*
* Remove the write permissions from the linear alias of .text/.rodata
*/
- update_mapping_prot(__pa_symbol(_stext), (unsigned long)lm_alias(_stext),
- (unsigned long)__init_begin - (unsigned long)_stext,
+ update_mapping_prot(__pa_symbol(_text), (unsigned long)lm_alias(_text),
+ (unsigned long)__init_begin - (unsigned long)_text,
PAGE_KERNEL_RO);
}
@@ -1037,7 +1037,7 @@ static inline bool force_pte_mapping(void)
static void __init map_mem(pgd_t *pgdp)
{
static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN);
- phys_addr_t kernel_start = __pa_symbol(_stext);
+ phys_addr_t kernel_start = __pa_symbol(_text);
phys_addr_t kernel_end = __pa_symbol(__init_begin);
phys_addr_t start, end;
phys_addr_t early_kfence_pool;
@@ -1086,7 +1086,7 @@ static void __init map_mem(pgd_t *pgdp)
}
/*
- * Map the linear alias of the [_stext, __init_begin) interval
+ * Map the linear alias of the [_text, __init_begin) interval
* as non-executable now, and remove the write permission in
* mark_linear_text_alias_ro() below (which will be called after
* alternative patching has completed). This makes the contents
@@ -1113,6 +1113,10 @@ void mark_rodata_ro(void)
WRITE_ONCE(rodata_is_rw, false);
update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata,
section_size, PAGE_KERNEL_RO);
+ /* mark the range between _text and _stext as read only. */
+ update_mapping_prot(__pa_symbol(_text), (unsigned long)_text,
+ (unsigned long)_stext - (unsigned long)_text,
+ PAGE_KERNEL_RO);
}
static void __init declare_vma(struct vm_struct *vma,
@@ -1183,7 +1187,7 @@ static void __init declare_kernel_vmas(void)
{
static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT];
- declare_vma(&vmlinux_seg[0], _stext, _etext, VM_NO_GUARD);
+ declare_vma(&vmlinux_seg[0], _text, _etext, VM_NO_GUARD);
declare_vma(&vmlinux_seg[1], __start_rodata, __inittext_begin, VM_NO_GUARD);
declare_vma(&vmlinux_seg[2], __inittext_begin, __inittext_end, VM_NO_GUARD);
declare_vma(&vmlinux_seg[3], __initdata_begin, __initdata_end, VM_NO_GUARD);
The patch below does not apply to the 6.12-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.12.y
git checkout FETCH_HEAD
git cherry-pick -x 8e7e265d558e0257d6dacc78ec64aff4ba75f61e
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025101502-elf-bootie-19bf@gregkh' --subject-prefix 'PATCH 6.12.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8e7e265d558e0257d6dacc78ec64aff4ba75f61e Mon Sep 17 00:00:00 2001
From: Charalampos Mitrodimas <charmitro(a)posteo.net>
Date: Sat, 16 Aug 2025 14:14:37 +0000
Subject: [PATCH] debugfs: fix mount options not being applied
Mount options (uid, gid, mode) are silently ignored when debugfs is
mounted. This is a regression introduced during the conversion to the
new mount API.
When the mount API conversion was done, the parsed options were never
applied to the superblock when it was reused. As a result, the mount
options were ignored when debugfs was mounted.
Fix this by following the same pattern as the tracefs fix in commit
e4d32142d1de ("tracing: Fix tracefs mount options"). Call
debugfs_reconfigure() in debugfs_get_tree() to apply the mount options
to the superblock after it has been created or reused.
As an example, with the bug the "mode" mount option is ignored:
$ mount -o mode=0666 -t debugfs debugfs /tmp/debugfs_test
$ mount | grep debugfs_test
debugfs on /tmp/debugfs_test type debugfs (rw,relatime)
$ ls -ld /tmp/debugfs_test
drwx------ 25 root root 0 Aug 4 14:16 /tmp/debugfs_test
With the fix applied, it works as expected:
$ mount -o mode=0666 -t debugfs debugfs /tmp/debugfs_test
$ mount | grep debugfs_test
debugfs on /tmp/debugfs_test type debugfs (rw,relatime,mode=666)
$ ls -ld /tmp/debugfs_test
drw-rw-rw- 37 root root 0 Aug 2 17:28 /tmp/debugfs_test
Fixes: a20971c18752 ("vfs: Convert debugfs to use the new mount API")
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220406
Cc: stable(a)vger.kernel.org
Reviewed-by: Eric Sandeen <sandeen(a)redhat.com>
Signed-off-by: Charalampos Mitrodimas <charmitro(a)posteo.net>
Link: https://lore.kernel.org/20250816-debugfs-mount-opts-v3-1-d271dad57b5b@poste…
Signed-off-by: Christian Brauner <brauner(a)kernel.org>
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index a0357b0cf362..c12d649df6a5 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -183,6 +183,9 @@ static int debugfs_reconfigure(struct fs_context *fc)
struct debugfs_fs_info *sb_opts = sb->s_fs_info;
struct debugfs_fs_info *new_opts = fc->s_fs_info;
+ if (!new_opts)
+ return 0;
+
sync_filesystem(sb);
/* structure copy of new mount options to sb */
@@ -282,10 +285,16 @@ static int debugfs_fill_super(struct super_block *sb, struct fs_context *fc)
static int debugfs_get_tree(struct fs_context *fc)
{
+ int err;
+
if (!(debugfs_allow & DEBUGFS_ALLOW_API))
return -EPERM;
- return get_tree_single(fc, debugfs_fill_super);
+ err = get_tree_single(fc, debugfs_fill_super);
+ if (err)
+ return err;
+
+ return debugfs_reconfigure(fc);
}
static void debugfs_free_fc(struct fs_context *fc)
The patch below does not apply to the 6.17-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.17.y
git checkout FETCH_HEAD
git cherry-pick -x 8e7e265d558e0257d6dacc78ec64aff4ba75f61e
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025101559-usher-corroding-5b5a@gregkh' --subject-prefix 'PATCH 6.17.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8e7e265d558e0257d6dacc78ec64aff4ba75f61e Mon Sep 17 00:00:00 2001
From: Charalampos Mitrodimas <charmitro(a)posteo.net>
Date: Sat, 16 Aug 2025 14:14:37 +0000
Subject: [PATCH] debugfs: fix mount options not being applied
Mount options (uid, gid, mode) are silently ignored when debugfs is
mounted. This is a regression introduced during the conversion to the
new mount API.
When the mount API conversion was done, the parsed options were never
applied to the superblock when it was reused. As a result, the mount
options were ignored when debugfs was mounted.
Fix this by following the same pattern as the tracefs fix in commit
e4d32142d1de ("tracing: Fix tracefs mount options"). Call
debugfs_reconfigure() in debugfs_get_tree() to apply the mount options
to the superblock after it has been created or reused.
As an example, with the bug the "mode" mount option is ignored:
$ mount -o mode=0666 -t debugfs debugfs /tmp/debugfs_test
$ mount | grep debugfs_test
debugfs on /tmp/debugfs_test type debugfs (rw,relatime)
$ ls -ld /tmp/debugfs_test
drwx------ 25 root root 0 Aug 4 14:16 /tmp/debugfs_test
With the fix applied, it works as expected:
$ mount -o mode=0666 -t debugfs debugfs /tmp/debugfs_test
$ mount | grep debugfs_test
debugfs on /tmp/debugfs_test type debugfs (rw,relatime,mode=666)
$ ls -ld /tmp/debugfs_test
drw-rw-rw- 37 root root 0 Aug 2 17:28 /tmp/debugfs_test
Fixes: a20971c18752 ("vfs: Convert debugfs to use the new mount API")
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220406
Cc: stable(a)vger.kernel.org
Reviewed-by: Eric Sandeen <sandeen(a)redhat.com>
Signed-off-by: Charalampos Mitrodimas <charmitro(a)posteo.net>
Link: https://lore.kernel.org/20250816-debugfs-mount-opts-v3-1-d271dad57b5b@poste…
Signed-off-by: Christian Brauner <brauner(a)kernel.org>
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index a0357b0cf362..c12d649df6a5 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -183,6 +183,9 @@ static int debugfs_reconfigure(struct fs_context *fc)
struct debugfs_fs_info *sb_opts = sb->s_fs_info;
struct debugfs_fs_info *new_opts = fc->s_fs_info;
+ if (!new_opts)
+ return 0;
+
sync_filesystem(sb);
/* structure copy of new mount options to sb */
@@ -282,10 +285,16 @@ static int debugfs_fill_super(struct super_block *sb, struct fs_context *fc)
static int debugfs_get_tree(struct fs_context *fc)
{
+ int err;
+
if (!(debugfs_allow & DEBUGFS_ALLOW_API))
return -EPERM;
- return get_tree_single(fc, debugfs_fill_super);
+ err = get_tree_single(fc, debugfs_fill_super);
+ if (err)
+ return err;
+
+ return debugfs_reconfigure(fc);
}
static void debugfs_free_fc(struct fs_context *fc)
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 72d271a7baa7062cb27e774ac37c5459c6d20e22
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025101537-visor-thank-7800@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 72d271a7baa7062cb27e774ac37c5459c6d20e22 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar(a)cyphar.com>
Date: Thu, 7 Aug 2025 03:55:23 +1000
Subject: [PATCH] fscontext: do not consume log entries when returning
-EMSGSIZE
Userspace generally expects APIs that return -EMSGSIZE to allow for them
to adjust their buffer size and retry the operation. However, the
fscontext log would previously clear the message even in the -EMSGSIZE
case.
Given that it is very cheap for us to check whether the buffer is too
small before we remove the message from the ring buffer, let's just do
that instead. While we're at it, refactor some fscontext_read() into a
separate helper to make the ring buffer logic a bit easier to read.
Fixes: 007ec26cdc9f ("vfs: Implement logging through fs_context")
Cc: David Howells <dhowells(a)redhat.com>
Cc: stable(a)vger.kernel.org # v5.2+
Signed-off-by: Aleksa Sarai <cyphar(a)cyphar.com>
Link: https://lore.kernel.org/20250807-fscontext-log-cleanups-v3-1-8d91d6242dc3@c…
Signed-off-by: Christian Brauner <brauner(a)kernel.org>
diff --git a/fs/fsopen.c b/fs/fsopen.c
index 1aaf4cb2afb2..f645c99204eb 100644
--- a/fs/fsopen.c
+++ b/fs/fsopen.c
@@ -18,50 +18,56 @@
#include "internal.h"
#include "mount.h"
+static inline const char *fetch_message_locked(struct fc_log *log, size_t len,
+ bool *need_free)
+{
+ const char *p;
+ int index;
+
+ if (unlikely(log->head == log->tail))
+ return ERR_PTR(-ENODATA);
+
+ index = log->tail & (ARRAY_SIZE(log->buffer) - 1);
+ p = log->buffer[index];
+ if (unlikely(strlen(p) > len))
+ return ERR_PTR(-EMSGSIZE);
+
+ log->buffer[index] = NULL;
+ *need_free = log->need_free & (1 << index);
+ log->need_free &= ~(1 << index);
+ log->tail++;
+
+ return p;
+}
+
/*
* Allow the user to read back any error, warning or informational messages.
+ * Only one message is returned for each read(2) call.
*/
static ssize_t fscontext_read(struct file *file,
char __user *_buf, size_t len, loff_t *pos)
{
struct fs_context *fc = file->private_data;
- struct fc_log *log = fc->log.log;
- unsigned int logsize = ARRAY_SIZE(log->buffer);
- ssize_t ret;
- char *p;
+ ssize_t err;
+ const char *p __free(kfree) = NULL, *message;
bool need_free;
- int index, n;
+ int n;
- ret = mutex_lock_interruptible(&fc->uapi_mutex);
- if (ret < 0)
- return ret;
-
- if (log->head == log->tail) {
- mutex_unlock(&fc->uapi_mutex);
- return -ENODATA;
- }
-
- index = log->tail & (logsize - 1);
- p = log->buffer[index];
- need_free = log->need_free & (1 << index);
- log->buffer[index] = NULL;
- log->need_free &= ~(1 << index);
- log->tail++;
+ err = mutex_lock_interruptible(&fc->uapi_mutex);
+ if (err < 0)
+ return err;
+ message = fetch_message_locked(fc->log.log, len, &need_free);
mutex_unlock(&fc->uapi_mutex);
+ if (IS_ERR(message))
+ return PTR_ERR(message);
- ret = -EMSGSIZE;
- n = strlen(p);
- if (n > len)
- goto err_free;
- ret = -EFAULT;
- if (copy_to_user(_buf, p, n) != 0)
- goto err_free;
- ret = n;
-
-err_free:
if (need_free)
- kfree(p);
- return ret;
+ p = message;
+
+ n = strlen(message);
+ if (copy_to_user(_buf, message, n))
+ return -EFAULT;
+ return n;
}
static int fscontext_release(struct inode *inode, struct file *file)