When get_block is called with a buffer_head allocated on the stack, such
as do_mpage_readpage, stack corruption due to buffer_head UAF may occur in
the following race condition situation.
<CPU 0> <CPU 1>
mpage_read_folio
<<bh on stack>>
do_mpage_readpage
exfat_get_block
bh_read
__bh_read
get_bh(bh)
submit_bh
wait_on_buffer
...
end_buffer_read_sync
__end_buffer_read_notouch
unlock_buffer
<<keep going>>
...
...
...
...
<<bh is not valid out of mpage_read_folio>>
.
.
another_function
<<variable A on stack>>
put_bh(bh)
atomic_dec(bh->b_count)
* stack corruption here *
This patch returns -EAGAIN if a folio does not have buffers when bh_read
needs to be called. By doing this, the caller can fallback to functions
like block_read_full_folio(), create a buffer_head in the folio, and then
call get_block again.
Let's do not call bh_read() with on-stack buffer_head.
Fixes: 11a347fb6cef ("exfat: change to get file size from DataLength")
Cc: stable(a)vger.kernel.org
Tested-by: Yeongjin Gil <youngjin.gil(a)samsung.com>
Signed-off-by: Sungjong Seo <sj1557.seo(a)samsung.com>
---
fs/exfat/inode.c | 37 ++++++++++++++++++++++++++++++++-----
1 file changed, 32 insertions(+), 5 deletions(-)
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index 96952d4acb50..b8ea910586e4 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -344,7 +344,8 @@ static int exfat_get_block(struct inode *inode, sector_t iblock,
* The block has been partially written,
* zero the unwritten part and map the block.
*/
- loff_t size, off, pos;
+ loff_t size, pos;
+ void *addr;
max_blocks = 1;
@@ -355,17 +356,43 @@ static int exfat_get_block(struct inode *inode, sector_t iblock,
if (!bh_result->b_folio)
goto done;
+ /*
+ * No buffer_head is allocated.
+ * (1) bmap: It's enough to fill bh_result without I/O.
+ * (2) read: The unwritten part should be filled with 0
+ * If a folio does not have any buffers,
+ * let's returns -EAGAIN to fallback to
+ * per-bh IO like block_read_full_folio().
+ */
+ if (!folio_buffers(bh_result->b_folio)) {
+ err = -EAGAIN;
+ goto done;
+ }
+
pos = EXFAT_BLK_TO_B(iblock, sb);
size = ei->valid_size - pos;
- off = pos & (PAGE_SIZE - 1);
+ addr = folio_address(bh_result->b_folio) +
+ offset_in_folio(bh_result->b_folio, pos);
+
+ BUG_ON(size > sb->s_blocksize);
+
+ /* Check if bh->b_data points to proper addr in folio */
+ if (bh_result->b_data != addr) {
+ exfat_fs_error_ratelimit(sb,
+ "b_data(%p) != folio_addr(%p)",
+ bh_result->b_data, addr);
+ err = -EINVAL;
+ goto done;
+ }
- folio_set_bh(bh_result, bh_result->b_folio, off);
+ /* Read a block */
err = bh_read(bh_result, 0);
if (err < 0)
goto unlock_ret;
- folio_zero_segment(bh_result->b_folio, off + size,
- off + sb->s_blocksize);
+ /* Zero unwritten part of a block */
+ memset(bh_result->b_data + size, 0,
+ bh_result->b_size - size);
} else {
/*
* The range has not been written, clear the mapped flag
--
2.25.1
Even when X86_FEATURE_PKU and X86_FEATURE_OSPKE are available,
XFEATURE_PKRU can be missing.
In such a case, pkeys has to be disabled to avoid hanging up.
WARNING: CPU: 0 PID: 1 at arch/x86/kernel/fpu/xstate.c:1003 get_xsave_addr_user+0x28/0x40
(...)
Call Trace:
<TASK>
? get_xsave_addr_user+0x28/0x40
? __warn.cold+0x8e/0xea
? get_xsave_addr_user+0x28/0x40
? report_bug+0xff/0x140
? handle_bug+0x3b/0x70
? exc_invalid_op+0x17/0x70
? asm_exc_invalid_op+0x1a/0x20
? get_xsave_addr_user+0x28/0x40
copy_fpstate_to_sigframe+0x1be/0x380
? __put_user_8+0x11/0x20
get_sigframe+0xf1/0x280
x64_setup_rt_frame+0x67/0x2c0
arch_do_signal_or_restart+0x1b3/0x240
syscall_exit_to_user_mode+0xb0/0x130
do_syscall_64+0xab/0x1a0
entry_SYSCALL_64_after_hwframe+0x77/0x7f
This fix is known to be needed on Apple Virtualization.
Tested with macOS 13.5.2 running on MacBook Pro 2020 with
Intel(R) Core(TM) i7-1068NG7 CPU @ 2.30GHz.
Fixes: 70044df250d0 ("x86/pkeys: Update PKRU to enable all pkeys before XSAVE")
Link: https://lore.kernel.org/regressions/CAG8fp8QvH71Wi_y7b7tgFp7knK38rfrF7rRHh-…
Link: https://github.com/lima-vm/lima/issues/3334
Signed-off-by: Akihiro Suda <akihiro.suda.cz(a)hco.ntt.co.jp>
---
arch/x86/kernel/cpu/common.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e9464fe411ac..4c2c268af214 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -517,7 +517,8 @@ static bool pku_disabled;
static __always_inline void setup_pku(struct cpuinfo_x86 *c)
{
if (c == &boot_cpu_data) {
- if (pku_disabled || !cpu_feature_enabled(X86_FEATURE_PKU))
+ if (pku_disabled || !cpu_feature_enabled(X86_FEATURE_PKU) ||
+ !cpu_has_xfeatures(XFEATURE_PKRU, NULL))
return;
/*
* Setting CR4.PKE will cause the X86_FEATURE_OSPKE cpuid
--
2.45.2
On 3/20/25 6:07 AM, James Thomas wrote:
> Hello all,
>
> I encountered an issue with the CPU affinity of tasks launched by systemd in a
> slice, after updating from systemd 254 to by systemd >= 256, on the LTS 5.15.x
> branch (tested on v5.15.179).
>
> Despite the slice file stipulating AllowedCPUS=2 (and confirming this was set in
> /sys/fs/cgroup/test.slice/cpuset.cpus) tasks launched in the slice would have
> the CPU affinity of the system.slice (i.e all by default) rather than 2.
>
> To reproduce:
>
> * Check kernel version and systemd version (I used a debian testing image for
> testing)
>
> ```
> # uname -r
> 5.15.179
> # systemctl --version
> systemd 257 (257.4-3)
> ...
> ```
>
> * Create a test.slice with AllowedCPUS=2
>
> ```
> # cat <<EOF > /usr/lib/systemd/system/test.slice
> [Unit]
> Description=Test slice
> Before=slices.target
> [Slice]
> AllowedCPUs=2
> [Install]
> WantedBy=slices.target
> EOF
> # systemctl daemon-reload && systemctl start test.slice
> ```
>
> * Confirm cpuset
>
> ```
> # cat /sys/fs/cgroup/test.slice/cpuset.cpus
> 2
> ```
>
> * Launch task in slice
>
> ```
> # systemd-run --slice test.slice yes
> Running as unit: run-r9187b97c6958498aad5bba213289ac56.service; invocation ID:
> f470f74047ac43b7a60861d03a7ef6f9
> # cat
> /sys/fs/cgroup/test.slice/run-r9187b97c6958498aad5bba213289ac56.service/cgroup.procs
>
> 317
> ```
>
> # Check affinity
>
> ```
> # taskset -pc 317
> pid 317's current affinity list: 0-7
> ```
>
> This issue is fixed by applying upstream commits:
>
> 18f9a4d47527772515ad6cbdac796422566e6440
> cgroup/cpuset: Skip spread flags update on v2
> and
> 42a11bf5c5436e91b040aeb04063be1710bb9f9c
> cgroup/cpuset: Make cpuset_fork() handle CLONE_INTO_CGROUP properly
>
> With these applied:
>
> ```
> # systemd-run --slice test.slice yes
> Running as unit: run-r442c444559ff49f48c6c2b8325b3b500.service; invocation ID:
> 5211167267154e9292cb6b854585cb91
> # cat /sys/fs/cgroup/test.slice/run-r442c444559ff49f48c6c2b8325b3b500.service
> 291
> # taskset -pc 291
> pid 291's current affinity list: 2
> ```
>
> Perhaps these are a good candidate for backport onto the 5.15 LTS branch?
>
> Thanks
> James
>
You should also send this email to stable(a)vger.kernel.org for
consideration into the 5.15 LTS branch.
Cheers,
Longman
Once a key's reference count has been reduced to 0, the garbage collector
thread may destroy it at any time and so key_put() is not allowed to touch
the key after that point. The most key_put() is normally allowed to do is
to touch key_gc_work as that's a static global variable.
However, in an effort to speed up the reclamation of quota, this is now
done in key_put() once the key's usage is reduced to 0 - but now the code
is looking at the key after the deadline, which is forbidden.
Fix this by using a flag to indicate that a key can be gc'd now rather than
looking at the key's refcount in the garbage collector.
Fixes: 9578e327b2b4 ("keys: update key quotas in key_put()")
Reported-by: syzbot+6105ffc1ded71d194d6d(a)syzkaller.appspotmail.com
Signed-off-by: David Howells <dhowells(a)redhat.com>
Tested-by: syzbot+6105ffc1ded71d194d6d(a)syzkaller.appspotmail.com
cc: Jarkko Sakkinen <jarkko(a)kernel.org>
cc: Oleg Nesterov <oleg(a)redhat.com>
cc: Kees Cook <kees(a)kernel.org>
cc: Hillf Danton <hdanton(a)sina.com>,
cc: keyrings(a)vger.kernel.org
Cc: stable(a)vger.kernel.org # v6.10+
---
include/linux/key.h | 1 +
security/keys/gc.c | 4 +++-
security/keys/key.c | 2 ++
3 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/include/linux/key.h b/include/linux/key.h
index 074dca3222b9..ba05de8579ec 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -236,6 +236,7 @@ struct key {
#define KEY_FLAG_ROOT_CAN_INVAL 7 /* set if key can be invalidated by root without permission */
#define KEY_FLAG_KEEP 8 /* set if key should not be removed */
#define KEY_FLAG_UID_KEYRING 9 /* set if key is a user or user session keyring */
+#define KEY_FLAG_FINAL_PUT 10 /* set if final put has happened on key */
/* the key type and key description string
* - the desc is used to match a key against search criteria
diff --git a/security/keys/gc.c b/security/keys/gc.c
index 7d687b0962b1..f27223ea4578 100644
--- a/security/keys/gc.c
+++ b/security/keys/gc.c
@@ -218,8 +218,10 @@ static void key_garbage_collector(struct work_struct *work)
key = rb_entry(cursor, struct key, serial_node);
cursor = rb_next(cursor);
- if (refcount_read(&key->usage) == 0)
+ if (test_bit(KEY_FLAG_FINAL_PUT, &key->flags)) {
+ smp_mb(); /* Clobber key->user after FINAL_PUT seen. */
goto found_unreferenced_key;
+ }
if (unlikely(gc_state & KEY_GC_REAPING_DEAD_1)) {
if (key->type == key_gc_dead_keytype) {
diff --git a/security/keys/key.c b/security/keys/key.c
index 3d7d185019d3..7198cd2ac3a3 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -658,6 +658,8 @@ void key_put(struct key *key)
key->user->qnbytes -= key->quotalen;
spin_unlock_irqrestore(&key->user->lock, flags);
}
+ smp_mb(); /* key->user before FINAL_PUT set. */
+ set_bit(KEY_FLAG_FINAL_PUT, &key->flags);
schedule_work(&key_gc_work);
}
}
From: Sebastian Andrzej Siewior <bigeasy(a)linutronix.de>
commit 4a1d3acd6ea86075e77fcc1188c3fc372833ba73 upstream.
The nft_counter uses two s64 counters for statistics. Those two are
protected by a seqcount to ensure that the 64bit variable is always
properly seen during updates even on 32bit architectures where the store
is performed by two writes. A side effect is that the two counter (bytes
and packet) are written and read together in the same window.
This can be replaced with u64_stats_t. write_seqcount_begin()/ end() is
replaced with u64_stats_update_begin()/ end() and behaves the same way
as with seqcount_t on 32bit architectures. Additionally there is a
preempt_disable on PREEMPT_RT to ensure that a reader does not preempt a
writer.
On 64bit architectures the macros are removed and the reads happen
without any retries. This also means that the reader can observe one
counter (bytes) from before the update and the other counter (packets)
but that is okay since there is no requirement to have both counter from
the same update window.
Convert the statistic to u64_stats_t. There is one optimisation:
nft_counter_do_init() and nft_counter_clone() allocate a new per-CPU
counter and assign a value to it. During this assignment preemption is
disabled which is not needed because the counter is not yet exposed to
the system so there can not be another writer or reader. Therefore
disabling preemption is omitted and raw_cpu_ptr() is used to obtain a
pointer to a counter for the assignment.
Cc: Eric Dumazet <edumazet(a)google.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy(a)linutronix.de>
Signed-off-by: Pablo Neira Ayuso <pablo(a)netfilter.org>
Signed-off-by: Felix Moessbauer <felix.moessbauer(a)siemens.com>
---
I propose the backport, as this is a performance improvement. Note,
that this is a bugfix on RT kernels.
net/netfilter/nft_counter.c | 90 +++++++++++++++++++------------------
1 file changed, 46 insertions(+), 44 deletions(-)
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
index 781d3a26f5df..8d19bd001277 100644
--- a/net/netfilter/nft_counter.c
+++ b/net/netfilter/nft_counter.c
@@ -8,7 +8,7 @@
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
-#include <linux/seqlock.h>
+#include <linux/u64_stats_sync.h>
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
@@ -17,6 +17,11 @@
#include <net/netfilter/nf_tables_offload.h>
struct nft_counter {
+ u64_stats_t bytes;
+ u64_stats_t packets;
+};
+
+struct nft_counter_tot {
s64 bytes;
s64 packets;
};
@@ -25,25 +30,24 @@ struct nft_counter_percpu_priv {
struct nft_counter __percpu *counter;
};
-static DEFINE_PER_CPU(seqcount_t, nft_counter_seq);
+static DEFINE_PER_CPU(struct u64_stats_sync, nft_counter_sync);
static inline void nft_counter_do_eval(struct nft_counter_percpu_priv *priv,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
+ struct u64_stats_sync *nft_sync;
struct nft_counter *this_cpu;
- seqcount_t *myseq;
local_bh_disable();
this_cpu = this_cpu_ptr(priv->counter);
- myseq = this_cpu_ptr(&nft_counter_seq);
-
- write_seqcount_begin(myseq);
+ nft_sync = this_cpu_ptr(&nft_counter_sync);
- this_cpu->bytes += pkt->skb->len;
- this_cpu->packets++;
+ u64_stats_update_begin(nft_sync);
+ u64_stats_add(&this_cpu->bytes, pkt->skb->len);
+ u64_stats_inc(&this_cpu->packets);
+ u64_stats_update_end(nft_sync);
- write_seqcount_end(myseq);
local_bh_enable();
}
@@ -66,17 +70,16 @@ static int nft_counter_do_init(const struct nlattr * const tb[],
if (cpu_stats == NULL)
return -ENOMEM;
- preempt_disable();
- this_cpu = this_cpu_ptr(cpu_stats);
+ this_cpu = raw_cpu_ptr(cpu_stats);
if (tb[NFTA_COUNTER_PACKETS]) {
- this_cpu->packets =
- be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]));
+ u64_stats_set(&this_cpu->packets,
+ be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS])));
}
if (tb[NFTA_COUNTER_BYTES]) {
- this_cpu->bytes =
- be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES]));
+ u64_stats_set(&this_cpu->bytes,
+ be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES])));
}
- preempt_enable();
+
priv->counter = cpu_stats;
return 0;
}
@@ -104,40 +107,41 @@ static void nft_counter_obj_destroy(const struct nft_ctx *ctx,
}
static void nft_counter_reset(struct nft_counter_percpu_priv *priv,
- struct nft_counter *total)
+ struct nft_counter_tot *total)
{
+ struct u64_stats_sync *nft_sync;
struct nft_counter *this_cpu;
- seqcount_t *myseq;
local_bh_disable();
this_cpu = this_cpu_ptr(priv->counter);
- myseq = this_cpu_ptr(&nft_counter_seq);
+ nft_sync = this_cpu_ptr(&nft_counter_sync);
+
+ u64_stats_update_begin(nft_sync);
+ u64_stats_add(&this_cpu->packets, -total->packets);
+ u64_stats_add(&this_cpu->bytes, -total->bytes);
+ u64_stats_update_end(nft_sync);
- write_seqcount_begin(myseq);
- this_cpu->packets -= total->packets;
- this_cpu->bytes -= total->bytes;
- write_seqcount_end(myseq);
local_bh_enable();
}
static void nft_counter_fetch(struct nft_counter_percpu_priv *priv,
- struct nft_counter *total)
+ struct nft_counter_tot *total)
{
struct nft_counter *this_cpu;
- const seqcount_t *myseq;
u64 bytes, packets;
unsigned int seq;
int cpu;
memset(total, 0, sizeof(*total));
for_each_possible_cpu(cpu) {
- myseq = per_cpu_ptr(&nft_counter_seq, cpu);
+ struct u64_stats_sync *nft_sync = per_cpu_ptr(&nft_counter_sync, cpu);
+
this_cpu = per_cpu_ptr(priv->counter, cpu);
do {
- seq = read_seqcount_begin(myseq);
- bytes = this_cpu->bytes;
- packets = this_cpu->packets;
- } while (read_seqcount_retry(myseq, seq));
+ seq = u64_stats_fetch_begin(nft_sync);
+ bytes = u64_stats_read(&this_cpu->bytes);
+ packets = u64_stats_read(&this_cpu->packets);
+ } while (u64_stats_fetch_retry(nft_sync, seq));
total->bytes += bytes;
total->packets += packets;
@@ -148,7 +152,7 @@ static int nft_counter_do_dump(struct sk_buff *skb,
struct nft_counter_percpu_priv *priv,
bool reset)
{
- struct nft_counter total;
+ struct nft_counter_tot total;
nft_counter_fetch(priv, &total);
@@ -236,7 +240,7 @@ static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src, g
struct nft_counter_percpu_priv *priv_clone = nft_expr_priv(dst);
struct nft_counter __percpu *cpu_stats;
struct nft_counter *this_cpu;
- struct nft_counter total;
+ struct nft_counter_tot total;
nft_counter_fetch(priv, &total);
@@ -244,11 +248,9 @@ static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src, g
if (cpu_stats == NULL)
return -ENOMEM;
- preempt_disable();
- this_cpu = this_cpu_ptr(cpu_stats);
- this_cpu->packets = total.packets;
- this_cpu->bytes = total.bytes;
- preempt_enable();
+ this_cpu = raw_cpu_ptr(cpu_stats);
+ u64_stats_set(&this_cpu->packets, total.packets);
+ u64_stats_set(&this_cpu->bytes, total.bytes);
priv_clone->counter = cpu_stats;
return 0;
@@ -266,17 +268,17 @@ static void nft_counter_offload_stats(struct nft_expr *expr,
const struct flow_stats *stats)
{
struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
+ struct u64_stats_sync *nft_sync;
struct nft_counter *this_cpu;
- seqcount_t *myseq;
local_bh_disable();
this_cpu = this_cpu_ptr(priv->counter);
- myseq = this_cpu_ptr(&nft_counter_seq);
+ nft_sync = this_cpu_ptr(&nft_counter_sync);
- write_seqcount_begin(myseq);
- this_cpu->packets += stats->pkts;
- this_cpu->bytes += stats->bytes;
- write_seqcount_end(myseq);
+ u64_stats_update_begin(nft_sync);
+ u64_stats_add(&this_cpu->packets, stats->pkts);
+ u64_stats_add(&this_cpu->bytes, stats->bytes);
+ u64_stats_update_end(nft_sync);
local_bh_enable();
}
@@ -285,7 +287,7 @@ void nft_counter_init_seqcount(void)
int cpu;
for_each_possible_cpu(cpu)
- seqcount_init(per_cpu_ptr(&nft_counter_seq, cpu));
+ u64_stats_init(per_cpu_ptr(&nft_counter_sync, cpu));
}
struct nft_expr_type nft_counter_type;
--
2.49.0