Hello all,
I encountered an issue with the CPU affinity of tasks launched by
systemd in a
slice, after updating from systemd 254 to by systemd >= 256, on the LTS
5.15.x
branch (tested on v5.15.179).
Despite the slice file stipulating AllowedCPUS=2 (and confirming this
was set in
/sys/fs/cgroup/test.slice/cpuset.cpus) tasks launched in the slice would
have
the CPU affinity of the system.slice (i.e all by default) rather than 2.
To reproduce:
* Check kernel version and systemd version (I used a debian testing
image for
testing)
```
# uname -r
5.15.179
# systemctl --version
systemd 257 (257.4-3)
...
```
* Create a test.slice with AllowedCPUS=2
```
# cat <<EOF > /usr/lib/systemd/system/test.slice
[Unit]
Description=Test slice
Before=slices.target
[Slice]
AllowedCPUs=2
[Install]
WantedBy=slices.target
EOF
# systemctl daemon-reload && systemctl start test.slice
```
* Confirm cpuset
```
# cat /sys/fs/cgroup/test.slice/cpuset.cpus
2
```
* Launch task in slice
```
# systemd-run --slice test.slice yes
Running as unit: run-r9187b97c6958498aad5bba213289ac56.service;
invocation ID:
f470f74047ac43b7a60861d03a7ef6f9
# cat
/sys/fs/cgroup/test.slice/run-r9187b97c6958498aad5bba213289ac56.service/cgroup.procs
317
```
# Check affinity
```
# taskset -pc 317
pid 317's current affinity list: 0-7
```
This issue is fixed by applying upstream commits:
18f9a4d47527772515ad6cbdac796422566e6440
cgroup/cpuset: Skip spread flags update on v2
and
42a11bf5c5436e91b040aeb04063be1710bb9f9c
cgroup/cpuset: Make cpuset_fork() handle CLONE_INTO_CGROUP properly
With these applied:
```
# systemd-run --slice test.slice yes
Running as unit: run-r442c444559ff49f48c6c2b8325b3b500.service;
invocation ID:
5211167267154e9292cb6b854585cb91
# cat
/sys/fs/cgroup/test.slice/run-r442c444559ff49f48c6c2b8325b3b500.service/cgroup.procs
291
# taskset -pc 291
pid 291's current affinity list: 2
```
Perhaps these are a good candidate for backport onto the 5.15 LTS
branch?
Thanks
James
When get_block is called with a buffer_head allocated on the stack, such
as do_mpage_readpage, stack corruption due to buffer_head UAF may occur in
the following race condition situation.
<CPU 0> <CPU 1>
mpage_read_folio
<<bh on stack>>
do_mpage_readpage
exfat_get_block
bh_read
__bh_read
get_bh(bh)
submit_bh
wait_on_buffer
...
end_buffer_read_sync
__end_buffer_read_notouch
unlock_buffer
<<keep going>>
...
...
...
...
<<bh is not valid out of mpage_read_folio>>
.
.
another_function
<<variable A on stack>>
put_bh(bh)
atomic_dec(bh->b_count)
* stack corruption here *
This patch returns -EAGAIN if a folio does not have buffers when bh_read
needs to be called. By doing this, the caller can fallback to functions
like block_read_full_folio(), create a buffer_head in the folio, and then
call get_block again.
Let's do not call bh_read() with on-stack buffer_head.
Fixes: 11a347fb6cef ("exfat: change to get file size from DataLength")
Cc: stable(a)vger.kernel.org
Tested-by: Yeongjin Gil <youngjin.gil(a)samsung.com>
Signed-off-by: Sungjong Seo <sj1557.seo(a)samsung.com>
---
v2:
- clear_buffer_mapped if there is any errors.
- remove unnecessary BUG_ON()
---
fs/exfat/inode.c | 39 +++++++++++++++++++++++++++++++++------
1 file changed, 33 insertions(+), 6 deletions(-)
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index 96952d4acb50..f3fdba9f4d21 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -344,7 +344,8 @@ static int exfat_get_block(struct inode *inode, sector_t iblock,
* The block has been partially written,
* zero the unwritten part and map the block.
*/
- loff_t size, off, pos;
+ loff_t size, pos;
+ void *addr;
max_blocks = 1;
@@ -355,17 +356,41 @@ static int exfat_get_block(struct inode *inode, sector_t iblock,
if (!bh_result->b_folio)
goto done;
+ /*
+ * No buffer_head is allocated.
+ * (1) bmap: It's enough to fill bh_result without I/O.
+ * (2) read: The unwritten part should be filled with 0
+ * If a folio does not have any buffers,
+ * let's returns -EAGAIN to fallback to
+ * per-bh IO like block_read_full_folio().
+ */
+ if (!folio_buffers(bh_result->b_folio)) {
+ err = -EAGAIN;
+ goto done;
+ }
+
pos = EXFAT_BLK_TO_B(iblock, sb);
size = ei->valid_size - pos;
- off = pos & (PAGE_SIZE - 1);
+ addr = folio_address(bh_result->b_folio) +
+ offset_in_folio(bh_result->b_folio, pos);
+
+ /* Check if bh->b_data points to proper addr in folio */
+ if (bh_result->b_data != addr) {
+ exfat_fs_error_ratelimit(sb,
+ "b_data(%p) != folio_addr(%p)",
+ bh_result->b_data, addr);
+ err = -EINVAL;
+ goto done;
+ }
- folio_set_bh(bh_result, bh_result->b_folio, off);
+ /* Read a block */
err = bh_read(bh_result, 0);
if (err < 0)
- goto unlock_ret;
+ goto done;
- folio_zero_segment(bh_result->b_folio, off + size,
- off + sb->s_blocksize);
+ /* Zero unwritten part of a block */
+ memset(bh_result->b_data + size, 0,
+ bh_result->b_size - size);
} else {
/*
* The range has not been written, clear the mapped flag
@@ -376,6 +401,8 @@ static int exfat_get_block(struct inode *inode, sector_t iblock,
}
done:
bh_result->b_size = EXFAT_BLK_TO_B(max_blocks, sb);
+ if (err < 0)
+ clear_buffer_mapped(bh_result);
unlock_ret:
mutex_unlock(&sbi->s_lock);
return err;
--
2.25.1
Hi again,
also for 6.1.132-rc1 the review hasn't started yet, but as it was
already available on [1], our CI has also tried to built it for ia64
in the morning. Unfortunately that failed, too - I assume due to the
following **missing** upstream commit:
https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/comm…
[1]: https://web.git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.…
Build failure (see [2]):
```
[...]
In file included from ./include/linux/string.h:5,
from ./include/linux/uuid.h:12,
from ./fs/xfs/xfs_linux.h:10,
from ./fs/xfs/xfs.h:22,
from fs/xfs/libxfs/xfs_alloc.c:6:
fs/xfs/libxfs/xfs_alloc.c: In function '__xfs_free_extent_later':
fs/xfs/libxfs/xfs_alloc.c:2551:51: error: 'mp' undeclared (first use in this function); did you mean 'tp'?
2551 | if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len)))
| ^~
./include/linux/compiler.h:78:45: note: in definition of macro 'unlikely'
78 | # define unlikely(x) __builtin_expect(!!(x), 0)
| ^
fs/xfs/libxfs/xfs_alloc.c:2551:13: note: in expansion of macro 'XFS_IS_CORRUPT'
2551 | if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len)))
| ^~~~~~~~~~~~~~
fs/xfs/libxfs/xfs_alloc.c:2551:51: note: each undeclared identifier is reported only once for each function it appears in
2551 | if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len)))
| ^~
./include/linux/compiler.h:78:45: note: in definition of macro 'unlikely'
78 | # define unlikely(x) __builtin_expect(!!(x), 0)
| ^
fs/xfs/libxfs/xfs_alloc.c:2551:13: note: in expansion of macro 'XFS_IS_CORRUPT'
2551 | if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len)))
| ^~~~~~~~~~~~~~
./fs/xfs/xfs_linux.h:225:63: warning: left-hand operand of comma expression has no effect [-Wunused-value]
225 | __this_address), \
| ^
fs/xfs/libxfs/xfs_alloc.c:2551:13: note: in expansion of macro 'XFS_IS_CORRUPT'
2551 | if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len)))
| ^~~~~~~~~~~~~~
make[5]: *** [scripts/Makefile.build:250: fs/xfs/libxfs/xfs_alloc.o] Error 1
[...]
```
[2]: https://github.com/linux-ia64/linux-stable-rc/actions/runs/13914712427/job/…
[3] (7dfee17b13e5024c5c0ab1911859ded4182de3e5 upstream) introduced
the XFS_IS_CORRUPT macro call now in `fs/xfs/libxfs/xfs_alloc.c:2551`,
but the struct "mp" is only there when DEBUG is defined in 6.1.132-rc1.
The above upstream commit (f6b3846) moves "mp" out of that guard and
hence should fix that specific build regression IIUC. Again not
build-tested yet, though.
[3]: https://web.git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.…
Cheers,
Frank
From: Ran Xiaokai <ran.xiaokai(a)zte.com.cn>
Lockdep reports this deadlock log:
osnoise: could not start sampling thread
============================================
WARNING: possible recursive locking detected
--------------------------------------------
CPU0
----
lock(cpu_hotplug_lock);
lock(cpu_hotplug_lock);
Call Trace:
<TASK>
print_deadlock_bug+0x282/0x3c0
__lock_acquire+0x1610/0x29a0
lock_acquire+0xcb/0x2d0
cpus_read_lock+0x49/0x120
stop_per_cpu_kthreads+0x7/0x60
start_kthread+0x103/0x120
osnoise_hotplug_workfn+0x5e/0x90
process_one_work+0x44f/0xb30
worker_thread+0x33e/0x5e0
kthread+0x206/0x3b0
ret_from_fork+0x31/0x50
ret_from_fork_asm+0x11/0x20
</TASK>
This is the deadlock scenario:
osnoise_hotplug_workfn()
guard(cpus_read_lock)(); // first lock call
start_kthread(cpu)
if (IS_ERR(kthread)) {
stop_per_cpu_kthreads(); {
cpus_read_lock(); // second lock call. Cause the AA deadlock
}
}
It is not necessary to call stop_per_cpu_kthreads() which stops osnoise
kthread for every other CPUs in the system if a failure occurs during
hotplug of a certain CPU.
For start_per_cpu_kthreads(), if the start_kthread() call fails,
this function calls stop_per_cpu_kthreads() to handle the error.
Therefore, similarly, there is no need to call stop_per_cpu_kthreads()
again within start_kthread().
So just remove stop_per_cpu_kthreads() from start_kthread to solve this issue.
Fixes: c8895e271f79 ("trace/osnoise: Support hotplug operations")
Signed-off-by: Ran Xiaokai <ran.xiaokai(a)zte.com.cn>
---
kernel/trace/trace_osnoise.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index 647516a73fa2..e732c9e37e14 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -2006,7 +2006,6 @@ static int start_kthread(unsigned int cpu)
if (IS_ERR(kthread)) {
pr_err(BANNER "could not start sampling thread\n");
- stop_per_cpu_kthreads();
return -ENOMEM;
}
--
2.17.1
When get_block is called with a buffer_head allocated on the stack, such
as do_mpage_readpage, stack corruption due to buffer_head UAF may occur in
the following race condition situation.
<CPU 0> <CPU 1>
mpage_read_folio
<<bh on stack>>
do_mpage_readpage
exfat_get_block
bh_read
__bh_read
get_bh(bh)
submit_bh
wait_on_buffer
...
end_buffer_read_sync
__end_buffer_read_notouch
unlock_buffer
<<keep going>>
...
...
...
...
<<bh is not valid out of mpage_read_folio>>
.
.
another_function
<<variable A on stack>>
put_bh(bh)
atomic_dec(bh->b_count)
* stack corruption here *
This patch returns -EAGAIN if a folio does not have buffers when bh_read
needs to be called. By doing this, the caller can fallback to functions
like block_read_full_folio(), create a buffer_head in the folio, and then
call get_block again.
Let's do not call bh_read() with on-stack buffer_head.
Fixes: 11a347fb6cef ("exfat: change to get file size from DataLength")
Cc: stable(a)vger.kernel.org
Tested-by: Yeongjin Gil <youngjin.gil(a)samsung.com>
Signed-off-by: Sungjong Seo <sj1557.seo(a)samsung.com>
---
fs/exfat/inode.c | 37 ++++++++++++++++++++++++++++++++-----
1 file changed, 32 insertions(+), 5 deletions(-)
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index 96952d4acb50..b8ea910586e4 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -344,7 +344,8 @@ static int exfat_get_block(struct inode *inode, sector_t iblock,
* The block has been partially written,
* zero the unwritten part and map the block.
*/
- loff_t size, off, pos;
+ loff_t size, pos;
+ void *addr;
max_blocks = 1;
@@ -355,17 +356,43 @@ static int exfat_get_block(struct inode *inode, sector_t iblock,
if (!bh_result->b_folio)
goto done;
+ /*
+ * No buffer_head is allocated.
+ * (1) bmap: It's enough to fill bh_result without I/O.
+ * (2) read: The unwritten part should be filled with 0
+ * If a folio does not have any buffers,
+ * let's returns -EAGAIN to fallback to
+ * per-bh IO like block_read_full_folio().
+ */
+ if (!folio_buffers(bh_result->b_folio)) {
+ err = -EAGAIN;
+ goto done;
+ }
+
pos = EXFAT_BLK_TO_B(iblock, sb);
size = ei->valid_size - pos;
- off = pos & (PAGE_SIZE - 1);
+ addr = folio_address(bh_result->b_folio) +
+ offset_in_folio(bh_result->b_folio, pos);
+
+ BUG_ON(size > sb->s_blocksize);
+
+ /* Check if bh->b_data points to proper addr in folio */
+ if (bh_result->b_data != addr) {
+ exfat_fs_error_ratelimit(sb,
+ "b_data(%p) != folio_addr(%p)",
+ bh_result->b_data, addr);
+ err = -EINVAL;
+ goto done;
+ }
- folio_set_bh(bh_result, bh_result->b_folio, off);
+ /* Read a block */
err = bh_read(bh_result, 0);
if (err < 0)
goto unlock_ret;
- folio_zero_segment(bh_result->b_folio, off + size,
- off + sb->s_blocksize);
+ /* Zero unwritten part of a block */
+ memset(bh_result->b_data + size, 0,
+ bh_result->b_size - size);
} else {
/*
* The range has not been written, clear the mapped flag
--
2.25.1