From: Waiman Long <longman(a)redhat.com>
[ Upstream commit 85b2b9c16d053364e2004883140538e73b333cdb ]
A circular lock dependency splat has been seen involving down_trylock():
======================================================
WARNING: possible circular locking dependency detected
6.12.0-41.el10.s390x+debug
------------------------------------------------------
dd/32479 is trying to acquire lock:
0015a20accd0d4f8 ((console_sem).lock){-.-.}-{2:2}, at: down_trylock+0x26/0x90
but task is already holding lock:
000000017e461698 (&zone->lock){-.-.}-{2:2}, at: rmqueue_bulk+0xac/0x8f0
the existing dependency chain (in reverse order) is:
-> #4 (&zone->lock){-.-.}-{2:2}:
-> #3 (hrtimer_bases.lock){-.-.}-{2:2}:
-> #2 (&rq->__lock){-.-.}-{2:2}:
-> #1 (&p->pi_lock){-.-.}-{2:2}:
-> #0 ((console_sem).lock){-.-.}-{2:2}:
The console_sem -> pi_lock dependency is due to calling try_to_wake_up()
while holding the console_sem raw_spinlock. This dependency can be broken
by using wake_q to do the wakeup instead of calling try_to_wake_up()
under the console_sem lock. This will also make the semaphore's
raw_spinlock become a terminal lock without taking any further locks
underneath it.
The hrtimer_bases.lock is a raw_spinlock while zone->lock is a
spinlock. The hrtimer_bases.lock -> zone->lock dependency happens via
the debug_objects_fill_pool() helper function in the debugobjects code.
-> #4 (&zone->lock){-.-.}-{2:2}:
__lock_acquire+0xe86/0x1cc0
lock_acquire.part.0+0x258/0x630
lock_acquire+0xb8/0xe0
_raw_spin_lock_irqsave+0xb4/0x120
rmqueue_bulk+0xac/0x8f0
__rmqueue_pcplist+0x580/0x830
rmqueue_pcplist+0xfc/0x470
rmqueue.isra.0+0xdec/0x11b0
get_page_from_freelist+0x2ee/0xeb0
__alloc_pages_noprof+0x2c2/0x520
alloc_pages_mpol_noprof+0x1fc/0x4d0
alloc_pages_noprof+0x8c/0xe0
allocate_slab+0x320/0x460
___slab_alloc+0xa58/0x12b0
__slab_alloc.isra.0+0x42/0x60
kmem_cache_alloc_noprof+0x304/0x350
fill_pool+0xf6/0x450
debug_object_activate+0xfe/0x360
enqueue_hrtimer+0x34/0x190
__run_hrtimer+0x3c8/0x4c0
__hrtimer_run_queues+0x1b2/0x260
hrtimer_interrupt+0x316/0x760
do_IRQ+0x9a/0xe0
do_irq_async+0xf6/0x160
Normally a raw_spinlock to spinlock dependency is not legitimate
and will be warned if CONFIG_PROVE_RAW_LOCK_NESTING is enabled,
but debug_objects_fill_pool() is an exception as it explicitly
allows this dependency for non-PREEMPT_RT kernel without causing
PROVE_RAW_LOCK_NESTING lockdep splat. As a result, this dependency is
legitimate and not a bug.
Anyway, semaphore is the only locking primitive left that is still
using try_to_wake_up() to do wakeup inside critical section, all the
other locking primitives had been migrated to use wake_q to do wakeup
outside of the critical section. It is also possible that there are
other circular locking dependencies involving printk/console_sem or
other existing/new semaphores lurking somewhere which may show up in
the future. Let just do the migration now to wake_q to avoid headache
like this.
Reported-by: yzbot+ed801a886dfdbfe7136d(a)syzkaller.appspotmail.com
Signed-off-by: Waiman Long <longman(a)redhat.com>
Signed-off-by: Boqun Feng <boqun.feng(a)gmail.com>
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Link: https://lore.kernel.org/r/20250307232717.1759087-3-boqun.feng@gmail.com
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
kernel/locking/semaphore.c | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index d9dd94defc0a9..19389fdbfdfb1 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -29,6 +29,7 @@
#include <linux/export.h>
#include <linux/sched.h>
#include <linux/sched/debug.h>
+#include <linux/sched/wake_q.h>
#include <linux/semaphore.h>
#include <linux/spinlock.h>
#include <linux/ftrace.h>
@@ -37,7 +38,7 @@ static noinline void __down(struct semaphore *sem);
static noinline int __down_interruptible(struct semaphore *sem);
static noinline int __down_killable(struct semaphore *sem);
static noinline int __down_timeout(struct semaphore *sem, long timeout);
-static noinline void __up(struct semaphore *sem);
+static noinline void __up(struct semaphore *sem, struct wake_q_head *wake_q);
/**
* down - acquire the semaphore
@@ -178,13 +179,16 @@ EXPORT_SYMBOL(down_timeout);
void up(struct semaphore *sem)
{
unsigned long flags;
+ DEFINE_WAKE_Q(wake_q);
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(list_empty(&sem->wait_list)))
sem->count++;
else
- __up(sem);
+ __up(sem, &wake_q);
raw_spin_unlock_irqrestore(&sem->lock, flags);
+ if (!wake_q_empty(&wake_q))
+ wake_up_q(&wake_q);
}
EXPORT_SYMBOL(up);
@@ -252,11 +256,12 @@ static noinline int __sched __down_timeout(struct semaphore *sem, long timeout)
return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout);
}
-static noinline void __sched __up(struct semaphore *sem)
+static noinline void __sched __up(struct semaphore *sem,
+ struct wake_q_head *wake_q)
{
struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
struct semaphore_waiter, list);
list_del(&waiter->list);
waiter->up = true;
- wake_up_process(waiter->task);
+ wake_q_add(wake_q, waiter->task);
}
--
2.39.5
[ Upstream commit 5ac9b4e935dfc6af41eee2ddc21deb5c36507a9f ]
>From memfd_secret(2) manpage:
The memory areas backing the file created with memfd_secret(2) are
visible only to the processes that have access to the file descriptor.
The memory region is removed from the kernel page tables and only the
page tables of the processes holding the file descriptor map the
corresponding physical memory. (Thus, the pages in the region can't be
accessed by the kernel itself, so that, for example, pointers to the
region can't be passed to system calls.)
We need to handle this special case gracefully in build ID fetching
code. Return -EFAULT whenever secretmem file is passed to build_id_parse()
family of APIs. Original report and repro can be found in [0].
[0] https://lore.kernel.org/bpf/ZwyG8Uro%2FSyTXAni@ly-workstation/
Fixes: de3ec364c3c3 ("lib/buildid: add single folio-based file reader abstraction")
Reported-by: Yi Lai <yi1.lai(a)intel.com>
Suggested-by: Shakeel Butt <shakeel.butt(a)linux.dev>
Signed-off-by: Andrii Nakryiko <andrii(a)kernel.org>
Signed-off-by: Daniel Borkmann <daniel(a)iogearbox.net>
Acked-by: Shakeel Butt <shakeel.butt(a)linux.dev>
Link: https://lore.kernel.org/bpf/20241017175431.6183-A-hca@linux.ibm.com
Link: https://lore.kernel.org/bpf/20241017174713.2157873-1-andrii@kernel.org
[ Chen Linxuan: backport same logic without folio-based changes ]
Cc: stable(a)vger.kernel.org
Fixes: 88a16a130933 ("perf: Add build id data in mmap2 event")
Signed-off-by: Chen Linxuan <chenlinxuan(a)deepin.org>
---
v1 -> v2: use vma_is_secretmem() instead of directly checking
vma->vm_file->f_op == &secretmem_fops
---
lib/buildid.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/lib/buildid.c b/lib/buildid.c
index 9fc46366597e..34315d09b544 100644
--- a/lib/buildid.c
+++ b/lib/buildid.c
@@ -5,6 +5,7 @@
#include <linux/elf.h>
#include <linux/kernel.h>
#include <linux/pagemap.h>
+#include <linux/secretmem.h>
#define BUILD_ID 3
@@ -157,6 +158,10 @@ int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id,
if (!vma->vm_file)
return -EINVAL;
+ /* reject secretmem */
+ if (vma_is_secretmem(vma))
+ return -EFAULT;
+
page = find_get_page(vma->vm_file->f_mapping, 0);
if (!page)
return -EFAULT; /* page not mapped */
--
2.48.1
From: Waiman Long <longman(a)redhat.com>
[ Upstream commit 85b2b9c16d053364e2004883140538e73b333cdb ]
A circular lock dependency splat has been seen involving down_trylock():
======================================================
WARNING: possible circular locking dependency detected
6.12.0-41.el10.s390x+debug
------------------------------------------------------
dd/32479 is trying to acquire lock:
0015a20accd0d4f8 ((console_sem).lock){-.-.}-{2:2}, at: down_trylock+0x26/0x90
but task is already holding lock:
000000017e461698 (&zone->lock){-.-.}-{2:2}, at: rmqueue_bulk+0xac/0x8f0
the existing dependency chain (in reverse order) is:
-> #4 (&zone->lock){-.-.}-{2:2}:
-> #3 (hrtimer_bases.lock){-.-.}-{2:2}:
-> #2 (&rq->__lock){-.-.}-{2:2}:
-> #1 (&p->pi_lock){-.-.}-{2:2}:
-> #0 ((console_sem).lock){-.-.}-{2:2}:
The console_sem -> pi_lock dependency is due to calling try_to_wake_up()
while holding the console_sem raw_spinlock. This dependency can be broken
by using wake_q to do the wakeup instead of calling try_to_wake_up()
under the console_sem lock. This will also make the semaphore's
raw_spinlock become a terminal lock without taking any further locks
underneath it.
The hrtimer_bases.lock is a raw_spinlock while zone->lock is a
spinlock. The hrtimer_bases.lock -> zone->lock dependency happens via
the debug_objects_fill_pool() helper function in the debugobjects code.
-> #4 (&zone->lock){-.-.}-{2:2}:
__lock_acquire+0xe86/0x1cc0
lock_acquire.part.0+0x258/0x630
lock_acquire+0xb8/0xe0
_raw_spin_lock_irqsave+0xb4/0x120
rmqueue_bulk+0xac/0x8f0
__rmqueue_pcplist+0x580/0x830
rmqueue_pcplist+0xfc/0x470
rmqueue.isra.0+0xdec/0x11b0
get_page_from_freelist+0x2ee/0xeb0
__alloc_pages_noprof+0x2c2/0x520
alloc_pages_mpol_noprof+0x1fc/0x4d0
alloc_pages_noprof+0x8c/0xe0
allocate_slab+0x320/0x460
___slab_alloc+0xa58/0x12b0
__slab_alloc.isra.0+0x42/0x60
kmem_cache_alloc_noprof+0x304/0x350
fill_pool+0xf6/0x450
debug_object_activate+0xfe/0x360
enqueue_hrtimer+0x34/0x190
__run_hrtimer+0x3c8/0x4c0
__hrtimer_run_queues+0x1b2/0x260
hrtimer_interrupt+0x316/0x760
do_IRQ+0x9a/0xe0
do_irq_async+0xf6/0x160
Normally a raw_spinlock to spinlock dependency is not legitimate
and will be warned if CONFIG_PROVE_RAW_LOCK_NESTING is enabled,
but debug_objects_fill_pool() is an exception as it explicitly
allows this dependency for non-PREEMPT_RT kernel without causing
PROVE_RAW_LOCK_NESTING lockdep splat. As a result, this dependency is
legitimate and not a bug.
Anyway, semaphore is the only locking primitive left that is still
using try_to_wake_up() to do wakeup inside critical section, all the
other locking primitives had been migrated to use wake_q to do wakeup
outside of the critical section. It is also possible that there are
other circular locking dependencies involving printk/console_sem or
other existing/new semaphores lurking somewhere which may show up in
the future. Let just do the migration now to wake_q to avoid headache
like this.
Reported-by: yzbot+ed801a886dfdbfe7136d(a)syzkaller.appspotmail.com
Signed-off-by: Waiman Long <longman(a)redhat.com>
Signed-off-by: Boqun Feng <boqun.feng(a)gmail.com>
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Link: https://lore.kernel.org/r/20250307232717.1759087-3-boqun.feng@gmail.com
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
kernel/locking/semaphore.c | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 9aa855a96c4ae..aadde65402913 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -29,6 +29,7 @@
#include <linux/export.h>
#include <linux/sched.h>
#include <linux/sched/debug.h>
+#include <linux/sched/wake_q.h>
#include <linux/semaphore.h>
#include <linux/spinlock.h>
#include <linux/ftrace.h>
@@ -37,7 +38,7 @@ static noinline void __down(struct semaphore *sem);
static noinline int __down_interruptible(struct semaphore *sem);
static noinline int __down_killable(struct semaphore *sem);
static noinline int __down_timeout(struct semaphore *sem, long timeout);
-static noinline void __up(struct semaphore *sem);
+static noinline void __up(struct semaphore *sem, struct wake_q_head *wake_q);
/**
* down - acquire the semaphore
@@ -178,13 +179,16 @@ EXPORT_SYMBOL(down_timeout);
void up(struct semaphore *sem)
{
unsigned long flags;
+ DEFINE_WAKE_Q(wake_q);
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(list_empty(&sem->wait_list)))
sem->count++;
else
- __up(sem);
+ __up(sem, &wake_q);
raw_spin_unlock_irqrestore(&sem->lock, flags);
+ if (!wake_q_empty(&wake_q))
+ wake_up_q(&wake_q);
}
EXPORT_SYMBOL(up);
@@ -252,11 +256,12 @@ static noinline int __sched __down_timeout(struct semaphore *sem, long timeout)
return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout);
}
-static noinline void __sched __up(struct semaphore *sem)
+static noinline void __sched __up(struct semaphore *sem,
+ struct wake_q_head *wake_q)
{
struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
struct semaphore_waiter, list);
list_del(&waiter->list);
waiter->up = true;
- wake_up_process(waiter->task);
+ wake_q_add(wake_q, waiter->task);
}
--
2.39.5
From: Eric Dumazet <edumazet(a)google.com>
tcp_abort() has the same issue than the one fixed in the prior patch
in tcp_write_err().
commit 5ce4645c23cf5f048eb8e9ce49e514bababdee85 upstream.
To apply commit bac76cf89816bff06c4ec2f3df97dc34e150a1c4,
this patch must be applied first.
In order to get consistent results from tcp_poll(), we must call
sk_error_report() after tcp_done().
We can use tcp_done_with_error() to centralize this logic.
Fixes: c1e64e298b8c ("net: diag: Support destroying TCP sockets.")
Signed-off-by: Eric Dumazet <edumazet(a)google.com>
Acked-by: Neal Cardwell <ncardwell(a)google.com>
Link: https://lore.kernel.org/r/20240528125253.1966136-4-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
[youngmin: Resolved minor conflict in net/ipv4/tcp.c]
Signed-off-by: Youngmin Nam <youngmin.nam(a)samsung.com>
---
net/ipv4/tcp.c | 6 +-----
1 file changed, 1 insertion(+), 5 deletions(-)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 7d591a0cf0c7..1ad3a20eb9b7 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4755,13 +4755,9 @@ int tcp_abort(struct sock *sk, int err)
bh_lock_sock(sk);
if (!sock_flag(sk, SOCK_DEAD)) {
- WRITE_ONCE(sk->sk_err, err);
- /* This barrier is coupled with smp_rmb() in tcp_poll() */
- smp_wmb();
- sk_error_report(sk);
if (tcp_need_reset(sk->sk_state))
tcp_send_active_reset(sk, GFP_ATOMIC);
- tcp_done(sk);
+ tcp_done_with_error(sk, err);
}
bh_unlock_sock(sk);
--
2.39.2
[ Upstream commit 5ac9b4e935dfc6af41eee2ddc21deb5c36507a9f ]
>From memfd_secret(2) manpage:
The memory areas backing the file created with memfd_secret(2) are
visible only to the processes that have access to the file descriptor.
The memory region is removed from the kernel page tables and only the
page tables of the processes holding the file descriptor map the
corresponding physical memory. (Thus, the pages in the region can't be
accessed by the kernel itself, so that, for example, pointers to the
region can't be passed to system calls.)
We need to handle this special case gracefully in build ID fetching
code. Return -EFAULT whenever secretmem file is passed to build_id_parse()
family of APIs. Original report and repro can be found in [0].
[0] https://lore.kernel.org/bpf/ZwyG8Uro%2FSyTXAni@ly-workstation/
Fixes: de3ec364c3c3 ("lib/buildid: add single folio-based file reader abstraction")
Reported-by: Yi Lai <yi1.lai(a)intel.com>
Suggested-by: Shakeel Butt <shakeel.butt(a)linux.dev>
Signed-off-by: Andrii Nakryiko <andrii(a)kernel.org>
Signed-off-by: Daniel Borkmann <daniel(a)iogearbox.net>
Acked-by: Shakeel Butt <shakeel.butt(a)linux.dev>
Link: https://lore.kernel.org/bpf/20241017175431.6183-A-hca@linux.ibm.com
Link: https://lore.kernel.org/bpf/20241017174713.2157873-1-andrii@kernel.org
[ Chen Linxuan: backport same logic without folio-based changes ]
Cc: stable(a)vger.kernel.org
Fixes: 88a16a130933 ("perf: Add build id data in mmap2 event")
Signed-off-by: Chen Linxuan <chenlinxuan(a)deepin.org>
---
v1 -> v2: use vma_is_secretmem() instead of directly checking
vma->vm_file->f_op == &secretmem_fops
v2 -> v3: keep original comment
---
lib/buildid.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/lib/buildid.c b/lib/buildid.c
index 9fc46366597e..8d839ff5548e 100644
--- a/lib/buildid.c
+++ b/lib/buildid.c
@@ -5,6 +5,7 @@
#include <linux/elf.h>
#include <linux/kernel.h>
#include <linux/pagemap.h>
+#include <linux/secretmem.h>
#define BUILD_ID 3
@@ -157,6 +158,10 @@ int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id,
if (!vma->vm_file)
return -EINVAL;
+ /* reject secretmem folios created with memfd_secret() */
+ if (vma_is_secretmem(vma))
+ return -EFAULT;
+
page = find_get_page(vma->vm_file->f_mapping, 0);
if (!page)
return -EFAULT; /* page not mapped */
--
2.48.1
[ Upstream commit 5ac9b4e935dfc6af41eee2ddc21deb5c36507a9f ]
>From memfd_secret(2) manpage:
The memory areas backing the file created with memfd_secret(2) are
visible only to the processes that have access to the file descriptor.
The memory region is removed from the kernel page tables and only the
page tables of the processes holding the file descriptor map the
corresponding physical memory. (Thus, the pages in the region can't be
accessed by the kernel itself, so that, for example, pointers to the
region can't be passed to system calls.)
We need to handle this special case gracefully in build ID fetching
code. Return -EFAULT whenever secretmem file is passed to build_id_parse()
family of APIs. Original report and repro can be found in [0].
[0] https://lore.kernel.org/bpf/ZwyG8Uro%2FSyTXAni@ly-workstation/
Fixes: de3ec364c3c3 ("lib/buildid: add single folio-based file reader abstraction")
Reported-by: Yi Lai <yi1.lai(a)intel.com>
Suggested-by: Shakeel Butt <shakeel.butt(a)linux.dev>
Signed-off-by: Andrii Nakryiko <andrii(a)kernel.org>
Signed-off-by: Daniel Borkmann <daniel(a)iogearbox.net>
Acked-by: Shakeel Butt <shakeel.butt(a)linux.dev>
Link: https://lore.kernel.org/bpf/20241017175431.6183-A-hca@linux.ibm.com
Link: https://lore.kernel.org/bpf/20241017174713.2157873-1-andrii@kernel.org
[ Chen Linxuan: backport same logic without folio-based changes ]
Cc: stable(a)vger.kernel.org
Fixes: 88a16a130933 ("perf: Add build id data in mmap2 event")
Signed-off-by: Chen Linxuan <chenlinxuan(a)deepin.org>
---
v1 -> v2: use vma_is_secretmem() instead of directly checking
vma->vm_file->f_op == &secretmem_fops
---
lib/buildid.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/lib/buildid.c b/lib/buildid.c
index 9fc46366597e..34315d09b544 100644
--- a/lib/buildid.c
+++ b/lib/buildid.c
@@ -5,6 +5,7 @@
#include <linux/elf.h>
#include <linux/kernel.h>
#include <linux/pagemap.h>
+#include <linux/secretmem.h>
#define BUILD_ID 3
@@ -157,6 +158,10 @@ int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id,
if (!vma->vm_file)
return -EINVAL;
+ /* reject secretmem */
+ if (vma_is_secretmem(vma))
+ return -EFAULT;
+
page = find_get_page(vma->vm_file->f_mapping, 0);
if (!page)
return -EFAULT; /* page not mapped */
--
2.48.1
From: Waiman Long <longman(a)redhat.com>
[ Upstream commit 85b2b9c16d053364e2004883140538e73b333cdb ]
A circular lock dependency splat has been seen involving down_trylock():
======================================================
WARNING: possible circular locking dependency detected
6.12.0-41.el10.s390x+debug
------------------------------------------------------
dd/32479 is trying to acquire lock:
0015a20accd0d4f8 ((console_sem).lock){-.-.}-{2:2}, at: down_trylock+0x26/0x90
but task is already holding lock:
000000017e461698 (&zone->lock){-.-.}-{2:2}, at: rmqueue_bulk+0xac/0x8f0
the existing dependency chain (in reverse order) is:
-> #4 (&zone->lock){-.-.}-{2:2}:
-> #3 (hrtimer_bases.lock){-.-.}-{2:2}:
-> #2 (&rq->__lock){-.-.}-{2:2}:
-> #1 (&p->pi_lock){-.-.}-{2:2}:
-> #0 ((console_sem).lock){-.-.}-{2:2}:
The console_sem -> pi_lock dependency is due to calling try_to_wake_up()
while holding the console_sem raw_spinlock. This dependency can be broken
by using wake_q to do the wakeup instead of calling try_to_wake_up()
under the console_sem lock. This will also make the semaphore's
raw_spinlock become a terminal lock without taking any further locks
underneath it.
The hrtimer_bases.lock is a raw_spinlock while zone->lock is a
spinlock. The hrtimer_bases.lock -> zone->lock dependency happens via
the debug_objects_fill_pool() helper function in the debugobjects code.
-> #4 (&zone->lock){-.-.}-{2:2}:
__lock_acquire+0xe86/0x1cc0
lock_acquire.part.0+0x258/0x630
lock_acquire+0xb8/0xe0
_raw_spin_lock_irqsave+0xb4/0x120
rmqueue_bulk+0xac/0x8f0
__rmqueue_pcplist+0x580/0x830
rmqueue_pcplist+0xfc/0x470
rmqueue.isra.0+0xdec/0x11b0
get_page_from_freelist+0x2ee/0xeb0
__alloc_pages_noprof+0x2c2/0x520
alloc_pages_mpol_noprof+0x1fc/0x4d0
alloc_pages_noprof+0x8c/0xe0
allocate_slab+0x320/0x460
___slab_alloc+0xa58/0x12b0
__slab_alloc.isra.0+0x42/0x60
kmem_cache_alloc_noprof+0x304/0x350
fill_pool+0xf6/0x450
debug_object_activate+0xfe/0x360
enqueue_hrtimer+0x34/0x190
__run_hrtimer+0x3c8/0x4c0
__hrtimer_run_queues+0x1b2/0x260
hrtimer_interrupt+0x316/0x760
do_IRQ+0x9a/0xe0
do_irq_async+0xf6/0x160
Normally a raw_spinlock to spinlock dependency is not legitimate
and will be warned if CONFIG_PROVE_RAW_LOCK_NESTING is enabled,
but debug_objects_fill_pool() is an exception as it explicitly
allows this dependency for non-PREEMPT_RT kernel without causing
PROVE_RAW_LOCK_NESTING lockdep splat. As a result, this dependency is
legitimate and not a bug.
Anyway, semaphore is the only locking primitive left that is still
using try_to_wake_up() to do wakeup inside critical section, all the
other locking primitives had been migrated to use wake_q to do wakeup
outside of the critical section. It is also possible that there are
other circular locking dependencies involving printk/console_sem or
other existing/new semaphores lurking somewhere which may show up in
the future. Let just do the migration now to wake_q to avoid headache
like this.
Reported-by: yzbot+ed801a886dfdbfe7136d(a)syzkaller.appspotmail.com
Signed-off-by: Waiman Long <longman(a)redhat.com>
Signed-off-by: Boqun Feng <boqun.feng(a)gmail.com>
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Link: https://lore.kernel.org/r/20250307232717.1759087-3-boqun.feng@gmail.com
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
kernel/locking/semaphore.c | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 9ee381e4d2a4d..a26c915430ba0 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -29,6 +29,7 @@
#include <linux/export.h>
#include <linux/sched.h>
#include <linux/sched/debug.h>
+#include <linux/sched/wake_q.h>
#include <linux/semaphore.h>
#include <linux/spinlock.h>
#include <linux/ftrace.h>
@@ -37,7 +38,7 @@ static noinline void __down(struct semaphore *sem);
static noinline int __down_interruptible(struct semaphore *sem);
static noinline int __down_killable(struct semaphore *sem);
static noinline int __down_timeout(struct semaphore *sem, long timeout);
-static noinline void __up(struct semaphore *sem);
+static noinline void __up(struct semaphore *sem, struct wake_q_head *wake_q);
/**
* down - acquire the semaphore
@@ -182,13 +183,16 @@ EXPORT_SYMBOL(down_timeout);
void up(struct semaphore *sem)
{
unsigned long flags;
+ DEFINE_WAKE_Q(wake_q);
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(list_empty(&sem->wait_list)))
sem->count++;
else
- __up(sem);
+ __up(sem, &wake_q);
raw_spin_unlock_irqrestore(&sem->lock, flags);
+ if (!wake_q_empty(&wake_q))
+ wake_up_q(&wake_q);
}
EXPORT_SYMBOL(up);
@@ -256,11 +260,12 @@ static noinline int __sched __down_timeout(struct semaphore *sem, long timeout)
return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout);
}
-static noinline void __sched __up(struct semaphore *sem)
+static noinline void __sched __up(struct semaphore *sem,
+ struct wake_q_head *wake_q)
{
struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
struct semaphore_waiter, list);
list_del(&waiter->list);
waiter->up = true;
- wake_up_process(waiter->task);
+ wake_q_add(wake_q, waiter->task);
}
--
2.39.5
[ Upstream commit 5ac9b4e935dfc6af41eee2ddc21deb5c36507a9f ]
>From memfd_secret(2) manpage:
The memory areas backing the file created with memfd_secret(2) are
visible only to the processes that have access to the file descriptor.
The memory region is removed from the kernel page tables and only the
page tables of the processes holding the file descriptor map the
corresponding physical memory. (Thus, the pages in the region can't be
accessed by the kernel itself, so that, for example, pointers to the
region can't be passed to system calls.)
We need to handle this special case gracefully in build ID fetching
code. Return -EFAULT whenever secretmem file is passed to build_id_parse()
family of APIs. Original report and repro can be found in [0].
[0] https://lore.kernel.org/bpf/ZwyG8Uro%2FSyTXAni@ly-workstation/
Fixes: de3ec364c3c3 ("lib/buildid: add single folio-based file reader abstraction")
Reported-by: Yi Lai <yi1.lai(a)intel.com>
Suggested-by: Shakeel Butt <shakeel.butt(a)linux.dev>
Signed-off-by: Andrii Nakryiko <andrii(a)kernel.org>
Signed-off-by: Daniel Borkmann <daniel(a)iogearbox.net>
Acked-by: Shakeel Butt <shakeel.butt(a)linux.dev>
Link: https://lore.kernel.org/bpf/20241017175431.6183-A-hca@linux.ibm.com
Link: https://lore.kernel.org/bpf/20241017174713.2157873-1-andrii@kernel.org
[ Chen Linxuan: backport same logic without folio-based changes ]
Cc: stable(a)vger.kernel.org
Fixes: 88a16a130933 ("perf: Add build id data in mmap2 event")
Signed-off-by: Chen Linxuan <chenlinxuan(a)deepin.org>
---
v1 -> v2: use vma_is_secretmem() instead of directly checking
vma->vm_file->f_op == &secretmem_fops
v2 -> v3: keep original comment
---
lib/buildid.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/lib/buildid.c b/lib/buildid.c
index 9fc46366597e..8d839ff5548e 100644
--- a/lib/buildid.c
+++ b/lib/buildid.c
@@ -5,6 +5,7 @@
#include <linux/elf.h>
#include <linux/kernel.h>
#include <linux/pagemap.h>
+#include <linux/secretmem.h>
#define BUILD_ID 3
@@ -157,6 +158,10 @@ int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id,
if (!vma->vm_file)
return -EINVAL;
+ /* reject secretmem folios created with memfd_secret() */
+ if (vma_is_secretmem(vma))
+ return -EFAULT;
+
page = find_get_page(vma->vm_file->f_mapping, 0);
if (!page)
return -EFAULT; /* page not mapped */
--
2.48.1
From: Waiman Long <longman(a)redhat.com>
[ Upstream commit 85b2b9c16d053364e2004883140538e73b333cdb ]
A circular lock dependency splat has been seen involving down_trylock():
======================================================
WARNING: possible circular locking dependency detected
6.12.0-41.el10.s390x+debug
------------------------------------------------------
dd/32479 is trying to acquire lock:
0015a20accd0d4f8 ((console_sem).lock){-.-.}-{2:2}, at: down_trylock+0x26/0x90
but task is already holding lock:
000000017e461698 (&zone->lock){-.-.}-{2:2}, at: rmqueue_bulk+0xac/0x8f0
the existing dependency chain (in reverse order) is:
-> #4 (&zone->lock){-.-.}-{2:2}:
-> #3 (hrtimer_bases.lock){-.-.}-{2:2}:
-> #2 (&rq->__lock){-.-.}-{2:2}:
-> #1 (&p->pi_lock){-.-.}-{2:2}:
-> #0 ((console_sem).lock){-.-.}-{2:2}:
The console_sem -> pi_lock dependency is due to calling try_to_wake_up()
while holding the console_sem raw_spinlock. This dependency can be broken
by using wake_q to do the wakeup instead of calling try_to_wake_up()
under the console_sem lock. This will also make the semaphore's
raw_spinlock become a terminal lock without taking any further locks
underneath it.
The hrtimer_bases.lock is a raw_spinlock while zone->lock is a
spinlock. The hrtimer_bases.lock -> zone->lock dependency happens via
the debug_objects_fill_pool() helper function in the debugobjects code.
-> #4 (&zone->lock){-.-.}-{2:2}:
__lock_acquire+0xe86/0x1cc0
lock_acquire.part.0+0x258/0x630
lock_acquire+0xb8/0xe0
_raw_spin_lock_irqsave+0xb4/0x120
rmqueue_bulk+0xac/0x8f0
__rmqueue_pcplist+0x580/0x830
rmqueue_pcplist+0xfc/0x470
rmqueue.isra.0+0xdec/0x11b0
get_page_from_freelist+0x2ee/0xeb0
__alloc_pages_noprof+0x2c2/0x520
alloc_pages_mpol_noprof+0x1fc/0x4d0
alloc_pages_noprof+0x8c/0xe0
allocate_slab+0x320/0x460
___slab_alloc+0xa58/0x12b0
__slab_alloc.isra.0+0x42/0x60
kmem_cache_alloc_noprof+0x304/0x350
fill_pool+0xf6/0x450
debug_object_activate+0xfe/0x360
enqueue_hrtimer+0x34/0x190
__run_hrtimer+0x3c8/0x4c0
__hrtimer_run_queues+0x1b2/0x260
hrtimer_interrupt+0x316/0x760
do_IRQ+0x9a/0xe0
do_irq_async+0xf6/0x160
Normally a raw_spinlock to spinlock dependency is not legitimate
and will be warned if CONFIG_PROVE_RAW_LOCK_NESTING is enabled,
but debug_objects_fill_pool() is an exception as it explicitly
allows this dependency for non-PREEMPT_RT kernel without causing
PROVE_RAW_LOCK_NESTING lockdep splat. As a result, this dependency is
legitimate and not a bug.
Anyway, semaphore is the only locking primitive left that is still
using try_to_wake_up() to do wakeup inside critical section, all the
other locking primitives had been migrated to use wake_q to do wakeup
outside of the critical section. It is also possible that there are
other circular locking dependencies involving printk/console_sem or
other existing/new semaphores lurking somewhere which may show up in
the future. Let just do the migration now to wake_q to avoid headache
like this.
Reported-by: yzbot+ed801a886dfdbfe7136d(a)syzkaller.appspotmail.com
Signed-off-by: Waiman Long <longman(a)redhat.com>
Signed-off-by: Boqun Feng <boqun.feng(a)gmail.com>
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Link: https://lore.kernel.org/r/20250307232717.1759087-3-boqun.feng@gmail.com
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
kernel/locking/semaphore.c | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 34bfae72f2952..de9117c0e671e 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -29,6 +29,7 @@
#include <linux/export.h>
#include <linux/sched.h>
#include <linux/sched/debug.h>
+#include <linux/sched/wake_q.h>
#include <linux/semaphore.h>
#include <linux/spinlock.h>
#include <linux/ftrace.h>
@@ -38,7 +39,7 @@ static noinline void __down(struct semaphore *sem);
static noinline int __down_interruptible(struct semaphore *sem);
static noinline int __down_killable(struct semaphore *sem);
static noinline int __down_timeout(struct semaphore *sem, long timeout);
-static noinline void __up(struct semaphore *sem);
+static noinline void __up(struct semaphore *sem, struct wake_q_head *wake_q);
/**
* down - acquire the semaphore
@@ -183,13 +184,16 @@ EXPORT_SYMBOL(down_timeout);
void __sched up(struct semaphore *sem)
{
unsigned long flags;
+ DEFINE_WAKE_Q(wake_q);
raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(list_empty(&sem->wait_list)))
sem->count++;
else
- __up(sem);
+ __up(sem, &wake_q);
raw_spin_unlock_irqrestore(&sem->lock, flags);
+ if (!wake_q_empty(&wake_q))
+ wake_up_q(&wake_q);
}
EXPORT_SYMBOL(up);
@@ -269,11 +273,12 @@ static noinline int __sched __down_timeout(struct semaphore *sem, long timeout)
return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout);
}
-static noinline void __sched __up(struct semaphore *sem)
+static noinline void __sched __up(struct semaphore *sem,
+ struct wake_q_head *wake_q)
{
struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
struct semaphore_waiter, list);
list_del(&waiter->list);
waiter->up = true;
- wake_up_process(waiter->task);
+ wake_q_add(wake_q, waiter->task);
}
--
2.39.5