Avoid that lockdep reports the following:
===================================================== WARNING: SOFTIRQ-safe -> SOFTIRQ-unsafe lock order detected 5.0.0-rc4+ #56 Not tainted ----------------------------------------------------- syz-executor5/9727 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire: 00000000a4278d31 (&ctx->fault_pending_wqh){+.+.}, at: spin_lock include/linux/spinlock.h:329 [inline] 00000000a4278d31 (&ctx->fault_pending_wqh){+.+.}, at: userfaultfd_ctx_read fs/userfaultfd.c:1040 [inline] 00000000a4278d31 (&ctx->fault_pending_wqh){+.+.}, at: userfaultfd_read+0x540/0x1940 fs/userfaultfd.c:1198
and this task is already holding: 000000000e5b4350 (&ctx->fd_wqh){....}, at: spin_lock_irq include/linux/spinlock.h:354 [inline] 000000000e5b4350 (&ctx->fd_wqh){....}, at: userfaultfd_ctx_read fs/userfaultfd.c:1036 [inline] 000000000e5b4350 (&ctx->fd_wqh){....}, at: userfaultfd_read+0x27a/0x1940 fs/userfaultfd.c:1198 which would create a new lock dependency: (&ctx->fd_wqh){....} -> (&ctx->fault_pending_wqh){+.+.}
but this new dependency connects a SOFTIRQ-irq-safe lock: (&(&ctx->ctx_lock)->rlock){..-.}
... which became SOFTIRQ-irq-safe at: lock_acquire+0x16f/0x3f0 kernel/locking/lockdep.c:3841 __raw_spin_lock_irq include/linux/spinlock_api_smp.h:128 [inline] _raw_spin_lock_irq+0x60/0x80 kernel/locking/spinlock.c:160 spin_lock_irq include/linux/spinlock.h:354 [inline] free_ioctx_users+0x2d/0x4a0 fs/aio.c:610 percpu_ref_put_many include/linux/percpu-refcount.h:285 [inline] percpu_ref_put include/linux/percpu-refcount.h:301 [inline] percpu_ref_call_confirm_rcu lib/percpu-refcount.c:123 [inline] percpu_ref_switch_to_atomic_rcu+0x3e7/0x520 lib/percpu-refcount.c:158 __rcu_reclaim kernel/rcu/rcu.h:240 [inline] rcu_do_batch kernel/rcu/tree.c:2452 [inline] invoke_rcu_callbacks kernel/rcu/tree.c:2773 [inline] rcu_process_callbacks+0x928/0x1390 kernel/rcu/tree.c:2754 __do_softirq+0x266/0x95a kernel/softirq.c:292 run_ksoftirqd kernel/softirq.c:654 [inline] run_ksoftirqd+0x8e/0x110 kernel/softirq.c:646 smpboot_thread_fn+0x6ab/0xa10 kernel/smpboot.c:164 kthread+0x357/0x430 kernel/kthread.c:246 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:352
to a SOFTIRQ-irq-unsafe lock: (&ctx->fault_pending_wqh){+.+.}
... which became SOFTIRQ-irq-unsafe at: ... lock_acquire+0x16f/0x3f0 kernel/locking/lockdep.c:3841 __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] _raw_spin_lock+0x2f/0x40 kernel/locking/spinlock.c:144 spin_lock include/linux/spinlock.h:329 [inline] userfaultfd_release+0x497/0x6d0 fs/userfaultfd.c:916 __fput+0x2df/0x8d0 fs/file_table.c:278 ____fput+0x16/0x20 fs/file_table.c:309 task_work_run+0x14a/0x1c0 kernel/task_work.c:113 tracehook_notify_resume include/linux/tracehook.h:188 [inline] exit_to_usermode_loop+0x273/0x2c0 arch/x86/entry/common.c:166 prepare_exit_to_usermode arch/x86/entry/common.c:197 [inline] syscall_return_slowpath arch/x86/entry/common.c:268 [inline] do_syscall_64+0x52d/0x610 arch/x86/entry/common.c:293 entry_SYSCALL_64_after_hwframe+0x49/0xbe
other info that might help us debug this:
Chain exists of: &(&ctx->ctx_lock)->rlock --> &ctx->fd_wqh --> &ctx->fault_pending_wqh
Possible interrupt unsafe locking scenario:
CPU0 CPU1 ---- ---- lock(&ctx->fault_pending_wqh); local_irq_disable(); lock(&(&ctx->ctx_lock)->rlock); lock(&ctx->fd_wqh); <Interrupt> lock(&(&ctx->ctx_lock)->rlock);
*** DEADLOCK ***
1 lock held by syz-executor5/9727: #0: 000000000e5b4350 (&ctx->fd_wqh){....}, at: spin_lock_irq include/linux/spinlock.h:354 [inline] #0: 000000000e5b4350 (&ctx->fd_wqh){....}, at: userfaultfd_ctx_read fs/userfaultfd.c:1036 [inline] #0: 000000000e5b4350 (&ctx->fd_wqh){....}, at: userfaultfd_read+0x27a/0x1940 fs/userfaultfd.c:1198
the dependencies between SOFTIRQ-irq-safe lock and the holding lock: -> (&(&ctx->ctx_lock)->rlock){..-.} { IN-SOFTIRQ-W at: lock_acquire+0x16f/0x3f0 kernel/locking/lockdep.c:3841 __raw_spin_lock_irq include/linux/spinlock_api_smp.h:128 [inline] _raw_spin_lock_irq+0x60/0x80 kernel/locking/spinlock.c:160 spin_lock_irq include/linux/spinlock.h:354 [inline] free_ioctx_users+0x2d/0x4a0 fs/aio.c:610 percpu_ref_put_many include/linux/percpu-refcount.h:285 [inline] percpu_ref_put include/linux/percpu-refcount.h:301 [inline] percpu_ref_call_confirm_rcu lib/percpu-refcount.c:123 [inline] percpu_ref_switch_to_atomic_rcu+0x3e7/0x520 lib/percpu-refcount.c:158 __rcu_reclaim kernel/rcu/rcu.h:240 [inline] rcu_do_batch kernel/rcu/tree.c:2452 [inline] invoke_rcu_callbacks kernel/rcu/tree.c:2773 [inline] rcu_process_callbacks+0x928/0x1390 kernel/rcu/tree.c:2754 __do_softirq+0x266/0x95a kernel/softirq.c:292 run_ksoftirqd kernel/softirq.c:654 [inline] run_ksoftirqd+0x8e/0x110 kernel/softirq.c:646 smpboot_thread_fn+0x6ab/0xa10 kernel/smpboot.c:164 kthread+0x357/0x430 kernel/kthread.c:246 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:352 INITIAL USE at: lock_acquire+0x16f/0x3f0 kernel/locking/lockdep.c:3841 __raw_spin_lock_irq include/linux/spinlock_api_smp.h:128 [inline] _raw_spin_lock_irq+0x60/0x80 kernel/locking/spinlock.c:160 spin_lock_irq include/linux/spinlock.h:354 [inline] free_ioctx_users+0x2d/0x4a0 fs/aio.c:610 percpu_ref_put_many include/linux/percpu-refcount.h:285 [inline] percpu_ref_put include/linux/percpu-refcount.h:301 [inline] percpu_ref_call_confirm_rcu lib/percpu-refcount.c:123 [inline] percpu_ref_switch_to_atomic_rcu+0x3e7/0x520 lib/percpu-refcount.c:158 __rcu_reclaim kernel/rcu/rcu.h:240 [inline] rcu_do_batch kernel/rcu/tree.c:2452 [inline] invoke_rcu_callbacks kernel/rcu/tree.c:2773 [inline] rcu_process_callbacks+0x928/0x1390 kernel/rcu/tree.c:2754 __do_softirq+0x266/0x95a kernel/softirq.c:292 run_ksoftirqd kernel/softirq.c:654 [inline] run_ksoftirqd+0x8e/0x110 kernel/softirq.c:646 smpboot_thread_fn+0x6ab/0xa10 kernel/smpboot.c:164 kthread+0x357/0x430 kernel/kthread.c:246 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:352 } ... key at: [<ffffffff8a5760a0>] __key.51972+0x0/0x40 ... acquired at: __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] _raw_spin_lock+0x2f/0x40 kernel/locking/spinlock.c:144 spin_lock include/linux/spinlock.h:329 [inline] aio_poll fs/aio.c:1772 [inline] __io_submit_one fs/aio.c:1875 [inline] io_submit_one+0xedf/0x1cf0 fs/aio.c:1908 __do_sys_io_submit fs/aio.c:1953 [inline] __se_sys_io_submit fs/aio.c:1923 [inline] __x64_sys_io_submit+0x1bd/0x580 fs/aio.c:1923 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe
-> (&ctx->fd_wqh){....} { INITIAL USE at: lock_acquire+0x16f/0x3f0 kernel/locking/lockdep.c:3841 __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline] _raw_spin_lock_irqsave+0x95/0xcd kernel/locking/spinlock.c:152 __wake_up_common_lock+0xc7/0x190 kernel/sched/wait.c:120 __wake_up+0xe/0x10 kernel/sched/wait.c:145 userfaultfd_release+0x4f5/0x6d0 fs/userfaultfd.c:924 __fput+0x2df/0x8d0 fs/file_table.c:278 ____fput+0x16/0x20 fs/file_table.c:309 task_work_run+0x14a/0x1c0 kernel/task_work.c:113 tracehook_notify_resume include/linux/tracehook.h:188 [inline] exit_to_usermode_loop+0x273/0x2c0 arch/x86/entry/common.c:166 prepare_exit_to_usermode arch/x86/entry/common.c:197 [inline] syscall_return_slowpath arch/x86/entry/common.c:268 [inline] do_syscall_64+0x52d/0x610 arch/x86/entry/common.c:293 entry_SYSCALL_64_after_hwframe+0x49/0xbe } ... key at: [<ffffffff8a575e20>] __key.44854+0x0/0x40 ... acquired at: lock_acquire+0x16f/0x3f0 kernel/locking/lockdep.c:3841 __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] _raw_spin_lock+0x2f/0x40 kernel/locking/spinlock.c:144 spin_lock include/linux/spinlock.h:329 [inline] userfaultfd_ctx_read fs/userfaultfd.c:1040 [inline] userfaultfd_read+0x540/0x1940 fs/userfaultfd.c:1198 __vfs_read+0x116/0x8c0 fs/read_write.c:416 vfs_read+0x194/0x3e0 fs/read_write.c:452 ksys_read+0xea/0x1f0 fs/read_write.c:578 __do_sys_read fs/read_write.c:588 [inline] __se_sys_read fs/read_write.c:586 [inline] __x64_sys_read+0x73/0xb0 fs/read_write.c:586 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe
the dependencies between the lock to be acquired and SOFTIRQ-irq-unsafe lock: -> (&ctx->fault_pending_wqh){+.+.} { HARDIRQ-ON-W at: lock_acquire+0x16f/0x3f0 kernel/locking/lockdep.c:3841 __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] _raw_spin_lock+0x2f/0x40 kernel/locking/spinlock.c:144 spin_lock include/linux/spinlock.h:329 [inline] userfaultfd_release+0x497/0x6d0 fs/userfaultfd.c:916 __fput+0x2df/0x8d0 fs/file_table.c:278 ____fput+0x16/0x20 fs/file_table.c:309 task_work_run+0x14a/0x1c0 kernel/task_work.c:113 tracehook_notify_resume include/linux/tracehook.h:188 [inline] exit_to_usermode_loop+0x273/0x2c0 arch/x86/entry/common.c:166 prepare_exit_to_usermode arch/x86/entry/common.c:197 [inline] syscall_return_slowpath arch/x86/entry/common.c:268 [inline] do_syscall_64+0x52d/0x610 arch/x86/entry/common.c:293 entry_SYSCALL_64_after_hwframe+0x49/0xbe SOFTIRQ-ON-W at: lock_acquire+0x16f/0x3f0 kernel/locking/lockdep.c:3841 __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] _raw_spin_lock+0x2f/0x40 kernel/locking/spinlock.c:144 spin_lock include/linux/spinlock.h:329 [inline] userfaultfd_release+0x497/0x6d0 fs/userfaultfd.c:916 __fput+0x2df/0x8d0 fs/file_table.c:278 ____fput+0x16/0x20 fs/file_table.c:309 task_work_run+0x14a/0x1c0 kernel/task_work.c:113 tracehook_notify_resume include/linux/tracehook.h:188 [inline] exit_to_usermode_loop+0x273/0x2c0 arch/x86/entry/common.c:166 prepare_exit_to_usermode arch/x86/entry/common.c:197 [inline] syscall_return_slowpath arch/x86/entry/common.c:268 [inline] do_syscall_64+0x52d/0x610 arch/x86/entry/common.c:293 entry_SYSCALL_64_after_hwframe+0x49/0xbe INITIAL USE at: lock_acquire+0x16f/0x3f0 kernel/locking/lockdep.c:3841 __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] _raw_spin_lock+0x2f/0x40 kernel/locking/spinlock.c:144 spin_lock include/linux/spinlock.h:329 [inline] userfaultfd_release+0x497/0x6d0 fs/userfaultfd.c:916 __fput+0x2df/0x8d0 fs/file_table.c:278 ____fput+0x16/0x20 fs/file_table.c:309 task_work_run+0x14a/0x1c0 kernel/task_work.c:113 tracehook_notify_resume include/linux/tracehook.h:188 [inline] exit_to_usermode_loop+0x273/0x2c0 arch/x86/entry/common.c:166 prepare_exit_to_usermode arch/x86/entry/common.c:197 [inline] syscall_return_slowpath arch/x86/entry/common.c:268 [inline] do_syscall_64+0x52d/0x610 arch/x86/entry/common.c:293 entry_SYSCALL_64_after_hwframe+0x49/0xbe } ... key at: [<ffffffff8a575ee0>] __key.44851+0x0/0x40 ... acquired at: lock_acquire+0x16f/0x3f0 kernel/locking/lockdep.c:3841 __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] _raw_spin_lock+0x2f/0x40 kernel/locking/spinlock.c:144 spin_lock include/linux/spinlock.h:329 [inline] userfaultfd_ctx_read fs/userfaultfd.c:1040 [inline] userfaultfd_read+0x540/0x1940 fs/userfaultfd.c:1198 __vfs_read+0x116/0x8c0 fs/read_write.c:416 vfs_read+0x194/0x3e0 fs/read_write.c:452 ksys_read+0xea/0x1f0 fs/read_write.c:578 __do_sys_read fs/read_write.c:588 [inline] __se_sys_read fs/read_write.c:586 [inline] __x64_sys_read+0x73/0xb0 fs/read_write.c:586 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe
stack backtrace: CPU: 1 PID: 9727 Comm: syz-executor5 Not tainted 5.0.0-rc4+ #56 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 print_bad_irq_dependency kernel/locking/lockdep.c:1573 [inline] check_usage.cold+0x60f/0x940 kernel/locking/lockdep.c:1605 check_irq_usage kernel/locking/lockdep.c:1661 [inline] check_prev_add_irq kernel/locking/lockdep_states.h:8 [inline] check_prev_add kernel/locking/lockdep.c:1871 [inline] check_prevs_add kernel/locking/lockdep.c:1979 [inline] validate_chain kernel/locking/lockdep.c:2350 [inline] __lock_acquire+0x1f47/0x4700 kernel/locking/lockdep.c:3338 lock_acquire+0x16f/0x3f0 kernel/locking/lockdep.c:3841 __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] _raw_spin_lock+0x2f/0x40 kernel/locking/spinlock.c:144 spin_lock include/linux/spinlock.h:329 [inline] userfaultfd_ctx_read fs/userfaultfd.c:1040 [inline] userfaultfd_read+0x540/0x1940 fs/userfaultfd.c:1198 __vfs_read+0x116/0x8c0 fs/read_write.c:416 vfs_read+0x194/0x3e0 fs/read_write.c:452 ksys_read+0xea/0x1f0 fs/read_write.c:578 __do_sys_read fs/read_write.c:588 [inline] __se_sys_read fs/read_write.c:586 [inline] __x64_sys_read+0x73/0xb0 fs/read_write.c:586 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe
Cc: Christoph Hellwig hch@infradead.org Cc: Andrea Arcangeli aarcange@redhat.com Cc: stable@vger.kernel.org Fixes: ae62c16e105a ("userfaultfd: disable irqs when taking the waitqueue lock") Reported-by: syzbot syzkaller@googlegroups.com Signed-off-by: Bart Van Assche bvanassche@acm.org --- fs/userfaultfd.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 89800fc7dc9d..4bcaaee1ee84 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -458,7 +458,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) blocking_state = return_to_userland ? TASK_INTERRUPTIBLE : TASK_KILLABLE;
- spin_lock(&ctx->fault_pending_wqh.lock); + spin_lock_irq(&ctx->fault_pending_wqh.lock); /* * After the __add_wait_queue the uwq is visible to userland * through poll/read(). @@ -470,7 +470,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) * __add_wait_queue. */ set_current_state(blocking_state); - spin_unlock(&ctx->fault_pending_wqh.lock); + spin_unlock_irq(&ctx->fault_pending_wqh.lock);
if (!is_vm_hugetlb_page(vmf->vma)) must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, @@ -552,13 +552,13 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) * kernel stack can be released after the list_del_init. */ if (!list_empty_careful(&uwq.wq.entry)) { - spin_lock(&ctx->fault_pending_wqh.lock); + spin_lock_irq(&ctx->fault_pending_wqh.lock); /* * No need of list_del_init(), the uwq on the stack * will be freed shortly anyway. */ list_del(&uwq.wq.entry); - spin_unlock(&ctx->fault_pending_wqh.lock); + spin_unlock_irq(&ctx->fault_pending_wqh.lock); }
/* @@ -583,7 +583,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, init_waitqueue_entry(&ewq->wq, current); release_new_ctx = NULL;
- spin_lock(&ctx->event_wqh.lock); + spin_lock_irq(&ctx->event_wqh.lock); /* * After the __add_wait_queue the uwq is visible to userland * through poll/read(). @@ -613,15 +613,15 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, break; }
- spin_unlock(&ctx->event_wqh.lock); + spin_unlock_irq(&ctx->event_wqh.lock);
wake_up_poll(&ctx->fd_wqh, EPOLLIN); schedule();
- spin_lock(&ctx->event_wqh.lock); + spin_lock_irq(&ctx->event_wqh.lock); } __set_current_state(TASK_RUNNING); - spin_unlock(&ctx->event_wqh.lock); + spin_unlock_irq(&ctx->event_wqh.lock);
if (release_new_ctx) { struct vm_area_struct *vma; @@ -913,10 +913,10 @@ static int userfaultfd_release(struct inode *inode, struct file *file) * the last page faults that may have been already waiting on * the fault_*wqh. */ - spin_lock(&ctx->fault_pending_wqh.lock); + spin_lock_irq(&ctx->fault_pending_wqh.lock); __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range); __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range); - spin_unlock(&ctx->fault_pending_wqh.lock); + spin_unlock_irq(&ctx->fault_pending_wqh.lock);
/* Flush pending events that may still wait on event_wqh */ wake_up_all(&ctx->event_wqh); @@ -1129,7 +1129,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
if (!ret && msg->event == UFFD_EVENT_FORK) { ret = resolve_userfault_fork(ctx, fork_nctx, msg); - spin_lock(&ctx->event_wqh.lock); + spin_lock_irq(&ctx->event_wqh.lock); if (!list_empty(&fork_event)) { /* * The fork thread didn't abort, so we can @@ -1175,7 +1175,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, if (ret) userfaultfd_ctx_put(fork_nctx); } - spin_unlock(&ctx->event_wqh.lock); + spin_unlock_irq(&ctx->event_wqh.lock); }
return ret; @@ -1214,14 +1214,14 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf, static void __wake_userfault(struct userfaultfd_ctx *ctx, struct userfaultfd_wake_range *range) { - spin_lock(&ctx->fault_pending_wqh.lock); + spin_lock_irq(&ctx->fault_pending_wqh.lock); /* wake all in the range and autoremove */ if (waitqueue_active(&ctx->fault_pending_wqh)) __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, range); if (waitqueue_active(&ctx->fault_wqh)) __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range); - spin_unlock(&ctx->fault_pending_wqh.lock); + spin_unlock_irq(&ctx->fault_pending_wqh.lock); }
static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx, @@ -1872,7 +1872,7 @@ static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f) wait_queue_entry_t *wq; unsigned long pending = 0, total = 0;
- spin_lock(&ctx->fault_pending_wqh.lock); + spin_lock_irq(&ctx->fault_pending_wqh.lock); list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) { pending++; total++; @@ -1880,7 +1880,7 @@ static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f) list_for_each_entry(wq, &ctx->fault_wqh.head, entry) { total++; } - spin_unlock(&ctx->fault_pending_wqh.lock); + spin_unlock_irq(&ctx->fault_pending_wqh.lock);
/* * If more protocols will be added, there will be all shown