From: Valentin Schneider vschneid@redhat.com
commit 811d581194f7412eda97acc03d17fc77824b561f upstream.
Attempting to get a crash dump out of a debug PREEMPT_RT kernel via an NMI panic() doesn't work. The cause of that lies in the PREEMPT_RT definition of mutex_trylock():
if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES) && WARN_ON_ONCE(!in_task())) return 0;
This prevents an nmi_panic() from executing the main body of __crash_kexec() which does the actual kexec into the kdump kernel. The warning and return are explained by:
6ce47fd961fa ("rtmutex: Warn if trylock is called from hard/softirq context") [...] The reasons for this are:
1) There is a potential deadlock in the slowpath
2) Another cpu which blocks on the rtmutex will boost the task which allegedly locked the rtmutex, but that cannot work because the hard/softirq context borrows the task context.
Furthermore, grabbing the lock isn't NMI safe, so do away with kexec_mutex and replace it with an atomic variable. This is somewhat overzealous as *some* callsites could keep using a mutex (e.g. the sysfs-facing ones like crash_shrink_memory()), but this has the benefit of involving a single unified lock and preventing any future NMI-related surprises.
Tested by triggering NMI panics via:
$ echo 1 > /proc/sys/kernel/panic_on_unrecovered_nmi $ echo 1 > /proc/sys/kernel/unknown_nmi_panic $ echo 1 > /proc/sys/kernel/panic
$ ipmitool power diag
Link: https://lkml.kernel.org/r/20220630223258.4144112-3-vschneid@redhat.com Fixes: 6ce47fd961fa ("rtmutex: Warn if trylock is called from hard/softirq context") Signed-off-by: Valentin Schneider vschneid@redhat.com Cc: Arnd Bergmann arnd@arndb.de Cc: Baoquan He bhe@redhat.com Cc: "Eric W . Biederman" ebiederm@xmission.com Cc: Juri Lelli jlelli@redhat.com Cc: Luis Claudio R. Goncalves lgoncalv@redhat.com Cc: Miaohe Lin linmiaohe@huawei.com Cc: Petr Mladek pmladek@suse.com Cc: Sebastian Andrzej Siewior bigeasy@linutronix.de Cc: Thomas Gleixner tglx@linutronix.de Signed-off-by: Andrew Morton akpm@linux-foundation.org Cc: stable@vger.kernel.org # 5.10+ Signed-off-by: Wen Yang wenyang.linux@foxmail.com --- kernel/kexec.c | 11 ++++------- kernel/kexec_core.c | 20 ++++++++++---------- kernel/kexec_file.c | 4 ++-- kernel/kexec_internal.h | 15 ++++++++++++++- 4 files changed, 30 insertions(+), 20 deletions(-)
diff --git a/kernel/kexec.c b/kernel/kexec.c index 9c7aef8f4bb6..f0f0c6555454 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -112,13 +112,10 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
/* * Because we write directly to the reserved memory region when loading - * crash kernels we need a mutex here to prevent multiple crash kernels - * from attempting to load simultaneously, and to prevent a crash kernel - * from loading over the top of a in use crash kernel. - * - * KISS: always take the mutex. + * crash kernels we need a serialization here to prevent multiple crash + * kernels from attempting to load simultaneously. */ - if (!mutex_trylock(&kexec_mutex)) + if (!kexec_trylock()) return -EBUSY;
if (flags & KEXEC_ON_CRASH) { @@ -184,7 +181,7 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
kimage_free(image); out_unlock: - mutex_unlock(&kexec_mutex); + kexec_unlock(); return ret; }
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index e47870f30728..7a8104d48997 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -45,7 +45,7 @@ #include <crypto/sha.h> #include "kexec_internal.h"
-DEFINE_MUTEX(kexec_mutex); +atomic_t __kexec_lock = ATOMIC_INIT(0);
/* Per cpu memory for storing cpu states in case of system crash. */ note_buf_t __percpu *crash_notes; @@ -943,7 +943,7 @@ int kexec_load_disabled; */ void __noclone __crash_kexec(struct pt_regs *regs) { - /* Take the kexec_mutex here to prevent sys_kexec_load + /* Take the kexec_lock here to prevent sys_kexec_load * running on one cpu from replacing the crash kernel * we are using after a panic on a different cpu. * @@ -951,7 +951,7 @@ void __noclone __crash_kexec(struct pt_regs *regs) * of memory the xchg(&kexec_crash_image) would be * sufficient. But since I reuse the memory... */ - if (mutex_trylock(&kexec_mutex)) { + if (kexec_trylock()) { if (kexec_crash_image) { struct pt_regs fixed_regs;
@@ -960,7 +960,7 @@ void __noclone __crash_kexec(struct pt_regs *regs) machine_crash_shutdown(&fixed_regs); machine_kexec(kexec_crash_image); } - mutex_unlock(&kexec_mutex); + kexec_unlock(); } } STACK_FRAME_NON_STANDARD(__crash_kexec); @@ -993,13 +993,13 @@ ssize_t crash_get_memory_size(void) { ssize_t size = 0;
- if (!mutex_trylock(&kexec_mutex)) + if (!kexec_trylock()) return -EBUSY;
if (crashk_res.end != crashk_res.start) size = resource_size(&crashk_res);
- mutex_unlock(&kexec_mutex); + kexec_unlock(); return size; }
@@ -1019,7 +1019,7 @@ int crash_shrink_memory(unsigned long new_size) unsigned long old_size; struct resource *ram_res;
- if (!mutex_trylock(&kexec_mutex)) + if (!kexec_trylock()) return -EBUSY;
if (kexec_crash_image) { @@ -1058,7 +1058,7 @@ int crash_shrink_memory(unsigned long new_size) insert_resource(&iomem_resource, ram_res);
unlock: - mutex_unlock(&kexec_mutex); + kexec_unlock(); return ret; }
@@ -1130,7 +1130,7 @@ int kernel_kexec(void) { int error = 0;
- if (!mutex_trylock(&kexec_mutex)) + if (!kexec_trylock()) return -EBUSY; if (!kexec_image) { error = -EINVAL; @@ -1205,7 +1205,7 @@ int kernel_kexec(void) #endif
Unlock: - mutex_unlock(&kexec_mutex); + kexec_unlock(); return error; }
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index fff11916aba3..b9c857782ada 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -343,7 +343,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
image = NULL;
- if (!mutex_trylock(&kexec_mutex)) + if (!kexec_trylock()) return -EBUSY;
dest_image = &kexec_image; @@ -415,7 +415,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image) arch_kexec_protect_crashkres();
- mutex_unlock(&kexec_mutex); + kexec_unlock(); kimage_free(image); return ret; } diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 39d30ccf8d87..49d4e3ab9c96 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -15,7 +15,20 @@ int kimage_is_destination_range(struct kimage *image,
int machine_kexec_post_load(struct kimage *image);
-extern struct mutex kexec_mutex; +/* + * Whatever is used to serialize accesses to the kexec_crash_image needs to be + * NMI safe, as __crash_kexec() can happen during nmi_panic(), so here we use a + * "simple" atomic variable that is acquired with a cmpxchg(). + */ +extern atomic_t __kexec_lock; +static inline bool kexec_trylock(void) +{ + return atomic_cmpxchg_acquire(&__kexec_lock, 0, 1) == 0; +} +static inline void kexec_unlock(void) +{ + atomic_set_release(&__kexec_lock, 0); +}
#ifdef CONFIG_KEXEC_FILE #include <linux/purgatory.h>
On Thu, Mar 02, 2023 at 12:25:02AM +0800, wenyang.linux@foxmail.com wrote:
From: Valentin Schneider vschneid@redhat.com
commit 811d581194f7412eda97acc03d17fc77824b561f upstream.
No it is not :(
Also, why is this needed for 5.10.y? PREEMPT_RT is not in 5.10, right?
thanks,
greg k-h
On 03/03/23 16:51, Greg Kroah-Hartman wrote:
On Thu, Mar 02, 2023 at 12:25:02AM +0800, wenyang.linux@foxmail.com wrote:
From: Valentin Schneider vschneid@redhat.com
commit 811d581194f7412eda97acc03d17fc77824b561f upstream.
No it is not :(
Also, why is this needed for 5.10.y? PREEMPT_RT is not in 5.10, right?
The mutex_unlock() might end up waking a waiter, and wake_up_process() really isn't NMI safe (regardless of PREEMPT_RT - the PREEMPT_RT warning just highlighted the issue).
thanks,
greg k-h
linux-stable-mirror@lists.linaro.org