While syscall got interrupted by NMI, it is possible to get userspace stack pointer if the esp/rsp register is not switched by syscall entry handler. In that case, show_stack_log_lvl did not judge the situation, kernel panic could occurr by the stack dump code.
PID: 1538 TASK: ffff882f68d13240 CPU: 31 COMMAND: "syslog-ng" [exception RIP: show_stack_log_lvl+265] RIP: ffffffff81017579 RSP: ffff882fbf1e5da8 RFLAGS: 00010046 RAX: 00007ffe9c495f10 RBX: 00007ffe9c495f08 RCX: 0000000000000000 RDX: ffff882fbf1e3fc0 RSI: ffff882fbf1e5ef8 RDI: 0000000000000000 RBP: ffff882fbf1e5df8 R8: ffff882fbf1dffc0 R9: ffffffff81d073d2 R10: 00000000000ab840 R11: 0000000000100000 R12: ffff882fbf1e5ef8 R13: 0000000000000000 R14: ffffffff8185286e R15: 0000000000000000 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 [exception RIP: system_call] RIP: ffffffff81647950 RSP: 00007ffe9c495f08 RFLAGS: 00000046 RAX: 00000000000000e8 RBX: 0000000000625400 RCX: 00007fc072cd62a3 RDX: 0000000000000009 RSI: 00007ffe9c495f10 RDI: 0000000000000004 RBP: 00007ffe9c495fd0 R8: 000000000000b0ee R9: 000000000000b0f1 R10: 0000000000000c4e R11: 0000000000000246 R12: 0000000000625458 R13: 0000000000625450 R14: 00007ffe9c495fe0 R15: 00007ffe9c496020 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 --- --- RIP: 00007fc072cd626a RSP: 00007ffe9c495f70 RFLAGS: 00000293 RAX: 0000000000000000 RBX: ffffffff816479c9 RCX: 0000000000000010 RDX: 0000000000000007 RSI: 0000000000000001 RDI: 0000000000000004 RBP: 00007ffe9c495fd0 R8: 000000000000b0ee R9: 000000000000b0f1 R10: 00007ffe9c495ec0 R11: 0000000000000246 R12: 00007ffe9c496020 R13: 00007ffe9c495ec0 R14: 0000000000625450 R15: 0000000000000001 ORIG_RAX: 00000000000000e9 CS: 0033 SS: 002b
Add to judge this scenario in show_stack_log_lvl.
Signed-off-by: Yuanliang Wang yuanliang.wyl@alibaba-inc.com --- arch/x86/kernel/dumpstack_32.c | 10 +++++++++- arch/x86/kernel/dumpstack_64.c | 10 +++++++++- 2 files changed, 18 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index f2a1770..8a1264e 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -60,6 +60,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, { unsigned long *stack; int i; + unsigned long s;
if (sp == NULL) { if (task) @@ -74,7 +75,14 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, break; if (i && ((i % STACKSLOTS_PER_LINE) == 0)) pr_cont("\n"); - pr_cont(" %08lx", *stack++); + if (stack < (unsigned long *)PAGE_OFFSET + || probe_kernel_address(stack, s)) { + printk_once(KERN_ERR "sp %p is user space addr\n", + stack); + pr_cont(" %016lx", s); + stack++; + } else + pr_cont(" %08lx", *stack++); touch_nmi_watchdog(); } pr_cont("\n"); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index addb207..4446ebc 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -203,6 +203,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *irq_stack_end; unsigned long *irq_stack; unsigned long *stack; + unsigned long s; int cpu; int i;
@@ -236,7 +237,14 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, } if (i && ((i % STACKSLOTS_PER_LINE) == 0)) pr_cont("\n"); - pr_cont(" %016lx", *stack++); + if (stack < (unsigned long *)PAGE_OFFSET + || probe_kernel_address(stack, s)) { + printk_once(KERN_ERR "sp %p is user space addr\n", + stack); + pr_cont(" %016lx", s); + stack++; + } else + pr_cont(" %016lx", *stack++); touch_nmi_watchdog(); } preempt_enable();
Adding Josh.
On Mon, Dec 25, 2017 at 1:03 AM, 王元良 yuanliang.wyl@alibaba-inc.com wrote:
While syscall got interrupted by NMI, it is possible to get userspace stack pointer if the esp/rsp register is not switched by syscall entry handler. In that case, show_stack_log_lvl did not judge the situation, kernel panic could occurr by the stack dump code.
PID: 1538 TASK: ffff882f68d13240 CPU: 31 COMMAND: "syslog-ng" [exception RIP: show_stack_log_lvl+265] RIP: ffffffff81017579 RSP: ffff882fbf1e5da8 RFLAGS: 00010046 RAX: 00007ffe9c495f10 RBX: 00007ffe9c495f08 RCX: 0000000000000000 RDX: ffff882fbf1e3fc0 RSI: ffff882fbf1e5ef8 RDI: 0000000000000000 RBP: ffff882fbf1e5df8 R8: ffff882fbf1dffc0 R9: ffffffff81d073d2 R10: 00000000000ab840 R11: 0000000000100000 R12: ffff882fbf1e5ef8 R13: 0000000000000000 R14: ffffffff8185286e R15: 0000000000000000 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 [exception RIP: system_call] RIP: ffffffff81647950 RSP: 00007ffe9c495f08 RFLAGS: 00000046 RAX: 00000000000000e8 RBX: 0000000000625400 RCX: 00007fc072cd62a3 RDX: 0000000000000009 RSI: 00007ffe9c495f10 RDI: 0000000000000004 RBP: 00007ffe9c495fd0 R8: 000000000000b0ee R9: 000000000000b0f1 R10: 0000000000000c4e R11: 0000000000000246 R12: 0000000000625458 R13: 0000000000625450 R14: 00007ffe9c495fe0 R15: 00007ffe9c496020 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
RIP: 00007fc072cd626a RSP: 00007ffe9c495f70 RFLAGS: 00000293 RAX: 0000000000000000 RBX: ffffffff816479c9 RCX: 0000000000000010 RDX: 0000000000000007 RSI: 0000000000000001 RDI: 0000000000000004 RBP: 00007ffe9c495fd0 R8: 000000000000b0ee R9: 000000000000b0f1 R10: 00007ffe9c495ec0 R11: 0000000000000246 R12: 00007ffe9c496020 R13: 00007ffe9c495ec0 R14: 0000000000625450 R15: 0000000000000001 ORIG_RAX: 00000000000000e9 CS: 0033 SS: 002b
Add to judge this scenario in show_stack_log_lvl.
Signed-off-by: Yuanliang Wang yuanliang.wyl@alibaba-inc.com
arch/x86/kernel/dumpstack_32.c | 10 +++++++++- arch/x86/kernel/dumpstack_64.c | 10 +++++++++- 2 files changed, 18 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index f2a1770..8a1264e 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -60,6 +60,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, { unsigned long *stack; int i;
unsigned long s; if (sp == NULL) { if (task)
@@ -74,7 +75,14 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, break; if (i && ((i % STACKSLOTS_PER_LINE) == 0)) pr_cont("\n");
pr_cont(" %08lx", *stack++);
if (stack < (unsigned long *)PAGE_OFFSET
|| probe_kernel_address(stack, s)) {
printk_once(KERN_ERR "sp %p is user space addr\n",
stack);
pr_cont(" %016lx", s);
stack++;
} else
pr_cont(" %08lx", *stack++);
NAK. If we already screwed up by trying to unwind through SYSCALL, "stack" is entirely user-controlled and this check is nowhere near sufficient. But the right fix is to prevent this from happening. The kernel should notice that the nmi interrupted a non-unwindable context and not try to unwind past it.
Josh, can you remind me exactly how this works in the 64-bit case with ORC disabled?
touch_nmi_watchdog(); } pr_cont("\n");
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index addb207..4446ebc 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -203,6 +203,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *irq_stack_end; unsigned long *irq_stack; unsigned long *stack;
unsigned long s; int cpu; int i;
@@ -236,7 +237,14 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, } if (i && ((i % STACKSLOTS_PER_LINE) == 0)) pr_cont("\n");
pr_cont(" %016lx", *stack++);
if (stack < (unsigned long *)PAGE_OFFSET
|| probe_kernel_address(stack, s)) {
printk_once(KERN_ERR "sp %p is user space addr\n",
stack);
pr_cont(" %016lx", s);
stack++;
} else
pr_cont(" %016lx", *stack++);
Extra NAK, since the 64-bit case is already correct.
On Mon, Dec 25, 2017 at 6:43 AM, Andy Lutomirski luto@kernel.org wrote:
Adding Josh.
On Mon, Dec 25, 2017 at 1:03 AM, 王元良 yuanliang.wyl@alibaba-inc.com wrote:
While syscall got interrupted by NMI, it is possible to get userspace stack pointer if the esp/rsp register is not switched by syscall entry handler. In that case, show_stack_log_lvl did not judge the situation, kernel panic could occurr by the stack dump code.
PID: 1538 TASK: ffff882f68d13240 CPU: 31 COMMAND: "syslog-ng" [exception RIP: show_stack_log_lvl+265] RIP: ffffffff81017579 RSP: ffff882fbf1e5da8 RFLAGS: 00010046 RAX: 00007ffe9c495f10 RBX: 00007ffe9c495f08 RCX: 0000000000000000 RDX: ffff882fbf1e3fc0 RSI: ffff882fbf1e5ef8 RDI: 0000000000000000 RBP: ffff882fbf1e5df8 R8: ffff882fbf1dffc0 R9: ffffffff81d073d2 R10: 00000000000ab840 R11: 0000000000100000 R12: ffff882fbf1e5ef8 R13: 0000000000000000 R14: ffffffff8185286e R15: 0000000000000000 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 [exception RIP: system_call] RIP: ffffffff81647950 RSP: 00007ffe9c495f08 RFLAGS: 00000046 RAX: 00000000000000e8 RBX: 0000000000625400 RCX: 00007fc072cd62a3 RDX: 0000000000000009 RSI: 00007ffe9c495f10 RDI: 0000000000000004 RBP: 00007ffe9c495fd0 R8: 000000000000b0ee R9: 000000000000b0f1 R10: 0000000000000c4e R11: 0000000000000246 R12: 0000000000625458 R13: 0000000000625450 R14: 00007ffe9c495fe0 R15: 00007ffe9c496020 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
RIP: 00007fc072cd626a RSP: 00007ffe9c495f70 RFLAGS: 00000293 RAX: 0000000000000000 RBX: ffffffff816479c9 RCX: 0000000000000010 RDX: 0000000000000007 RSI: 0000000000000001 RDI: 0000000000000004 RBP: 00007ffe9c495fd0 R8: 000000000000b0ee R9: 000000000000b0f1 R10: 00007ffe9c495ec0 R11: 0000000000000246 R12: 00007ffe9c496020 R13: 00007ffe9c495ec0 R14: 0000000000625450 R15: 0000000000000001 ORIG_RAX: 00000000000000e9 CS: 0033 SS: 002b
Add to judge this scenario in show_stack_log_lvl.
Signed-off-by: Yuanliang Wang yuanliang.wyl@alibaba-inc.com
arch/x86/kernel/dumpstack_32.c | 10 +++++++++- arch/x86/kernel/dumpstack_64.c | 10 +++++++++- 2 files changed, 18 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index f2a1770..8a1264e 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -60,6 +60,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, { unsigned long *stack; int i;
unsigned long s; if (sp == NULL) { if (task)
@@ -74,7 +75,14 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, break; if (i && ((i % STACKSLOTS_PER_LINE) == 0)) pr_cont("\n");
pr_cont(" %08lx", *stack++);
if (stack < (unsigned long *)PAGE_OFFSET
|| probe_kernel_address(stack, s)) {
printk_once(KERN_ERR "sp %p is user space addr\n",
stack);
pr_cont(" %016lx", s);
stack++;
} else
pr_cont(" %08lx", *stack++);
NAK. If we already screwed up by trying to unwind through SYSCALL, "stack" is entirely user-controlled and this check is nowhere near sufficient. But the right fix is to prevent this from happening. The kernel should notice that the nmi interrupted a non-unwindable context and not try to unwind past it.
Josh, can you remind me exactly how this works in the 64-bit case with ORC disabled?
I spoke too quickly. This case isn't even possible on x86_32 because we don't enable SYSCALL.
Are you sure you're seeing a problem on a recent upstream kernel?
linux-stable-mirror@lists.linaro.org