In NMI context, we might be in the middle of context switching or in the middle of switch_mm_irqs_off(). In either case, CR3 might not match current->mm, which could cause copy_from_user_nmi() and friends to read the wrong memory.
Fix it by adding a new nmi_uaccess_okay() helper and checking it in copy_from_user_nmi() and in __copy_from_user_nmi()'s callers.
Cc: stable@vger.kernel.org Cc: Peter Zijlstra peterz@infradead.org Cc: Nadav Amit nadav.amit@gmail.com Signed-off-by: Andy Lutomirski luto@kernel.org ---
Nadav, this is intended for your series. Want to add it right before the use_temporary_mm() stuff?
Changes from v1: - Improved comments - Add debugging - Fix bugs
arch/x86/events/core.c | 2 +- arch/x86/include/asm/tlbflush.h | 44 +++++++++++++++++++++++++++++++++ arch/x86/lib/usercopy.c | 5 ++++ arch/x86/mm/tlb.c | 7 ++++++ 4 files changed, 57 insertions(+), 1 deletion(-)
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 5f4829f10129..dfb2f7c0d019 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2465,7 +2465,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs
perf_callchain_store(entry, regs->ip);
- if (!current->mm) + if (!nmi_uaccess_okay()) return;
if (perf_callchain_user32(regs, entry)) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 29c9da6c62fc..183d36a98b04 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -175,8 +175,16 @@ struct tlb_state { * are on. This means that it may not match current->active_mm, * which will contain the previous user mm when we're in lazy TLB * mode even if we've already switched back to swapper_pg_dir. + * + * During switch_mm_irqs_off(), loaded_mm will be set to + * LOADED_MM_SWITCHING during the brief interrupts-off window + * when CR3 and loaded_mm would otherwise be inconsistent. This + * is for nmi_uaccess_okay()'s benefit. */ struct mm_struct *loaded_mm; + +#define LOADED_MM_SWITCHING ((struct mm_struct *)1) + u16 loaded_mm_asid; u16 next_asid; /* last user mm's ctx id */ @@ -246,6 +254,42 @@ struct tlb_state { }; DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
+/* + * Blindly accessing user memory from NMI context can be dangerous + * if we're in the middle of switching the current user task or + * switching the loaded mm. It can also be dangerous if we + * interrupted some kernel code that was temporarily using a + * different mm. + */ +static inline bool nmi_uaccess_okay(void) +{ + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + struct mm_struct *current_mm = current->mm; + +#ifdef CONFIG_DEBUG_VM + WARN_ON_ONCE(!loaded_mm); +#endif + + /* + * The condition we want to check is + * current_mm->pgd == __va(read_cr3_pa()). This may be slow, though, + * if we're running in a VM with shadow paging, and nmi_uaccess_okay() + * is supposed to be reasonably fast. + * + * Instead, we check the almost equivalent but somewhat conservative + * condition below, and we rely on the fact that switch_mm_irqs_off() + * sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3. + */ + if (loaded_mm != current_mm) + return false; + +#ifdef CONFIG_DEBUG_VM + WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa())); +#endif + + return true; +} + /* Initialize cr4 shadow for this CPU. */ static inline void cr4_init_shadow(void) { diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c index c8c6ad0d58b8..3f435d7fca5e 100644 --- a/arch/x86/lib/usercopy.c +++ b/arch/x86/lib/usercopy.c @@ -7,6 +7,8 @@ #include <linux/uaccess.h> #include <linux/export.h>
+#include <asm/tlbflush.h> + /* * We rely on the nested NMI work to allow atomic faults from the NMI path; the * nested NMI paths are careful to preserve CR2. @@ -19,6 +21,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) if (__range_not_ok(from, n, TASK_SIZE)) return n;
+ if (!nmi_uaccess_okay()) + return n; + /* * Even though this function is typically called from NMI/IRQ context * disable pagefaults so that its behaviour is consistent even when diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index e721cf2d4290..707d2b2a333f 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -304,6 +304,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
+ /* Let nmi_uaccess_okay() know that we're changing CR3. */ + this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); + barrier(); + if (need_flush) { this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); @@ -334,6 +338,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, if (next != &init_mm) this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
+ /* Make sure we write CR3 before loaded_mm. */ + barrier(); + this_cpu_write(cpu_tlbstate.loaded_mm, next); this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); }
On Wed, 2018-08-29 at 08:47 -0700, Andy Lutomirski wrote:
In NMI context, we might be in the middle of context switching or in the middle of switch_mm_irqs_off(). In either case, CR3 might not match current->mm, which could cause copy_from_user_nmi() and friends to read the wrong memory.
Fix it by adding a new nmi_uaccess_okay() helper and checking it in copy_from_user_nmi() and in __copy_from_user_nmi()'s callers.
Cc: stable@vger.kernel.org Cc: Peter Zijlstra peterz@infradead.org Cc: Nadav Amit nadav.amit@gmail.com Signed-off-by: Andy Lutomirski luto@kernel.org
Reviewed-by: Rik van Riel riel@surriel.com
at 8:47 AM, Andy Lutomirski luto@kernel.org wrote:
In NMI context, we might be in the middle of context switching or in the middle of switch_mm_irqs_off(). In either case, CR3 might not match current->mm, which could cause copy_from_user_nmi() and friends to read the wrong memory.
Fix it by adding a new nmi_uaccess_okay() helper and checking it in copy_from_user_nmi() and in __copy_from_user_nmi()'s callers.
Cc: stable@vger.kernel.org Cc: Peter Zijlstra peterz@infradead.org Cc: Nadav Amit nadav.amit@gmail.com Signed-off-by: Andy Lutomirski luto@kernel.org
Nadav, this is intended for your series. Want to add it right before the use_temporary_mm() stuff?
Sure. Thanks! I will apply the following small fix:
+#ifdef CONFIG_DEBUG_VM
- WARN_ON_ONCE(!loaded_mm);
+#endif
Will be changed to VM_WARN_ON_ONCE() in the two instances.
Regards, Nadav
On Wed, 29 Aug 2018, Nadav Amit wrote:
at 8:47 AM, Andy Lutomirski luto@kernel.org wrote:
In NMI context, we might be in the middle of context switching or in the middle of switch_mm_irqs_off(). In either case, CR3 might not match current->mm, which could cause copy_from_user_nmi() and friends to read the wrong memory.
Fix it by adding a new nmi_uaccess_okay() helper and checking it in copy_from_user_nmi() and in __copy_from_user_nmi()'s callers.
Cc: stable@vger.kernel.org Cc: Peter Zijlstra peterz@infradead.org Cc: Nadav Amit nadav.amit@gmail.com Signed-off-by: Andy Lutomirski luto@kernel.org
Nadav, this is intended for your series. Want to add it right before the use_temporary_mm() stuff?
Sure. Thanks! I will apply the following small fix:
+#ifdef CONFIG_DEBUG_VM
- WARN_ON_ONCE(!loaded_mm);
+#endif
Will be changed to VM_WARN_ON_ONCE() in the two instances.
Unless I'm completely lost, this can just be applied to tip right away. It's not depending on anything else.
Thanks,
tglx
On Aug 30, 2018, at 6:36 AM, Thomas Gleixner tglx@linutronix.de wrote:
On Wed, 29 Aug 2018, Nadav Amit wrote: at 8:47 AM, Andy Lutomirski luto@kernel.org wrote:
In NMI context, we might be in the middle of context switching or in the middle of switch_mm_irqs_off(). In either case, CR3 might not match current->mm, which could cause copy_from_user_nmi() and friends to read the wrong memory.
Fix it by adding a new nmi_uaccess_okay() helper and checking it in copy_from_user_nmi() and in __copy_from_user_nmi()'s callers.
Cc: stable@vger.kernel.org Cc: Peter Zijlstra peterz@infradead.org Cc: Nadav Amit nadav.amit@gmail.com Signed-off-by: Andy Lutomirski luto@kernel.org
Nadav, this is intended for your series. Want to add it right before the use_temporary_mm() stuff?
Sure. Thanks! I will apply the following small fix:
+#ifdef CONFIG_DEBUG_VM
- WARN_ON_ONCE(!loaded_mm);
+#endif
Will be changed to VM_WARN_ON_ONCE() in the two instances.
Unless I'm completely lost, this can just be applied to tip right away. It's not depending on anything else.
Fine with me. Do you want to do the VM_WARN_ON cleanup yourself or should I send a v3?
On Thu, 30 Aug 2018, Andy Lutomirski wrote:
On Aug 30, 2018, at 6:36 AM, Thomas Gleixner tglx@linutronix.de wrote:
On Wed, 29 Aug 2018, Nadav Amit wrote: at 8:47 AM, Andy Lutomirski luto@kernel.org wrote:
In NMI context, we might be in the middle of context switching or in the middle of switch_mm_irqs_off(). In either case, CR3 might not match current->mm, which could cause copy_from_user_nmi() and friends to read the wrong memory.
Fix it by adding a new nmi_uaccess_okay() helper and checking it in copy_from_user_nmi() and in __copy_from_user_nmi()'s callers.
Cc: stable@vger.kernel.org Cc: Peter Zijlstra peterz@infradead.org Cc: Nadav Amit nadav.amit@gmail.com Signed-off-by: Andy Lutomirski luto@kernel.org
Nadav, this is intended for your series. Want to add it right before the use_temporary_mm() stuff?
Sure. Thanks! I will apply the following small fix:
+#ifdef CONFIG_DEBUG_VM
- WARN_ON_ONCE(!loaded_mm);
+#endif
Will be changed to VM_WARN_ON_ONCE() in the two instances.
Unless I'm completely lost, this can just be applied to tip right away. It's not depending on anything else.
Fine with me. Do you want to do the VM_WARN_ON cleanup yourself or should I send a v3?
I think, I'll manage
linux-stable-mirror@lists.linaro.org