From: Richard Fellner richard.fellner@student.tugraz.at
This patch introduces our implementation of KAISER (Kernel Address Isolation to have Side-channels Efficiently Removed), a kernel isolation technique to close hardware side channels on kernel address information.
More information about the patch can be found on:
https://github.com/IAIK/KAISER
From: Richard Fellner richard.fellner@student.tugraz.at From: Daniel Gruss daniel.gruss@iaik.tugraz.at X-Subject: [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode Date: Thu, 4 May 2017 14:26:50 +0200 Link: http://marc.info/?l=linux-kernel&m=149390087310405&w=2 Kaiser-4.10-SHA1: c4b1831d44c6144d3762ccc72f0c4e71a0c713e5
To: linux-kernel@vger.kernel.org To: kernel-hardening@lists.openwall.com Cc: clementine.maurice@iaik.tugraz.at Cc: moritz.lipp@iaik.tugraz.at Cc: Michael Schwarz michael.schwarz@iaik.tugraz.at Cc: Richard Fellner richard.fellner@student.tugraz.at Cc: Ingo Molnar mingo@kernel.org Cc: kirill.shutemov@linux.intel.com Cc: anders.fogh@gdata-adan.de
After several recent works [1,2,3] KASLR on x86_64 was basically considered dead by many researchers. We have been working on an efficient but effective fix for this problem and found that not mapping the kernel space when running in user mode is the solution to this problem [4] (the corresponding paper [5] will be presented at ESSoS17).
With this RFC patch we allow anybody to configure their kernel with the flag CONFIG_KAISER to add our defense mechanism.
If there are any questions we would love to answer them. We also appreciate any comments!
Cheers, Daniel (+ the KAISER team from Graz University of Technology)
[1] http://www.ieee-security.org/TC/SP2013/papers/4977a191.pdf [2] https://www.blackhat.com/docs/us-16/materials/us-16-Fogh-Using-Undocumented-... [3] https://www.blackhat.com/docs/us-16/materials/us-16-Jang-Breaking-Kernel-Add... [4] https://github.com/IAIK/KAISER [5] https://gruss.cc/files/kaiser.pdf
[patch based also on https://raw.githubusercontent.com/IAIK/KAISER/master/KAISER/0001-KAISER-Kern...]
Signed-off-by: Richard Fellner richard.fellner@student.tugraz.at Signed-off-by: Moritz Lipp moritz.lipp@iaik.tugraz.at Signed-off-by: Daniel Gruss daniel.gruss@iaik.tugraz.at Signed-off-by: Michael Schwarz michael.schwarz@iaik.tugraz.at Acked-by: Jiri Kosina jkosina@suse.cz Signed-off-by: Hugh Dickins hughd@google.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org (cherry picked from commit 8a43ddfb93a0c6ae1a6e1f5c25705ec5d1843c40) Signed-off-by: Pavel Tatashin pasha.tatashin@oracle.com
Conflicts: arch/x86/entry/entry_64.S (not in this tree) arch/x86/kernel/entry_64.S (patched instead of that) arch/x86/entry/entry_64_compat.S (not in this tree) arch/x86/ia32/ia32entry.S (patched instead of that) arch/x86/include/asm/hw_irq.h arch/x86/kernel/irqinit.c kernel/fork.c --- arch/x86/ia32/ia32entry.S | 6 ++ arch/x86/include/asm/hw_irq.h | 2 +- arch/x86/include/asm/kaiser.h | 113 +++++++++++++++++++++++++ arch/x86/include/asm/pgtable.h | 4 + arch/x86/include/asm/pgtable_64.h | 21 +++++ arch/x86/include/asm/pgtable_types.h | 12 ++- arch/x86/include/asm/processor.h | 7 +- arch/x86/kernel/cpu/common.c | 4 +- arch/x86/kernel/entry_64.S | 18 +++- arch/x86/kernel/espfix_64.c | 6 ++ arch/x86/kernel/head_64.S | 16 +++- arch/x86/kernel/irqinit.c | 2 +- arch/x86/kernel/process.c | 2 +- arch/x86/mm/Makefile | 1 + arch/x86/mm/kaiser.c | 160 +++++++++++++++++++++++++++++++++++ arch/x86/mm/pageattr.c | 2 +- arch/x86/mm/pgtable.c | 26 ++++++ include/asm-generic/vmlinux.lds.h | 11 ++- include/linux/percpu-defs.h | 30 +++++++ init/main.c | 6 ++ kernel/fork.c | 8 ++ security/Kconfig | 7 ++ 22 files changed, 448 insertions(+), 16 deletions(-) create mode 100644 arch/x86/include/asm/kaiser.h create mode 100644 arch/x86/mm/kaiser.c
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index b2fafcb37d4e..665e2c7887fe 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -15,6 +15,7 @@ #include <asm/irqflags.h> #include <asm/asm.h> #include <asm/smap.h> +#include <asm/kaiser.h> #include <linux/linkage.h> #include <linux/err.h>
@@ -119,6 +120,7 @@ ENTRY(ia32_sysenter_target) * it is too small to ever cause noticeable irq latency. */ SWAPGS_UNSAFE_STACK + SWITCH_KERNEL_CR3_NO_STACK movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp ENABLE_INTERRUPTS(CLBR_NONE)
@@ -207,6 +209,7 @@ sysexit_from_sys_call: movl EFLAGS(%rsp),%r11d /* User eflags */ /*CFI_RESTORE rflags*/ TRACE_IRQS_ON + SWITCH_USER_CR3
/* * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT, @@ -354,6 +357,7 @@ ENTRY(ia32_cstar_target) * it is too small to ever cause noticeable irq latency. */ SWAPGS_UNSAFE_STACK + SWITCH_KERNEL_CR3_NO_STACK movl %esp,%r8d CFI_REGISTER rsp,r8 movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp @@ -419,6 +423,7 @@ sysretl_from_sys_call: xorq %r9,%r9 xorq %r8,%r8 TRACE_IRQS_ON + SWITCH_USER_CR3 movl RSP(%rsp),%esp CFI_RESTORE rsp /* @@ -513,6 +518,7 @@ ENTRY(ia32_syscall) PARAVIRT_ADJUST_EXCEPTION_FRAME ASM_CLAC /* Do this early to minimize exposure */ SWAPGS + SWITCH_KERNEL_CR3_NO_STACK ENABLE_INTERRUPTS(CLBR_NONE)
/* Zero-extending 32-bit regs, do not remove */ diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index e9571ddabc4f..08017d37db0a 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -190,7 +190,7 @@ extern char irq_entries_start[]; #define VECTOR_RETRIGGERED (-2)
typedef int vector_irq_t[NR_VECTORS]; -DECLARE_PER_CPU(vector_irq_t, vector_irq); +DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq);
#endif /* !ASSEMBLY_ */
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h new file mode 100644 index 000000000000..63ee8309b35b --- /dev/null +++ b/arch/x86/include/asm/kaiser.h @@ -0,0 +1,113 @@ +#ifndef _ASM_X86_KAISER_H +#define _ASM_X86_KAISER_H + +/* This file includes the definitions for the KAISER feature. + * KAISER is a counter measure against x86_64 side channel attacks on the kernel virtual memory. + * It has a shodow-pgd for every process. the shadow-pgd has a minimalistic kernel-set mapped, + * but includes the whole user memory. Within a kernel context switch, or when an interrupt is handled, + * the pgd is switched to the normal one. When the system switches to user mode, the shadow pgd is enabled. + * By this, the virtual memory chaches are freed, and the user may not attack the whole kernel memory. + * + * A minimalistic kernel mapping holds the parts needed to be mapped in user mode, as the entry/exit functions + * of the user space, or the stacks. + */ +#ifdef __ASSEMBLY__ +#ifdef CONFIG_KAISER + +.macro _SWITCH_TO_KERNEL_CR3 reg +movq %cr3, \reg +andq $(~0x1000), \reg +movq \reg, %cr3 +.endm + +.macro _SWITCH_TO_USER_CR3 reg +movq %cr3, \reg +orq $(0x1000), \reg +movq \reg, %cr3 +.endm + +.macro SWITCH_KERNEL_CR3 +pushq %rax +_SWITCH_TO_KERNEL_CR3 %rax +popq %rax +.endm + +.macro SWITCH_USER_CR3 +pushq %rax +_SWITCH_TO_USER_CR3 %rax +popq %rax +.endm + +.macro SWITCH_KERNEL_CR3_NO_STACK +movq %rax, PER_CPU_VAR(unsafe_stack_register_backup) +_SWITCH_TO_KERNEL_CR3 %rax +movq PER_CPU_VAR(unsafe_stack_register_backup), %rax +.endm + + +.macro SWITCH_USER_CR3_NO_STACK + +movq %rax, PER_CPU_VAR(unsafe_stack_register_backup) +_SWITCH_TO_USER_CR3 %rax +movq PER_CPU_VAR(unsafe_stack_register_backup), %rax + +.endm + +#else /* CONFIG_KAISER */ + +.macro SWITCH_KERNEL_CR3 reg +.endm +.macro SWITCH_USER_CR3 reg +.endm +.macro SWITCH_USER_CR3_NO_STACK +.endm +.macro SWITCH_KERNEL_CR3_NO_STACK +.endm + +#endif /* CONFIG_KAISER */ +#else /* __ASSEMBLY__ */ + + +#ifdef CONFIG_KAISER +// Upon kernel/user mode switch, it may happen that +// the address space has to be switched before the registers have been stored. +// To change the address space, another register is needed. +// A register therefore has to be stored/restored. +// +DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); + +#endif /* CONFIG_KAISER */ + +/** + * shadowmem_add_mapping - map a virtual memory part to the shadow mapping + * @addr: the start address of the range + * @size: the size of the range + * @flags: The mapping flags of the pages + * + * the mapping is done on a global scope, so no bigger synchronization has to be done. + * the pages have to be manually unmapped again when they are not needed any longer. + */ +extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); + + +/** + * shadowmem_remove_mapping - unmap a virtual memory part of the shadow mapping + * @addr: the start address of the range + * @size: the size of the range + */ +extern void kaiser_remove_mapping(unsigned long start, unsigned long size); + +/** + * shadowmem_initialize_mapping - Initalize the shadow mapping + * + * most parts of the shadow mapping can be mapped upon boot time. + * only the thread stacks have to be mapped on runtime. + * the mapped regions are not unmapped at all. + */ +extern void kaiser_init(void); + +#endif + + + +#endif /* _ASM_X86_KAISER_H */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index fe57e7a98839..53d44406af45 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -829,6 +829,10 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) { memcpy(dst, src, count * sizeof(pgd_t)); +#ifdef CONFIG_KAISER + // clone the shadow pgd part as well + memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t)); +#endif }
#define PTE_SHIFT ilog2(PTRS_PER_PTE) diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 2ee781114d34..2131edda6620 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -106,9 +106,30 @@ static inline void native_pud_clear(pud_t *pud) native_set_pud(pud, native_make_pud(0)); }
+#ifdef CONFIG_KAISER +static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) { + return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE); +} + +static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) { + return (pgd_t *)(void*)((unsigned long)(void*)pgdp & ~(unsigned long)PAGE_SIZE); +} +#endif /* CONFIG_KAISER */ + static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) { +#ifdef CONFIG_KAISER + // We know that a pgd is page aligned. + // Therefore the lower indices have to be mapped to user space. + // These pages are mapped to the shadow mapping. + if ((((unsigned long)pgdp) % PAGE_SIZE) < (PAGE_SIZE / 2)) { + native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; + } + + pgdp->pgd = pgd.pgd & ~_PAGE_USER; +#else /* CONFIG_KAISER */ *pgdp = pgd; +#endif }
static inline void native_pgd_clear(pgd_t *pgd) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 74fcdf3f1534..c0a0560b19af 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -39,7 +39,11 @@ #define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED) #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) -#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) +#ifdef CONFIG_KAISER +#define _PAGE_GLOBAL (_AT(pteval_t, 0)) +#else +#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) +#endif #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) #define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2) #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) @@ -89,7 +93,11 @@ #define _PAGE_NX (_AT(pteval_t, 0)) #endif
-#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) +#ifdef CONFIG_KAISER +#define _PAGE_PROTNONE (_AT(pteval_t, 0)) +#else +#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) +#endif
#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ _PAGE_ACCESSED | _PAGE_DIRTY) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 23ba6765b718..4abbd7d7cfb0 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -300,7 +300,7 @@ struct tss_struct {
} ____cacheline_aligned;
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); +DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss);
#ifdef CONFIG_X86_32 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); @@ -449,6 +449,11 @@ union irq_stack_union { char gs_base[40]; unsigned long stack_canary; }; + + struct { + char irq_stack_pointer[64]; + char unused[IRQ_STACK_SIZE - 64]; + }; };
DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4f1db34113e2..47194fac7eb7 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -91,7 +91,7 @@ static const struct cpu_dev default_cpu = {
static const struct cpu_dev *this_cpu = &default_cpu;
-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { +DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = { #ifdef CONFIG_X86_64 /* * We need valid kernel segments for data and code in long mode too @@ -1247,7 +1247,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { [DEBUG_STACK - 1] = DEBUG_STKSZ };
-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks +DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
/* May not be marked __init: used by software suspend */ diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index eaf3d4df76d5..8435b806d6a5 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -45,6 +45,7 @@ #include <asm/context_tracking.h> #include <asm/smap.h> #include <asm/pgtable_types.h> +#include <asm/kaiser.h> #include <linux/err.h>
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ @@ -208,6 +209,7 @@ ENTRY(system_call) * it is too small to ever cause noticeable irq latency. */ SWAPGS_UNSAFE_STACK + SWITCH_KERNEL_CR3_NO_STACK /* * A hypervisor implementation might want to use a label * after the swapgs, so that it can do the swapgs @@ -284,11 +286,12 @@ system_call_fastpath:
CFI_REMEMBER_STATE
- RESTORE_C_REGS_EXCEPT_RCX_R11 movq RIP(%rsp),%rcx CFI_REGISTER rip,rcx movq EFLAGS(%rsp),%r11 /*CFI_REGISTER rflags,r11*/ + RESTORE_C_REGS_EXCEPT_RCX_R11 + SWITCH_USER_CR3 movq RSP(%rsp),%rsp /* * 64bit SYSRET restores rip from rcx, @@ -477,11 +480,13 @@ syscall_return_via_sysret: CFI_REMEMBER_STATE /* r11 is already restored (see code above) */ RESTORE_C_REGS_EXCEPT_R11 + SWITCH_USER_CR3 movq RSP(%rsp),%rsp USERGS_SYSRET64 CFI_RESTORE_STATE
opportunistic_sysret_failed: + SWITCH_USER_CR3 SWAPGS jmp restore_c_regs_and_iret CFI_ENDPROC @@ -689,6 +694,7 @@ END(irq_entries_start) testl $3, CS-RBP(%rsp) je 1f SWAPGS + SWITCH_KERNEL_CR3 1: /* * Save previous stack pointer, optionally switch to interrupt stack. @@ -765,6 +771,7 @@ retint_swapgs: /* return to user-space */ DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_IRETQ
+ SWITCH_USER_CR3 SWAPGS jmp restore_c_regs_and_iret
@@ -820,6 +827,7 @@ native_irq_return_ldt: pushq_cfi %rax pushq_cfi %rdi SWAPGS + SWITCH_KERNEL_CR3 movq PER_CPU_VAR(espfix_waddr),%rdi movq %rax,(0*8)(%rdi) /* RAX */ movq (2*8)(%rsp),%rax /* RIP */ @@ -835,6 +843,7 @@ native_irq_return_ldt: andl $0xffff0000,%eax popq_cfi %rdi orq PER_CPU_VAR(espfix_stack),%rax + SWITCH_USER_CR3 SWAPGS movq %rax,%rsp popq_cfi %rax @@ -1283,6 +1292,7 @@ ENTRY(paranoid_entry) testl %edx,%edx js 1f /* negative -> in kernel */ SWAPGS + SWITCH_KERNEL_CR3 xorl %ebx,%ebx 1: ret CFI_ENDPROC @@ -1306,6 +1316,7 @@ ENTRY(paranoid_exit) testl %ebx,%ebx /* swapgs needed? */ jnz paranoid_exit_no_swapgs TRACE_IRQS_IRETQ + SWITCH_USER_CR3_NO_STACK SWAPGS_UNSAFE_STACK jmp paranoid_exit_restore paranoid_exit_no_swapgs: @@ -1332,6 +1343,7 @@ ENTRY(error_entry) je error_kernelspace error_swapgs: SWAPGS + SWITCH_KERNEL_CR3 error_sti: TRACE_IRQS_OFF ret @@ -1460,8 +1472,8 @@ ENTRY(nmi) * We also must not push anything to the stack before switching * stacks lest we corrupt the "NMI executing" variable. */ - SWAPGS_UNSAFE_STACK + SWITCH_KERNEL_CR3_NO_STACK cld movq %rsp, %rdx movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp @@ -1501,6 +1513,7 @@ ENTRY(nmi) * work, because we don't want to enable interrupts. Fortunately, * do_nmi doesn't modify pt_regs. */ + SWITCH_USER_CR3 SWAPGS jmp restore_c_regs_and_iret
@@ -1710,6 +1723,7 @@ end_repeat_nmi: testl %ebx,%ebx /* swapgs needed? */ jnz nmi_restore nmi_swapgs: + SWITCH_USER_CR3_NO_STACK SWAPGS_UNSAFE_STACK nmi_restore: RESTORE_EXTRA_REGS diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index f5d0730e7b08..9b0aef0dce94 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -41,6 +41,7 @@ #include <asm/pgalloc.h> #include <asm/setup.h> #include <asm/espfix.h> +#include <asm/kaiser.h>
/* * Note: we only need 6*8 = 48 bytes for the espfix stack, but round @@ -126,6 +127,11 @@ void __init init_espfix_bsp(void) /* Install the espfix pud into the kernel page directory */ pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page); +#ifdef CONFIG_KAISER + // add the esp stack pud to the shadow mapping here. + // This can be done directly, because the fixup stack has its own pud + set_pgd(native_get_shadow_pgd(pgd_p), __pgd(_PAGE_TABLE | __pa((pud_t *)espfix_pud_page))); +#endif
/* Randomize the locations */ init_espfix_random(); diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 174fa035a09a..0ee14f5bf59c 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -441,6 +441,14 @@ early_idt_ripmsg: .balign PAGE_SIZE; \ GLOBAL(name)
+#ifdef CONFIG_KAISER +#define NEXT_PGD_PAGE(name) \ + .balign 2 * PAGE_SIZE; \ +GLOBAL(name) +#else +#define NEXT_PGD_PAGE(name) NEXT_PAGE(name) +#endif + /* Automate the creation of 1 to 1 mapping pmd entries */ #define PMDS(START, PERM, COUNT) \ i = 0 ; \ @@ -450,7 +458,7 @@ GLOBAL(name) .endr
__INITDATA -NEXT_PAGE(early_level4_pgt) +NEXT_PGD_PAGE(early_level4_pgt) .fill 511,8,0 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
@@ -460,10 +468,10 @@ NEXT_PAGE(early_dynamic_pgts) .data
#ifndef CONFIG_XEN -NEXT_PAGE(init_level4_pgt) - .fill 512,8,0 +NEXT_PGD_PAGE(init_level4_pgt) + .fill 2*512,8,0 #else -NEXT_PAGE(init_level4_pgt) +NEXT_PGD_PAGE(init_level4_pgt) .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index cd10a6437264..77b07c72c3e3 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -51,7 +51,7 @@ static struct irqaction irq2 = { .flags = IRQF_NO_THREAD, };
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = { +DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = { [0 ... NR_VECTORS - 1] = VECTOR_UNDEFINED, };
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 971743774248..21e00403877a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -38,7 +38,7 @@ * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { +__visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss) = { .x86_tss = { .sp0 = TOP_OF_INIT_STACK, #ifdef CONFIG_X86_32 diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index d893640d5c68..4c15b80bf7df 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -32,3 +32,4 @@ obj-$(CONFIG_ACPI_NUMA) += srat.o obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
obj-$(CONFIG_X86_INTEL_MPX) += mpx.o +obj-$(CONFIG_KAISER) += kaiser.o diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c new file mode 100644 index 000000000000..cf1bb922d467 --- /dev/null +++ b/arch/x86/mm/kaiser.c @@ -0,0 +1,160 @@ + + +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/bug.h> +#include <linux/init.h> +#include <linux/spinlock.h> +#include <linux/mm.h> + +#include <linux/uaccess.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/desc.h> +#ifdef CONFIG_KAISER + +__visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); + +/** + * Get the real ppn from a address in kernel mapping. + * @param address The virtual adrress + * @return the physical address + */ +static inline unsigned long get_pa_from_mapping (unsigned long address) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset_k(address); + BUG_ON(pgd_none(*pgd) || pgd_large(*pgd)); + + pud = pud_offset(pgd, address); + BUG_ON(pud_none(*pud)); + + if (pud_large(*pud)) { + return (pud_pfn(*pud) << PAGE_SHIFT) | (address & ~PUD_PAGE_MASK); + } + + pmd = pmd_offset(pud, address); + BUG_ON(pmd_none(*pmd)); + + if (pmd_large(*pmd)) { + return (pmd_pfn(*pmd) << PAGE_SHIFT) | (address & ~PMD_PAGE_MASK); + } + + pte = pte_offset_kernel(pmd, address); + BUG_ON(pte_none(*pte)); + + return (pte_pfn(*pte) << PAGE_SHIFT) | (address & ~PAGE_MASK); +} + +void _kaiser_copy (unsigned long start_addr, unsigned long size, + unsigned long flags) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + unsigned long address; + unsigned long end_addr = start_addr + size; + unsigned long target_address; + + for (address = PAGE_ALIGN(start_addr - (PAGE_SIZE - 1)); + address < PAGE_ALIGN(end_addr); address += PAGE_SIZE) { + target_address = get_pa_from_mapping(address); + + pgd = native_get_shadow_pgd(pgd_offset_k(address)); + + BUG_ON(pgd_none(*pgd) && "All shadow pgds should be mapped at this time\n"); + BUG_ON(pgd_large(*pgd)); + + pud = pud_offset(pgd, address); + if (pud_none(*pud)) { + set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd_alloc_one(0, address)))); + } + BUG_ON(pud_large(*pud)); + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) { + set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte_alloc_one_kernel(0, address)))); + } + BUG_ON(pmd_large(*pmd)); + + pte = pte_offset_kernel(pmd, address); + if (pte_none(*pte)) { + set_pte(pte, __pte(flags | target_address)); + } else { + BUG_ON(__pa(pte_page(*pte)) != target_address); + } + } +} + +// at first, add a pmd for every pgd entry in the shadowmem-kernel-part of the kernel mapping +static inline void __init _kaiser_init(void) +{ + pgd_t *pgd; + int i = 0; + + pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0)); + for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) { + set_pgd(pgd + i, __pgd(_PAGE_TABLE |__pa(pud_alloc_one(0, 0)))); + } +} + +extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[]; +spinlock_t shadow_table_lock; +void __init kaiser_init(void) +{ + int cpu; + spin_lock_init(&shadow_table_lock); + + spin_lock(&shadow_table_lock); + + _kaiser_init(); + + for_each_possible_cpu(cpu) { + // map the per cpu user variables + _kaiser_copy( + (unsigned long) (__per_cpu_user_mapped_start + per_cpu_offset(cpu)), + (unsigned long) __per_cpu_user_mapped_end - (unsigned long) __per_cpu_user_mapped_start, + __PAGE_KERNEL); + } + + // map the entry/exit text section, which is responsible to switch between user- and kernel mode + _kaiser_copy( + (unsigned long) __entry_text_start, + (unsigned long) __entry_text_end - (unsigned long) __entry_text_start, + __PAGE_KERNEL_RX); + + // the fixed map address of the idt_table + _kaiser_copy( + (unsigned long) idt_descr.address, + sizeof(gate_desc) * NR_VECTORS, + __PAGE_KERNEL_RO); + + spin_unlock(&shadow_table_lock); +} + +// add a mapping to the shadow-mapping, and synchronize the mappings +void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) +{ + spin_lock(&shadow_table_lock); + _kaiser_copy(addr, size, flags); + spin_unlock(&shadow_table_lock); +} + +extern void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end); +void kaiser_remove_mapping(unsigned long start, unsigned long size) +{ + pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(start)); + spin_lock(&shadow_table_lock); + do { + unmap_pud_range(pgd, start, start + size); + } while (pgd++ != native_get_shadow_pgd(pgd_offset_k(start + size))); + spin_unlock(&shadow_table_lock); +} +#endif /* CONFIG_KAISER */ diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 1e5a786e75ce..5553b7fbc6cb 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -802,7 +802,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) pud_clear(pud); }
-static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) +void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) { pud_t *pud = pud_offset(pgd, start);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 0b97d2c75df3..c847acf67933 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -342,12 +342,38 @@ static inline void _pgd_free(pgd_t *pgd) #else static inline pgd_t *_pgd_alloc(void) { +#ifdef CONFIG_KAISER + // Instead of one PML4, we aquire two PML4s and, thus, an 8kb-aligned memory + // block. Therefore, we have to allocate at least 3 pages. However, the + // __get_free_pages returns us 4 pages. Hence, we store the base pointer at + // the beginning of the page of our 8kb-aligned memory block in order to + // correctly free it afterwars. + + unsigned long pages = __get_free_pages(PGALLOC_GFP, get_order(4*PAGE_SIZE)); + + if(native_get_normal_pgd((pgd_t*) pages) == (pgd_t*) pages) + { + *((unsigned long*)(pages + 2 * PAGE_SIZE)) = pages; + return (pgd_t *) pages; + } + else + { + *((unsigned long*)(pages + 3 * PAGE_SIZE)) = pages; + return (pgd_t *) (pages + PAGE_SIZE); + } +#else return (pgd_t *)__get_free_page(PGALLOC_GFP); +#endif }
static inline void _pgd_free(pgd_t *pgd) { +#ifdef CONFIG_KAISER + unsigned long pages = *((unsigned long*) ((char*) pgd + 2 * PAGE_SIZE)); + free_pages(pages, get_order(4*PAGE_SIZE)); +#else free_page((unsigned long)pgd); +#endif } #endif /* CONFIG_X86_PAE */
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 13671bc9a288..88c02f965f2e 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -715,7 +715,16 @@ */ #define PERCPU_INPUT(cacheline) \ VMLINUX_SYMBOL(__per_cpu_start) = .; \ - *(.data..percpu..first) \ + \ + VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .; \ + *(.data..percpu..first) \ + . = ALIGN(cacheline); \ + *(.data..percpu..user_mapped) \ + *(.data..percpu..user_mapped..shared_aligned) \ + . = ALIGN(PAGE_SIZE); \ + *(.data..percpu..user_mapped..page_aligned) \ + VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .; \ + \ . = ALIGN(PAGE_SIZE); \ *(.data..percpu..page_aligned) \ . = ALIGN(cacheline); \ diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 57f3a1c550dc..141d0de913a9 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -35,6 +35,12 @@
#endif
+#ifdef CONFIG_KAISER +#define USER_MAPPED_SECTION "..user_mapped" +#else +#define USER_MAPPED_SECTION "" +#endif + /* * Base implementations of per-CPU variable declarations and definitions, where * the section in which the variable is to be placed is provided by the @@ -115,6 +121,12 @@ #define DEFINE_PER_CPU(type, name) \ DEFINE_PER_CPU_SECTION(type, name, "")
+#define DECLARE_PER_CPU_USER_MAPPED(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION) + +#define DEFINE_PER_CPU_USER_MAPPED(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION) + /* * Declaration/definition used for per-CPU variables that must come first in * the set of variables. @@ -144,6 +156,14 @@ DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \ ____cacheline_aligned_in_smp
+#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \ + ____cacheline_aligned_in_smp + +#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \ + ____cacheline_aligned_in_smp + #define DECLARE_PER_CPU_ALIGNED(type, name) \ DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION) \ ____cacheline_aligned @@ -162,6 +182,16 @@ #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \ DEFINE_PER_CPU_SECTION(type, name, "..page_aligned") \ __aligned(PAGE_SIZE) +/* + * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode. + */ +#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \ + DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \ + __aligned(PAGE_SIZE) + +#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \ + DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \ + __aligned(PAGE_SIZE)
/* * Declaration/definition used for per-CPU variables that must be read mostly. diff --git a/init/main.c b/init/main.c index 2a89545e0a5d..de1951eaf1cc 100644 --- a/init/main.c +++ b/init/main.c @@ -87,6 +87,9 @@ #include <asm/setup.h> #include <asm/sections.h> #include <asm/cacheflush.h> +#ifdef CONFIG_KAISER +#include <asm/kaiser.h> +#endif
static int kernel_init(void *);
@@ -487,6 +490,9 @@ static void __init mm_init(void) pgtable_init(); vmalloc_init(); ioremap_huge_init(); +#ifdef CONFIG_KAISER + kaiser_init(); +#endif }
asmlinkage __visible void __init start_kernel(void) diff --git a/kernel/fork.c b/kernel/fork.c index edc1916e89ee..efe6f94d8dc3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -167,8 +167,12 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, return page ? page_address(page) : NULL; }
+extern void kaiser_remove_mapping(unsigned long start_addr, unsigned long size); static inline void free_thread_info(struct thread_info *ti) { +#ifdef CONFIG_KAISER + kaiser_remove_mapping((unsigned long)ti, THREAD_SIZE); +#endif free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); } # else @@ -325,6 +329,7 @@ void set_task_stack_end_magic(struct task_struct *tsk) *stackend = STACK_END_MAGIC; /* for overflow detection */ }
+extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags); static struct task_struct *dup_task_struct(struct task_struct *orig) { struct task_struct *tsk; @@ -345,6 +350,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) goto free_ti;
tsk->stack = ti; +#ifdef CONFIG_KAISER + kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE, __PAGE_KERNEL); +#endif #ifdef CONFIG_SECCOMP /* * We must handle setting up seccomp filters once we're under diff --git a/security/Kconfig b/security/Kconfig index bf4ec46474b6..9da2a6029703 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -30,6 +30,13 @@ config SECURITY model will be used.
If you are unsure how to answer this question, answer N. +config KAISER + bool "Remove the kernel mapping in user mode" + depends on X86_64 + depends on !PARAVIRT + help + This enforces a strict kernel and user space isolation in order to close + hardware side channels on kernel address information.
config SECURITYFS bool "Enable the securityfs filesystem"