kvmclock defines few static variables which are shared with hypervisor during the kvmclock initialization.
When SEV is active, memory is encrypted with a guest-specific key, and if guest OS wants to share the memory region with hypervisor then it must clear the C-bit before sharing it.
The '__decrypted' can be used to define a shared variables; the variables will be put in the .data.decryption section. This section is mapped with C=0 early in the boot, we also ensure that the initialized values are updated to match with C=0 (i.e peform an in-place decryption). The .data..decrypted section is PMD aligned and sized so that we avoid the need for spliting the pages when map with C=0.
Signed-off-by: Brijesh Singh brijesh.singh@amd.com Fixes: 368a540e0232 ("x86/kvmclock: Remove memblock dependency") Cc: stable@vger.kernel.org Cc: Tom Lendacky thomas.lendacky@amd.com Cc: kvm@vger.kernel.org Cc: Thomas Gleixner tglx@linutronix.de Cc: Borislav Petkov bp@suse.de Cc: "H. Peter Anvin" hpa@zytor.com Cc: linux-kernel@vger.kernel.org Cc: Paolo Bonzini pbonzini@redhat.com Cc: Sean Christopherson sean.j.christopherson@intel.com Cc: "Radim Krčmář" rkrcmar@redhat.com --- arch/x86/include/asm/mem_encrypt.h | 4 + arch/x86/kernel/head64.c | 12 ++ arch/x86/kernel/vmlinux.lds.S | 18 +++ arch/x86/mm/mem_encrypt_identity.c | 220 +++++++++++++++++++++++++++---------- 4 files changed, 197 insertions(+), 57 deletions(-)
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index c064383..3f7d9d3 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -52,6 +52,8 @@ void __init mem_encrypt_init(void); bool sme_active(void); bool sev_active(void);
+#define __decrypted __attribute__((__section__(".data..decrypted"))) + #else /* !CONFIG_AMD_MEM_ENCRYPT */
#define sme_me_mask 0ULL @@ -77,6 +79,8 @@ early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; static inline int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; }
+#define __decrypted + #endif /* CONFIG_AMD_MEM_ENCRYPT */
/* diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 8047379..6a18297 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -43,6 +43,9 @@ extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; static unsigned int __initdata next_early_pgt; pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
+/* To clear memory encryption mask from the decrypted section */ +extern char __start_data_decrypted[], __end_data_decrypted[]; + #ifdef CONFIG_X86_5LEVEL unsigned int __pgtable_l5_enabled __ro_after_init; unsigned int pgdir_shift __ro_after_init = 39; @@ -112,6 +115,7 @@ static bool __head check_la57_support(unsigned long physaddr) unsigned long __head __startup_64(unsigned long physaddr, struct boot_params *bp) { + unsigned long vaddr, vaddr_end; unsigned long load_delta, *p; unsigned long pgtable_flags; pgdval_t *pgd; @@ -234,6 +238,14 @@ unsigned long __head __startup_64(unsigned long physaddr, /* Encrypt the kernel and related (if SME is active) */ sme_encrypt_kernel(bp);
+ /* Clear the memory encryption mask from the decrypted section */ + vaddr = (unsigned long)__start_data_decrypted; + vaddr_end = (unsigned long)__end_data_decrypted; + for (; vaddr < vaddr_end; vaddr += PMD_SIZE) { + i = pmd_index(vaddr); + pmd[i] -= sme_get_me_mask(); + } + /* * Return the SME encryption mask (if SME is active) to be used as a * modifier for the initial pgdir entry programmed into CR3. diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 8bde0a4..511b875 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -89,6 +89,22 @@ PHDRS { note PT_NOTE FLAGS(0); /* ___ */ }
+/* + * This section contains data which will be mapped as decrypted. Memory + * encryption operates on a page basis. But we make this section a pmd + * aligned to avoid spliting the pages while mapping the section early. + * + * Note: We use a separate section so that only this section gets + * decrypted to avoid exposing more than we wish. + */ +#define DATA_DECRYPTED_SECTION \ + . = ALIGN(PMD_SIZE); \ + __start_data_decrypted = .; \ + *(.data..decrypted); \ + __end_data_decrypted = .; \ + . = ALIGN(PMD_SIZE); \ + + SECTIONS { #ifdef CONFIG_X86_32 @@ -171,6 +187,8 @@ SECTIONS /* rarely changed data like cpu maps */ READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES)
+ DATA_DECRYPTED_SECTION + /* End of data section */ _edata = .; } :data diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index 7ae3686..ccf6e2b 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -59,6 +59,8 @@ (_PAGE_PAT | _PAGE_PWT))
#define PTE_FLAGS_ENC (PTE_FLAGS | _PAGE_ENC) +#define PTE_FLAGS_ENC_WP ((PTE_FLAGS_ENC & ~_PAGE_CACHE_MASK) | \ + (_PAGE_PAT | _PAGE_PWT))
struct sme_populate_pgd_data { void *pgtable_area; @@ -72,10 +74,28 @@ struct sme_populate_pgd_data { unsigned long vaddr_end; };
+struct sme_workarea_data { + unsigned long kernel_start; + unsigned long kernel_end; + unsigned long kernel_len; + + unsigned long initrd_start; + unsigned long initrd_end; + unsigned long initrd_len; + + unsigned long workarea_start; + unsigned long workarea_end; + unsigned long workarea_len; + + unsigned long decrypted_base; +}; + static char sme_cmdline_arg[] __initdata = "mem_encrypt"; static char sme_cmdline_on[] __initdata = "on"; static char sme_cmdline_off[] __initdata = "off";
+extern char __start_data_decrypted[], __end_data_decrypted[]; + static void __init sme_clear_pgd(struct sme_populate_pgd_data *ppd) { unsigned long pgd_start, pgd_end, pgd_size; @@ -219,6 +239,11 @@ static void __init sme_map_range_encrypted(struct sme_populate_pgd_data *ppd) __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC); }
+static void __init sme_map_range_encrypted_wp(struct sme_populate_pgd_data *ppd) +{ + __sme_map_range(ppd, PMD_FLAGS_ENC, PTE_FLAGS_ENC_WP); +} + static void __init sme_map_range_decrypted(struct sme_populate_pgd_data *ppd) { __sme_map_range(ppd, PMD_FLAGS_DEC, PTE_FLAGS_DEC); @@ -266,19 +291,17 @@ static unsigned long __init sme_pgtable_calc(unsigned long len) return entries + tables; }
-void __init sme_encrypt_kernel(struct boot_params *bp) +static void __init build_workarea_map(struct boot_params *bp, + struct sme_workarea_data *wa, + struct sme_populate_pgd_data *ppd) { unsigned long workarea_start, workarea_end, workarea_len; unsigned long execute_start, execute_end, execute_len; unsigned long kernel_start, kernel_end, kernel_len; unsigned long initrd_start, initrd_end, initrd_len; - struct sme_populate_pgd_data ppd; unsigned long pgtable_area_len; unsigned long decrypted_base;
- if (!sme_active()) - return; - /* * Prepare for encrypting the kernel and initrd by building new * pagetables with the necessary attributes needed to encrypt the @@ -358,17 +381,17 @@ void __init sme_encrypt_kernel(struct boot_params *bp) * pagetables and when the new encrypted and decrypted kernel * mappings are populated. */ - ppd.pgtable_area = (void *)execute_end; + ppd->pgtable_area = (void *)execute_end;
/* * Make sure the current pagetable structure has entries for * addressing the workarea. */ - ppd.pgd = (pgd_t *)native_read_cr3_pa(); - ppd.paddr = workarea_start; - ppd.vaddr = workarea_start; - ppd.vaddr_end = workarea_end; - sme_map_range_decrypted(&ppd); + ppd->pgd = (pgd_t *)native_read_cr3_pa(); + ppd->paddr = workarea_start; + ppd->vaddr = workarea_start; + ppd->vaddr_end = workarea_end; + sme_map_range_decrypted(ppd);
/* Flush the TLB - no globals so cr3 is enough */ native_write_cr3(__native_read_cr3()); @@ -379,9 +402,9 @@ void __init sme_encrypt_kernel(struct boot_params *bp) * then be populated with new PUDs and PMDs as the encrypted and * decrypted kernel mappings are created. */ - ppd.pgd = ppd.pgtable_area; - memset(ppd.pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD); - ppd.pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD; + ppd->pgd = ppd->pgtable_area; + memset(ppd->pgd, 0, sizeof(pgd_t) * PTRS_PER_PGD); + ppd->pgtable_area += sizeof(pgd_t) * PTRS_PER_PGD;
/* * A different PGD index/entry must be used to get different @@ -399,75 +422,158 @@ void __init sme_encrypt_kernel(struct boot_params *bp) decrypted_base <<= PGDIR_SHIFT;
/* Add encrypted kernel (identity) mappings */ - ppd.paddr = kernel_start; - ppd.vaddr = kernel_start; - ppd.vaddr_end = kernel_end; - sme_map_range_encrypted(&ppd); + ppd->paddr = kernel_start; + ppd->vaddr = kernel_start; + ppd->vaddr_end = kernel_end; + sme_map_range_encrypted(ppd);
/* Add decrypted, write-protected kernel (non-identity) mappings */ - ppd.paddr = kernel_start; - ppd.vaddr = kernel_start + decrypted_base; - ppd.vaddr_end = kernel_end + decrypted_base; - sme_map_range_decrypted_wp(&ppd); + ppd->paddr = kernel_start; + ppd->vaddr = kernel_start + decrypted_base; + ppd->vaddr_end = kernel_end + decrypted_base; + sme_map_range_decrypted_wp(ppd);
if (initrd_len) { /* Add encrypted initrd (identity) mappings */ - ppd.paddr = initrd_start; - ppd.vaddr = initrd_start; - ppd.vaddr_end = initrd_end; - sme_map_range_encrypted(&ppd); + ppd->paddr = initrd_start; + ppd->vaddr = initrd_start; + ppd->vaddr_end = initrd_end; + sme_map_range_encrypted(ppd); /* * Add decrypted, write-protected initrd (non-identity) mappings */ - ppd.paddr = initrd_start; - ppd.vaddr = initrd_start + decrypted_base; - ppd.vaddr_end = initrd_end + decrypted_base; - sme_map_range_decrypted_wp(&ppd); + ppd->paddr = initrd_start; + ppd->vaddr = initrd_start + decrypted_base; + ppd->vaddr_end = initrd_end + decrypted_base; + sme_map_range_decrypted_wp(ppd); }
- /* Add decrypted workarea mappings to both kernel mappings */ - ppd.paddr = workarea_start; - ppd.vaddr = workarea_start; - ppd.vaddr_end = workarea_end; - sme_map_range_decrypted(&ppd); + /* + * When SEV is active, kernel is already encrypted hence mapping + * the initial workarea_start as encrypted. When SME is active, + * the kernel is not encrypted hence add a decrypted workarea + * mappings to both kernel mappings + */ + ppd->paddr = workarea_start; + ppd->vaddr = workarea_start; + ppd->vaddr_end = workarea_end; + if (sev_active()) + sme_map_range_encrypted(ppd); + else + sme_map_range_decrypted(ppd); + + ppd->paddr = workarea_start; + ppd->vaddr = workarea_start + decrypted_base; + ppd->vaddr_end = workarea_end + decrypted_base; + sme_map_range_decrypted(ppd);
- ppd.paddr = workarea_start; - ppd.vaddr = workarea_start + decrypted_base; - ppd.vaddr_end = workarea_end + decrypted_base; - sme_map_range_decrypted(&ppd); + wa->kernel_start = kernel_start; + wa->kernel_end = kernel_end; + wa->kernel_len = kernel_len;
- /* Perform the encryption */ - sme_encrypt_execute(kernel_start, kernel_start + decrypted_base, - kernel_len, workarea_start, (unsigned long)ppd.pgd); + wa->initrd_start = initrd_start; + wa->initrd_end = initrd_end; + wa->initrd_len = initrd_len;
- if (initrd_len) - sme_encrypt_execute(initrd_start, initrd_start + decrypted_base, - initrd_len, workarea_start, - (unsigned long)ppd.pgd); + wa->workarea_start = workarea_start; + wa->workarea_end = workarea_end; + wa->workarea_len = workarea_len;
+ wa->decrypted_base = decrypted_base; +} + +static void __init remove_workarea_map(struct sme_workarea_data *wa, + struct sme_populate_pgd_data *ppd) +{ /* * At this point we are running encrypted. Remove the mappings for * the decrypted areas - all that is needed for this is to remove * the PGD entry/entries. */ - ppd.vaddr = kernel_start + decrypted_base; - ppd.vaddr_end = kernel_end + decrypted_base; - sme_clear_pgd(&ppd); - - if (initrd_len) { - ppd.vaddr = initrd_start + decrypted_base; - ppd.vaddr_end = initrd_end + decrypted_base; - sme_clear_pgd(&ppd); + ppd->vaddr = wa->kernel_start + wa->decrypted_base; + ppd->vaddr_end = wa->kernel_end + wa->decrypted_base; + sme_clear_pgd(ppd); + + if (wa->initrd_len) { + ppd->vaddr = wa->initrd_start + wa->decrypted_base; + ppd->vaddr_end = wa->initrd_end + wa->decrypted_base; + sme_clear_pgd(ppd); }
- ppd.vaddr = workarea_start + decrypted_base; - ppd.vaddr_end = workarea_end + decrypted_base; - sme_clear_pgd(&ppd); + ppd->vaddr = wa->workarea_start + wa->decrypted_base; + ppd->vaddr_end = wa->workarea_end + wa->decrypted_base; + sme_clear_pgd(ppd);
/* Flush the TLB - no globals so cr3 is enough */ native_write_cr3(__native_read_cr3()); }
+static void __init decrypt_data_decrypted_section(struct sme_workarea_data *wa, + struct sme_populate_pgd_data *ppd) +{ + unsigned long decrypted_start, decrypted_end, decrypted_len; + + /* Physical addresses of decrypted data section */ + decrypted_start = __pa_symbol(__start_data_decrypted); + decrypted_end = __pa_symbol(__end_data_decrypted); + decrypted_len = decrypted_end - decrypted_start; + + if (!decrypted_len) + return; + + /* Add decrypted mapping for the section (identity) */ + ppd->paddr = decrypted_start; + ppd->vaddr = decrypted_start; + ppd->vaddr_end = decrypted_end; + sme_map_range_decrypted(ppd); + + /* Add encrypted-wp mapping for the section (non-identity) */ + ppd->paddr = decrypted_start; + ppd->vaddr = decrypted_start + wa->decrypted_base; + ppd->vaddr_end = decrypted_end + wa->decrypted_base; + sme_map_range_encrypted_wp(ppd); + + /* Perform in-place decryption */ + sme_encrypt_execute(decrypted_start + wa->decrypted_base, + decrypted_start, + decrypted_len, wa->workarea_start, + (unsigned long)ppd->pgd); + + ppd->vaddr = decrypted_start + wa->decrypted_base; + ppd->vaddr_end = decrypted_end + wa->decrypted_base; + sme_clear_pgd(ppd); +} + +void __init sme_encrypt_kernel(struct boot_params *bp) +{ + struct sme_populate_pgd_data ppd; + struct sme_workarea_data wa; + + if (!mem_encrypt_active()) + return; + + build_workarea_map(bp, &wa, &ppd); + + /* When SEV is active, encrypt kernel and initrd */ + if (sme_active()) { + sme_encrypt_execute(wa.kernel_start, + wa.kernel_start + wa.decrypted_base, + wa.kernel_len, wa.workarea_start, + (unsigned long)ppd.pgd); + + if (wa.initrd_len) + sme_encrypt_execute(wa.initrd_start, + wa.initrd_start + wa.decrypted_base, + wa.initrd_len, wa.workarea_start, + (unsigned long)ppd.pgd); + } + + /* Decrypt the contents of .data..decrypted section */ + decrypt_data_decrypted_section(&wa, &ppd); + + remove_workarea_map(&wa, &ppd); +} + void __init sme_enable(struct boot_params *bp) { const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off;