This is a note to let you know that I've just added the patch titled
x86/mm, KVM: Teach KVM's VMX code that CR3 isn't a constant
to the 4.9-stable tree which can be found at: http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git%3Ba=su...
The filename of the patch is: x86-mm-kvm-teach-kvm-s-vmx-code-that-cr3-isn-t-a-constant.patch and it can be found in the queue-4.9 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree, please let stable@vger.kernel.org know about it.
From d6e41f1151feeb118eee776c09323aceb4a415d9 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski luto@kernel.org Date: Sun, 28 May 2017 10:00:17 -0700 Subject: x86/mm, KVM: Teach KVM's VMX code that CR3 isn't a constant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit
From: Andy Lutomirski luto@kernel.org
commit d6e41f1151feeb118eee776c09323aceb4a415d9 upstream.
When PCID is enabled, CR3's PCID bits can change during context switches, so KVM won't be able to treat CR3 as a per-mm constant any more.
I structured this like the existing CR4 handling. Under ordinary circumstances (PCID disabled or if the current PCID and the value that's already in the VMCS match), then we won't do an extra VMCS write, and we'll never do an extra direct CR3 read. The overhead should be minimal.
I disallowed using the new helper in non-atomic context because PCID support will cause CR3 to stop being constant in non-atomic process context.
(Frankly, it also scares me a bit that KVM ever treated CR3 as constant, but it looks like it was okay before.)
Signed-off-by: Andy Lutomirski luto@kernel.org Cc: Andrew Morton akpm@linux-foundation.org Cc: Arjan van de Ven arjan@linux.intel.com Cc: Borislav Petkov bpetkov@suse.de Cc: Dave Hansen dave.hansen@intel.com Cc: Linus Torvalds torvalds@linux-foundation.org Cc: Mel Gorman mgorman@suse.de Cc: Michal Hocko mhocko@suse.com Cc: Nadav Amit nadav.amit@gmail.com Cc: Nadav Amit namit@vmware.com Cc: Paolo Bonzini pbonzini@redhat.com Cc: Peter Zijlstra peterz@infradead.org Cc: Radim Krčmář rkrcmar@redhat.com Cc: Rik van Riel riel@redhat.com Cc: Thomas Gleixner tglx@linutronix.de Cc: kvm@vger.kernel.org Cc: linux-mm@kvack.org Signed-off-by: Ingo Molnar mingo@kernel.org Signed-off-by: Eduardo Valentin eduval@amazon.com Signed-off-by: Eduardo Valentin edubezval@gmail.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org --- arch/x86/include/asm/mmu_context.h | 19 +++++++++++++++++++ arch/x86/kvm/vmx.c | 25 +++++++++++++++++++++---- 2 files changed, 40 insertions(+), 4 deletions(-)
--- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -268,4 +268,23 @@ static inline bool arch_pte_access_permi { return __pkru_allows_pkey(pte_flags_pkey(pte_flags(pte)), write); } + +/* + * This can be used from process context to figure out what the value of + * CR3 is without needing to do a (slow) read_cr3(). + * + * It's intended to be used for code like KVM that sneakily changes CR3 + * and needs to restore it. It needs to be used very carefully. + */ +static inline unsigned long __get_current_cr3_fast(void) +{ + unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd); + + /* For now, be very restrictive about when this can be called. */ + VM_WARN_ON(in_nmi() || !in_atomic()); + + VM_BUG_ON(cr3 != read_cr3()); + return cr3; +} + #endif /* _ASM_X86_MMU_CONTEXT_H */ --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -48,6 +48,7 @@ #include <asm/kexec.h> #include <asm/apic.h> #include <asm/irq_remapping.h> +#include <asm/mmu_context.h>
#include "trace.h" #include "pmu.h" @@ -572,6 +573,7 @@ struct vcpu_vmx { int gs_ldt_reload_needed; int fs_reload_needed; u64 msr_host_bndcfgs; + unsigned long vmcs_host_cr3; /* May not match real cr3 */ unsigned long vmcs_host_cr4; /* May not match real cr4 */ } host_state; struct { @@ -4857,10 +4859,19 @@ static void vmx_set_constant_host_state( u32 low32, high32; unsigned long tmpl; struct desc_ptr dt; - unsigned long cr4; + unsigned long cr0, cr3, cr4;
- vmcs_writel(HOST_CR0, read_cr0() & ~X86_CR0_TS); /* 22.2.3 */ - vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ + cr0 = read_cr0(); + WARN_ON(cr0 & X86_CR0_TS); + vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */ + + /* + * Save the most likely value for this task's CR3 in the VMCS. + * We can't use __get_current_cr3_fast() because we're not atomic. + */ + cr3 = read_cr3(); + vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ + vmx->host_state.vmcs_host_cr3 = cr3;
/* Save the most likely value for this task's CR4 in the VMCS. */ cr4 = cr4_read_shadow(); @@ -8836,7 +8847,7 @@ void vmx_arm_hv_timer(struct kvm_vcpu *v static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long debugctlmsr, cr4; + unsigned long debugctlmsr, cr3, cr4;
/* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) @@ -8862,6 +8873,12 @@ static void __noclone vmx_vcpu_run(struc if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+ cr3 = __get_current_cr3_fast(); + if (unlikely(cr3 != vmx->host_state.vmcs_host_cr3)) { + vmcs_writel(HOST_CR3, cr3); + vmx->host_state.vmcs_host_cr3 = cr3; + } + cr4 = cr4_read_shadow(); if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) { vmcs_writel(HOST_CR4, cr4);
Patches currently in stable-queue which might be from luto@kernel.org are
queue-4.9/x86-mm-refactor-flush_tlb_mm_range-to-merge-local-and-remote-cases.patch queue-4.9/x86-mm-pass-flush_tlb_info-to-flush_tlb_others-etc.patch queue-4.9/x86-mm-rework-lazy-tlb-to-track-the-actual-loaded-mm.patch queue-4.9/x86-mm-kvm-teach-kvm-s-vmx-code-that-cr3-isn-t-a-constant.patch queue-4.9/x86-mm-use-new-merged-flush-logic-in-arch_tlbbatch_flush.patch queue-4.9/x86-kvm-vmx-simplify-segment_base.patch queue-4.9/x86-entry-unwind-create-stack-frames-for-saved-interrupt-registers.patch queue-4.9/x86-mm-reduce-indentation-in-flush_tlb_func.patch queue-4.9/x86-mm-remove-the-up-asm-tlbflush.h-code-always-use-the-formerly-smp-code.patch queue-4.9/x86-mm-reimplement-flush_tlb_page-using-flush_tlb_mm_range.patch queue-4.9/mm-x86-mm-make-the-batched-unmap-tlb-flush-api-more-generic.patch queue-4.9/x86-kvm-vmx-defer-tr-reload-after-vm-exit.patch queue-4.9/x86-mm-change-the-leave_mm-condition-for-local-tlb-flushes.patch queue-4.9/x86-mm-be-more-consistent-wrt-page_shift-vs-page_size-in-tlb-flush-code.patch