This is a partial backport of the CR0.WP KVM series[1] to Linux v5.4. It limits itself to avoid TDP MMU unloading as making CR0.WP a guest owned bit turned out to be too much of an effort and the partial backport already being quite effective.
I used 'ssdd 10 50000' from rt-tests[2] as a micro-benchmark, running on a grsecurity L1 VM. Below table shows the results (runtime in seconds, lower is better):
TDP shadow Linux v5.4.240 8.87s 56.8s + patches 5.84s 55.4s
This kernel version had no module parameter to control the TDP MMU setting, it's always enabled when EPT / NPT is. Therefore its meaning is likely what became "legacy" in newer kernels.
Please consider applying.
Thanks, Mathias
[1] https://lore.kernel.org/kvm/20230322013731.102955-1-minipli@grsecurity.net/ [2] https://git.kernel.org/pub/scm/utils/rt-tests/rt-tests.git
Mathias Krause (2): KVM: x86: Do not unload MMU roots when only toggling CR0.WP with TDP enabled KVM: x86: Make use of kvm_read_cr*_bits() when testing bits
Paolo Bonzini (1): KVM: x86/mmu: Avoid indirect call for get_cr3
arch/x86/kvm/mmu.c | 14 +++++++------- arch/x86/kvm/mmu.h | 11 +++++++++++ arch/x86/kvm/paging_tmpl.h | 2 +- arch/x86/kvm/vmx/vmx.c | 4 ++-- arch/x86/kvm/x86.c | 14 +++++++++++++- 5 files changed, 34 insertions(+), 11 deletions(-)
From: Paolo Bonzini pbonzini@redhat.com
[ Upstream commit 2fdcc1b324189b5fb20655baebd40cd82e2bdf0c ]
Most of the time, calls to get_guest_pgd result in calling kvm_read_cr3 (the exception is only nested TDP). Hardcode the default instead of using the get_cr3 function, avoiding a retpoline if they are enabled.
Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Mathias Krause minipli@grsecurity.net Link: https://lore.kernel.org/r/20230322013731.102955-2-minipli@grsecurity.net Signed-off-by: Sean Christopherson seanjc@google.com Signed-off-by: Mathias Krause minipli@grsecurity.net # backport to v5.4.x --- arch/x86/kvm/mmu.c | 14 +++++++------- arch/x86/kvm/mmu.h | 11 +++++++++++ arch/x86/kvm/paging_tmpl.h | 2 +- arch/x86/kvm/x86.c | 2 +- 4 files changed, 20 insertions(+), 9 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 015da62e4ad7..a6efd71a0a6e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3815,7 +3815,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root); } else BUG(); - vcpu->arch.mmu->root_cr3 = vcpu->arch.mmu->get_cr3(vcpu); + vcpu->arch.mmu->root_cr3 = kvm_mmu_get_guest_cr3(vcpu, vcpu->arch.mmu);
return 0; } @@ -3827,7 +3827,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) gfn_t root_gfn, root_cr3; int i;
- root_cr3 = vcpu->arch.mmu->get_cr3(vcpu); + root_cr3 = kvm_mmu_get_guest_cr3(vcpu, vcpu->arch.mmu); root_gfn = root_cr3 >> PAGE_SHIFT;
if (mmu_check_root(vcpu, root_gfn)) @@ -4191,7 +4191,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; arch.gfn = gfn; arch.direct_map = vcpu->arch.mmu->direct_map; - arch.cr3 = vcpu->arch.mmu->get_cr3(vcpu); + arch.cr3 = kvm_mmu_get_guest_cr3(vcpu, vcpu->arch.mmu);
return kvm_setup_async_pf(vcpu, cr2_or_gpa, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); @@ -4453,7 +4453,7 @@ void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush) } EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3);
-static unsigned long get_cr3(struct kvm_vcpu *vcpu) +unsigned long get_guest_cr3(struct kvm_vcpu *vcpu) { return kvm_read_cr3(vcpu); } @@ -5040,7 +5040,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu); context->direct_map = true; context->set_cr3 = kvm_x86_ops->set_tdp_cr3; - context->get_cr3 = get_cr3; + context->get_cr3 = get_guest_cr3; context->get_pdptr = kvm_pdptr_read; context->inject_page_fault = kvm_inject_page_fault;
@@ -5187,7 +5187,7 @@ static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
kvm_init_shadow_mmu(vcpu); context->set_cr3 = kvm_x86_ops->set_cr3; - context->get_cr3 = get_cr3; + context->get_cr3 = get_guest_cr3; context->get_pdptr = kvm_pdptr_read; context->inject_page_fault = kvm_inject_page_fault; } @@ -5202,7 +5202,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) return;
g_context->mmu_role.as_u64 = new_role.as_u64; - g_context->get_cr3 = get_cr3; + g_context->get_cr3 = get_guest_cr3; g_context->get_pdptr = kvm_pdptr_read; g_context->inject_page_fault = kvm_inject_page_fault;
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index ea9945a05b83..a53b223a245a 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -102,6 +102,17 @@ static inline void kvm_mmu_load_cr3(struct kvm_vcpu *vcpu) kvm_get_active_pcid(vcpu)); }
+unsigned long get_guest_cr3(struct kvm_vcpu *vcpu); + +static inline unsigned long kvm_mmu_get_guest_cr3(struct kvm_vcpu *vcpu, + struct kvm_mmu *mmu) +{ + if (IS_ENABLED(CONFIG_RETPOLINE) && mmu->get_cr3 == get_guest_cr3) + return kvm_read_cr3(vcpu); + + return mmu->get_cr3(vcpu); +} + /* * Currently, we have two sorts of write-protection, a) the first one * write-protects guest page to sync the guest modification, b) another one is diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 1a1d2b5e7b35..b61ab1cdeab1 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -315,7 +315,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, trace_kvm_mmu_pagetable_walk(addr, access); retry_walk: walker->level = mmu->root_level; - pte = mmu->get_cr3(vcpu); + pte = kvm_mmu_get_guest_cr3(vcpu, mmu); have_ad = PT_HAVE_ACCESSED_DIRTY(mmu);
#if PTTYPE == 64 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f5e9590a8f31..f073c56b9301 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10130,7 +10130,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) return;
if (!vcpu->arch.mmu->direct_map && - work->arch.cr3 != vcpu->arch.mmu->get_cr3(vcpu)) + work->arch.cr3 != kvm_mmu_get_guest_cr3(vcpu, vcpu->arch.mmu)) return;
vcpu->arch.mmu->page_fault(vcpu, work->cr2_or_gpa, 0, true);
[ Upstream commit 01b31714bd90be2784f7145bf93b7f78f3d081e1 ]
There is no need to unload the MMU roots with TDP enabled when only CR0.WP has changed -- the paging structures are still valid, only the permission bitmap needs to be updated.
One heavy user of toggling CR0.WP is grsecurity's KERNEXEC feature to implement kernel W^X.
The optimization brings a huge performance gain for this case as the following micro-benchmark running 'ssdd 10 50000' from rt-tests[1] on a grsecurity L1 VM shows (runtime in seconds, lower is better):
legacy TDP shadow kvm-x86/next@d8708b 8.43s 9.45s 70.3s +patch 5.39s 5.63s 70.2s
For legacy MMU this is ~36% faster, for TDP MMU even ~40% faster. Also TDP and legacy MMU now both have a similar runtime which vanishes the need to disable TDP MMU for grsecurity.
Shadow MMU sees no measurable difference and is still slow, as expected.
[1] https://git.kernel.org/pub/scm/utils/rt-tests/rt-tests.git
Signed-off-by: Mathias Krause minipli@grsecurity.net Link: https://lore.kernel.org/r/20230322013731.102955-3-minipli@grsecurity.net Co-developed-by: Sean Christopherson seanjc@google.com Signed-off-by: Sean Christopherson seanjc@google.com Signed-off-by: Mathias Krause minipli@grsecurity.net # backport to v5.4.x --- arch/x86/kvm/x86.c | 12 ++++++++++++ 1 file changed, 12 insertions(+)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f073c56b9301..2903fd5523bd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -799,6 +799,18 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
kvm_x86_ops->set_cr0(vcpu, cr0);
+ /* + * CR0.WP is incorporated into the MMU role, but only for non-nested, + * indirect shadow MMUs. If TDP is enabled, the MMU's metadata needs + * to be updated, e.g. so that emulating guest translations does the + * right thing, but there's no need to unload the root as CR0.WP + * doesn't affect SPTEs. + */ + if (tdp_enabled && (cr0 ^ old_cr0) == X86_CR0_WP) { + kvm_init_mmu(vcpu, false); + return 0; + } + if ((cr0 ^ old_cr0) & X86_CR0_PG) { kvm_clear_async_pf_completion_queue(vcpu); kvm_async_pf_hash_reset(vcpu);
[ Upstream commit 74cdc836919bf34684ef66f995273f35e2189daf ]
Make use of the kvm_read_cr{0,4}_bits() helper functions when we only want to know the state of certain bits instead of the whole register.
This not only makes the intent cleaner, it also avoids a potential VMREAD in case the tested bits aren't guest owned.
Signed-off-by: Mathias Krause minipli@grsecurity.net Link: https://lore.kernel.org/r/20230322013731.102955-5-minipli@grsecurity.net Signed-off-by: Sean Christopherson seanjc@google.com Signed-off-by: Mathias Krause minipli@grsecurity.net # backport to 5.4.x --- arch/x86/kvm/vmx/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e6dd6a7e8689..9bbbb201bab5 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4970,7 +4970,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) break; case 3: /* lmsw */ val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; - trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val); + trace_kvm_cr_write(0, (kvm_read_cr0_bits(vcpu, ~0xful) | val)); kvm_lmsw(vcpu, val);
return kvm_skip_emulated_instruction(vcpu); @@ -6982,7 +6982,7 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) goto exit; }
- if (kvm_read_cr0(vcpu) & X86_CR0_CD) { + if (kvm_read_cr0_bits(vcpu, X86_CR0_CD)) { ipat = VMX_EPT_IPAT_BIT; if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) cache = MTRR_TYPE_WRBACK;
On Mon, May 08, 2023, Mathias Krause wrote:
This is a partial backport of the CR0.WP KVM series[1] to Linux v5.4. It limits itself to avoid TDP MMU unloading as making CR0.WP a guest owned bit turned out to be too much of an effort and the partial backport already being quite effective.
I used 'ssdd 10 50000' from rt-tests[2] as a micro-benchmark, running on a grsecurity L1 VM. Below table shows the results (runtime in seconds, lower is better):
TDP shadow Linux v5.4.240 8.87s 56.8s + patches 5.84s 55.4s
This kernel version had no module parameter to control the TDP MMU setting, it's always enabled when EPT / NPT is. Therefore its meaning is likely what became "legacy" in newer kernels.
Please consider applying.
NAK, same problem as 5.10 and 5.15. Sorry :-(
linux-stable-mirror@lists.linaro.org