From: Sean Christopherson seanjc@google.com
commit c5e2184d1544f9e56140791eff1a351bea2e63b9 upstream
Remove the update_pte() shadow paging logic, which was obsoleted by commit 4731d4c7a077 ("KVM: MMU: out of sync shadow core"), but never removed. As pointed out by Yu, KVM never write protects leaf page tables for the purposes of shadow paging, and instead marks their associated shadow page as unsync so that the guest can write PTEs at will.
The update_pte() path, which predates the unsync logic, optimizes COW scenarios by refreshing leaf SPTEs when they are written, as opposed to zapping the SPTE, restarting the guest, and installing the new SPTE on the subsequent fault. Since KVM no longer write-protects leaf page tables, update_pte() is unreachable and can be dropped.
Reported-by: Yu Zhang yu.c.zhang@intel.com Signed-off-by: Sean Christopherson seanjc@google.com Message-Id: 20210115004051.4099250-1-seanjc@google.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com (jwang: backport to 5.4 to fix a warning on AMD nested Virtualization) Signed-off-by: Jack Wang jinpu.wang@ionos.com --- We hit a warning in WARNING: CPU: 62 PID: 29302 at arch/x86/kvm/mmu.c:2250 nonpaging_update_pte+0x5/0x10 [kvm] on AMD Opteron(tm) Processor 6386 SE with kernel 5.4.113, it seems nested L2 is running, I notice a similar bug report on https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1884058.
I did test with kvm-unit-tests on both Intel Broadwell/Skylake, AMD Opteron, no regression, basic VM tests work fine too on 5.4 kernel. the commit c5e2184d1544f9e56140791eff1a351bea2e63b9 can be cherry-picked cleanly to kernel 5.10+.
arch/x86/include/asm/kvm_host.h | 3 --- arch/x86/kvm/mmu.c | 33 ++------------------------------- arch/x86/kvm/x86.c | 1 - 3 files changed, 2 insertions(+), 35 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c52b7073a5ab..4bc476d7fa6c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -391,8 +391,6 @@ struct kvm_mmu { int (*sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp); void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa); - void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - u64 *spte, const void *pte); hpa_t root_hpa; gpa_t root_cr3; union kvm_mmu_role mmu_role; @@ -944,7 +942,6 @@ struct kvm_arch { struct kvm_vm_stat { ulong mmu_shadow_zapped; ulong mmu_pte_write; - ulong mmu_pte_updated; ulong mmu_pde_zapped; ulong mmu_flooded; ulong mmu_recycled; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 47c27c6e3842..b9400087141d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2243,13 +2243,6 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root) { }
-static void nonpaging_update_pte(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, u64 *spte, - const void *pte) -{ - WARN_ON(1); -} - #define KVM_PAGE_ARRAY_NR 16
struct kvm_mmu_pages { @@ -4356,7 +4349,6 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu, context->gva_to_gpa = nonpaging_gva_to_gpa; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; - context->update_pte = nonpaging_update_pte; context->root_level = 0; context->shadow_root_level = PT32E_ROOT_LEVEL; context->direct_map = true; @@ -4935,7 +4927,6 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu, context->gva_to_gpa = paging64_gva_to_gpa; context->sync_page = paging64_sync_page; context->invlpg = paging64_invlpg; - context->update_pte = paging64_update_pte; context->shadow_root_level = level; context->direct_map = false; } @@ -4964,7 +4955,6 @@ static void paging32_init_context(struct kvm_vcpu *vcpu, context->gva_to_gpa = paging32_gva_to_gpa; context->sync_page = paging32_sync_page; context->invlpg = paging32_invlpg; - context->update_pte = paging32_update_pte; context->shadow_root_level = PT32E_ROOT_LEVEL; context->direct_map = false; } @@ -5039,7 +5029,6 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->page_fault = tdp_page_fault; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; - context->update_pte = nonpaging_update_pte; context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu); context->direct_map = true; context->set_cr3 = kvm_x86_ops->set_tdp_cr3; @@ -5172,7 +5161,6 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, context->gva_to_gpa = ept_gva_to_gpa; context->sync_page = ept_sync_page; context->invlpg = ept_invlpg; - context->update_pte = ept_update_pte; context->root_level = PT64_ROOT_4LEVEL; context->direct_map = false; context->mmu_role.as_u64 = new_role.as_u64; @@ -5312,19 +5300,6 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_mmu_unload);
-static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, u64 *spte, - const void *new) -{ - if (sp->role.level != PT_PAGE_TABLE_LEVEL) { - ++vcpu->kvm->stat.mmu_pde_zapped; - return; - } - - ++vcpu->kvm->stat.mmu_pte_updated; - vcpu->arch.mmu->update_pte(vcpu, sp, spte, new); -} - static bool need_remote_flush(u64 old, u64 new) { if (!is_shadow_present_pte(old)) @@ -5490,14 +5465,10 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
local_flush = true; while (npte--) { - u32 base_role = vcpu->arch.mmu->mmu_role.base.word; - entry = *spte; mmu_page_zap_pte(vcpu->kvm, sp, spte); - if (gentry && - !((sp->role.word ^ base_role) - & mmu_base_role_mask.word) && rmap_can_add(vcpu)) - mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); + if (gentry && sp->role.level != PG_LEVEL_4K) + ++vcpu->kvm->stat.mmu_pde_zapped; if (need_remote_flush(entry, *spte)) remote_flush = true; ++spte; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 153659e8f403..b19de7d42b46 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -208,7 +208,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "l1d_flush", VCPU_STAT(l1d_flush) }, { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, { "mmu_pte_write", VM_STAT(mmu_pte_write) }, - { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, { "mmu_flooded", VM_STAT(mmu_flooded) }, { "mmu_recycled", VM_STAT(mmu_recycled) },
On Fri, May 14, 2021 at 01:38:53PM +0200, Jack Wang wrote:
From: Sean Christopherson seanjc@google.com
commit c5e2184d1544f9e56140791eff1a351bea2e63b9 upstream
Remove the update_pte() shadow paging logic, which was obsoleted by commit 4731d4c7a077 ("KVM: MMU: out of sync shadow core"), but never removed. As pointed out by Yu, KVM never write protects leaf page tables for the purposes of shadow paging, and instead marks their associated shadow page as unsync so that the guest can write PTEs at will.
The update_pte() path, which predates the unsync logic, optimizes COW scenarios by refreshing leaf SPTEs when they are written, as opposed to zapping the SPTE, restarting the guest, and installing the new SPTE on the subsequent fault. Since KVM no longer write-protects leaf page tables, update_pte() is unreachable and can be dropped.
Reported-by: Yu Zhang yu.c.zhang@intel.com Signed-off-by: Sean Christopherson seanjc@google.com Message-Id: 20210115004051.4099250-1-seanjc@google.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com (jwang: backport to 5.4 to fix a warning on AMD nested Virtualization) Signed-off-by: Jack Wang jinpu.wang@ionos.com
We hit a warning in WARNING: CPU: 62 PID: 29302 at arch/x86/kvm/mmu.c:2250 nonpaging_update_pte+0x5/0x10 [kvm] on AMD Opteron(tm) Processor 6386 SE with kernel 5.4.113, it seems nested L2 is running, I notice a similar bug report on https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1884058.
I did test with kvm-unit-tests on both Intel Broadwell/Skylake, AMD Opteron, no regression, basic VM tests work fine too on 5.4 kernel. the commit c5e2184d1544f9e56140791eff1a351bea2e63b9 can be cherry-picked cleanly to kernel 5.10+.
Now queued up, thanks.
greg k-h
On Fri, May 14, 2021, Greg KH wrote:
On Fri, May 14, 2021 at 01:38:53PM +0200, Jack Wang wrote:
From: Sean Christopherson seanjc@google.com
commit c5e2184d1544f9e56140791eff1a351bea2e63b9 upstream
Remove the update_pte() shadow paging logic, which was obsoleted by commit 4731d4c7a077 ("KVM: MMU: out of sync shadow core"), but never removed. As pointed out by Yu, KVM never write protects leaf page tables for the purposes of shadow paging, and instead marks their associated shadow page as unsync so that the guest can write PTEs at will.
The update_pte() path, which predates the unsync logic, optimizes COW scenarios by refreshing leaf SPTEs when they are written, as opposed to zapping the SPTE, restarting the guest, and installing the new SPTE on the subsequent fault. Since KVM no longer write-protects leaf page tables, update_pte() is unreachable and can be dropped.
Reported-by: Yu Zhang yu.c.zhang@intel.com Signed-off-by: Sean Christopherson seanjc@google.com Message-Id: 20210115004051.4099250-1-seanjc@google.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com (jwang: backport to 5.4 to fix a warning on AMD nested Virtualization) Signed-off-by: Jack Wang jinpu.wang@ionos.com
We hit a warning in WARNING: CPU: 62 PID: 29302 at arch/x86/kvm/mmu.c:2250 nonpaging_update_pte+0x5/0x10 [kvm] on AMD Opteron(tm) Processor 6386 SE with kernel 5.4.113, it seems nested L2 is running, I notice a similar bug report on https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1884058.
I did test with kvm-unit-tests on both Intel Broadwell/Skylake, AMD Opteron, no regression, basic VM tests work fine too on 5.4 kernel. the commit c5e2184d1544f9e56140791eff1a351bea2e63b9 can be cherry-picked cleanly to kernel 5.10+.
Ah, drat. The WARN fires because update_pte() comes from the current MMU context, and older kernels are missing a check to ensure the current MMU context is "compatible" with the target shadow page's context.
That bug was inadvertantly fixed by commit a102a674e423 ("KVM: x86/mmu: Don't drop level/direct from MMU role calculation"), but I didn't tag that for stable because I incorrectly thought that adding a check on the "direct" role was a nop. From that changelog:
While doing so, stop ignoring "direct". The field is effectively ignored anyways because kvm_mmu_pte_write() is only reached with an indirect mmu and the loop only walks indirect shadow pages, but double checking "direct" literally costs nothing.
Specifically, the "kvm_mmu_pte_write() is only reached with an indirect mmu" part fails to account for the case where where the VM has one or more indirect MMUs _and_ a direct MMU (the current MMU context).
All that said, ripping out this code works too, and is preferable since the code is dead for its intended purpose, i.e. reaching the update_pte() code can only be a random, unwanted collision.
Now queued up, thanks.
Belated thumbs up, thanks!
On Mon, May 17, 2021 at 7:13 PM Sean Christopherson seanjc@google.com wrote:
On Fri, May 14, 2021, Greg KH wrote:
On Fri, May 14, 2021 at 01:38:53PM +0200, Jack Wang wrote:
From: Sean Christopherson seanjc@google.com
commit c5e2184d1544f9e56140791eff1a351bea2e63b9 upstream
Remove the update_pte() shadow paging logic, which was obsoleted by commit 4731d4c7a077 ("KVM: MMU: out of sync shadow core"), but never removed. As pointed out by Yu, KVM never write protects leaf page tables for the purposes of shadow paging, and instead marks their associated shadow page as unsync so that the guest can write PTEs at will.
The update_pte() path, which predates the unsync logic, optimizes COW scenarios by refreshing leaf SPTEs when they are written, as opposed to zapping the SPTE, restarting the guest, and installing the new SPTE on the subsequent fault. Since KVM no longer write-protects leaf page tables, update_pte() is unreachable and can be dropped.
Reported-by: Yu Zhang yu.c.zhang@intel.com Signed-off-by: Sean Christopherson seanjc@google.com Message-Id: 20210115004051.4099250-1-seanjc@google.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com (jwang: backport to 5.4 to fix a warning on AMD nested Virtualization) Signed-off-by: Jack Wang jinpu.wang@ionos.com
We hit a warning in WARNING: CPU: 62 PID: 29302 at arch/x86/kvm/mmu.c:2250 nonpaging_update_pte+0x5/0x10 [kvm] on AMD Opteron(tm) Processor 6386 SE with kernel 5.4.113, it seems nested L2 is running, I notice a similar bug report on https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1884058.
I did test with kvm-unit-tests on both Intel Broadwell/Skylake, AMD Opteron, no regression, basic VM tests work fine too on 5.4 kernel. the commit c5e2184d1544f9e56140791eff1a351bea2e63b9 can be cherry-picked cleanly to kernel 5.10+.
Ah, drat. The WARN fires because update_pte() comes from the current MMU context, and older kernels are missing a check to ensure the current MMU context is "compatible" with the target shadow page's context.
That bug was inadvertantly fixed by commit a102a674e423 ("KVM: x86/mmu: Don't drop level/direct from MMU role calculation"), but I didn't tag that for stable because I incorrectly thought that adding a check on the "direct" role was a nop. From that changelog:
While doing so, stop ignoring "direct". The field is effectively ignored anyways because kvm_mmu_pte_write() is only reached with an indirect mmu and the loop only walks indirect shadow pages, but double checking "direct" literally costs nothing.
Specifically, the "kvm_mmu_pte_write() is only reached with an indirect mmu" part fails to account for the case where where the VM has one or more indirect MMUs _and_ a direct MMU (the current MMU context).
All that said, ripping out this code works too, and is preferable since the code is dead for its intended purpose, i.e. reaching the update_pte() code can only be a random, unwanted collision.
Now queued up, thanks.
Belated thumbs up, thanks!
Hi Sean!
Thanks for checking and detailed implaination.
Regards! Jack Wang
linux-stable-mirror@lists.linaro.org