From: Paolo Bonzini pbonzini@redhat.com
[ Upstream commit f18b4aebe107d092e384b1ae680b1e1de7a0196d ]
Red Hat's QE team reported test failure on access_tracking_perf_test:
Testing guest mode: PA-bits:ANY, VA-bits:48, 4K pages guest physical test memory offset: 0x3fffbffff000
Populating memory : 0.684014577s Writing to populated memory : 0.006230175s Reading from populated memory : 0.004557805s ==== Test Assertion Failure ==== lib/kvm_util.c:1411: false pid=125806 tid=125809 errno=4 - Interrupted system call 1 0x0000000000402f7c: addr_gpa2hva at kvm_util.c:1411 2 (inlined by) addr_gpa2hva at kvm_util.c:1405 3 0x0000000000401f52: lookup_pfn at access_tracking_perf_test.c:98 4 (inlined by) mark_vcpu_memory_idle at access_tracking_perf_test.c:152 5 (inlined by) vcpu_thread_main at access_tracking_perf_test.c:232 6 0x00007fefe9ff81ce: ?? ??:0 7 0x00007fefe9c64d82: ?? ??:0 No vm physical memory at 0xffbffff000
I can easily reproduce it with a Intel(R) Xeon(R) CPU E5-2630 with 46 bits PA.
It turns out that the address translation for clearing idle page tracking returned a wrong result; addr_gva2gpa()'s last step, which is based on "pte[index[0]].pfn", did the calculation with 40 bits length and the high 12 bits got truncated. In above case the GPA address to be returned should be 0x3fffbffff000 for GVA 0xc0000000, but it got truncated into 0xffbffff000 and the subsequent gpa2hva lookup failed.
The width of operations on bit fields greater than 32-bit is implementation defined, and differs between GCC (which uses the bitfield precision) and clang (which uses 64-bit arithmetic), so this is a potential minefield. Remove the bit fields and using manual masking instead.
Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2075036 Reported-by: Nana Liu nanliu@redhat.com Reviewed-by: Peter Xu peterx@redhat.com Tested-by: Peter Xu peterx@redhat.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org --- .../selftests/kvm/include/x86_64/processor.h | 15 ++ .../selftests/kvm/lib/x86_64/processor.c | 192 +++++++----------- 2 files changed, 92 insertions(+), 115 deletions(-)
diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 05e65ca1c30c..23861c8faa61 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -58,6 +58,21 @@ /* CPUID.0x8000_0001.EDX */ #define CPUID_GBPAGES (1ul << 26)
+/* Page table bitfield declarations */ +#define PTE_PRESENT_MASK BIT_ULL(0) +#define PTE_WRITABLE_MASK BIT_ULL(1) +#define PTE_USER_MASK BIT_ULL(2) +#define PTE_ACCESSED_MASK BIT_ULL(5) +#define PTE_DIRTY_MASK BIT_ULL(6) +#define PTE_LARGE_MASK BIT_ULL(7) +#define PTE_GLOBAL_MASK BIT_ULL(8) +#define PTE_NX_MASK BIT_ULL(63) + +#define PAGE_SHIFT 12 + +#define PHYSICAL_PAGE_MASK GENMASK_ULL(51, 12) +#define PTE_GET_PFN(pte) (((pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT) + /* General Registers in 64-Bit Mode */ struct gpr64_regs { u64 rax; diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index da73b97e1e6d..46057079d8bb 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -19,38 +19,6 @@
vm_vaddr_t exception_handlers;
-/* Virtual translation table structure declarations */ -struct pageUpperEntry { - uint64_t present:1; - uint64_t writable:1; - uint64_t user:1; - uint64_t write_through:1; - uint64_t cache_disable:1; - uint64_t accessed:1; - uint64_t ignored_06:1; - uint64_t page_size:1; - uint64_t ignored_11_08:4; - uint64_t pfn:40; - uint64_t ignored_62_52:11; - uint64_t execute_disable:1; -}; - -struct pageTableEntry { - uint64_t present:1; - uint64_t writable:1; - uint64_t user:1; - uint64_t write_through:1; - uint64_t cache_disable:1; - uint64_t accessed:1; - uint64_t dirty:1; - uint64_t reserved_07:1; - uint64_t global:1; - uint64_t ignored_11_09:3; - uint64_t pfn:40; - uint64_t ignored_62_52:11; - uint64_t execute_disable:1; -}; - void regs_dump(FILE *stream, struct kvm_regs *regs, uint8_t indent) { @@ -195,23 +163,21 @@ static void *virt_get_pte(struct kvm_vm *vm, uint64_t pt_pfn, uint64_t vaddr, return &page_table[index]; }
-static struct pageUpperEntry *virt_create_upper_pte(struct kvm_vm *vm, - uint64_t pt_pfn, - uint64_t vaddr, - uint64_t paddr, - int level, - enum x86_page_size page_size) +static uint64_t *virt_create_upper_pte(struct kvm_vm *vm, + uint64_t pt_pfn, + uint64_t vaddr, + uint64_t paddr, + int level, + enum x86_page_size page_size) { - struct pageUpperEntry *pte = virt_get_pte(vm, pt_pfn, vaddr, level); - - if (!pte->present) { - pte->writable = true; - pte->present = true; - pte->page_size = (level == page_size); - if (pte->page_size) - pte->pfn = paddr >> vm->page_shift; + uint64_t *pte = virt_get_pte(vm, pt_pfn, vaddr, level); + + if (!(*pte & PTE_PRESENT_MASK)) { + *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK; + if (level == page_size) + *pte |= PTE_LARGE_MASK | (paddr & PHYSICAL_PAGE_MASK); else - pte->pfn = vm_alloc_page_table(vm) >> vm->page_shift; + *pte |= vm_alloc_page_table(vm) & PHYSICAL_PAGE_MASK; } else { /* * Entry already present. Assert that the caller doesn't want @@ -221,7 +187,7 @@ static struct pageUpperEntry *virt_create_upper_pte(struct kvm_vm *vm, TEST_ASSERT(level != page_size, "Cannot create hugepage at level: %u, vaddr: 0x%lx\n", page_size, vaddr); - TEST_ASSERT(!pte->page_size, + TEST_ASSERT(!(*pte & PTE_LARGE_MASK), "Cannot create page table at level: %u, vaddr: 0x%lx\n", level, vaddr); } @@ -232,8 +198,8 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, enum x86_page_size page_size) { const uint64_t pg_size = 1ull << ((page_size * 9) + 12); - struct pageUpperEntry *pml4e, *pdpe, *pde; - struct pageTableEntry *pte; + uint64_t *pml4e, *pdpe, *pde; + uint64_t *pte;
TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Unknown or unsupported guest mode, mode: 0x%x", vm->mode); @@ -257,24 +223,22 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, */ pml4e = virt_create_upper_pte(vm, vm->pgd >> vm->page_shift, vaddr, paddr, 3, page_size); - if (pml4e->page_size) + if (*pml4e & PTE_LARGE_MASK) return;
- pdpe = virt_create_upper_pte(vm, pml4e->pfn, vaddr, paddr, 2, page_size); - if (pdpe->page_size) + pdpe = virt_create_upper_pte(vm, PTE_GET_PFN(*pml4e), vaddr, paddr, 2, page_size); + if (*pdpe & PTE_LARGE_MASK) return;
- pde = virt_create_upper_pte(vm, pdpe->pfn, vaddr, paddr, 1, page_size); - if (pde->page_size) + pde = virt_create_upper_pte(vm, PTE_GET_PFN(*pdpe), vaddr, paddr, 1, page_size); + if (*pde & PTE_LARGE_MASK) return;
/* Fill in page table entry. */ - pte = virt_get_pte(vm, pde->pfn, vaddr, 0); - TEST_ASSERT(!pte->present, + pte = virt_get_pte(vm, PTE_GET_PFN(*pde), vaddr, 0); + TEST_ASSERT(!(*pte & PTE_PRESENT_MASK), "PTE already present for 4k page at vaddr: 0x%lx\n", vaddr); - pte->pfn = paddr >> vm->page_shift; - pte->writable = true; - pte->present = 1; + *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK); }
void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) @@ -282,12 +246,12 @@ void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) __virt_pg_map(vm, vaddr, paddr, X86_PAGE_SIZE_4K); }
-static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, +static uint64_t *_vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr) { uint16_t index[4]; - struct pageUpperEntry *pml4e, *pdpe, *pde; - struct pageTableEntry *pte; + uint64_t *pml4e, *pdpe, *pde; + uint64_t *pte; struct kvm_cpuid_entry2 *entry; struct kvm_sregs sregs; int max_phy_addr; @@ -329,30 +293,29 @@ static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vc index[3] = (vaddr >> 39) & 0x1ffu;
pml4e = addr_gpa2hva(vm, vm->pgd); - TEST_ASSERT(pml4e[index[3]].present, + TEST_ASSERT(pml4e[index[3]] & PTE_PRESENT_MASK, "Expected pml4e to be present for gva: 0x%08lx", vaddr); - TEST_ASSERT((*(uint64_t*)(&pml4e[index[3]]) & - (rsvd_mask | (1ull << 7))) == 0, + TEST_ASSERT((pml4e[index[3]] & (rsvd_mask | PTE_LARGE_MASK)) == 0, "Unexpected reserved bits set.");
- pdpe = addr_gpa2hva(vm, pml4e[index[3]].pfn * vm->page_size); - TEST_ASSERT(pdpe[index[2]].present, + pdpe = addr_gpa2hva(vm, PTE_GET_PFN(pml4e[index[3]]) * vm->page_size); + TEST_ASSERT(pdpe[index[2]] & PTE_PRESENT_MASK, "Expected pdpe to be present for gva: 0x%08lx", vaddr); - TEST_ASSERT(pdpe[index[2]].page_size == 0, + TEST_ASSERT(!(pdpe[index[2]] & PTE_LARGE_MASK), "Expected pdpe to map a pde not a 1-GByte page."); - TEST_ASSERT((*(uint64_t*)(&pdpe[index[2]]) & rsvd_mask) == 0, + TEST_ASSERT((pdpe[index[2]] & rsvd_mask) == 0, "Unexpected reserved bits set.");
- pde = addr_gpa2hva(vm, pdpe[index[2]].pfn * vm->page_size); - TEST_ASSERT(pde[index[1]].present, + pde = addr_gpa2hva(vm, PTE_GET_PFN(pdpe[index[2]]) * vm->page_size); + TEST_ASSERT(pde[index[1]] & PTE_PRESENT_MASK, "Expected pde to be present for gva: 0x%08lx", vaddr); - TEST_ASSERT(pde[index[1]].page_size == 0, + TEST_ASSERT(!(pde[index[1]] & PTE_LARGE_MASK), "Expected pde to map a pte not a 2-MByte page."); - TEST_ASSERT((*(uint64_t*)(&pde[index[1]]) & rsvd_mask) == 0, + TEST_ASSERT((pde[index[1]] & rsvd_mask) == 0, "Unexpected reserved bits set.");
- pte = addr_gpa2hva(vm, pde[index[1]].pfn * vm->page_size); - TEST_ASSERT(pte[index[0]].present, + pte = addr_gpa2hva(vm, PTE_GET_PFN(pde[index[1]]) * vm->page_size); + TEST_ASSERT(pte[index[0]] & PTE_PRESENT_MASK, "Expected pte to be present for gva: 0x%08lx", vaddr);
return &pte[index[0]]; @@ -360,7 +323,7 @@ static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vc
uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr) { - struct pageTableEntry *pte = _vm_get_page_table_entry(vm, vcpuid, vaddr); + uint64_t *pte = _vm_get_page_table_entry(vm, vcpuid, vaddr);
return *(uint64_t *)pte; } @@ -368,18 +331,17 @@ uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr) void vm_set_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr, uint64_t pte) { - struct pageTableEntry *new_pte = _vm_get_page_table_entry(vm, vcpuid, - vaddr); + uint64_t *new_pte = _vm_get_page_table_entry(vm, vcpuid, vaddr);
*(uint64_t *)new_pte = pte; }
void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) { - struct pageUpperEntry *pml4e, *pml4e_start; - struct pageUpperEntry *pdpe, *pdpe_start; - struct pageUpperEntry *pde, *pde_start; - struct pageTableEntry *pte, *pte_start; + uint64_t *pml4e, *pml4e_start; + uint64_t *pdpe, *pdpe_start; + uint64_t *pde, *pde_start; + uint64_t *pte, *pte_start;
if (!vm->pgd_created) return; @@ -389,58 +351,58 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) fprintf(stream, "%*s index hvaddr gpaddr " "addr w exec dirty\n", indent, ""); - pml4e_start = (struct pageUpperEntry *) addr_gpa2hva(vm, vm->pgd); + pml4e_start = (uint64_t *) addr_gpa2hva(vm, vm->pgd); for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) { pml4e = &pml4e_start[n1]; - if (!pml4e->present) + if (!(*pml4e & PTE_PRESENT_MASK)) continue; - fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10lx %u " + fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10llx %u " " %u\n", indent, "", pml4e - pml4e_start, pml4e, - addr_hva2gpa(vm, pml4e), (uint64_t) pml4e->pfn, - pml4e->writable, pml4e->execute_disable); + addr_hva2gpa(vm, pml4e), PTE_GET_PFN(*pml4e), + !!(*pml4e & PTE_WRITABLE_MASK), !!(*pml4e & PTE_NX_MASK));
- pdpe_start = addr_gpa2hva(vm, pml4e->pfn * vm->page_size); + pdpe_start = addr_gpa2hva(vm, *pml4e & PHYSICAL_PAGE_MASK); for (uint16_t n2 = 0; n2 <= 0x1ffu; n2++) { pdpe = &pdpe_start[n2]; - if (!pdpe->present) + if (!(*pdpe & PTE_PRESENT_MASK)) continue; - fprintf(stream, "%*spdpe 0x%-3zx %p 0x%-12lx 0x%-10lx " + fprintf(stream, "%*spdpe 0x%-3zx %p 0x%-12lx 0x%-10llx " "%u %u\n", indent, "", pdpe - pdpe_start, pdpe, addr_hva2gpa(vm, pdpe), - (uint64_t) pdpe->pfn, pdpe->writable, - pdpe->execute_disable); + PTE_GET_PFN(*pdpe), !!(*pdpe & PTE_WRITABLE_MASK), + !!(*pdpe & PTE_NX_MASK));
- pde_start = addr_gpa2hva(vm, pdpe->pfn * vm->page_size); + pde_start = addr_gpa2hva(vm, *pdpe & PHYSICAL_PAGE_MASK); for (uint16_t n3 = 0; n3 <= 0x1ffu; n3++) { pde = &pde_start[n3]; - if (!pde->present) + if (!(*pde & PTE_PRESENT_MASK)) continue; fprintf(stream, "%*spde 0x%-3zx %p " - "0x%-12lx 0x%-10lx %u %u\n", + "0x%-12lx 0x%-10llx %u %u\n", indent, "", pde - pde_start, pde, addr_hva2gpa(vm, pde), - (uint64_t) pde->pfn, pde->writable, - pde->execute_disable); + PTE_GET_PFN(*pde), !!(*pde & PTE_WRITABLE_MASK), + !!(*pde & PTE_NX_MASK));
- pte_start = addr_gpa2hva(vm, pde->pfn * vm->page_size); + pte_start = addr_gpa2hva(vm, *pde & PHYSICAL_PAGE_MASK); for (uint16_t n4 = 0; n4 <= 0x1ffu; n4++) { pte = &pte_start[n4]; - if (!pte->present) + if (!(*pte & PTE_PRESENT_MASK)) continue; fprintf(stream, "%*spte 0x%-3zx %p " - "0x%-12lx 0x%-10lx %u %u " + "0x%-12lx 0x%-10llx %u %u " " %u 0x%-10lx\n", indent, "", pte - pte_start, pte, addr_hva2gpa(vm, pte), - (uint64_t) pte->pfn, - pte->writable, - pte->execute_disable, - pte->dirty, + PTE_GET_PFN(*pte), + !!(*pte & PTE_WRITABLE_MASK), + !!(*pte & PTE_NX_MASK), + !!(*pte & PTE_DIRTY_MASK), ((uint64_t) n1 << 27) | ((uint64_t) n2 << 18) | ((uint64_t) n3 << 9) @@ -558,8 +520,8 @@ static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector, vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) { uint16_t index[4]; - struct pageUpperEntry *pml4e, *pdpe, *pde; - struct pageTableEntry *pte; + uint64_t *pml4e, *pdpe, *pde; + uint64_t *pte;
TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " "unknown or unsupported guest mode, mode: 0x%x", vm->mode); @@ -572,22 +534,22 @@ vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) if (!vm->pgd_created) goto unmapped_gva; pml4e = addr_gpa2hva(vm, vm->pgd); - if (!pml4e[index[3]].present) + if (!(pml4e[index[3]] & PTE_PRESENT_MASK)) goto unmapped_gva;
- pdpe = addr_gpa2hva(vm, pml4e[index[3]].pfn * vm->page_size); - if (!pdpe[index[2]].present) + pdpe = addr_gpa2hva(vm, PTE_GET_PFN(pml4e[index[3]]) * vm->page_size); + if (!(pdpe[index[2]] & PTE_PRESENT_MASK)) goto unmapped_gva;
- pde = addr_gpa2hva(vm, pdpe[index[2]].pfn * vm->page_size); - if (!pde[index[1]].present) + pde = addr_gpa2hva(vm, PTE_GET_PFN(pdpe[index[2]]) * vm->page_size); + if (!(pde[index[1]] & PTE_PRESENT_MASK)) goto unmapped_gva;
- pte = addr_gpa2hva(vm, pde[index[1]].pfn * vm->page_size); - if (!pte[index[0]].present) + pte = addr_gpa2hva(vm, PTE_GET_PFN(pde[index[1]]) * vm->page_size); + if (!(pte[index[0]] & PTE_PRESENT_MASK)) goto unmapped_gva;
- return (pte[index[0]].pfn * vm->page_size) + (gva & 0xfffu); + return (PTE_GET_PFN(pte[index[0]]) * vm->page_size) + (gva & 0xfffu);
unmapped_gva: TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva);
From: Thomas Huth thuth@redhat.com
[ Upstream commit 266a19a0bc4fbfab4d981a47640ca98972a01865 ]
When compiling kvm_page_table_test.c, I get this compiler warning with gcc 11.2:
kvm_page_table_test.c: In function 'pre_init_before_test': ../../../../tools/include/linux/kernel.h:44:24: warning: comparison of distinct pointer types lacks a cast 44 | (void) (&_max1 == &_max2); \ | ^~ kvm_page_table_test.c:281:21: note: in expansion of macro 'max' 281 | alignment = max(0x100000, alignment); | ^~~
Fix it by adjusting the type of the absolute value.
Signed-off-by: Thomas Huth thuth@redhat.com Reviewed-by: Claudio Imbrenda imbrenda@linux.ibm.com Message-Id: 20220414103031.565037-1-thuth@redhat.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org --- tools/testing/selftests/kvm/kvm_page_table_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c index 36407cb0ec85..f1ddfe4c4a03 100644 --- a/tools/testing/selftests/kvm/kvm_page_table_test.c +++ b/tools/testing/selftests/kvm/kvm_page_table_test.c @@ -278,7 +278,7 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg) else guest_test_phys_mem = p->phys_offset; #ifdef __s390x__ - alignment = max(0x100000, alignment); + alignment = max(0x100000UL, alignment); #endif guest_test_phys_mem &= ~(alignment - 1);
On 4/27/22 17:54, Sasha Levin wrote:
From: Thomas Huth thuth@redhat.com
[ Upstream commit 266a19a0bc4fbfab4d981a47640ca98972a01865 ]
When compiling kvm_page_table_test.c, I get this compiler warning with gcc 11.2:
kvm_page_table_test.c: In function 'pre_init_before_test': ../../../../tools/include/linux/kernel.h:44:24: warning: comparison of distinct pointer types lacks a cast 44 | (void) (&_max1 == &_max2); \ | ^~ kvm_page_table_test.c:281:21: note: in expansion of macro 'max' 281 | alignment = max(0x100000, alignment); | ^~~
Fix it by adjusting the type of the absolute value.
Signed-off-by: Thomas Huth thuth@redhat.com Reviewed-by: Claudio Imbrenda imbrenda@linux.ibm.com Message-Id: 20220414103031.565037-1-thuth@redhat.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org
tools/testing/selftests/kvm/kvm_page_table_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c index 36407cb0ec85..f1ddfe4c4a03 100644 --- a/tools/testing/selftests/kvm/kvm_page_table_test.c +++ b/tools/testing/selftests/kvm/kvm_page_table_test.c @@ -278,7 +278,7 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg) else guest_test_phys_mem = p->phys_offset; #ifdef __s390x__
- alignment = max(0x100000, alignment);
- alignment = max(0x100000UL, alignment); #endif guest_test_phys_mem &= ~(alignment - 1);
Acked-by: Paolo Bonzini pbonzini@redhat.com
From: Wanpeng Li wanpengli@tencent.com
[ Upstream commit 0361bdfddca20c8855ea3bdbbbc9c999912b10ff ]
MSR_KVM_POLL_CONTROL is cleared on reset, thus reverting guests to host-side polling after suspend/resume. Non-bootstrap CPUs are restored correctly by the haltpoll driver because they are hot-unplugged during suspend and hot-plugged during resume; however, the BSP is not hotpluggable and remains in host-sde polling mode after the guest resume. The makes the guest pay for the cost of vmexits every time the guest enters idle.
Fix it by recording BSP's haltpoll state and resuming it during guest resume.
Cc: Marcelo Tosatti mtosatti@redhat.com Signed-off-by: Wanpeng Li wanpengli@tencent.com Message-Id: 1650267752-46796-1-git-send-email-wanpengli@tencent.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org --- arch/x86/kernel/kvm.c | 13 +++++++++++++ 1 file changed, 13 insertions(+)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index bd7b65081eb0..d36b58e705b6 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -66,6 +66,7 @@ static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __align DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible; static int has_steal_clock = 0;
+static int has_guest_poll = 0; /* * No need for any "IO delay" on KVM */ @@ -650,14 +651,26 @@ static int kvm_cpu_down_prepare(unsigned int cpu)
static int kvm_suspend(void) { + u64 val = 0; + kvm_guest_cpu_offline(false);
+#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL + if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) + rdmsrl(MSR_KVM_POLL_CONTROL, val); + has_guest_poll = !(val & 1); +#endif return 0; }
static void kvm_resume(void) { kvm_cpu_online(raw_smp_processor_id()); + +#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL + if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll) + wrmsrl(MSR_KVM_POLL_CONTROL, 0); +#endif }
static struct syscore_ops kvm_syscore_ops = {
On 4/27/22 17:54, Sasha Levin wrote:
From: Wanpeng Li wanpengli@tencent.com
[ Upstream commit 0361bdfddca20c8855ea3bdbbbc9c999912b10ff ]
MSR_KVM_POLL_CONTROL is cleared on reset, thus reverting guests to host-side polling after suspend/resume. Non-bootstrap CPUs are restored correctly by the haltpoll driver because they are hot-unplugged during suspend and hot-plugged during resume; however, the BSP is not hotpluggable and remains in host-sde polling mode after the guest resume. The makes the guest pay for the cost of vmexits every time the guest enters idle.
Fix it by recording BSP's haltpoll state and resuming it during guest resume.
Cc: Marcelo Tosatti mtosatti@redhat.com Signed-off-by: Wanpeng Li wanpengli@tencent.com Message-Id: 1650267752-46796-1-git-send-email-wanpengli@tencent.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org
arch/x86/kernel/kvm.c | 13 +++++++++++++ 1 file changed, 13 insertions(+)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index bd7b65081eb0..d36b58e705b6 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -66,6 +66,7 @@ static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __align DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible; static int has_steal_clock = 0; +static int has_guest_poll = 0; /*
- No need for any "IO delay" on KVM
*/ @@ -650,14 +651,26 @@ static int kvm_cpu_down_prepare(unsigned int cpu) static int kvm_suspend(void) {
- u64 val = 0;
- kvm_guest_cpu_offline(false);
+#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
- if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
rdmsrl(MSR_KVM_POLL_CONTROL, val);
- has_guest_poll = !(val & 1);
+#endif return 0; } static void kvm_resume(void) { kvm_cpu_online(raw_smp_processor_id());
+#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
- if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll)
wrmsrl(MSR_KVM_POLL_CONTROL, 0);
+#endif } static struct syscore_ops kvm_syscore_ops = {
Acked-by: Paolo Bonzini pbonzini@redhat.com
From: Paolo Bonzini pbonzini@redhat.com
[ Upstream commit 614f6970aa70242a3f8a8051b01244c029f77b2a ]
Remove the "shared" argument of for_each_tdp_mmu_root_yield_safe, thus ensuring that readers do not ever acquire a reference to an invalid root. After this patch, all readers except kvm_tdp_mmu_zap_invalidated_roots() treat refcount=0/valid, refcount=0/invalid and refcount=1/invalid in exactly the same way. kvm_tdp_mmu_zap_invalidated_roots() is different but it also does not acquire a reference to the invalid root, and it cannot see refcount=0/invalid because it is guaranteed to run after kvm_tdp_mmu_invalidate_all_roots().
Opportunistically add a lockdep assertion to the yield-safe iterator.
Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org --- arch/x86/kvm/mmu/tdp_mmu.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 853780eb033b..7e854313ec3b 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -155,14 +155,15 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \ _root; \ _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \ - if (kvm_mmu_page_as_id(_root) != _as_id) { \ + if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \ + kvm_mmu_page_as_id(_root) != _as_id) { \ } else
#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
-#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ - __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, false) +#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ + __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link, \ @@ -828,7 +829,7 @@ bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, { struct kvm_mmu_page *root;
- for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, false) + for_each_tdp_mmu_root_yield_safe(kvm, root, as_id) flush = zap_gfn_range(kvm, root, start, end, can_yield, flush, false);
On 4/27/22 17:54, Sasha Levin wrote:
From: Paolo Bonzini pbonzini@redhat.com
[ Upstream commit 614f6970aa70242a3f8a8051b01244c029f77b2a ]
Remove the "shared" argument of for_each_tdp_mmu_root_yield_safe, thus ensuring that readers do not ever acquire a reference to an invalid root. After this patch, all readers except kvm_tdp_mmu_zap_invalidated_roots() treat refcount=0/valid, refcount=0/invalid and refcount=1/invalid in exactly the same way. kvm_tdp_mmu_zap_invalidated_roots() is different but it also does not acquire a reference to the invalid root, and it cannot see refcount=0/invalid because it is guaranteed to run after kvm_tdp_mmu_invalidate_all_roots().
Opportunistically add a lockdep assertion to the yield-safe iterator.
Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org
arch/x86/kvm/mmu/tdp_mmu.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 853780eb033b..7e854313ec3b 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -155,14 +155,15 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \ _root; \ _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \
if (kvm_mmu_page_as_id(_root) != _as_id) { \
if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \
} elsekvm_mmu_page_as_id(_root) != _as_id) { \
#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true) -#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
- __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, false)
+#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
- __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link, \ @@ -828,7 +829,7 @@ bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, { struct kvm_mmu_page *root;
- for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, false)
- for_each_tdp_mmu_root_yield_safe(kvm, root, as_id) flush = zap_gfn_range(kvm, root, start, end, can_yield, flush, false);
Acked-by: Paolo Bonzini pbonzini@redhat.com
On 4/27/22 17:54, Sasha Levin wrote:
From: Paolo Bonzini pbonzini@redhat.com
[ Upstream commit 614f6970aa70242a3f8a8051b01244c029f77b2a ]
Remove the "shared" argument of for_each_tdp_mmu_root_yield_safe, thus ensuring that readers do not ever acquire a reference to an invalid root. After this patch, all readers except kvm_tdp_mmu_zap_invalidated_roots() treat refcount=0/valid, refcount=0/invalid and refcount=1/invalid in exactly the same way. kvm_tdp_mmu_zap_invalidated_roots() is different but it also does not acquire a reference to the invalid root, and it cannot see refcount=0/invalid because it is guaranteed to run after kvm_tdp_mmu_invalidate_all_roots().
Opportunistically add a lockdep assertion to the yield-safe iterator.
Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org
arch/x86/kvm/mmu/tdp_mmu.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 853780eb033b..7e854313ec3b 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -155,14 +155,15 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \ _root; \ _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \
if (kvm_mmu_page_as_id(_root) != _as_id) { \
if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \
} elsekvm_mmu_page_as_id(_root) != _as_id) { \
#define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true) -#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
- __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, false)
+#define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
- __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
#define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link, \ @@ -828,7 +829,7 @@ bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, { struct kvm_mmu_page *root;
- for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, false)
- for_each_tdp_mmu_root_yield_safe(kvm, root, as_id) flush = zap_gfn_range(kvm, root, start, end, can_yield, flush, false);
Sorry no, this is a NACK.
Paolo
From: Paolo Bonzini pbonzini@redhat.com
[ Upstream commit d22a81b304a27fca6124174a8e842e826c193466 ]
Emulating writes to SELF_IPI with a write to ICR has an unwanted side effect: the value of ICR in vAPIC page gets changed. The lists SELF_IPI as write-only, with no associated MMIO offset, so any write should have no visible side effect in the vAPIC page.
Reported-by: Chao Gao chao.gao@intel.com Reviewed-by: Sean Christopherson seanjc@google.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org --- arch/x86/kvm/lapic.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4d92fb4fdf69..83d1743a1dd0 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2125,10 +2125,9 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) break;
case APIC_SELF_IPI: - if (apic_x2apic_mode(apic)) { - kvm_lapic_reg_write(apic, APIC_ICR, - APIC_DEST_SELF | (val & APIC_VECTOR_MASK)); - } else + if (apic_x2apic_mode(apic)) + kvm_apic_send_ipi(apic, APIC_DEST_SELF | (val & APIC_VECTOR_MASK), 0); + else ret = 1; break; default:
On 4/27/22 17:54, Sasha Levin wrote:
From: Paolo Bonzini pbonzini@redhat.com
[ Upstream commit d22a81b304a27fca6124174a8e842e826c193466 ]
Emulating writes to SELF_IPI with a write to ICR has an unwanted side effect: the value of ICR in vAPIC page gets changed. The lists SELF_IPI as write-only, with no associated MMIO offset, so any write should have no visible side effect in the vAPIC page.
Reported-by: Chao Gao chao.gao@intel.com Reviewed-by: Sean Christopherson seanjc@google.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org
arch/x86/kvm/lapic.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4d92fb4fdf69..83d1743a1dd0 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2125,10 +2125,9 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) break; case APIC_SELF_IPI:
if (apic_x2apic_mode(apic)) {
kvm_lapic_reg_write(apic, APIC_ICR,
APIC_DEST_SELF | (val & APIC_VECTOR_MASK));
} else
if (apic_x2apic_mode(apic))
kvm_apic_send_ipi(apic, APIC_DEST_SELF | (val & APIC_VECTOR_MASK), 0);
break; default:else ret = 1;
Acked-by: Paolo Bonzini pbonzini@redhat.com
From: Paolo Bonzini pbonzini@redhat.com
[ Upstream commit 9191b8f0745e63edf519e4a54a4aaae1d3d46fbd ]
WARN and bail if KVM attempts to free a root that isn't backed by a shadow page. KVM allocates a bare page for "special" roots, e.g. when using PAE paging or shadowing 2/3/4-level page tables with 4/5-level, and so root_hpa will be valid but won't be backed by a shadow page. It's all too easy to blindly call mmu_free_root_page() on root_hpa, be nice and WARN instead of crashing KVM and possibly the kernel.
Reviewed-by: Sean Christopherson seanjc@google.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org --- arch/x86/kvm/mmu/mmu.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 34e828badc51..806f9d42bcce 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3314,6 +3314,8 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, return;
sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK); + if (WARN_ON(!sp)) + return;
if (is_tdp_mmu_page(sp)) kvm_tdp_mmu_put_root(kvm, sp, false);
On 4/27/22 17:54, Sasha Levin wrote:
From: Paolo Bonzini pbonzini@redhat.com
[ Upstream commit 9191b8f0745e63edf519e4a54a4aaae1d3d46fbd ]
WARN and bail if KVM attempts to free a root that isn't backed by a shadow page. KVM allocates a bare page for "special" roots, e.g. when using PAE paging or shadowing 2/3/4-level page tables with 4/5-level, and so root_hpa will be valid but won't be backed by a shadow page. It's all too easy to blindly call mmu_free_root_page() on root_hpa, be nice and WARN instead of crashing KVM and possibly the kernel.
Reviewed-by: Sean Christopherson seanjc@google.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org
arch/x86/kvm/mmu/mmu.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 34e828badc51..806f9d42bcce 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3314,6 +3314,8 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, return; sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
- if (WARN_ON(!sp))
return;
if (is_tdp_mmu_page(sp)) kvm_tdp_mmu_put_root(kvm, sp, false);
Acked-by: Paolo Bonzini pbonzini@redhat.com
From: Wanpeng Li wanpengli@tencent.com
[ Upstream commit 1714a4eb6fb0cb79f182873cd011a8ed60ac65e8 ]
As commit 0c5f81dad46 ("KVM: LAPIC: Inject timer interrupt via posted interrupt") mentioned that the host admin should well tune the guest setup, so that vCPUs are placed on isolated pCPUs, and with several pCPUs surplus for *busy* housekeeping. In this setup, it is preferrable to disable mwait/hlt/pause vmexits to keep the vCPUs in non-root mode.
However, if only some guests isolated and others not, they would not have any benefit from posted timer interrupts, and at the same time lose VMX preemption timer fast paths because kvm_can_post_timer_interrupt() returns true and therefore forces kvm_can_use_hv_timer() to false.
By guaranteeing that posted-interrupt timer is only used if MWAIT or HLT are done without vmexit, KVM can make a better choice and use the VMX preemption timer and the corresponding fast paths.
Reported-by: Aili Yao yaoaili@kingsoft.com Reviewed-by: Sean Christopherson seanjc@google.com Cc: Aili Yao yaoaili@kingsoft.com Cc: Sean Christopherson seanjc@google.com Signed-off-by: Wanpeng Li wanpengli@tencent.com Message-Id: 1643112538-36743-1-git-send-email-wanpengli@tencent.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org --- arch/x86/kvm/lapic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 83d1743a1dd0..493d636e6231 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -113,7 +113,8 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu) { - return pi_inject_timer && kvm_vcpu_apicv_active(vcpu); + return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) && + (kvm_mwait_in_guest(vcpu->kvm) || kvm_hlt_in_guest(vcpu->kvm)); }
bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu)
On 4/27/22 17:54, Sasha Levin wrote:
From: Wanpeng Li wanpengli@tencent.com
[ Upstream commit 1714a4eb6fb0cb79f182873cd011a8ed60ac65e8 ]
As commit 0c5f81dad46 ("KVM: LAPIC: Inject timer interrupt via posted interrupt") mentioned that the host admin should well tune the guest setup, so that vCPUs are placed on isolated pCPUs, and with several pCPUs surplus for *busy* housekeeping. In this setup, it is preferrable to disable mwait/hlt/pause vmexits to keep the vCPUs in non-root mode.
However, if only some guests isolated and others not, they would not have any benefit from posted timer interrupts, and at the same time lose VMX preemption timer fast paths because kvm_can_post_timer_interrupt() returns true and therefore forces kvm_can_use_hv_timer() to false.
By guaranteeing that posted-interrupt timer is only used if MWAIT or HLT are done without vmexit, KVM can make a better choice and use the VMX preemption timer and the corresponding fast paths.
Reported-by: Aili Yao yaoaili@kingsoft.com Reviewed-by: Sean Christopherson seanjc@google.com Cc: Aili Yao yaoaili@kingsoft.com Cc: Sean Christopherson seanjc@google.com Signed-off-by: Wanpeng Li wanpengli@tencent.com Message-Id: 1643112538-36743-1-git-send-email-wanpengli@tencent.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org
arch/x86/kvm/lapic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 83d1743a1dd0..493d636e6231 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -113,7 +113,8 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic) static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu) {
- return pi_inject_timer && kvm_vcpu_apicv_active(vcpu);
- return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) &&
}(kvm_mwait_in_guest(vcpu->kvm) || kvm_hlt_in_guest(vcpu->kvm));
bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu)
Acked-by: Paolo Bonzini pbonzini@redhat.com
On 4/27/22 17:54, Sasha Levin wrote:
From: Paolo Bonzini pbonzini@redhat.com
[ Upstream commit f18b4aebe107d092e384b1ae680b1e1de7a0196d ]
Red Hat's QE team reported test failure on access_tracking_perf_test:
Testing guest mode: PA-bits:ANY, VA-bits:48, 4K pages guest physical test memory offset: 0x3fffbffff000
Populating memory : 0.684014577s Writing to populated memory : 0.006230175s Reading from populated memory : 0.004557805s ==== Test Assertion Failure ==== lib/kvm_util.c:1411: false pid=125806 tid=125809 errno=4 - Interrupted system call 1 0x0000000000402f7c: addr_gpa2hva at kvm_util.c:1411 2 (inlined by) addr_gpa2hva at kvm_util.c:1405 3 0x0000000000401f52: lookup_pfn at access_tracking_perf_test.c:98 4 (inlined by) mark_vcpu_memory_idle at access_tracking_perf_test.c:152 5 (inlined by) vcpu_thread_main at access_tracking_perf_test.c:232 6 0x00007fefe9ff81ce: ?? ??:0 7 0x00007fefe9c64d82: ?? ??:0 No vm physical memory at 0xffbffff000
I can easily reproduce it with a Intel(R) Xeon(R) CPU E5-2630 with 46 bits PA.
It turns out that the address translation for clearing idle page tracking returned a wrong result; addr_gva2gpa()'s last step, which is based on "pte[index[0]].pfn", did the calculation with 40 bits length and the high 12 bits got truncated. In above case the GPA address to be returned should be 0x3fffbffff000 for GVA 0xc0000000, but it got truncated into 0xffbffff000 and the subsequent gpa2hva lookup failed.
The width of operations on bit fields greater than 32-bit is implementation defined, and differs between GCC (which uses the bitfield precision) and clang (which uses 64-bit arithmetic), so this is a potential minefield. Remove the bit fields and using manual masking instead.
Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2075036 Reported-by: Nana Liu nanliu@redhat.com Reviewed-by: Peter Xu peterx@redhat.com Tested-by: Peter Xu peterx@redhat.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org
.../selftests/kvm/include/x86_64/processor.h | 15 ++ .../selftests/kvm/lib/x86_64/processor.c | 192 +++++++----------- 2 files changed, 92 insertions(+), 115 deletions(-)
diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 05e65ca1c30c..23861c8faa61 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -58,6 +58,21 @@ /* CPUID.0x8000_0001.EDX */ #define CPUID_GBPAGES (1ul << 26) +/* Page table bitfield declarations */ +#define PTE_PRESENT_MASK BIT_ULL(0) +#define PTE_WRITABLE_MASK BIT_ULL(1) +#define PTE_USER_MASK BIT_ULL(2) +#define PTE_ACCESSED_MASK BIT_ULL(5) +#define PTE_DIRTY_MASK BIT_ULL(6) +#define PTE_LARGE_MASK BIT_ULL(7) +#define PTE_GLOBAL_MASK BIT_ULL(8) +#define PTE_NX_MASK BIT_ULL(63)
+#define PAGE_SHIFT 12
+#define PHYSICAL_PAGE_MASK GENMASK_ULL(51, 12) +#define PTE_GET_PFN(pte) (((pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
- /* General Registers in 64-Bit Mode */ struct gpr64_regs { u64 rax;
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index da73b97e1e6d..46057079d8bb 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -19,38 +19,6 @@ vm_vaddr_t exception_handlers; -/* Virtual translation table structure declarations */ -struct pageUpperEntry {
- uint64_t present:1;
- uint64_t writable:1;
- uint64_t user:1;
- uint64_t write_through:1;
- uint64_t cache_disable:1;
- uint64_t accessed:1;
- uint64_t ignored_06:1;
- uint64_t page_size:1;
- uint64_t ignored_11_08:4;
- uint64_t pfn:40;
- uint64_t ignored_62_52:11;
- uint64_t execute_disable:1;
-};
-struct pageTableEntry {
- uint64_t present:1;
- uint64_t writable:1;
- uint64_t user:1;
- uint64_t write_through:1;
- uint64_t cache_disable:1;
- uint64_t accessed:1;
- uint64_t dirty:1;
- uint64_t reserved_07:1;
- uint64_t global:1;
- uint64_t ignored_11_09:3;
- uint64_t pfn:40;
- uint64_t ignored_62_52:11;
- uint64_t execute_disable:1;
-};
- void regs_dump(FILE *stream, struct kvm_regs *regs, uint8_t indent) {
@@ -195,23 +163,21 @@ static void *virt_get_pte(struct kvm_vm *vm, uint64_t pt_pfn, uint64_t vaddr, return &page_table[index]; } -static struct pageUpperEntry *virt_create_upper_pte(struct kvm_vm *vm,
uint64_t pt_pfn,
uint64_t vaddr,
uint64_t paddr,
int level,
enum x86_page_size page_size)
+static uint64_t *virt_create_upper_pte(struct kvm_vm *vm,
uint64_t pt_pfn,
uint64_t vaddr,
uint64_t paddr,
int level,
{enum x86_page_size page_size)
- struct pageUpperEntry *pte = virt_get_pte(vm, pt_pfn, vaddr, level);
- if (!pte->present) {
pte->writable = true;
pte->present = true;
pte->page_size = (level == page_size);
if (pte->page_size)
pte->pfn = paddr >> vm->page_shift;
- uint64_t *pte = virt_get_pte(vm, pt_pfn, vaddr, level);
- if (!(*pte & PTE_PRESENT_MASK)) {
*pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK;
if (level == page_size)
else*pte |= PTE_LARGE_MASK | (paddr & PHYSICAL_PAGE_MASK);
pte->pfn = vm_alloc_page_table(vm) >> vm->page_shift;
} else { /**pte |= vm_alloc_page_table(vm) & PHYSICAL_PAGE_MASK;
- Entry already present. Assert that the caller doesn't want
@@ -221,7 +187,7 @@ static struct pageUpperEntry *virt_create_upper_pte(struct kvm_vm *vm, TEST_ASSERT(level != page_size, "Cannot create hugepage at level: %u, vaddr: 0x%lx\n", page_size, vaddr);
TEST_ASSERT(!pte->page_size,
}TEST_ASSERT(!(*pte & PTE_LARGE_MASK), "Cannot create page table at level: %u, vaddr: 0x%lx\n", level, vaddr);
@@ -232,8 +198,8 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, enum x86_page_size page_size) { const uint64_t pg_size = 1ull << ((page_size * 9) + 12);
- struct pageUpperEntry *pml4e, *pdpe, *pde;
- struct pageTableEntry *pte;
- uint64_t *pml4e, *pdpe, *pde;
- uint64_t *pte;
TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Unknown or unsupported guest mode, mode: 0x%x", vm->mode); @@ -257,24 +223,22 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, */ pml4e = virt_create_upper_pte(vm, vm->pgd >> vm->page_shift, vaddr, paddr, 3, page_size);
- if (pml4e->page_size)
- if (*pml4e & PTE_LARGE_MASK) return;
- pdpe = virt_create_upper_pte(vm, pml4e->pfn, vaddr, paddr, 2, page_size);
- if (pdpe->page_size)
- pdpe = virt_create_upper_pte(vm, PTE_GET_PFN(*pml4e), vaddr, paddr, 2, page_size);
- if (*pdpe & PTE_LARGE_MASK) return;
- pde = virt_create_upper_pte(vm, pdpe->pfn, vaddr, paddr, 1, page_size);
- if (pde->page_size)
- pde = virt_create_upper_pte(vm, PTE_GET_PFN(*pdpe), vaddr, paddr, 1, page_size);
- if (*pde & PTE_LARGE_MASK) return;
/* Fill in page table entry. */
- pte = virt_get_pte(vm, pde->pfn, vaddr, 0);
- TEST_ASSERT(!pte->present,
- pte = virt_get_pte(vm, PTE_GET_PFN(*pde), vaddr, 0);
- TEST_ASSERT(!(*pte & PTE_PRESENT_MASK), "PTE already present for 4k page at vaddr: 0x%lx\n", vaddr);
- pte->pfn = paddr >> vm->page_shift;
- pte->writable = true;
- pte->present = 1;
- *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK); }
void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) @@ -282,12 +246,12 @@ void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) __virt_pg_map(vm, vaddr, paddr, X86_PAGE_SIZE_4K); } -static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, +static uint64_t *_vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr) { uint16_t index[4];
- struct pageUpperEntry *pml4e, *pdpe, *pde;
- struct pageTableEntry *pte;
- uint64_t *pml4e, *pdpe, *pde;
- uint64_t *pte; struct kvm_cpuid_entry2 *entry; struct kvm_sregs sregs; int max_phy_addr;
@@ -329,30 +293,29 @@ static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vc index[3] = (vaddr >> 39) & 0x1ffu; pml4e = addr_gpa2hva(vm, vm->pgd);
- TEST_ASSERT(pml4e[index[3]].present,
- TEST_ASSERT(pml4e[index[3]] & PTE_PRESENT_MASK, "Expected pml4e to be present for gva: 0x%08lx", vaddr);
- TEST_ASSERT((*(uint64_t*)(&pml4e[index[3]]) &
(rsvd_mask | (1ull << 7))) == 0,
- TEST_ASSERT((pml4e[index[3]] & (rsvd_mask | PTE_LARGE_MASK)) == 0, "Unexpected reserved bits set.");
- pdpe = addr_gpa2hva(vm, pml4e[index[3]].pfn * vm->page_size);
- TEST_ASSERT(pdpe[index[2]].present,
- pdpe = addr_gpa2hva(vm, PTE_GET_PFN(pml4e[index[3]]) * vm->page_size);
- TEST_ASSERT(pdpe[index[2]] & PTE_PRESENT_MASK, "Expected pdpe to be present for gva: 0x%08lx", vaddr);
- TEST_ASSERT(pdpe[index[2]].page_size == 0,
- TEST_ASSERT(!(pdpe[index[2]] & PTE_LARGE_MASK), "Expected pdpe to map a pde not a 1-GByte page.");
- TEST_ASSERT((*(uint64_t*)(&pdpe[index[2]]) & rsvd_mask) == 0,
- TEST_ASSERT((pdpe[index[2]] & rsvd_mask) == 0, "Unexpected reserved bits set.");
- pde = addr_gpa2hva(vm, pdpe[index[2]].pfn * vm->page_size);
- TEST_ASSERT(pde[index[1]].present,
- pde = addr_gpa2hva(vm, PTE_GET_PFN(pdpe[index[2]]) * vm->page_size);
- TEST_ASSERT(pde[index[1]] & PTE_PRESENT_MASK, "Expected pde to be present for gva: 0x%08lx", vaddr);
- TEST_ASSERT(pde[index[1]].page_size == 0,
- TEST_ASSERT(!(pde[index[1]] & PTE_LARGE_MASK), "Expected pde to map a pte not a 2-MByte page.");
- TEST_ASSERT((*(uint64_t*)(&pde[index[1]]) & rsvd_mask) == 0,
- TEST_ASSERT((pde[index[1]] & rsvd_mask) == 0, "Unexpected reserved bits set.");
- pte = addr_gpa2hva(vm, pde[index[1]].pfn * vm->page_size);
- TEST_ASSERT(pte[index[0]].present,
- pte = addr_gpa2hva(vm, PTE_GET_PFN(pde[index[1]]) * vm->page_size);
- TEST_ASSERT(pte[index[0]] & PTE_PRESENT_MASK, "Expected pte to be present for gva: 0x%08lx", vaddr);
return &pte[index[0]]; @@ -360,7 +323,7 @@ static struct pageTableEntry *_vm_get_page_table_entry(struct kvm_vm *vm, int vc uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr) {
- struct pageTableEntry *pte = _vm_get_page_table_entry(vm, vcpuid, vaddr);
- uint64_t *pte = _vm_get_page_table_entry(vm, vcpuid, vaddr);
return *(uint64_t *)pte; } @@ -368,18 +331,17 @@ uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr) void vm_set_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr, uint64_t pte) {
- struct pageTableEntry *new_pte = _vm_get_page_table_entry(vm, vcpuid,
vaddr);
- uint64_t *new_pte = _vm_get_page_table_entry(vm, vcpuid, vaddr);
*(uint64_t *)new_pte = pte; } void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) {
- struct pageUpperEntry *pml4e, *pml4e_start;
- struct pageUpperEntry *pdpe, *pdpe_start;
- struct pageUpperEntry *pde, *pde_start;
- struct pageTableEntry *pte, *pte_start;
- uint64_t *pml4e, *pml4e_start;
- uint64_t *pdpe, *pdpe_start;
- uint64_t *pde, *pde_start;
- uint64_t *pte, *pte_start;
if (!vm->pgd_created) return; @@ -389,58 +351,58 @@ void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) fprintf(stream, "%*s index hvaddr gpaddr " "addr w exec dirty\n", indent, "");
- pml4e_start = (struct pageUpperEntry *) addr_gpa2hva(vm, vm->pgd);
- pml4e_start = (uint64_t *) addr_gpa2hva(vm, vm->pgd); for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) { pml4e = &pml4e_start[n1];
if (!pml4e->present)
if (!(*pml4e & PTE_PRESENT_MASK)) continue;
fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10lx %u "
fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10llx %u " " %u\n", indent, "", pml4e - pml4e_start, pml4e,
addr_hva2gpa(vm, pml4e), (uint64_t) pml4e->pfn,
pml4e->writable, pml4e->execute_disable);
addr_hva2gpa(vm, pml4e), PTE_GET_PFN(*pml4e),
!!(*pml4e & PTE_WRITABLE_MASK), !!(*pml4e & PTE_NX_MASK));
pdpe_start = addr_gpa2hva(vm, pml4e->pfn * vm->page_size);
for (uint16_t n2 = 0; n2 <= 0x1ffu; n2++) { pdpe = &pdpe_start[n2];pdpe_start = addr_gpa2hva(vm, *pml4e & PHYSICAL_PAGE_MASK);
if (!pdpe->present)
if (!(*pdpe & PTE_PRESENT_MASK)) continue;
fprintf(stream, "%*spdpe 0x%-3zx %p 0x%-12lx 0x%-10lx "
fprintf(stream, "%*spdpe 0x%-3zx %p 0x%-12lx 0x%-10llx " "%u %u\n", indent, "", pdpe - pdpe_start, pdpe, addr_hva2gpa(vm, pdpe),
(uint64_t) pdpe->pfn, pdpe->writable,
pdpe->execute_disable);
PTE_GET_PFN(*pdpe), !!(*pdpe & PTE_WRITABLE_MASK),
!!(*pdpe & PTE_NX_MASK));
pde_start = addr_gpa2hva(vm, pdpe->pfn * vm->page_size);
pde_start = addr_gpa2hva(vm, *pdpe & PHYSICAL_PAGE_MASK); for (uint16_t n3 = 0; n3 <= 0x1ffu; n3++) { pde = &pde_start[n3];
if (!pde->present)
if (!(*pde & PTE_PRESENT_MASK)) continue; fprintf(stream, "%*spde 0x%-3zx %p "
"0x%-12lx 0x%-10lx %u %u\n",
"0x%-12lx 0x%-10llx %u %u\n", indent, "", pde - pde_start, pde, addr_hva2gpa(vm, pde),
(uint64_t) pde->pfn, pde->writable,
pde->execute_disable);
PTE_GET_PFN(*pde), !!(*pde & PTE_WRITABLE_MASK),
!!(*pde & PTE_NX_MASK));
pte_start = addr_gpa2hva(vm, pde->pfn * vm->page_size);
pte_start = addr_gpa2hva(vm, *pde & PHYSICAL_PAGE_MASK); for (uint16_t n4 = 0; n4 <= 0x1ffu; n4++) { pte = &pte_start[n4];
if (!pte->present)
if (!(*pte & PTE_PRESENT_MASK)) continue; fprintf(stream, "%*spte 0x%-3zx %p "
"0x%-12lx 0x%-10lx %u %u "
"0x%-12lx 0x%-10llx %u %u " " %u 0x%-10lx\n", indent, "", pte - pte_start, pte, addr_hva2gpa(vm, pte),
(uint64_t) pte->pfn,
pte->writable,
pte->execute_disable,
pte->dirty,
PTE_GET_PFN(*pte),
!!(*pte & PTE_WRITABLE_MASK),
!!(*pte & PTE_NX_MASK),
!!(*pte & PTE_DIRTY_MASK), ((uint64_t) n1 << 27) | ((uint64_t) n2 << 18) | ((uint64_t) n3 << 9)
@@ -558,8 +520,8 @@ static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector, vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) { uint16_t index[4];
- struct pageUpperEntry *pml4e, *pdpe, *pde;
- struct pageTableEntry *pte;
- uint64_t *pml4e, *pdpe, *pde;
- uint64_t *pte;
TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " "unknown or unsupported guest mode, mode: 0x%x", vm->mode); @@ -572,22 +534,22 @@ vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) if (!vm->pgd_created) goto unmapped_gva; pml4e = addr_gpa2hva(vm, vm->pgd);
- if (!pml4e[index[3]].present)
- if (!(pml4e[index[3]] & PTE_PRESENT_MASK)) goto unmapped_gva;
- pdpe = addr_gpa2hva(vm, pml4e[index[3]].pfn * vm->page_size);
- if (!pdpe[index[2]].present)
- pdpe = addr_gpa2hva(vm, PTE_GET_PFN(pml4e[index[3]]) * vm->page_size);
- if (!(pdpe[index[2]] & PTE_PRESENT_MASK)) goto unmapped_gva;
- pde = addr_gpa2hva(vm, pdpe[index[2]].pfn * vm->page_size);
- if (!pde[index[1]].present)
- pde = addr_gpa2hva(vm, PTE_GET_PFN(pdpe[index[2]]) * vm->page_size);
- if (!(pde[index[1]] & PTE_PRESENT_MASK)) goto unmapped_gva;
- pte = addr_gpa2hva(vm, pde[index[1]].pfn * vm->page_size);
- if (!pte[index[0]].present)
- pte = addr_gpa2hva(vm, PTE_GET_PFN(pde[index[1]]) * vm->page_size);
- if (!(pte[index[0]] & PTE_PRESENT_MASK)) goto unmapped_gva;
- return (pte[index[0]].pfn * vm->page_size) + (gva & 0xfffu);
- return (PTE_GET_PFN(pte[index[0]]) * vm->page_size) + (gva & 0xfffu);
unmapped_gva: TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva);
Acked-by: Paolo Bonzini pbonzini@redhat.com
linux-stable-mirror@lists.linaro.org