From: Jack Thomson jackabt@amazon.com
This patch series adds ARM64 support for the KVM_PRE_FAULT_MEMORY feature, which was previously only available on x86 [1]. This allows us to reduce the number of stage-2 faults during execution. This is of benefit in post-copy migration scenarios, particularly in memory intensive applications, where we are experiencing high latencies due to the stage-2 faults.
Patch Overview:
- The first patch adds support for the KVM_PRE_FAULT_MEMORY ioctl on arm64.
- The second patch updates the pre_fault_memory_test to support arm64.
- The last patch extends the pre_fault_memory_test to cover different vm memory backings.
With regards to the additional parameter to `user_mem_abort`, noted in the v3 review, would you like this to be fixed in this series or would a follow-up series be ok? I also found a series from Sean which looks to address this [2].
=== Changes Since v3 [3] ===
- Updated to now return -EOPNOTSUPP for pKVM. Previously this was not checked. - When running a nested guest, properly resolve the L2 IPA to L1 IPA before pre faulting. - Refactoring, page_size is now unsigned and ordered definitions at top of pre_fault function.
Thanks Marc for your review
=== Changes Since v2 [4] ===
- Update fault info synthesize value. Thanks Suzuki - Remove change to selftests for unaligned mmap allocations. Thanks Sean
[1]: https://lore.kernel.org/kvm/20240710174031.312055-1-pbonzini@redhat.com [2]: https://lore.kernel.org/linux-arm-kernel/20250821210042.3451147-1-seanjc@goo... [3]: https://lore.kernel.org/linux-arm-kernel/20251119154910.97716-1-jackabt.amaz... [4]: https://lore.kernel.org/linux-arm-kernel/20251013151502.6679-1-jackabt.amazo...
Jack Thomson (3): KVM: arm64: Add pre_fault_memory implementation KVM: selftests: Enable pre_fault_memory_test for arm64 KVM: selftests: Add option for different backing in pre-fault tests
Documentation/virt/kvm/api.rst | 3 +- arch/arm64/kvm/Kconfig | 1 + arch/arm64/kvm/arm.c | 1 + arch/arm64/kvm/mmu.c | 79 +++++++++++- tools/testing/selftests/kvm/Makefile.kvm | 1 + .../selftests/kvm/pre_fault_memory_test.c | 115 ++++++++++++++---- 6 files changed, 169 insertions(+), 31 deletions(-)
base-commit: 3611ca7c12b740e250d83f8bbe3554b740c503b0
From: Jack Thomson jackabt@amazon.com
Add kvm_arch_vcpu_pre_fault_memory() for arm64. The implementation hands off the stage-2 faulting logic to either gmem_abort() or user_mem_abort().
Add an optional page_size output parameter to user_mem_abort() to return the VMA page size, which is needed when pre-faulting.
Update the documentation to clarify x86 specific behaviour.
Signed-off-by: Jack Thomson jackabt@amazon.com --- Documentation/virt/kvm/api.rst | 3 +- arch/arm64/kvm/Kconfig | 1 + arch/arm64/kvm/arm.c | 1 + arch/arm64/kvm/mmu.c | 79 ++++++++++++++++++++++++++++++++-- 4 files changed, 79 insertions(+), 5 deletions(-)
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 01a3abef8abb..44cfd9e736bb 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6493,7 +6493,8 @@ Errors: KVM_PRE_FAULT_MEMORY populates KVM's stage-2 page tables used to map memory for the current vCPU state. KVM maps memory as if the vCPU generated a stage-2 read page fault, e.g. faults in memory as needed, but doesn't break -CoW. However, KVM does not mark any newly created stage-2 PTE as Accessed. +CoW. However, on x86, KVM does not mark any newly created stage-2 PTE as +Accessed.
In the case of confidential VM types where there is an initial set up of private guest memory before the guest is 'finalized'/measured, this ioctl diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 4f803fd1c99a..6872aaabe16c 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -25,6 +25,7 @@ menuconfig KVM select HAVE_KVM_CPU_RELAX_INTERCEPT select KVM_MMIO select KVM_GENERIC_DIRTYLOG_READ_PROTECT + select KVM_GENERIC_PRE_FAULT_MEMORY select VIRT_XFER_TO_GUEST_WORK select KVM_VFIO select HAVE_KVM_DIRTY_RING_ACQ_REL diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 4f80da0c0d1d..19bac68f737f 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -332,6 +332,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_COUNTER_OFFSET: case KVM_CAP_ARM_WRITABLE_IMP_ID_REGS: case KVM_CAP_ARM_SEA_TO_USER: + case KVM_CAP_PRE_FAULT_MEMORY: r = 1; break; case KVM_CAP_SET_GUEST_DEBUG2: diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 48d7c372a4cd..499b131f794e 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1642,8 +1642,8 @@ static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_s2_trans *nested, - struct kvm_memory_slot *memslot, unsigned long hva, - bool fault_is_perm) + struct kvm_memory_slot *memslot, unsigned long *page_size, + unsigned long hva, bool fault_is_perm) { int ret = 0; bool topup_memcache; @@ -1923,6 +1923,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, kvm_release_faultin_page(kvm, page, !!ret, writable); kvm_fault_unlock(kvm);
+ if (page_size) + *page_size = vma_pagesize; + /* Mark the page dirty only if the fault is handled successfully */ if (writable && !ret) mark_page_dirty_in_slot(kvm, memslot, gfn); @@ -2196,8 +2199,8 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) ret = gmem_abort(vcpu, fault_ipa, nested, memslot, esr_fsc_is_permission_fault(esr)); else - ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva, - esr_fsc_is_permission_fault(esr)); + ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, NULL, + hva, esr_fsc_is_permission_fault(esr)); if (ret == 0) ret = 1; out: @@ -2573,3 +2576,71 @@ void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled); } + +long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, + struct kvm_pre_fault_memory *range) +{ + struct kvm_vcpu_fault_info *fault_info = &vcpu->arch.fault; + struct kvm_s2_trans nested_trans, *nested = NULL; + unsigned long page_size = PAGE_SIZE; + struct kvm_memory_slot *memslot; + phys_addr_t ipa = range->gpa; + phys_addr_t end; + hva_t hva; + gfn_t gfn; + int ret; + + if (vcpu_is_protected(vcpu)) + return -EOPNOTSUPP; + + /* + * We may prefault on a shadow stage 2 page table if we are + * running a nested guest. In this case, we have to resolve the L2 + * IPA to the L1 IPA first, before knowing what kind of memory should + * back the L1 IPA. + * + * If the shadow stage 2 page table walk faults, then we return + * -EFAULT + */ + if (kvm_is_nested_s2_mmu(vcpu->kvm, vcpu->arch.hw_mmu) && + vcpu->arch.hw_mmu->nested_stage2_enabled) { + ret = kvm_walk_nested_s2(vcpu, ipa, &nested_trans); + if (ret) + return -EFAULT; + + ipa = kvm_s2_trans_output(&nested_trans); + nested = &nested_trans; + } + + if (ipa >= kvm_phys_size(vcpu->arch.hw_mmu)) + return -ENOENT; + + /* Generate a synthetic abort for the pre-fault address */ + fault_info->esr_el2 = (ESR_ELx_EC_DABT_LOW << ESR_ELx_EC_SHIFT) | + ESR_ELx_FSC_FAULT_L(KVM_PGTABLE_LAST_LEVEL); + fault_info->hpfar_el2 = HPFAR_EL2_NS | + FIELD_PREP(HPFAR_EL2_FIPA, ipa >> 12); + + gfn = gpa_to_gfn(ipa); + memslot = gfn_to_memslot(vcpu->kvm, gfn); + if (!memslot) + return -ENOENT; + + if (kvm_slot_has_gmem(memslot)) { + /* gmem currently only supports PAGE_SIZE mappings */ + ret = gmem_abort(vcpu, ipa, nested, memslot, false); + } else { + hva = gfn_to_hva_memslot_prot(memslot, gfn, NULL); + if (kvm_is_error_hva(hva)) + return -EFAULT; + + ret = user_mem_abort(vcpu, ipa, nested, memslot, &page_size, hva, + false); + } + + if (ret < 0) + return ret; + + end = ALIGN_DOWN(range->gpa, page_size) + page_size; + return min(range->size, end - range->gpa); +}
From: Jack Thomson jackabt@amazon.com
Enable the pre_fault_memory_test to run on arm64 by making it work with different guest page sizes and testing multiple guest configurations.
Update the test_assert to compare against the UCALL_EXIT_REASON, for portability, as arm64 exits with KVM_EXIT_MMIO while x86 uses KVM_EXIT_IO.
Signed-off-by: Jack Thomson jackabt@amazon.com --- tools/testing/selftests/kvm/Makefile.kvm | 1 + .../selftests/kvm/pre_fault_memory_test.c | 85 ++++++++++++++----- 2 files changed, 63 insertions(+), 23 deletions(-)
diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index ba5c2b643efa..6d6a74ddad30 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -187,6 +187,7 @@ TEST_GEN_PROGS_arm64 += memslot_perf_test TEST_GEN_PROGS_arm64 += mmu_stress_test TEST_GEN_PROGS_arm64 += rseq_test TEST_GEN_PROGS_arm64 += steal_time +TEST_GEN_PROGS_arm64 += pre_fault_memory_test
TEST_GEN_PROGS_s390 = $(TEST_GEN_PROGS_COMMON) TEST_GEN_PROGS_s390 += s390/memop diff --git a/tools/testing/selftests/kvm/pre_fault_memory_test.c b/tools/testing/selftests/kvm/pre_fault_memory_test.c index 93e603d91311..be1a84a6c137 100644 --- a/tools/testing/selftests/kvm/pre_fault_memory_test.c +++ b/tools/testing/selftests/kvm/pre_fault_memory_test.c @@ -11,19 +11,29 @@ #include <kvm_util.h> #include <processor.h> #include <pthread.h> +#include <guest_modes.h>
/* Arbitrarily chosen values */ -#define TEST_SIZE (SZ_2M + PAGE_SIZE) -#define TEST_NPAGES (TEST_SIZE / PAGE_SIZE) +#define TEST_BASE_SIZE SZ_2M #define TEST_SLOT 10
-static void guest_code(uint64_t base_gva) +/* Storage of test info to share with guest code */ +struct test_config { + uint64_t page_size; + uint64_t test_size; + uint64_t test_num_pages; +}; + +static struct test_config test_config; + +static void guest_code(uint64_t base_gpa) { volatile uint64_t val __used; + struct test_config *config = &test_config; int i;
- for (i = 0; i < TEST_NPAGES; i++) { - uint64_t *src = (uint64_t *)(base_gva + i * PAGE_SIZE); + for (i = 0; i < config->test_num_pages; i++) { + uint64_t *src = (uint64_t *)(base_gpa + i * config->page_size);
val = *src; } @@ -56,7 +66,7 @@ static void *delete_slot_worker(void *__data) cpu_relax();
vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, data->gpa, - TEST_SLOT, TEST_NPAGES, data->flags); + TEST_SLOT, test_config.test_num_pages, data->flags);
return NULL; } @@ -159,22 +169,35 @@ static void pre_fault_memory(struct kvm_vcpu *vcpu, u64 base_gpa, u64 offset, KVM_PRE_FAULT_MEMORY, ret, vcpu->vm); }
-static void __test_pre_fault_memory(unsigned long vm_type, bool private) +struct test_params { + unsigned long vm_type; + bool private; +}; + +static void __test_pre_fault_memory(enum vm_guest_mode guest_mode, void *arg) { uint64_t gpa, gva, alignment, guest_page_size; + struct test_params *p = arg; const struct vm_shape shape = { - .mode = VM_MODE_DEFAULT, - .type = vm_type, + .mode = guest_mode, + .type = p->vm_type, }; struct kvm_vcpu *vcpu; struct kvm_run *run; struct kvm_vm *vm; struct ucall uc;
+ pr_info("Testing guest mode: %s\n", vm_guest_mode_string(guest_mode)); + vm = vm_create_shape_with_one_vcpu(shape, &vcpu, guest_code);
- alignment = guest_page_size = vm_guest_mode_params[VM_MODE_DEFAULT].page_size; - gpa = (vm->max_gfn - TEST_NPAGES) * guest_page_size; + guest_page_size = vm_guest_mode_params[guest_mode].page_size; + + test_config.page_size = guest_page_size; + test_config.test_size = TEST_BASE_SIZE + test_config.page_size; + test_config.test_num_pages = vm_calc_num_guest_pages(vm->mode, test_config.test_size); + + gpa = (vm->max_gfn - test_config.test_num_pages) * test_config.page_size; #ifdef __s390x__ alignment = max(0x100000UL, guest_page_size); #else @@ -183,23 +206,32 @@ static void __test_pre_fault_memory(unsigned long vm_type, bool private) gpa = align_down(gpa, alignment); gva = gpa & ((1ULL << (vm->va_bits - 1)) - 1);
- vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, gpa, TEST_SLOT, - TEST_NPAGES, private ? KVM_MEM_GUEST_MEMFD : 0); - virt_map(vm, gva, gpa, TEST_NPAGES); + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + gpa, TEST_SLOT, test_config.test_num_pages, + p->private ? KVM_MEM_GUEST_MEMFD : 0); + virt_map(vm, gva, gpa, test_config.test_num_pages); + + if (p->private) + vm_mem_set_private(vm, gpa, test_config.test_size); + pre_fault_memory(vcpu, gpa, 0, TEST_BASE_SIZE, 0, p->private); + /* Test pre-faulting over an already faulted range */ + pre_fault_memory(vcpu, gpa, 0, TEST_BASE_SIZE, 0, p->private); + pre_fault_memory(vcpu, gpa, TEST_BASE_SIZE, + test_config.page_size * 2, test_config.page_size, p->private); + pre_fault_memory(vcpu, gpa, test_config.test_size, + test_config.page_size, test_config.page_size, p->private);
- if (private) - vm_mem_set_private(vm, gpa, TEST_SIZE); + vcpu_args_set(vcpu, 1, gva);
- pre_fault_memory(vcpu, gpa, 0, SZ_2M, 0, private); - pre_fault_memory(vcpu, gpa, SZ_2M, PAGE_SIZE * 2, PAGE_SIZE, private); - pre_fault_memory(vcpu, gpa, TEST_SIZE, PAGE_SIZE, PAGE_SIZE, private); + /* Export the shared variables to the guest. */ + sync_global_to_guest(vm, test_config);
- vcpu_args_set(vcpu, 1, gva); vcpu_run(vcpu);
run = vcpu->run; - TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, - "Wanted KVM_EXIT_IO, got exit reason: %u (%s)", + TEST_ASSERT(run->exit_reason == UCALL_EXIT_REASON, + "Wanted %s, got exit reason: %u (%s)", + exit_reason_str(UCALL_EXIT_REASON), run->exit_reason, exit_reason_str(run->exit_reason));
switch (get_ucall(vcpu, &uc)) { @@ -218,18 +250,25 @@ static void __test_pre_fault_memory(unsigned long vm_type, bool private)
static void test_pre_fault_memory(unsigned long vm_type, bool private) { + struct test_params p = { + .vm_type = vm_type, + .private = private, + }; + if (vm_type && !(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(vm_type))) { pr_info("Skipping tests for vm_type 0x%lx\n", vm_type); return; }
- __test_pre_fault_memory(vm_type, private); + for_each_guest_mode(__test_pre_fault_memory, &p); }
int main(int argc, char *argv[]) { TEST_REQUIRE(kvm_check_cap(KVM_CAP_PRE_FAULT_MEMORY));
+ guest_modes_append_default(); + test_pre_fault_memory(0, false); #ifdef __x86_64__ test_pre_fault_memory(KVM_X86_SW_PROTECTED_VM, false);
From: Jack Thomson jackabt@amazon.com
Add a -m option to specify different memory backing types for the pre-fault tests (e.g., anonymous, hugetlb), allowing testing of the pre-fault functionality across different memory configurations.
Signed-off-by: Jack Thomson jackabt@amazon.com --- .../selftests/kvm/pre_fault_memory_test.c | 42 +++++++++++++++---- 1 file changed, 33 insertions(+), 9 deletions(-)
diff --git a/tools/testing/selftests/kvm/pre_fault_memory_test.c b/tools/testing/selftests/kvm/pre_fault_memory_test.c index be1a84a6c137..1a177f89bc43 100644 --- a/tools/testing/selftests/kvm/pre_fault_memory_test.c +++ b/tools/testing/selftests/kvm/pre_fault_memory_test.c @@ -172,6 +172,7 @@ static void pre_fault_memory(struct kvm_vcpu *vcpu, u64 base_gpa, u64 offset, struct test_params { unsigned long vm_type; bool private; + enum vm_mem_backing_src_type mem_backing_src; };
static void __test_pre_fault_memory(enum vm_guest_mode guest_mode, void *arg) @@ -187,14 +188,19 @@ static void __test_pre_fault_memory(enum vm_guest_mode guest_mode, void *arg) struct kvm_vm *vm; struct ucall uc;
+ size_t backing_src_pagesz = get_backing_src_pagesz(p->mem_backing_src); + pr_info("Testing guest mode: %s\n", vm_guest_mode_string(guest_mode)); + pr_info("Testing memory backing src type: %s\n", + vm_mem_backing_src_alias(p->mem_backing_src)->name);
vm = vm_create_shape_with_one_vcpu(shape, &vcpu, guest_code);
guest_page_size = vm_guest_mode_params[guest_mode].page_size;
test_config.page_size = guest_page_size; - test_config.test_size = TEST_BASE_SIZE + test_config.page_size; + test_config.test_size = align_up(TEST_BASE_SIZE + test_config.page_size, + backing_src_pagesz); test_config.test_num_pages = vm_calc_num_guest_pages(vm->mode, test_config.test_size);
gpa = (vm->max_gfn - test_config.test_num_pages) * test_config.page_size; @@ -203,20 +209,23 @@ static void __test_pre_fault_memory(enum vm_guest_mode guest_mode, void *arg) #else alignment = SZ_2M; #endif + alignment = max(alignment, backing_src_pagesz); gpa = align_down(gpa, alignment); gva = gpa & ((1ULL << (vm->va_bits - 1)) - 1);
- vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + vm_userspace_mem_region_add(vm, p->mem_backing_src, gpa, TEST_SLOT, test_config.test_num_pages, p->private ? KVM_MEM_GUEST_MEMFD : 0); virt_map(vm, gva, gpa, test_config.test_num_pages);
if (p->private) vm_mem_set_private(vm, gpa, test_config.test_size); - pre_fault_memory(vcpu, gpa, 0, TEST_BASE_SIZE, 0, p->private); + + pre_fault_memory(vcpu, gpa, 0, test_config.test_size, 0, p->private); /* Test pre-faulting over an already faulted range */ - pre_fault_memory(vcpu, gpa, 0, TEST_BASE_SIZE, 0, p->private); - pre_fault_memory(vcpu, gpa, TEST_BASE_SIZE, + pre_fault_memory(vcpu, gpa, 0, test_config.test_size, 0, p->private); + pre_fault_memory(vcpu, gpa, + test_config.test_size - test_config.page_size, test_config.page_size * 2, test_config.page_size, p->private); pre_fault_memory(vcpu, gpa, test_config.test_size, test_config.page_size, test_config.page_size, p->private); @@ -248,11 +257,13 @@ static void __test_pre_fault_memory(enum vm_guest_mode guest_mode, void *arg) kvm_vm_free(vm); }
-static void test_pre_fault_memory(unsigned long vm_type, bool private) +static void test_pre_fault_memory(unsigned long vm_type, enum vm_mem_backing_src_type backing_src, + bool private) { struct test_params p = { .vm_type = vm_type, .private = private, + .mem_backing_src = backing_src, };
if (vm_type && !(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(vm_type))) { @@ -265,14 +276,27 @@ static void test_pre_fault_memory(unsigned long vm_type, bool private)
int main(int argc, char *argv[]) { + enum vm_mem_backing_src_type backing = VM_MEM_SRC_ANONYMOUS; + int opt; + TEST_REQUIRE(kvm_check_cap(KVM_CAP_PRE_FAULT_MEMORY));
guest_modes_append_default();
- test_pre_fault_memory(0, false); + while ((opt = getopt(argc, argv, "m:")) != -1) { + switch (opt) { + case 'm': + backing = parse_backing_src_type(optarg); + break; + default: + break; + } + } + + test_pre_fault_memory(0, backing, false); #ifdef __x86_64__ - test_pre_fault_memory(KVM_X86_SW_PROTECTED_VM, false); - test_pre_fault_memory(KVM_X86_SW_PROTECTED_VM, true); + test_pre_fault_memory(KVM_X86_SW_PROTECTED_VM, backing, false); + test_pre_fault_memory(KVM_X86_SW_PROTECTED_VM, backing, true); #endif return 0; }
linux-kselftest-mirror@lists.linaro.org