[ based on kvm/next ]
Implement guest_memfd allocation and population via the write syscall. This is useful in non-CoCo use cases where the host can access guest memory. Even though the same can also be achieved via userspace mapping and memcpying from userspace, write provides a more performant option because it does not need to set page tables and it does not cause a page fault for every page like memcpy would. Note that memcpy cannot be accelerated via MADV_POPULATE_WRITE as it is not supported by guest_memfd and relies on GUP.
Populating 512MiB of guest_memfd on a x86 machine: - via memcpy: 436 ms - via write: 202 ms (-54%)
v4: - Switch from implementing the write callback to write_iter - Remove conditional compilation - Rebase to kvm/next
v3: - https://lore.kernel.org/kvm/20250303130838.28812-1-kalyazin@amazon.com - David/Mike D: Only compile support for the write syscall if CONFIG_KVM_GMEM_SHARED_MEM (now gone) is enabled. v2: - https://lore.kernel.org/kvm/20241129123929.64790-1-kalyazin@amazon.com - Switch from an ioctl to the write syscall to implement population
v1: - https://lore.kernel.org/kvm/20241024095429.54052-1-kalyazin@amazon.com
Nikita Kalyazin (2): KVM: guest_memfd: add generic population via write KVM: selftests: update guest_memfd write tests
.../testing/selftests/kvm/guest_memfd_test.c | 85 +++++++++++++++++-- virt/kvm/guest_memfd.c | 64 +++++++++++++- 2 files changed, 142 insertions(+), 7 deletions(-)
base-commit: a6ad54137af92535cfe32e19e5f3bc1bb7dbd383
write syscall populates guest_memfd with user-supplied data in a generic way, ie no vendor-specific preparation is performed. This is supposed to be used in non-CoCo setups where guest memory is not hardware-encrypted.
The following behaviour is implemented: - only page-aligned count and offset are allowed - if the memory is already allocated, the call will successfully populate it - if the memory is not allocated, the call will both allocate and populate - if the memory is already populated, the call will not repopulate it
Signed-off-by: Nikita Kalyazin kalyazin@amazon.com --- virt/kvm/guest_memfd.c | 64 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 63 insertions(+), 1 deletion(-)
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 08a6bc7d25b6..1f6f85edace0 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -379,7 +379,9 @@ static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma) }
static struct file_operations kvm_gmem_fops = { - .mmap = kvm_gmem_mmap, + .mmap = kvm_gmem_mmap, + .llseek = default_llseek, + .write_iter = generic_perform_write, .open = generic_file_open, .release = kvm_gmem_release, .fallocate = kvm_gmem_fallocate, @@ -390,6 +392,63 @@ void kvm_gmem_init(struct module *module) kvm_gmem_fops.owner = module; }
+static int kvm_kmem_gmem_write_begin(const struct kiocb *kiocb, + struct address_space *mapping, + loff_t pos, unsigned len, + struct folio **foliop, + void **fsdata) +{ + struct file *file = kiocb->ki_filp; + pgoff_t index = pos >> PAGE_SHIFT; + struct folio *folio; + + if (!PAGE_ALIGNED(pos) || len != PAGE_SIZE) + return -EINVAL; + + if (pos + len > i_size_read(file_inode(file))) + return -EINVAL; + + folio = kvm_gmem_get_folio(file_inode(file), index); + if (IS_ERR(folio)) + return -EFAULT; + + if (WARN_ON_ONCE(folio_test_large(folio))) { + folio_unlock(folio); + folio_put(folio); + return -EFAULT; + } + + if (folio_test_uptodate(folio)) { + folio_unlock(folio); + folio_put(folio); + return -ENOSPC; + } + + *foliop = folio; + return 0; +} + +static int kvm_kmem_gmem_write_end(const struct kiocb *kiocb, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct folio *folio, void *fsdata) +{ + int ret; + + if (copied == len) { + kvm_gmem_mark_prepared(folio); + ret = copied; + } else { + filemap_remove_folio(folio); + ret = 0; + } + + folio_unlock(folio); + folio_put(folio); + + return ret; +} + static int kvm_gmem_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) @@ -442,6 +501,8 @@ static void kvm_gmem_free_folio(struct folio *folio)
static const struct address_space_operations kvm_gmem_aops = { .dirty_folio = noop_dirty_folio, + .write_begin = kvm_kmem_gmem_write_begin, + .write_end = kvm_kmem_gmem_write_end, .migrate_folio = kvm_gmem_migrate_folio, .error_remove_folio = kvm_gmem_error_folio, #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE @@ -489,6 +550,7 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) }
file->f_flags |= O_LARGEFILE; + file->f_mode |= FMODE_LSEEK | FMODE_PWRITE;
inode = file->f_inode; WARN_ON(file->f_mapping != inode->i_mapping);
On 28.08.25 17:31, Kalyazin, Nikita wrote:
write syscall populates guest_memfd with user-supplied data in a generic way, ie no vendor-specific preparation is performed. This is supposed to be used in non-CoCo setups where guest memory is not hardware-encrypted.
The following behaviour is implemented:
- only page-aligned count and offset are allowed
- if the memory is already allocated, the call will successfully populate it
- if the memory is not allocated, the call will both allocate and populate
- if the memory is already populated, the call will not repopulate it
Signed-off-by: Nikita Kalyazin kalyazin@amazon.com
Just nothing that checkpatch complains about
a) Usage of "unsigned" instead of "unsigned int"
b) The From doesn't completely match the SOB: "Kalyazin, Nikita" vs "Nikita Kalyazin"
Hi Nikita,
kernel test robot noticed the following build errors:
[auto build test ERROR on a6ad54137af92535cfe32e19e5f3bc1bb7dbd383]
url: https://github.com/intel-lab-lkp/linux/commits/Kalyazin-Nikita/KVM-guest_mem... base: a6ad54137af92535cfe32e19e5f3bc1bb7dbd383 patch link: https://lore.kernel.org/r/20250828153049.3922-2-kalyazin%40amazon.com patch subject: [PATCH v4 1/2] KVM: guest_memfd: add generic population via write config: x86_64-randconfig-001-20250830 (https://download.01.org/0day-ci/archive/20250831/202508310252.E5uFh1hx-lkp@i...) compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261) reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250831/202508310252.E5uFh1hx-lkp@i...)
If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot lkp@intel.com | Closes: https://lore.kernel.org/oe-kbuild-all/202508310252.E5uFh1hx-lkp@intel.com/
All errors (new ones prefixed by >>, old ones prefixed by <<):
ERROR: modpost: "filemap_remove_folio" [arch/x86/kvm/kvm.ko] undefined!
This is to reflect that the write syscall is now implemented for guest_memfd.
Signed-off-by: Nikita Kalyazin kalyazin@amazon.com --- .../testing/selftests/kvm/guest_memfd_test.c | 85 +++++++++++++++++-- 1 file changed, 79 insertions(+), 6 deletions(-)
diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index b3ca6737f304..7217a3232055 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -24,18 +24,90 @@ #include "test_util.h" #include "ucall_common.h"
-static void test_file_read_write(int fd) +static void test_file_read(int fd) { char buf[64];
TEST_ASSERT(read(fd, buf, sizeof(buf)) < 0, "read on a guest_mem fd should fail"); - TEST_ASSERT(write(fd, buf, sizeof(buf)) < 0, - "write on a guest_mem fd should fail"); TEST_ASSERT(pread(fd, buf, sizeof(buf), 0) < 0, "pread on a guest_mem fd should fail"); - TEST_ASSERT(pwrite(fd, buf, sizeof(buf), 0) < 0, - "pwrite on a guest_mem fd should fail"); +} + +static void test_file_write(int fd, size_t total_size) +{ + size_t page_size = getpagesize(); + void *buf = NULL; + int ret; + + ret = posix_memalign(&buf, page_size, total_size); + TEST_ASSERT_EQ(ret, 0); + + /* Check arguments correctness checks work as expected */ + + ret = pwrite(fd, buf, page_size - 1, 0); + TEST_ASSERT(ret == -1, "write unaligned count on a guest_mem fd should fail"); + TEST_ASSERT_EQ(errno, EINVAL); + + ret = pwrite(fd, buf, page_size, 1); + TEST_ASSERT(ret == -1, "write unaligned offset on a guest_mem fd should fail"); + TEST_ASSERT_EQ(errno, EINVAL); + + ret = pwrite(fd, buf, page_size, total_size); + TEST_ASSERT(ret == -1, "writing past the file size on a guest_mem fd should fail"); + TEST_ASSERT_EQ(errno, EINVAL); + + ret = pwrite(fd, NULL, page_size, 0); + TEST_ASSERT(ret == -1, "supplying a NULL buffer when writing a guest_mem fd should fail"); + TEST_ASSERT_EQ(errno, EFAULT); + + /* Check double population is not allowed */ + + ret = pwrite(fd, buf, page_size, 0); + TEST_ASSERT(ret == page_size, "page-aligned write on a guest_mem fd should succeed"); + + ret = pwrite(fd, buf, page_size, 0); + TEST_ASSERT(ret == -1, "write on already populated guest_mem fd should fail"); + TEST_ASSERT_EQ(errno, ENOSPC); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, page_size); + TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) should succeed"); + + /* Check population is allowed again after punching a hole */ + + ret = pwrite(fd, buf, page_size, 0); + TEST_ASSERT(ret == page_size, "page-aligned write on a punched guest_mem fd should succeed"); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, page_size); + TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) should succeed"); + + /* Check population of already allocated memory is allowed */ + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE, 0, page_size); + TEST_ASSERT(!ret, "fallocate with aligned offset and size should succeed"); + + ret = pwrite(fd, buf, page_size, 0); + TEST_ASSERT(ret == page_size, "write on a preallocated guest_mem fd should succeed"); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, page_size); + TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) should succeed"); + + /* Check population works until an already populated page is encountered */ + + ret = pwrite(fd, buf, total_size, 0); + TEST_ASSERT(ret == total_size, "page-aligned write on a guest_mem fd should succeed"); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, page_size); + TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) should succeed"); + + ret = pwrite(fd, buf, total_size, 0); + TEST_ASSERT(ret == page_size, "write on a guest_mem fd should not overwrite data"); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, total_size); + TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) should succeed"); + + + free(buf); }
static void test_mmap_supported(int fd, size_t page_size, size_t total_size) @@ -281,7 +353,8 @@ static void test_guest_memfd(unsigned long vm_type)
fd = vm_create_guest_memfd(vm, total_size, flags);
- test_file_read_write(fd); + test_file_read(fd); + test_file_write(fd, total_size);
if (flags & GUEST_MEMFD_FLAG_MMAP) { test_mmap_supported(fd, page_size, total_size);
On 28.08.25 17:30, Kalyazin, Nikita wrote:
[ based on kvm/next ]
Implement guest_memfd allocation and population via the write syscall. This is useful in non-CoCo use cases where the host can access guest memory. Even though the same can also be achieved via userspace mapping and memcpying from userspace, write provides a more performant option because it does not need to set page tables and it does not cause a page fault for every page like memcpy would. Note that memcpy cannot be accelerated via MADV_POPULATE_WRITE as it is not supported by guest_memfd and relies on GUP.
I also added this patch to the pile of guestmemfd preview patches located at
https://git.kernel.org/pub/scm/linux/kernel/git/david/linux.git/log/?h=guest...
There was only one minor conflict regarding setting file->f_mode.
linux-kselftest-mirror@lists.linaro.org