This series is rebased on top of Fuad's v4 for shared mapping of guest_memfd [1].
Change since v2 [2]: - David/Mike D: Only compile support for the write syscall if CONFIG_KVM_GMEM_SHARED_MEM introduced in [1] is enabled.
In non-CoCo use cases where the host can access guest memory, guest_memfd can be allocated and populated via the write syscall. Even though the same can also be achieved via userspace mapping and memcpy from userspace, write provides a more performant option because it 1) avoids double initialisation as the kernel does not need to zero pages and 2) does not require setting up page tables.
Nikita
[1] https://lore.kernel.org/kvm/20250218172500.807733-4-tabba@google.com/T/ [2] https://lore.kernel.org/kvm/20241129123929.64790-1-kalyazin@amazon.com/T/
Nikita Kalyazin (2): KVM: guest_memfd: add generic population via write KVM: selftests: update guest_memfd write tests
.../testing/selftests/kvm/guest_memfd_test.c | 85 +++++++++++++++-- virt/kvm/guest_memfd.c | 94 ++++++++++++++++++- 2 files changed, 170 insertions(+), 9 deletions(-)
base-commit: 005f6404708d430abab7fab9b422d0daf6e0c2fe
write syscall populates guest_memfd with user-supplied data in a generic way, ie no vendor-specific preparation is performed. This is supposed to be used in non-CoCo setups where guest memory is not hardware-encrypted.
The following behaviour is implemented: - only page-aligned count and offset are allowed - if the memory is already allocated, the call will successfully populate it - if the memory is not allocated, the call will both allocate and populate - if the memory is already populated, the call will not repopulate it
Signed-off-by: Nikita Kalyazin kalyazin@amazon.com --- virt/kvm/guest_memfd.c | 94 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 91 insertions(+), 3 deletions(-)
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 30b47ff0e6d2..f93fe5835173 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -417,12 +417,97 @@ static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
return 0; } -#else -#define kvm_gmem_mmap NULL + +static ssize_t kvm_kmem_gmem_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset) +{ + pgoff_t start, end, index; + ssize_t ret = 0; + + if (!PAGE_ALIGNED(*offset) || !PAGE_ALIGNED(count)) + return -EINVAL; + + if (*offset + count > i_size_read(file_inode(file))) + return -EINVAL; + + if (!buf) + return -EINVAL; + + start = *offset >> PAGE_SHIFT; + end = (*offset + count) >> PAGE_SHIFT; + + filemap_invalidate_lock_shared(file->f_mapping); + + for (index = start; index < end; ) { + struct folio *folio; + void *vaddr; + pgoff_t buf_offset = (index - start) << PAGE_SHIFT; + + if (signal_pending(current)) { + ret = -EINTR; + goto out; + } + + folio = kvm_gmem_get_folio(file_inode(file), index); + if (IS_ERR(folio)) { + ret = -EFAULT; + goto out; + } + + if (folio_test_hwpoison(folio)) { + folio_unlock(folio); + folio_put(folio); + ret = -EFAULT; + goto out; + } + + /* No support for huge pages. */ + if (WARN_ON_ONCE(folio_test_large(folio))) { + folio_unlock(folio); + folio_put(folio); + ret = -EFAULT; + goto out; + } + + if (folio_test_uptodate(folio)) { + folio_unlock(folio); + folio_put(folio); + ret = -ENOSPC; + goto out; + } + + folio_unlock(folio); + + vaddr = kmap_local_folio(folio, 0); + ret = copy_from_user(vaddr, buf + buf_offset, PAGE_SIZE); + kunmap_local(vaddr); + if (ret) { + ret = -EINVAL; + folio_put(folio); + goto out; + } + + kvm_gmem_mark_prepared(folio); + folio_put(folio); + + index = folio_next_index(folio); + *offset += PAGE_SIZE; + } + +out: + filemap_invalidate_unlock_shared(file->f_mapping); + + return ret && start == (*offset >> PAGE_SHIFT) ? + ret : *offset - (start << PAGE_SHIFT); +} #endif /* CONFIG_KVM_GMEM_SHARED_MEM */
static struct file_operations kvm_gmem_fops = { - .mmap = kvm_gmem_mmap, +#ifdef CONFIG_KVM_GMEM_SHARED_MEM + .mmap = kvm_gmem_mmap, + .llseek = default_llseek, + .write = kvm_kmem_gmem_write, +#endif /* CONFIG_KVM_GMEM_SHARED_MEM */ .open = generic_file_open, .release = kvm_gmem_release, .fallocate = kvm_gmem_fallocate, @@ -538,6 +623,9 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) }
file->f_flags |= O_LARGEFILE; +#ifdef CONFIG_KVM_GMEM_SHARED_MEM + file->f_mode |= FMODE_LSEEK | FMODE_PWRITE; +#endif /* CONFIG_KVM_GMEM_SHARED_MEM */
inode = file->f_inode; WARN_ON(file->f_mapping != inode->i_mapping);
This is to reflect that the write syscall is now implemented for guest_memfd.
Signed-off-by: Nikita Kalyazin kalyazin@amazon.com --- .../testing/selftests/kvm/guest_memfd_test.c | 85 +++++++++++++++++-- 1 file changed, 79 insertions(+), 6 deletions(-)
diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index 38c501e49e0e..b07221aa54c9 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -20,18 +20,90 @@ #include "kvm_util.h" #include "test_util.h"
-static void test_file_read_write(int fd) +static void test_file_read(int fd) { char buf[64];
TEST_ASSERT(read(fd, buf, sizeof(buf)) < 0, "read on a guest_mem fd should fail"); - TEST_ASSERT(write(fd, buf, sizeof(buf)) < 0, - "write on a guest_mem fd should fail"); TEST_ASSERT(pread(fd, buf, sizeof(buf), 0) < 0, "pread on a guest_mem fd should fail"); - TEST_ASSERT(pwrite(fd, buf, sizeof(buf), 0) < 0, - "pwrite on a guest_mem fd should fail"); +} + +static void test_file_write(int fd, size_t total_size) +{ + size_t page_size = getpagesize(); + void *buf = NULL; + int ret; + + ret = posix_memalign(&buf, page_size, total_size); + TEST_ASSERT_EQ(ret, 0); + + /* Check arguments correctness checks work as expected */ + + ret = pwrite(fd, buf, page_size - 1, 0); + TEST_ASSERT(ret == -1, "write unaligned count on a guest_mem fd should fail"); + TEST_ASSERT_EQ(errno, EINVAL); + + ret = pwrite(fd, buf, page_size, 1); + TEST_ASSERT(ret == -1, "write unaligned offset on a guest_mem fd should fail"); + TEST_ASSERT_EQ(errno, EINVAL); + + ret = pwrite(fd, buf, page_size, total_size); + TEST_ASSERT(ret == -1, "writing past the file size on a guest_mem fd should fail"); + TEST_ASSERT_EQ(errno, EINVAL); + + ret = pwrite(fd, NULL, page_size, 0); + TEST_ASSERT(ret == -1, "supplying a NULL buffer when writing a guest_mem fd should fail"); + TEST_ASSERT_EQ(errno, EINVAL); + + /* Check double population is not allowed */ + + ret = pwrite(fd, buf, page_size, 0); + TEST_ASSERT(ret == page_size, "page-aligned write on a guest_mem fd should succeed"); + + ret = pwrite(fd, buf, page_size, 0); + TEST_ASSERT(ret == -1, "write on already populated guest_mem fd should fail"); + TEST_ASSERT_EQ(errno, ENOSPC); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, page_size); + TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) should succeed"); + + /* Check population is allowed again after punching a hole */ + + ret = pwrite(fd, buf, page_size, 0); + TEST_ASSERT(ret == page_size, "page-aligned write on a punched guest_mem fd should succeed"); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, page_size); + TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) should succeed"); + + /* Check population of already allocated memory is allowed */ + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE, 0, page_size); + TEST_ASSERT(!ret, "fallocate with aligned offset and size should succeed"); + + ret = pwrite(fd, buf, page_size, 0); + TEST_ASSERT(ret == page_size, "write on a preallocated guest_mem fd should succeed"); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, page_size); + TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) should succeed"); + + /* Check population works until an already populated page is encountered */ + + ret = pwrite(fd, buf, total_size, 0); + TEST_ASSERT(ret == total_size, "page-aligned write on a guest_mem fd should succeed"); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, page_size); + TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) should succeed"); + + ret = pwrite(fd, buf, total_size, 0); + TEST_ASSERT(ret == page_size, "write on a guest_mem fd should not overwrite data"); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, total_size); + TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) should succeed"); + + + free(buf); }
static void test_mmap_allowed(int fd, size_t total_size) @@ -233,7 +305,8 @@ void test_vm_type(unsigned long type, bool is_shared)
fd = vm_create_guest_memfd(vm, total_size, 0);
- test_file_read_write(fd); + test_file_read(fd); + test_file_write(fd, total_size);
if (is_shared) test_mmap_allowed(fd, total_size);
linux-kselftest-mirror@lists.linaro.org