The hardware XRSTOR instruction resets the PKRU register to its hardware
init value (namely 0) if the PKRU bit is not set in the xfeatures mask.
Emulating that here restores the pre-5.14 behavior for PTRACE_SET_REGSET
with NT_X86_XSTATE, and makes sigreturn (which still uses XRSTOR) and
ptrace behave identically. KVM has never used XRSTOR and never had this
behavior, so KVM opts-out of this emulation by passing a NULL pkru pointer
to copy_uabi_to_xstate().
Fixes: e84ba47e313d ("x86/fpu: Hook up PKRU into ptrace()")
Signed-off-by: Kyle Huey <me(a)kylehuey.com>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Borislav Petkov <bp(a)suse.de>
Cc: stable(a)vger.kernel.org # 5.14+
---
arch/x86/kernel/fpu/core.c | 8 ++++++++
arch/x86/kernel/fpu/xstate.c | 15 ++++++++++++++-
2 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 46b935bc87c8..8d0f6019c21d 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -404,6 +404,14 @@ int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf,
if (ustate->xsave.header.xfeatures & ~xcr0)
return -EINVAL;
+ /*
+ * Nullify @vpkru to preserve its current value if PKRU's bit isn't set
+ * in the header. KVM's odd ABI is to leave PKRU untouched in this
+ * case (all other components are eventually re-initialized).
+ */
+ if (!(ustate->xsave.header.xfeatures & XFEATURE_MASK_PKRU))
+ vpkru = NULL;
+
return copy_uabi_from_kernel_to_xstate(kstate, ustate, vpkru);
}
EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index bebc30c29ed3..193c6e95daa8 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1219,8 +1219,14 @@ static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
* it is harmless.
* 2. When called from ptrace the PKRU register will be restored from the
* thread_struct's pkru field. A pointer to that is passed in @pkru.
+ * The kernel will restore it manually, so the XRSTOR behavior that resets
+ * the PKRU register to the hardware init value (0) if the corresponding
+ * xfeatures bit is not set is emulated here.
* 3. When called from KVM the PKRU register will be restored from the vcpu's
- * pkru field. A pointer to that is passed in @pkru.
+ * pkru field. A pointer to that is passed in @pkru. KVM hasn't used
+ * XRSTOR and hasn't had the PKRU resetting behavior described above. To
+ * preserve that KVM behavior, it passes NULL for @pkru if the xfeatures
+ * bit is not set.
*/
static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
const void __user *ubuf, u32 *pkru)
@@ -1277,6 +1283,13 @@ static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
xpkru = __raw_xsave_addr(xsave, XFEATURE_PKRU);
*pkru = xpkru->pkru;
+ } else {
+ /*
+ * KVM may pass NULL here to indicate that it does not need
+ * PKRU updated.
+ */
+ if (pkru)
+ *pkru = 0;
}
/*
--
2.38.1
Move KVM's PKRU handling code in fpu_copy_uabi_to_guest_fpstate() to
copy_uabi_to_xstate() so that it is shared with other APIs that write the
XSTATE such as PTRACE_SETREGSET with NT_X86_XSTATE.
This restores the pre-5.14 behavior of ptrace. The regression can be seen
by running gdb and executing `p $pkru`, `set $pkru = 42`, and `p $pkru`.
On affected kernels (5.14+) the write to the PKRU register (which gdb
performs through ptrace) is ignored.
Fixes: e84ba47e313d ("x86/fpu: Hook up PKRU into ptrace()")
Signed-off-by: Kyle Huey <me(a)kylehuey.com>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Borislav Petkov <bp(a)suse.de>
Cc: stable(a)vger.kernel.org # 5.14+
---
arch/x86/kernel/fpu/core.c | 13 +------------
arch/x86/kernel/fpu/xstate.c | 21 ++++++++++++++++++++-
2 files changed, 21 insertions(+), 13 deletions(-)
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 550157686323..46b935bc87c8 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -391,8 +391,6 @@ int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf,
{
struct fpstate *kstate = gfpu->fpstate;
const union fpregs_state *ustate = buf;
- struct pkru_state *xpkru;
- int ret;
if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) {
if (ustate->xsave.header.xfeatures & ~XFEATURE_MASK_FPSSE)
@@ -406,16 +404,7 @@ int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf,
if (ustate->xsave.header.xfeatures & ~xcr0)
return -EINVAL;
- ret = copy_uabi_from_kernel_to_xstate(kstate, ustate, vpkru);
- if (ret)
- return ret;
-
- /* Retrieve PKRU if not in init state */
- if (kstate->regs.xsave.header.xfeatures & XFEATURE_MASK_PKRU) {
- xpkru = get_xsave_addr(&kstate->regs.xsave, XFEATURE_PKRU);
- *vpkru = xpkru->pkru;
- }
- return 0;
+ return copy_uabi_from_kernel_to_xstate(kstate, ustate, vpkru);
}
EXPORT_SYMBOL_GPL(fpu_copy_uabi_to_guest_fpstate);
#endif /* CONFIG_KVM */
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 3a6ced76e932..bebc30c29ed3 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1205,10 +1205,22 @@ static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size,
* @fpstate: The fpstate buffer to copy to
* @kbuf: The UABI format buffer, if it comes from the kernel
* @ubuf: The UABI format buffer, if it comes from userspace
- * @pkru: unused
+ * @pkru: The location to write the PKRU value to
*
* Converts from the UABI format into the kernel internal hardware
* dependent format.
+ *
+ * This function ultimately has three different callers with distinct PKRU
+ * behavior.
+ * 1. When called from sigreturn the PKRU register will be restored from
+ * @fpstate via an XRSTOR. Correctly copying the UABI format buffer to
+ * @fpstate is sufficient to cover this case, but the caller will also
+ * pass a pointer to the thread_struct's pkru field in @pkru and updating
+ * it is harmless.
+ * 2. When called from ptrace the PKRU register will be restored from the
+ * thread_struct's pkru field. A pointer to that is passed in @pkru.
+ * 3. When called from KVM the PKRU register will be restored from the vcpu's
+ * pkru field. A pointer to that is passed in @pkru.
*/
static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
const void __user *ubuf, u32 *pkru)
@@ -1260,6 +1272,13 @@ static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf,
}
}
+ if (hdr.xfeatures & XFEATURE_MASK_PKRU) {
+ struct pkru_state *xpkru;
+
+ xpkru = __raw_xsave_addr(xsave, XFEATURE_PKRU);
+ *pkru = xpkru->pkru;
+ }
+
/*
* The state that came in from userspace was user-state only.
* Mask all the user states out of 'xfeatures':
--
2.38.1
Ives van Hoorne from codesandbox.io reported an issue regarding possible
data loss of uffd-wp when applied to memfds on heavily loaded systems. The
sympton is some read page got data mismatch from the snapshot child VMs.
Here I can also reproduce with a Rust reproducer that was provided by Ives
that keeps taking snapshot of a 256MB VM, on a 32G system when I initiate
80 instances I can trigger the issues in ten minutes.
It turns out that we got some pages write-through even if uffd-wp is
applied to the pte.
The problem is, when removing migration entries, we didn't really worry
about write bit as long as we know it's not a write migration entry. That
may not be true, for some memory types (e.g. writable shmem) mk_pte can
return a pte with write bit set, then to recover the migration entry to its
original state we need to explicit wr-protect the pte or it'll has the
write bit set if it's a read migration entry.
For uffd it can cause write-through. I didn't verify, but I think it'll be
the same for mprotect()ed pages and after migration we can miss the sigbus
instead.
The relevant code on uffd was introduced in the anon support, which is
commit f45ec5ff16a7 ("userfaultfd: wp: support swap and page migration",
2020-04-07). However anon shouldn't suffer from this problem because anon
should already have the write bit cleared always, so that may not be a
proper Fixes target. To satisfy the need on the backport, I'm attaching
the Fixes tag to the uffd-wp shmem support. Since no one had issue with
mprotect, so I assume that's also the kernel version we should start to
backport for stable, and we shouldn't need to worry before that.
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: stable(a)vger.kernel.org
Fixes: b1f9e876862d ("mm/uffd: enable write protection for shmem & hugetlbfs")
Reported-by: Ives van Hoorne <ives(a)codesandbox.io>
Signed-off-by: Peter Xu <peterx(a)redhat.com>
---
mm/migrate.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/mm/migrate.c b/mm/migrate.c
index dff333593a8a..8b6351c08c78 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -213,8 +213,14 @@ static bool remove_migration_pte(struct folio *folio,
pte = pte_mkdirty(pte);
if (is_writable_migration_entry(entry))
pte = maybe_mkwrite(pte, vma);
- else if (pte_swp_uffd_wp(*pvmw.pte))
+ else
+ /* NOTE: mk_pte can have write bit set */
+ pte = pte_wrprotect(pte);
+
+ if (pte_swp_uffd_wp(*pvmw.pte)) {
+ WARN_ON_ONCE(pte_write(pte));
pte = pte_mkuffd_wp(pte);
+ }
if (folio_test_anon(folio) && !is_readable_migration_entry(entry))
rmap_flags |= RMAP_EXCLUSIVE;
--
2.37.3