Ives van Hoorne from codesandbox.io reported an issue regarding possible
data loss of uffd-wp when applied to memfds on heavily loaded systems. The
sympton is some read page got data mismatch from the snapshot child VMs.
Here I can also reproduce with a Rust reproducer that was provided by Ives
that keeps taking snapshot of a 256MB VM, on a 32G system when I initiate
80 instances I can trigger the issues in ten minutes.
It turns out that we got some pages write-through even if uffd-wp is
applied to the pte.
The problem is, when removing migration entries, we didn't really worry
about write bit as long as we know it's not a write migration entry. That
may not be true, for some memory types (e.g. writable shmem) mk_pte can
return a pte with write bit set, then to recover the migration entry to its
original state we need to explicit wr-protect the pte or it'll has the
write bit set if it's a read migration entry.
For uffd it can cause write-through. I didn't verify, but I think it'll be
the same for mprotect()ed pages and after migration we can miss the sigbus
instead.
The relevant code on uffd was introduced in the anon support, which is
commit f45ec5ff16a7 ("userfaultfd: wp: support swap and page migration",
2020-04-07). However anon shouldn't suffer from this problem because anon
should already have the write bit cleared always, so that may not be a
proper Fixes target. To satisfy the need on the backport, I'm attaching
the Fixes tag to the uffd-wp shmem support. Since no one had issue with
mprotect, so I assume that's also the kernel version we should start to
backport for stable, and we shouldn't need to worry before that.
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: stable(a)vger.kernel.org
Fixes: b1f9e876862d ("mm/uffd: enable write protection for shmem & hugetlbfs")
Reported-by: Ives van Hoorne <ives(a)codesandbox.io>
Signed-off-by: Peter Xu <peterx(a)redhat.com>
---
mm/migrate.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/mm/migrate.c b/mm/migrate.c
index dff333593a8a..8b6351c08c78 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -213,8 +213,14 @@ static bool remove_migration_pte(struct folio *folio,
pte = pte_mkdirty(pte);
if (is_writable_migration_entry(entry))
pte = maybe_mkwrite(pte, vma);
- else if (pte_swp_uffd_wp(*pvmw.pte))
+ else
+ /* NOTE: mk_pte can have write bit set */
+ pte = pte_wrprotect(pte);
+
+ if (pte_swp_uffd_wp(*pvmw.pte)) {
+ WARN_ON_ONCE(pte_write(pte));
pte = pte_mkuffd_wp(pte);
+ }
if (folio_test_anon(folio) && !is_readable_migration_entry(entry))
rmap_flags |= RMAP_EXCLUSIVE;
--
2.37.3
From: Xiubo Li <xiubli(a)redhat.com>
The request's r_session maybe changed when it was forwarded or
resent. Both the forwarding and resending cases the requests will
be protected by the mdsc->mutex.
Cc: stable(a)vger.kernel.org
URL: https://bugzilla.redhat.com/show_bug.cgi?id=2137955
Signed-off-by: Xiubo Li <xiubli(a)redhat.com>
---
Changed in V5:
- simplify the code by removing the "unlikely(s->s_mds >= max_sessions)" check.
Changed in V4:
- move mdsc->mutex acquisition and max_sessions assignment into "if (req1 || req2)" branch
fs/ceph/caps.c | 48 ++++++++++++------------------------------------
1 file changed, 12 insertions(+), 36 deletions(-)
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 894adfb4a092..065e9311b607 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2297,7 +2297,6 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_request *req1 = NULL, *req2 = NULL;
- unsigned int max_sessions;
int ret, err = 0;
spin_lock(&ci->i_unsafe_lock);
@@ -2315,28 +2314,24 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
}
spin_unlock(&ci->i_unsafe_lock);
- /*
- * The mdsc->max_sessions is unlikely to be changed
- * mostly, here we will retry it by reallocating the
- * sessions array memory to get rid of the mdsc->mutex
- * lock.
- */
-retry:
- max_sessions = mdsc->max_sessions;
-
/*
* Trigger to flush the journal logs in all the relevant MDSes
* manually, or in the worst case we must wait at most 5 seconds
* to wait the journal logs to be flushed by the MDSes periodically.
*/
- if ((req1 || req2) && likely(max_sessions)) {
- struct ceph_mds_session **sessions = NULL;
- struct ceph_mds_session *s;
+ if (req1 || req2) {
struct ceph_mds_request *req;
+ struct ceph_mds_session **sessions;
+ struct ceph_mds_session *s;
+ unsigned int max_sessions;
int i;
+ mutex_lock(&mdsc->mutex);
+ max_sessions = mdsc->max_sessions;
+
sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL);
if (!sessions) {
+ mutex_unlock(&mdsc->mutex);
err = -ENOMEM;
goto out;
}
@@ -2348,16 +2343,6 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
s = req->r_session;
if (!s)
continue;
- if (unlikely(s->s_mds >= max_sessions)) {
- spin_unlock(&ci->i_unsafe_lock);
- for (i = 0; i < max_sessions; i++) {
- s = sessions[i];
- if (s)
- ceph_put_mds_session(s);
- }
- kfree(sessions);
- goto retry;
- }
if (!sessions[s->s_mds]) {
s = ceph_get_mds_session(s);
sessions[s->s_mds] = s;
@@ -2370,16 +2355,6 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
s = req->r_session;
if (!s)
continue;
- if (unlikely(s->s_mds >= max_sessions)) {
- spin_unlock(&ci->i_unsafe_lock);
- for (i = 0; i < max_sessions; i++) {
- s = sessions[i];
- if (s)
- ceph_put_mds_session(s);
- }
- kfree(sessions);
- goto retry;
- }
if (!sessions[s->s_mds]) {
s = ceph_get_mds_session(s);
sessions[s->s_mds] = s;
@@ -2391,11 +2366,12 @@ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
/* the auth MDS */
spin_lock(&ci->i_ceph_lock);
if (ci->i_auth_cap) {
- s = ci->i_auth_cap->session;
- if (!sessions[s->s_mds])
- sessions[s->s_mds] = ceph_get_mds_session(s);
+ s = ci->i_auth_cap->session;
+ if (!sessions[s->s_mds])
+ sessions[s->s_mds] = ceph_get_mds_session(s);
}
spin_unlock(&ci->i_ceph_lock);
+ mutex_unlock(&mdsc->mutex);
/* send flush mdlog request to MDSes */
for (i = 0; i < max_sessions; i++) {
--
2.31.1
Hello,
I saw that the projected EOL of LTS 5.15 is Oct 2023.
How likely is it that the date will be extended? I'm guessing it's
pretty likely, given that Android uses it.
Thanks,
-- Suleiman
From: Eric Biggers <ebiggers(a)google.com>
Currently, registering an algorithm with the crypto API always causes a
notification to be posted to the "cryptomgr", which then creates a
kthread to self-test the algorithm. However, if self-tests are disabled
in the kconfig (as is the default option), then this kthread just
notifies waiters that the algorithm has been tested, then exits.
This causes a significant amount of overhead, especially in the kthread
creation and destruction, which is not necessary at all. For example,
in a quick test I found that booting a "minimum" x86_64 kernel with all
the crypto options enabled (except for the self-tests) takes about 400ms
until PID 1 can start. Of that, a full 13ms is spent just doing this
pointless dance, involving a kthread being created, run, and destroyed
over 200 times. That's over 3% of the entire kernel start time.
Fix this by just skipping the creation of the test larval and the
posting of the registration notification entirely, when self-tests are
disabled. Also compile out the unnecessary code in algboss.c.
While this patch is an optimization and not a "fix" per se, I've marked
it as for stable, due to the large improvement it can make to boot time.
Cc: stable(a)vger.kernel.org
Signed-off-by: Eric Biggers <ebiggers(a)google.com>
---
crypto/algapi.c | 3 ++-
crypto/algboss.c | 11 +++++++----
2 files changed, 9 insertions(+), 5 deletions(-)
diff --git a/crypto/algapi.c b/crypto/algapi.c
index 5c69ff8e8fa5c..018935cb8417d 100644
--- a/crypto/algapi.c
+++ b/crypto/algapi.c
@@ -226,7 +226,8 @@ static struct crypto_larval *crypto_alloc_test_larval(struct crypto_alg *alg)
{
struct crypto_larval *larval;
- if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER))
+ if (!IS_ENABLED(CONFIG_CRYPTO_MANAGER) ||
+ IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS))
return NULL;
larval = crypto_larval_alloc(alg->cra_name,
diff --git a/crypto/algboss.c b/crypto/algboss.c
index eb5fe84efb83e..e6f0443d08048 100644
--- a/crypto/algboss.c
+++ b/crypto/algboss.c
@@ -171,16 +171,18 @@ static int cryptomgr_schedule_probe(struct crypto_larval *larval)
return NOTIFY_OK;
}
+#ifdef CONFIG_CRYPTO_MANAGER_DISABLE_TESTS
+static int cryptomgr_schedule_test(struct crypto_alg *alg)
+{
+ return 0;
+}
+#else
static int cryptomgr_test(void *data)
{
struct crypto_test_param *param = data;
u32 type = param->type;
int err = 0;
-#ifdef CONFIG_CRYPTO_MANAGER_DISABLE_TESTS
- goto skiptest;
-#endif
-
if (type & CRYPTO_ALG_TESTED)
goto skiptest;
@@ -229,6 +231,7 @@ static int cryptomgr_schedule_test(struct crypto_alg *alg)
err:
return NOTIFY_OK;
}
+#endif /* !CONFIG_CRYPTO_MANAGER_DISABLE_TESTS */
static int cryptomgr_notify(struct notifier_block *this, unsigned long msg,
void *data)
base-commit: f67dd6ce0723ad013395f20a3f79d8a437d3f455
--
2.38.1