From: Eric Biggers <ebiggers(a)google.com>
The full pagecache drop at the end of FS_IOC_ENABLE_VERITY is causing
performance problems and is hindering adoption of fsverity. It was
intended to solve a race condition where unverified pages might be left
in the pagecache. But actually it doesn't solve it fully.
Since the incomplete solution for this race condition has too much
performance impact for it to be worth it, let's remove it for now.
Fixes: 3fda4c617e84 ("fs-verity: implement FS_IOC_ENABLE_VERITY ioctl")
Cc: stable(a)vger.kernel.org
Signed-off-by: Eric Biggers <ebiggers(a)google.com>
---
fs/verity/enable.c | 25 +++++++++++++------------
1 file changed, 13 insertions(+), 12 deletions(-)
diff --git a/fs/verity/enable.c b/fs/verity/enable.c
index e13db6507b38b..7a0e3a84d370b 100644
--- a/fs/verity/enable.c
+++ b/fs/verity/enable.c
@@ -8,7 +8,6 @@
#include "fsverity_private.h"
#include <linux/mount.h>
-#include <linux/pagemap.h>
#include <linux/sched/signal.h>
#include <linux/uaccess.h>
@@ -367,25 +366,27 @@ int fsverity_ioctl_enable(struct file *filp, const void __user *uarg)
goto out_drop_write;
err = enable_verity(filp, &arg);
- if (err)
- goto out_allow_write_access;
/*
- * Some pages of the file may have been evicted from pagecache after
- * being used in the Merkle tree construction, then read into pagecache
- * again by another process reading from the file concurrently. Since
- * these pages didn't undergo verification against the file digest which
- * fs-verity now claims to be enforcing, we have to wipe the pagecache
- * to ensure that all future reads are verified.
+ * We no longer drop the inode's pagecache after enabling verity. This
+ * used to be done to try to avoid a race condition where pages could be
+ * evicted after being used in the Merkle tree construction, then
+ * re-instantiated by a concurrent read. Such pages are unverified, and
+ * the backing storage could have filled them with different content, so
+ * they shouldn't be used to fulfill reads once verity is enabled.
+ *
+ * But, dropping the pagecache has a big performance impact, and it
+ * doesn't fully solve the race condition anyway. So for those reasons,
+ * and also because this race condition isn't very important relatively
+ * speaking (especially for small-ish files, where the chance of a page
+ * being used, evicted, *and* re-instantiated all while enabling verity
+ * is quite small), we no longer drop the inode's pagecache.
*/
- filemap_write_and_wait(inode->i_mapping);
- invalidate_inode_pages2(inode->i_mapping);
/*
* allow_write_access() is needed to pair with deny_write_access().
* Regardless, the filesystem won't allow writing to verity files.
*/
-out_allow_write_access:
allow_write_access(filp);
out_drop_write:
mnt_drop_write_file(filp);
base-commit: f959325e6ac3f499450088b8d9c626d1177be160
--
2.39.2
From: Eric Biggers <ebiggers(a)google.com>
Once all I/O using a blk_crypto_key has completed, filesystems can call
blk_crypto_evict_key(). However, the block layer currently doesn't call
blk_crypto_put_keyslot() until the request is being freed, which happens
after upper layers have been told (via bio_endio()) the I/O has
completed. This causes a race condition where blk_crypto_evict_key()
can see 'slot_refs != 0' without there being an actual bug.
This makes __blk_crypto_evict_key() hit the
'WARN_ON_ONCE(atomic_read(&slot->slot_refs) != 0)' and return without
doing anything, eventually causing a use-after-free in
blk_crypto_reprogram_all_keys(). (This is a very rare bug and has only
been seen when per-file keys are being used with fscrypt.)
There are two options to fix this: either release the keyslot before
bio_endio() is called on the request's last bio, or make
__blk_crypto_evict_key() ignore slot_refs. Let's go with the first
solution, since it preserves the ability to report bugs (via
WARN_ON_ONCE) where a key is evicted while still in-use.
Fixes: a892c8d52c02 ("block: Inline encryption support for blk-mq")
Cc: stable(a)vger.kernel.org
Reviewed-by: Nathan Huckleberry <nhuck(a)google.com>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
Signed-off-by: Eric Biggers <ebiggers(a)google.com>
---
block/blk-crypto-internal.h | 25 +++++++++++++++++++++----
block/blk-crypto.c | 24 ++++++++++++------------
block/blk-merge.c | 2 ++
block/blk-mq.c | 15 ++++++++++++++-
4 files changed, 49 insertions(+), 17 deletions(-)
diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h
index a8cdaf26851e1..4f1de2495f0c3 100644
--- a/block/blk-crypto-internal.h
+++ b/block/blk-crypto-internal.h
@@ -65,6 +65,11 @@ static inline bool blk_crypto_rq_is_encrypted(struct request *rq)
return rq->crypt_ctx;
}
+static inline bool blk_crypto_rq_has_keyslot(struct request *rq)
+{
+ return rq->crypt_keyslot;
+}
+
blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile,
const struct blk_crypto_key *key,
struct blk_crypto_keyslot **slot_ptr);
@@ -119,6 +124,11 @@ static inline bool blk_crypto_rq_is_encrypted(struct request *rq)
return false;
}
+static inline bool blk_crypto_rq_has_keyslot(struct request *rq)
+{
+ return false;
+}
+
#endif /* CONFIG_BLK_INLINE_ENCRYPTION */
void __bio_crypt_advance(struct bio *bio, unsigned int bytes);
@@ -153,14 +163,21 @@ static inline bool blk_crypto_bio_prep(struct bio **bio_ptr)
return true;
}
-blk_status_t __blk_crypto_init_request(struct request *rq);
-static inline blk_status_t blk_crypto_init_request(struct request *rq)
+blk_status_t __blk_crypto_rq_get_keyslot(struct request *rq);
+static inline blk_status_t blk_crypto_rq_get_keyslot(struct request *rq)
{
if (blk_crypto_rq_is_encrypted(rq))
- return __blk_crypto_init_request(rq);
+ return __blk_crypto_rq_get_keyslot(rq);
return BLK_STS_OK;
}
+void __blk_crypto_rq_put_keyslot(struct request *rq);
+static inline void blk_crypto_rq_put_keyslot(struct request *rq)
+{
+ if (blk_crypto_rq_has_keyslot(rq))
+ __blk_crypto_rq_put_keyslot(rq);
+}
+
void __blk_crypto_free_request(struct request *rq);
static inline void blk_crypto_free_request(struct request *rq)
{
@@ -199,7 +216,7 @@ static inline blk_status_t blk_crypto_insert_cloned_request(struct request *rq)
{
if (blk_crypto_rq_is_encrypted(rq))
- return blk_crypto_init_request(rq);
+ return blk_crypto_rq_get_keyslot(rq);
return BLK_STS_OK;
}
diff --git a/block/blk-crypto.c b/block/blk-crypto.c
index 45378586151f7..d0c7feb447e96 100644
--- a/block/blk-crypto.c
+++ b/block/blk-crypto.c
@@ -224,27 +224,27 @@ static bool bio_crypt_check_alignment(struct bio *bio)
return true;
}
-blk_status_t __blk_crypto_init_request(struct request *rq)
+blk_status_t __blk_crypto_rq_get_keyslot(struct request *rq)
{
return blk_crypto_get_keyslot(rq->q->crypto_profile,
rq->crypt_ctx->bc_key,
&rq->crypt_keyslot);
}
-/**
- * __blk_crypto_free_request - Uninitialize the crypto fields of a request.
- *
- * @rq: The request whose crypto fields to uninitialize.
- *
- * Completely uninitializes the crypto fields of a request. If a keyslot has
- * been programmed into some inline encryption hardware, that keyslot is
- * released. The rq->crypt_ctx is also freed.
- */
-void __blk_crypto_free_request(struct request *rq)
+void __blk_crypto_rq_put_keyslot(struct request *rq)
{
blk_crypto_put_keyslot(rq->crypt_keyslot);
+ rq->crypt_keyslot = NULL;
+}
+
+void __blk_crypto_free_request(struct request *rq)
+{
+ /* The keyslot, if one was needed, should have been released earlier. */
+ if (WARN_ON_ONCE(rq->crypt_keyslot))
+ __blk_crypto_rq_put_keyslot(rq);
+
mempool_free(rq->crypt_ctx, bio_crypt_ctx_pool);
- blk_crypto_rq_set_defaults(rq);
+ rq->crypt_ctx = NULL;
}
/**
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 6460abdb24267..65e75efa9bd36 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -867,6 +867,8 @@ static struct request *attempt_merge(struct request_queue *q,
if (!blk_discard_mergable(req))
elv_merge_requests(q, req, next);
+ blk_crypto_rq_put_keyslot(next);
+
/*
* 'next' is going away, so update stats accordingly
*/
diff --git a/block/blk-mq.c b/block/blk-mq.c
index d0cb2ef18fe21..49825538d932d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -840,6 +840,12 @@ static void blk_complete_request(struct request *req)
req->q->integrity.profile->complete_fn(req, total_bytes);
#endif
+ /*
+ * Upper layers may call blk_crypto_evict_key() anytime after the last
+ * bio_endio(). Therefore, the keyslot must be released before that.
+ */
+ blk_crypto_rq_put_keyslot(req);
+
blk_account_io_completion(req, total_bytes);
do {
@@ -905,6 +911,13 @@ bool blk_update_request(struct request *req, blk_status_t error,
req->q->integrity.profile->complete_fn(req, nr_bytes);
#endif
+ /*
+ * Upper layers may call blk_crypto_evict_key() anytime after the last
+ * bio_endio(). Therefore, the keyslot must be released before that.
+ */
+ if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req))
+ __blk_crypto_rq_put_keyslot(req);
+
if (unlikely(error && !blk_rq_is_passthrough(req) &&
!(req->rq_flags & RQF_QUIET)) &&
!test_bit(GD_DEAD, &req->q->disk->state)) {
@@ -2967,7 +2980,7 @@ void blk_mq_submit_bio(struct bio *bio)
blk_mq_bio_to_request(rq, bio, nr_segs);
- ret = blk_crypto_init_request(rq);
+ ret = blk_crypto_rq_get_keyslot(rq);
if (ret != BLK_STS_OK) {
bio->bi_status = ret;
bio_endio(bio);
--
2.39.2