The patch below does not apply to the 5.11-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From b5b0ecb736f1ce1e68eb50613c0cfecff10198eb Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe(a)kernel.dk>
Date: Thu, 4 Mar 2021 21:02:58 -0700
Subject: [PATCH] io_uring: clear IOCB_WAITQ for non -EIOCBQUEUED return
The callback can only be armed, if we get -EIOCBQUEUED returned. It's
important that we clear the WAITQ bit for other cases, otherwise we can
queue for async retry and filemap will assume that we're armed and
return -EAGAIN instead of just blocking for the IO.
Cc: stable(a)vger.kernel.org # 5.9+
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 044170165402..5762750c666c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3286,6 +3286,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
if (ret == -EIOCBQUEUED)
return 0;
/* we got some bytes, but not all. retry. */
+ kiocb->ki_flags &= ~IOCB_WAITQ;
} while (ret > 0 && ret < io_size);
done:
kiocb_done(kiocb, ret, issue_flags);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From b5b0ecb736f1ce1e68eb50613c0cfecff10198eb Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe(a)kernel.dk>
Date: Thu, 4 Mar 2021 21:02:58 -0700
Subject: [PATCH] io_uring: clear IOCB_WAITQ for non -EIOCBQUEUED return
The callback can only be armed, if we get -EIOCBQUEUED returned. It's
important that we clear the WAITQ bit for other cases, otherwise we can
queue for async retry and filemap will assume that we're armed and
return -EAGAIN instead of just blocking for the IO.
Cc: stable(a)vger.kernel.org # 5.9+
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 044170165402..5762750c666c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3286,6 +3286,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
if (ret == -EIOCBQUEUED)
return 0;
/* we got some bytes, but not all. retry. */
+ kiocb->ki_flags &= ~IOCB_WAITQ;
} while (ret > 0 && ret < io_size);
done:
kiocb_done(kiocb, ret, issue_flags);
The patch below does not apply to the 5.11-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From ca0a26511c679a797f86589894a4523db36d833e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe(a)kernel.dk>
Date: Thu, 4 Mar 2021 17:15:48 -0700
Subject: [PATCH] io_uring: don't keep looping for more events if we can't
flush overflow
It doesn't make sense to wait for more events to come in, if we can't
even flush the overflow we already have to the ring. Return -EBUSY for
that condition, just like we do for attempts to submit with overflow
pending.
Cc: stable(a)vger.kernel.org # 5.11
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 76e243c056b5..044170165402 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1451,18 +1451,22 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
return all_flushed;
}
-static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
+static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force,
struct task_struct *tsk,
struct files_struct *files)
{
+ bool ret = true;
+
if (test_bit(0, &ctx->cq_check_overflow)) {
/* iopoll syncs against uring_lock, not completion_lock */
if (ctx->flags & IORING_SETUP_IOPOLL)
mutex_lock(&ctx->uring_lock);
- __io_cqring_overflow_flush(ctx, force, tsk, files);
+ ret = __io_cqring_overflow_flush(ctx, force, tsk, files);
if (ctx->flags & IORING_SETUP_IOPOLL)
mutex_unlock(&ctx->uring_lock);
}
+
+ return ret;
}
static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
@@ -6883,11 +6887,16 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
trace_io_uring_cqring_wait(ctx, min_events);
do {
- io_cqring_overflow_flush(ctx, false, NULL, NULL);
+ /* if we can't even flush overflow, don't wait for more */
+ if (!io_cqring_overflow_flush(ctx, false, NULL, NULL)) {
+ ret = -EBUSY;
+ break;
+ }
prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
TASK_INTERRUPTIBLE);
ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
finish_wait(&ctx->wait, &iowq.wq);
+ cond_resched();
} while (ret > 0);
restore_saved_sigmask_unless(ret == -EINTR);
The patch below does not apply to the 5.11-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 3e6a0d3c7571ce3ed0d25c5c32543a54a7ebcd75 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe(a)kernel.dk>
Date: Mon, 1 Mar 2021 13:56:00 -0700
Subject: [PATCH] io_uring: fix -EAGAIN retry with IOPOLL
We no longer revert the iovec on -EIOCBQUEUED, see commit ab2125df921d,
and this started causing issues for IOPOLL on devies that run out of
request slots. Turns out what outside of needing a revert for those, we
also had a bug where we didn't properly setup retry inside the submission
path. That could cause re-import of the iovec, if any, and that could lead
to spurious results if the application had those allocated on the stack.
Catch -EAGAIN retry and make the iovec stable for IOPOLL, just like we do
for !IOPOLL retries.
Cc: <stable(a)vger.kernel.org> # 5.9+
Reported-by: Abaci Robot <abaci(a)linux.alibaba.com>
Reported-by: Xiaoguang Wang <xiaoguang.wang(a)linux.alibaba.com>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io_uring.c b/fs/io_uring.c
index f060dcc1cc86..361befaae28f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2423,23 +2423,32 @@ static bool io_resubmit_prep(struct io_kiocb *req)
return false;
return !io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
}
-#endif
-static bool io_rw_reissue(struct io_kiocb *req)
+static bool io_rw_should_reissue(struct io_kiocb *req)
{
-#ifdef CONFIG_BLOCK
umode_t mode = file_inode(req->file)->i_mode;
+ struct io_ring_ctx *ctx = req->ctx;
if (!S_ISBLK(mode) && !S_ISREG(mode))
return false;
- if ((req->flags & REQ_F_NOWAIT) || io_wq_current_is_worker())
+ if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
+ !(ctx->flags & IORING_SETUP_IOPOLL)))
return false;
/*
* If ref is dying, we might be running poll reap from the exit work.
* Don't attempt to reissue from that path, just let it fail with
* -EAGAIN.
*/
- if (percpu_ref_is_dying(&req->ctx->refs))
+ if (percpu_ref_is_dying(&ctx->refs))
+ return false;
+ return true;
+}
+#endif
+
+static bool io_rw_reissue(struct io_kiocb *req)
+{
+#ifdef CONFIG_BLOCK
+ if (!io_rw_should_reissue(req))
return false;
lockdep_assert_held(&req->ctx->uring_lock);
@@ -2482,6 +2491,19 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
+#ifdef CONFIG_BLOCK
+ /* Rewind iter, if we have one. iopoll path resubmits as usual */
+ if (res == -EAGAIN && io_rw_should_reissue(req)) {
+ struct io_async_rw *rw = req->async_data;
+
+ if (rw)
+ iov_iter_revert(&rw->iter,
+ req->result - iov_iter_count(&rw->iter));
+ else if (!io_resubmit_prep(req))
+ res = -EIO;
+ }
+#endif
+
if (kiocb->ki_flags & IOCB_WRITE)
kiocb_end_write(req);
@@ -3230,6 +3252,8 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
ret = io_iter_do_read(req, iter);
if (ret == -EIOCBQUEUED) {
+ if (req->async_data)
+ iov_iter_revert(iter, io_size - iov_iter_count(iter));
goto out_free;
} else if (ret == -EAGAIN) {
/* IOPOLL retry should happen for io-wq threads */
@@ -3361,6 +3385,8 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
/* no retry on NONBLOCK nor RWF_NOWAIT */
if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
goto done;
+ if (ret2 == -EIOCBQUEUED && req->async_data)
+ iov_iter_revert(iter, io_size - iov_iter_count(iter));
if (!force_nonblock || ret2 != -EAGAIN) {
/* IOPOLL retry should happen for io-wq threads */
if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 3e6a0d3c7571ce3ed0d25c5c32543a54a7ebcd75 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe(a)kernel.dk>
Date: Mon, 1 Mar 2021 13:56:00 -0700
Subject: [PATCH] io_uring: fix -EAGAIN retry with IOPOLL
We no longer revert the iovec on -EIOCBQUEUED, see commit ab2125df921d,
and this started causing issues for IOPOLL on devies that run out of
request slots. Turns out what outside of needing a revert for those, we
also had a bug where we didn't properly setup retry inside the submission
path. That could cause re-import of the iovec, if any, and that could lead
to spurious results if the application had those allocated on the stack.
Catch -EAGAIN retry and make the iovec stable for IOPOLL, just like we do
for !IOPOLL retries.
Cc: <stable(a)vger.kernel.org> # 5.9+
Reported-by: Abaci Robot <abaci(a)linux.alibaba.com>
Reported-by: Xiaoguang Wang <xiaoguang.wang(a)linux.alibaba.com>
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
diff --git a/fs/io_uring.c b/fs/io_uring.c
index f060dcc1cc86..361befaae28f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2423,23 +2423,32 @@ static bool io_resubmit_prep(struct io_kiocb *req)
return false;
return !io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
}
-#endif
-static bool io_rw_reissue(struct io_kiocb *req)
+static bool io_rw_should_reissue(struct io_kiocb *req)
{
-#ifdef CONFIG_BLOCK
umode_t mode = file_inode(req->file)->i_mode;
+ struct io_ring_ctx *ctx = req->ctx;
if (!S_ISBLK(mode) && !S_ISREG(mode))
return false;
- if ((req->flags & REQ_F_NOWAIT) || io_wq_current_is_worker())
+ if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
+ !(ctx->flags & IORING_SETUP_IOPOLL)))
return false;
/*
* If ref is dying, we might be running poll reap from the exit work.
* Don't attempt to reissue from that path, just let it fail with
* -EAGAIN.
*/
- if (percpu_ref_is_dying(&req->ctx->refs))
+ if (percpu_ref_is_dying(&ctx->refs))
+ return false;
+ return true;
+}
+#endif
+
+static bool io_rw_reissue(struct io_kiocb *req)
+{
+#ifdef CONFIG_BLOCK
+ if (!io_rw_should_reissue(req))
return false;
lockdep_assert_held(&req->ctx->uring_lock);
@@ -2482,6 +2491,19 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
+#ifdef CONFIG_BLOCK
+ /* Rewind iter, if we have one. iopoll path resubmits as usual */
+ if (res == -EAGAIN && io_rw_should_reissue(req)) {
+ struct io_async_rw *rw = req->async_data;
+
+ if (rw)
+ iov_iter_revert(&rw->iter,
+ req->result - iov_iter_count(&rw->iter));
+ else if (!io_resubmit_prep(req))
+ res = -EIO;
+ }
+#endif
+
if (kiocb->ki_flags & IOCB_WRITE)
kiocb_end_write(req);
@@ -3230,6 +3252,8 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
ret = io_iter_do_read(req, iter);
if (ret == -EIOCBQUEUED) {
+ if (req->async_data)
+ iov_iter_revert(iter, io_size - iov_iter_count(iter));
goto out_free;
} else if (ret == -EAGAIN) {
/* IOPOLL retry should happen for io-wq threads */
@@ -3361,6 +3385,8 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
/* no retry on NONBLOCK nor RWF_NOWAIT */
if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
goto done;
+ if (ret2 == -EIOCBQUEUED && req->async_data)
+ iov_iter_revert(iter, io_size - iov_iter_count(iter));
if (!force_nonblock || ret2 != -EAGAIN) {
/* IOPOLL retry should happen for io-wq threads */
if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 4d14c5cde5c268a2bc26addecf09489cb953ef64 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov(a)suse.com>
Date: Mon, 22 Feb 2021 18:40:44 +0200
Subject: [PATCH] btrfs: don't flush from btrfs_delayed_inode_reserve_metadata
Calling btrfs_qgroup_reserve_meta_prealloc from
btrfs_delayed_inode_reserve_metadata can result in flushing delalloc
while holding a transaction and delayed node locks. This is deadlock
prone. In the past multiple commits:
* ae5e070eaca9 ("btrfs: qgroup: don't try to wait flushing if we're
already holding a transaction")
* 6f23277a49e6 ("btrfs: qgroup: don't commit transaction when we already
hold the handle")
Tried to solve various aspects of this but this was always a
whack-a-mole game. Unfortunately those 2 fixes don't solve a deadlock
scenario involving btrfs_delayed_node::mutex. Namely, one thread
can call btrfs_dirty_inode as a result of reading a file and modifying
its atime:
PID: 6963 TASK: ffff8c7f3f94c000 CPU: 2 COMMAND: "test"
#0 __schedule at ffffffffa529e07d
#1 schedule at ffffffffa529e4ff
#2 schedule_timeout at ffffffffa52a1bdd
#3 wait_for_completion at ffffffffa529eeea <-- sleeps with delayed node mutex held
#4 start_delalloc_inodes at ffffffffc0380db5
#5 btrfs_start_delalloc_snapshot at ffffffffc0393836
#6 try_flush_qgroup at ffffffffc03f04b2
#7 __btrfs_qgroup_reserve_meta at ffffffffc03f5bb6 <-- tries to reserve space and starts delalloc inodes.
#8 btrfs_delayed_update_inode at ffffffffc03e31aa <-- acquires delayed node mutex
#9 btrfs_update_inode at ffffffffc0385ba8
#10 btrfs_dirty_inode at ffffffffc038627b <-- TRANSACTIION OPENED
#11 touch_atime at ffffffffa4cf0000
#12 generic_file_read_iter at ffffffffa4c1f123
#13 new_sync_read at ffffffffa4ccdc8a
#14 vfs_read at ffffffffa4cd0849
#15 ksys_read at ffffffffa4cd0bd1
#16 do_syscall_64 at ffffffffa4a052eb
#17 entry_SYSCALL_64_after_hwframe at ffffffffa540008c
This will cause an asynchronous work to flush the delalloc inodes to
happen which can try to acquire the same delayed_node mutex:
PID: 455 TASK: ffff8c8085fa4000 CPU: 5 COMMAND: "kworker/u16:30"
#0 __schedule at ffffffffa529e07d
#1 schedule at ffffffffa529e4ff
#2 schedule_preempt_disabled at ffffffffa529e80a
#3 __mutex_lock at ffffffffa529fdcb <-- goes to sleep, never wakes up.
#4 btrfs_delayed_update_inode at ffffffffc03e3143 <-- tries to acquire the mutex
#5 btrfs_update_inode at ffffffffc0385ba8 <-- this is the same inode that pid 6963 is holding
#6 cow_file_range_inline.constprop.78 at ffffffffc0386be7
#7 cow_file_range at ffffffffc03879c1
#8 btrfs_run_delalloc_range at ffffffffc038894c
#9 writepage_delalloc at ffffffffc03a3c8f
#10 __extent_writepage at ffffffffc03a4c01
#11 extent_write_cache_pages at ffffffffc03a500b
#12 extent_writepages at ffffffffc03a6de2
#13 do_writepages at ffffffffa4c277eb
#14 __filemap_fdatawrite_range at ffffffffa4c1e5bb
#15 btrfs_run_delalloc_work at ffffffffc0380987 <-- starts running delayed nodes
#16 normal_work_helper at ffffffffc03b706c
#17 process_one_work at ffffffffa4aba4e4
#18 worker_thread at ffffffffa4aba6fd
#19 kthread at ffffffffa4ac0a3d
#20 ret_from_fork at ffffffffa54001ff
To fully address those cases the complete fix is to never issue any
flushing while holding the transaction or the delayed node lock. This
patch achieves it by calling qgroup_reserve_meta directly which will
either succeed without flushing or will fail and return -EDQUOT. In the
latter case that return value is going to be propagated to
btrfs_dirty_inode which will fallback to start a new transaction. That's
fine as the majority of time we expect the inode will have
BTRFS_DELAYED_NODE_INODE_DIRTY flag set which will result in directly
copying the in-memory state.
Fixes: c53e9653605d ("btrfs: qgroup: try to flush qgroup space when we get -EDQUOT")
CC: stable(a)vger.kernel.org # 5.10+
Reviewed-by: Qu Wenruo <wqu(a)suse.com>
Signed-off-by: Nikolay Borisov <nborisov(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index ac9966e76a2f..bf25401c9768 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -627,7 +627,8 @@ static int btrfs_delayed_inode_reserve_metadata(
*/
if (!src_rsv || (!trans->bytes_reserved &&
src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
- ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
+ ret = btrfs_qgroup_reserve_meta(root, num_bytes,
+ BTRFS_QGROUP_RSV_META_PREALLOC, true);
if (ret < 0)
return ret;
ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4f2f1e932751..c35b724a5611 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6081,7 +6081,7 @@ static int btrfs_dirty_inode(struct inode *inode)
return PTR_ERR(trans);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
- if (ret && ret == -ENOSPC) {
+ if (ret && (ret == -ENOSPC || ret == -EDQUOT)) {
/* whoops, lets try again with the full transaction */
btrfs_end_transaction(trans);
trans = btrfs_start_transaction(root, 1);
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 5011c5a663b9c6d6aff3d394f11049b371199627 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dancarpenter(a)oracle.com>
Date: Wed, 17 Feb 2021 09:04:34 +0300
Subject: [PATCH] btrfs: validate qgroup inherit for SNAP_CREATE_V2 ioctl
The problem is we're copying "inherit" from user space but we don't
necessarily know that we're copying enough data for a 64 byte
struct. Then the next problem is that 'inherit' has a variable size
array at the end, and we have to verify that array is the size we
expected.
Fixes: 6f72c7e20dba ("Btrfs: add qgroup inheritance")
CC: stable(a)vger.kernel.org # 4.4+
Signed-off-by: Dan Carpenter <dan.carpenter(a)oracle.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a8c60d46d19c..1b837c08ca90 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1935,7 +1935,10 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
readonly = true;
if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
- if (vol_args->size > PAGE_SIZE) {
+ u64 nums;
+
+ if (vol_args->size < sizeof(*inherit) ||
+ vol_args->size > PAGE_SIZE) {
ret = -EINVAL;
goto free_args;
}
@@ -1944,6 +1947,20 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
ret = PTR_ERR(inherit);
goto free_args;
}
+
+ if (inherit->num_qgroups > PAGE_SIZE ||
+ inherit->num_ref_copies > PAGE_SIZE ||
+ inherit->num_excl_copies > PAGE_SIZE) {
+ ret = -EINVAL;
+ goto free_inherit;
+ }
+
+ nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
+ 2 * inherit->num_excl_copies;
+ if (vol_args->size != struct_size(inherit, qgroups, nums)) {
+ ret = -EINVAL;
+ goto free_inherit;
+ }
}
ret = __btrfs_ioctl_snap_create(file, vol_args->name, vol_args->fd,
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 3c17916510428dbccdf657de050c34e208347089 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov(a)suse.com>
Date: Mon, 8 Feb 2021 10:26:54 +0200
Subject: [PATCH] btrfs: fix race between extent freeing/allocation when using
bitmaps
During allocation the allocator will try to allocate an extent using
cluster policy. Once the current cluster is exhausted it will remove the
entry under btrfs_free_cluster::lock and subsequently acquire
btrfs_free_space_ctl::tree_lock to dispose of the already-deleted entry
and adjust btrfs_free_space_ctl::total_bitmap. This poses a problem
because there exists a race condition between removing the entry under
one lock and doing the necessary accounting holding a different lock
since extent freeing only uses the 2nd lock. This can result in the
following situation:
T1: T2:
btrfs_alloc_from_cluster insert_into_bitmap <holds tree_lock>
if (entry->bytes == 0) if (block_group && !list_empty(&block_group->cluster_list)) {
rb_erase(entry)
spin_unlock(&cluster->lock);
(total_bitmaps is still 4) spin_lock(&cluster->lock);
<doesn't find entry in cluster->root>
spin_lock(&ctl->tree_lock); <goes to new_bitmap label, adds
<blocked since T2 holds tree_lock> <a new entry and calls add_new_bitmap>
recalculate_thresholds <crashes,
due to total_bitmaps
becoming 5 and triggering
an ASSERT>
To fix this ensure that once depleted, the cluster entry is deleted when
both cluster lock and tree locks are held in the allocator (T1), this
ensures that even if there is a race with a concurrent
insert_into_bitmap call it will correctly find the entry in the cluster
and add the new space to it.
CC: <stable(a)vger.kernel.org> # 4.4+
Reviewed-by: Josef Bacik <josef(a)toxicpanda.com>
Signed-off-by: Nikolay Borisov <nborisov(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 5400294bd271..abcf951e6b44 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -3125,8 +3125,6 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
entry->bytes -= bytes;
}
- if (entry->bytes == 0)
- rb_erase(&entry->offset_index, &cluster->root);
break;
}
out:
@@ -3143,7 +3141,10 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
ctl->free_space -= bytes;
if (!entry->bitmap && !btrfs_free_space_trimmed(entry))
ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
+
+ spin_lock(&cluster->lock);
if (entry->bytes == 0) {
+ rb_erase(&entry->offset_index, &cluster->root);
ctl->free_extents--;
if (entry->bitmap) {
kmem_cache_free(btrfs_free_space_bitmap_cachep,
@@ -3156,6 +3157,7 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
kmem_cache_free(btrfs_free_space_cachep, entry);
}
+ spin_unlock(&cluster->lock);
spin_unlock(&ctl->tree_lock);
return ret;
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 3c17916510428dbccdf657de050c34e208347089 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov(a)suse.com>
Date: Mon, 8 Feb 2021 10:26:54 +0200
Subject: [PATCH] btrfs: fix race between extent freeing/allocation when using
bitmaps
During allocation the allocator will try to allocate an extent using
cluster policy. Once the current cluster is exhausted it will remove the
entry under btrfs_free_cluster::lock and subsequently acquire
btrfs_free_space_ctl::tree_lock to dispose of the already-deleted entry
and adjust btrfs_free_space_ctl::total_bitmap. This poses a problem
because there exists a race condition between removing the entry under
one lock and doing the necessary accounting holding a different lock
since extent freeing only uses the 2nd lock. This can result in the
following situation:
T1: T2:
btrfs_alloc_from_cluster insert_into_bitmap <holds tree_lock>
if (entry->bytes == 0) if (block_group && !list_empty(&block_group->cluster_list)) {
rb_erase(entry)
spin_unlock(&cluster->lock);
(total_bitmaps is still 4) spin_lock(&cluster->lock);
<doesn't find entry in cluster->root>
spin_lock(&ctl->tree_lock); <goes to new_bitmap label, adds
<blocked since T2 holds tree_lock> <a new entry and calls add_new_bitmap>
recalculate_thresholds <crashes,
due to total_bitmaps
becoming 5 and triggering
an ASSERT>
To fix this ensure that once depleted, the cluster entry is deleted when
both cluster lock and tree locks are held in the allocator (T1), this
ensures that even if there is a race with a concurrent
insert_into_bitmap call it will correctly find the entry in the cluster
and add the new space to it.
CC: <stable(a)vger.kernel.org> # 4.4+
Reviewed-by: Josef Bacik <josef(a)toxicpanda.com>
Signed-off-by: Nikolay Borisov <nborisov(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 5400294bd271..abcf951e6b44 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -3125,8 +3125,6 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
entry->bytes -= bytes;
}
- if (entry->bytes == 0)
- rb_erase(&entry->offset_index, &cluster->root);
break;
}
out:
@@ -3143,7 +3141,10 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
ctl->free_space -= bytes;
if (!entry->bitmap && !btrfs_free_space_trimmed(entry))
ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes;
+
+ spin_lock(&cluster->lock);
if (entry->bytes == 0) {
+ rb_erase(&entry->offset_index, &cluster->root);
ctl->free_extents--;
if (entry->bitmap) {
kmem_cache_free(btrfs_free_space_bitmap_cachep,
@@ -3156,6 +3157,7 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group,
kmem_cache_free(btrfs_free_space_cachep, entry);
}
+ spin_unlock(&cluster->lock);
spin_unlock(&ctl->tree_lock);
return ret;