Patch 1-2 and 5 helps to fix the conflicts smoothly. Patch 3-4 and 6 fixes the regression as reported by the test case btrfs/153 and comes from the below patch-set (btrfs: qgroup: Fix the long-existing regression of btrfs/153) Patch 7 fixes lockdep Warning as in the commit log now reported by the test case btrfs/153 on 5.4.y
Filipe Manana (1): btrfs: fix lockdep splat when enabling and disabling qgroups
Nikolay Borisov (2): btrfs: make qgroup_free_reserved_data take btrfs_inode btrfs: make btrfs_qgroup_reserve_data take btrfs_inode
Qu Wenruo (4): btrfs: qgroup: allow to unreserve range without releasing other ranges btrfs: qgroup: try to flush qgroup space when we get -EDQUOT btrfs: transaction: Cleanup unused TRANS_STATE_BLOCKED btrfs: qgroup: remove ASYNC_COMMIT mechanism in favor of reserve retry-after-EDQUOT
fs/btrfs/ctree.h | 13 +- fs/btrfs/delalloc-space.c | 2 +- fs/btrfs/disk-io.c | 4 +- fs/btrfs/file.c | 7 +- fs/btrfs/qgroup.c | 308 ++++++++++++++++++++++++++++---------- fs/btrfs/qgroup.h | 2 +- fs/btrfs/transaction.c | 16 +- fs/btrfs/transaction.h | 15 -- 8 files changed, 246 insertions(+), 121 deletions(-)
From: Nikolay Borisov nborisov@suse.com
commit df2cfd131fd33dbef1ce33be8b332b1f3d645f35 upstream
It only uses btrfs_inode so can just as easily take it as an argument.
Signed-off-by: Nikolay Borisov nborisov@suse.com Reviewed-by: David Sterba dsterba@suse.com Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Anand Jain anand.jain@oracle.com --- fs/btrfs/qgroup.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index cd8e81c02f63..1f214f80d664 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3481,10 +3481,10 @@ int btrfs_qgroup_reserve_data(struct inode *inode, }
/* Free ranges specified by @reserved, normally in error path */ -static int qgroup_free_reserved_data(struct inode *inode, +static int qgroup_free_reserved_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct ulist_node *unode; struct ulist_iterator uiter; struct extent_changeset changeset; @@ -3520,8 +3520,8 @@ static int qgroup_free_reserved_data(struct inode *inode, * EXTENT_QGROUP_RESERVED, we won't double free. * So not need to rush. */ - ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, - free_start, free_start + free_len - 1, + ret = clear_record_extent_bits(&inode->io_tree, free_start, + free_start + free_len - 1, EXTENT_QGROUP_RESERVED, &changeset); if (ret < 0) goto out; @@ -3550,7 +3550,8 @@ static int __btrfs_qgroup_release_data(struct inode *inode, /* In release case, we shouldn't have @reserved */ WARN_ON(!free && reserved); if (free && reserved) - return qgroup_free_reserved_data(inode, reserved, start, len); + return qgroup_free_reserved_data(BTRFS_I(inode), reserved, + start, len); extent_changeset_init(&changeset); ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len -1, EXTENT_QGROUP_RESERVED, &changeset);
From: Nikolay Borisov nborisov@suse.com
commit 7661a3e033ab782366e0e1f4b6aad0df3555fcbd upstream
There's only a single use of vfs_inode in a tracepoint so let's take btrfs_inode directly.
Signed-off-by: Nikolay Borisov nborisov@suse.com Reviewed-by: David Sterba dsterba@suse.com Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Anand Jain anand.jain@oracle.com --- fs/btrfs/delalloc-space.c | 2 +- fs/btrfs/file.c | 7 ++++--- fs/btrfs/qgroup.c | 10 +++++----- fs/btrfs/qgroup.h | 2 +- 4 files changed, 11 insertions(+), 10 deletions(-)
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index db9f2c58eb4a..f4f531c4aa96 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -151,7 +151,7 @@ int btrfs_check_data_free_space(struct inode *inode, return ret;
/* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ - ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); + ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), reserved, start, len); if (ret < 0) btrfs_free_reserved_data_space_noquota(inode, start, len); else diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f6308a7b761d..400b0717b9d4 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3149,7 +3149,7 @@ static int btrfs_zero_range(struct inode *inode, &cached_state); if (ret) goto out; - ret = btrfs_qgroup_reserve_data(inode, &data_reserved, + ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved, alloc_start, bytes_to_reserve); if (ret) { unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, @@ -3322,8 +3322,9 @@ static long btrfs_fallocate(struct file *file, int mode, free_extent_map(em); break; } - ret = btrfs_qgroup_reserve_data(inode, &data_reserved, - cur_offset, last_byte - cur_offset); + ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), + &data_reserved, cur_offset, + last_byte - cur_offset); if (ret < 0) { cur_offset = last_byte; free_extent_map(em); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 1f214f80d664..3d8cc8d56274 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3425,11 +3425,11 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) * same @reserved, caller must ensure when error happens it's OK * to free *ALL* reserved space. */ -int btrfs_qgroup_reserve_data(struct inode *inode, +int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, struct extent_changeset **reserved_ret, u64 start, u64 len) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct ulist_node *unode; struct ulist_iterator uiter; struct extent_changeset *reserved; @@ -3452,12 +3452,12 @@ int btrfs_qgroup_reserve_data(struct inode *inode, reserved = *reserved_ret; /* Record already reserved space */ orig_reserved = reserved->bytes_changed; - ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start, + ret = set_record_extent_bits(&inode->io_tree, start, start + len -1, EXTENT_QGROUP_RESERVED, reserved);
/* Newly reserved space */ to_reserve = reserved->bytes_changed - orig_reserved; - trace_btrfs_qgroup_reserve_data(inode, start, len, + trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len, to_reserve, QGROUP_RESERVE); if (ret < 0) goto cleanup; @@ -3471,7 +3471,7 @@ int btrfs_qgroup_reserve_data(struct inode *inode, /* cleanup *ALL* already reserved ranges */ ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(&reserved->range_changed, &uiter))) - clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val, + clear_extent_bit(&inode->io_tree, unode->val, unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL); /* Also free data bytes of already reserved one */ btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index b0420c4f5d0e..d1f93585f217 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -344,7 +344,7 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, #endif
/* New io_tree based accurate qgroup reserve API */ -int btrfs_qgroup_reserve_data(struct inode *inode, +int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, struct extent_changeset **reserved, u64 start, u64 len); int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len); int btrfs_qgroup_free_data(struct inode *inode,
From: Qu Wenruo wqu@suse.com
commit 263da812e87bac4098a4778efaa32c54275641db upstream
[PROBLEM] Before this patch, when btrfs_qgroup_reserve_data() fails, we free all reserved space of the changeset.
For example: ret = btrfs_qgroup_reserve_data(inode, changeset, 0, SZ_1M); ret = btrfs_qgroup_reserve_data(inode, changeset, SZ_1M, SZ_1M); ret = btrfs_qgroup_reserve_data(inode, changeset, SZ_2M, SZ_1M);
If the last btrfs_qgroup_reserve_data() failed, it will release the entire [0, 3M) range.
This behavior is kind of OK for now, as when we hit -EDQUOT, we normally go error handling and need to release all reserved ranges anyway.
But this also means the following call is not possible:
ret = btrfs_qgroup_reserve_data(); if (ret == -EDQUOT) { /* Do something to free some qgroup space */ ret = btrfs_qgroup_reserve_data(); }
As if the first btrfs_qgroup_reserve_data() fails, it will free all reserved qgroup space.
[CAUSE] This is because we release all reserved ranges when btrfs_qgroup_reserve_data() fails.
[FIX] This patch will implement a new function, qgroup_unreserve_range(), to iterate through the ulist nodes, to find any nodes in the failure range, and remove the EXTENT_QGROUP_RESERVED bits from the io_tree, and decrease the extent_changeset::bytes_changed, so that we can revert to previous state.
This allows later patches to retry btrfs_qgroup_reserve_data() if EDQUOT happens.
Suggested-by: Josef Bacik josef@toxicpanda.com Reviewed-by: Josef Bacik josef@toxicpanda.com Signed-off-by: Qu Wenruo wqu@suse.com Reviewed-by: David Sterba dsterba@suse.com Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Anand Jain anand.jain@oracle.com --- fs/btrfs/qgroup.c | 92 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 77 insertions(+), 15 deletions(-)
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 3d8cc8d56274..50c45b4fcfd4 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3411,6 +3411,73 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) } }
+#define rbtree_iterate_from_safe(node, next, start) \ + for (node = start; node && ({ next = rb_next(node); 1;}); node = next) + +static int qgroup_unreserve_range(struct btrfs_inode *inode, + struct extent_changeset *reserved, u64 start, + u64 len) +{ + struct rb_node *node; + struct rb_node *next; + struct ulist_node *entry = NULL; + int ret = 0; + + node = reserved->range_changed.root.rb_node; + while (node) { + entry = rb_entry(node, struct ulist_node, rb_node); + if (entry->val < start) + node = node->rb_right; + else if (entry) + node = node->rb_left; + else + break; + } + + /* Empty changeset */ + if (!entry) + return 0; + + if (entry->val > start && rb_prev(&entry->rb_node)) + entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node, + rb_node); + + rbtree_iterate_from_safe(node, next, &entry->rb_node) { + u64 entry_start; + u64 entry_end; + u64 entry_len; + int clear_ret; + + entry = rb_entry(node, struct ulist_node, rb_node); + entry_start = entry->val; + entry_end = entry->aux; + entry_len = entry_end - entry_start + 1; + + if (entry_start >= start + len) + break; + if (entry_start + entry_len <= start) + continue; + /* + * Now the entry is in [start, start + len), revert the + * EXTENT_QGROUP_RESERVED bit. + */ + clear_ret = clear_extent_bits(&inode->io_tree, entry_start, + entry_end, EXTENT_QGROUP_RESERVED); + if (!ret && clear_ret < 0) + ret = clear_ret; + + ulist_del(&reserved->range_changed, entry->val, entry->aux); + if (likely(reserved->bytes_changed >= entry_len)) { + reserved->bytes_changed -= entry_len; + } else { + WARN_ON(1); + reserved->bytes_changed = 0; + } + } + + return ret; +} + /* * Reserve qgroup space for range [start, start + len). * @@ -3421,18 +3488,14 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) * Return <0 for error (including -EQUOT) * * NOTE: this function may sleep for memory allocation. - * if btrfs_qgroup_reserve_data() is called multiple times with - * same @reserved, caller must ensure when error happens it's OK - * to free *ALL* reserved space. */ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, struct extent_changeset **reserved_ret, u64 start, u64 len) { struct btrfs_root *root = inode->root; - struct ulist_node *unode; - struct ulist_iterator uiter; struct extent_changeset *reserved; + bool new_reserved = false; u64 orig_reserved; u64 to_reserve; int ret; @@ -3445,6 +3508,7 @@ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, if (WARN_ON(!reserved_ret)) return -EINVAL; if (!*reserved_ret) { + new_reserved = true; *reserved_ret = extent_changeset_alloc(); if (!*reserved_ret) return -ENOMEM; @@ -3460,7 +3524,7 @@ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len, to_reserve, QGROUP_RESERVE); if (ret < 0) - goto cleanup; + goto out; ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA); if (ret < 0) goto cleanup; @@ -3468,15 +3532,13 @@ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, return ret;
cleanup: - /* cleanup *ALL* already reserved ranges */ - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(&reserved->range_changed, &uiter))) - clear_extent_bit(&inode->io_tree, unode->val, - unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL); - /* Also free data bytes of already reserved one */ - btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, - orig_reserved, BTRFS_QGROUP_RSV_DATA); - extent_changeset_release(reserved); + qgroup_unreserve_range(inode, reserved, start, len); +out: + if (new_reserved) { + extent_changeset_release(reserved); + kfree(reserved); + *reserved_ret = NULL; + } return ret; }
From: Qu Wenruo wqu@suse.com
commit c53e9653605dbf708f5be02902de51831be4b009 upstream
[PROBLEM] There are known problem related to how btrfs handles qgroup reserved space. One of the most obvious case is the the test case btrfs/153, which do fallocate, then write into the preallocated range.
btrfs/153 1s ... - output mismatch (see xfstests-dev/results//btrfs/153.out.bad) --- tests/btrfs/153.out 2019-10-22 15:18:14.068965341 +0800 +++ xfstests-dev/results//btrfs/153.out.bad 2020-07-01 20:24:40.730000089 +0800 @@ -1,2 +1,5 @@ QA output created by 153 +pwrite: Disk quota exceeded +/mnt/scratch/testfile2: Disk quota exceeded +/mnt/scratch/testfile2: Disk quota exceeded Silence is golden ... (Run 'diff -u xfstests-dev/tests/btrfs/153.out xfstests-dev/results//btrfs/153.out.bad' to see the entire diff)
[CAUSE] Since commit c6887cd11149 ("Btrfs: don't do nocow check unless we have to"), we always reserve space no matter if it's COW or not.
Such behavior change is mostly for performance, and reverting it is not a good idea anyway.
For preallcoated extent, we reserve qgroup data space for it already, and since we also reserve data space for qgroup at buffered write time, it needs twice the space for us to write into preallocated space.
This leads to the -EDQUOT in buffered write routine.
And we can't follow the same solution, unlike data/meta space check, qgroup reserved space is shared between data/metadata. The EDQUOT can happen at the metadata reservation, so doing NODATACOW check after qgroup reservation failure is not a solution.
[FIX] To solve the problem, we don't return -EDQUOT directly, but every time we got a -EDQUOT, we try to flush qgroup space:
- Flush all inodes of the root NODATACOW writes will free the qgroup reserved at run_dealloc_range(). However we don't have the infrastructure to only flush NODATACOW inodes, here we flush all inodes anyway.
- Wait for ordered extents This would convert the preallocated metadata space into per-trans metadata, which can be freed in later transaction commit.
- Commit transaction This will free all per-trans metadata space.
Also we don't want to trigger flush multiple times, so here we introduce a per-root wait list and a new root status, to ensure only one thread starts the flushing.
Fixes: c6887cd11149 ("Btrfs: don't do nocow check unless we have to") Reviewed-by: Josef Bacik josef@toxicpanda.com Signed-off-by: Qu Wenruo wqu@suse.com Reviewed-by: David Sterba dsterba@suse.com Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Anand Jain anand.jain@oracle.com --- fs/btrfs/ctree.h | 3 ++ fs/btrfs/disk-io.c | 1 + fs/btrfs/qgroup.c | 100 +++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 96 insertions(+), 8 deletions(-)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 7960359dbc70..5448dc62e915 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -945,6 +945,8 @@ enum { BTRFS_ROOT_DEAD_TREE, /* The root has a log tree. Used only for subvolume roots. */ BTRFS_ROOT_HAS_LOG_TREE, + /* Qgroup flushing is in progress */ + BTRFS_ROOT_QGROUP_FLUSHING, };
/* @@ -1097,6 +1099,7 @@ struct btrfs_root { spinlock_t qgroup_meta_rsv_lock; u64 qgroup_meta_rsv_pertrans; u64 qgroup_meta_rsv_prealloc; + wait_queue_head_t qgroup_flush_wait;
/* Number of active swapfiles */ atomic_t nr_swapfiles; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e6aa94a583e9..e3bcab38a166 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1154,6 +1154,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, mutex_init(&root->log_mutex); mutex_init(&root->ordered_extent_mutex); mutex_init(&root->delalloc_mutex); + init_waitqueue_head(&root->qgroup_flush_wait); init_waitqueue_head(&root->log_writer_wait); init_waitqueue_head(&root->log_commit_wait[0]); init_waitqueue_head(&root->log_commit_wait[1]); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 50c45b4fcfd4..b312ac645e08 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3479,17 +3479,58 @@ static int qgroup_unreserve_range(struct btrfs_inode *inode, }
/* - * Reserve qgroup space for range [start, start + len). + * Try to free some space for qgroup. * - * This function will either reserve space from related qgroups or doing - * nothing if the range is already reserved. + * For qgroup, there are only 3 ways to free qgroup space: + * - Flush nodatacow write + * Any nodatacow write will free its reserved data space at run_delalloc_range(). + * In theory, we should only flush nodatacow inodes, but it's not yet + * possible, so we need to flush the whole root. * - * Return 0 for successful reserve - * Return <0 for error (including -EQUOT) + * - Wait for ordered extents + * When ordered extents are finished, their reserved metadata is finally + * converted to per_trans status, which can be freed by later commit + * transaction. * - * NOTE: this function may sleep for memory allocation. + * - Commit transaction + * This would free the meta_per_trans space. + * In theory this shouldn't provide much space, but any more qgroup space + * is needed. */ -int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, +static int try_flush_qgroup(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + int ret; + + /* + * We don't want to run flush again and again, so if there is a running + * one, we won't try to start a new flush, but exit directly. + */ + if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) { + wait_event(root->qgroup_flush_wait, + !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)); + return 0; + } + + ret = btrfs_start_delalloc_snapshot(root); + if (ret < 0) + goto out; + btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + ret = btrfs_commit_transaction(trans); +out: + clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state); + wake_up(&root->qgroup_flush_wait); + return ret; +} + +static int qgroup_reserve_data(struct btrfs_inode *inode, struct extent_changeset **reserved_ret, u64 start, u64 len) { @@ -3542,6 +3583,34 @@ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, return ret; }
+/* + * Reserve qgroup space for range [start, start + len). + * + * This function will either reserve space from related qgroups or do nothing + * if the range is already reserved. + * + * Return 0 for successful reservation + * Return <0 for error (including -EQUOT) + * + * NOTE: This function may sleep for memory allocation, dirty page flushing and + * commit transaction. So caller should not hold any dirty page locked. + */ +int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, + struct extent_changeset **reserved_ret, u64 start, + u64 len) +{ + int ret; + + ret = qgroup_reserve_data(inode, reserved_ret, start, len); + if (ret <= 0 && ret != -EDQUOT) + return ret; + + ret = try_flush_qgroup(inode->root); + if (ret < 0) + return ret; + return qgroup_reserve_data(inode, reserved_ret, start, len); +} + /* Free ranges specified by @reserved, normally in error path */ static int qgroup_free_reserved_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len) @@ -3712,7 +3781,7 @@ static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes, return num_bytes; }
-int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, +static int qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type, bool enforce) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -3739,6 +3808,21 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, return ret; }
+int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, + enum btrfs_qgroup_rsv_type type, bool enforce) +{ + int ret; + + ret = qgroup_reserve_meta(root, num_bytes, type, enforce); + if (ret <= 0 && ret != -EDQUOT) + return ret; + + ret = try_flush_qgroup(root); + if (ret < 0) + return ret; + return qgroup_reserve_meta(root, num_bytes, type, enforce); +} + void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info;
On 2021/8/13 下午5:55, Anand Jain wrote:
From: Qu Wenruo wqu@suse.com
commit c53e9653605dbf708f5be02902de51831be4b009 upstream
This lacks certain upstream fixes for it:
f9baa501b4fd6962257853d46ddffbc21f27e344 btrfs: fix deadlock when cloning inline extents and using qgroups
4d14c5cde5c268a2bc26addecf09489cb953ef64 btrfs: don't flush from btrfs_delayed_inode_reserve_metadata
6f23277a49e68f8a9355385c846939ad0b1261e7 btrfs: qgroup: don't commit transaction when we already hold the handle
All these fixes are to ensure we don't try to flush in context where we shouldn't.
Without them, it can hit various deadlock.
Thanks, Qu
[PROBLEM] There are known problem related to how btrfs handles qgroup reserved space. One of the most obvious case is the the test case btrfs/153, which do fallocate, then write into the preallocated range.
btrfs/153 1s ... - output mismatch (see xfstests-dev/results//btrfs/153.out.bad) --- tests/btrfs/153.out 2019-10-22 15:18:14.068965341 +0800 +++ xfstests-dev/results//btrfs/153.out.bad 2020-07-01 20:24:40.730000089 +0800 @@ -1,2 +1,5 @@ QA output created by 153 +pwrite: Disk quota exceeded +/mnt/scratch/testfile2: Disk quota exceeded +/mnt/scratch/testfile2: Disk quota exceeded Silence is golden ... (Run 'diff -u xfstests-dev/tests/btrfs/153.out xfstests-dev/results//btrfs/153.out.bad' to see the entire diff)
[CAUSE] Since commit c6887cd11149 ("Btrfs: don't do nocow check unless we have to"), we always reserve space no matter if it's COW or not.
Such behavior change is mostly for performance, and reverting it is not a good idea anyway.
For preallcoated extent, we reserve qgroup data space for it already, and since we also reserve data space for qgroup at buffered write time, it needs twice the space for us to write into preallocated space.
This leads to the -EDQUOT in buffered write routine.
And we can't follow the same solution, unlike data/meta space check, qgroup reserved space is shared between data/metadata. The EDQUOT can happen at the metadata reservation, so doing NODATACOW check after qgroup reservation failure is not a solution.
[FIX] To solve the problem, we don't return -EDQUOT directly, but every time we got a -EDQUOT, we try to flush qgroup space:
Flush all inodes of the root NODATACOW writes will free the qgroup reserved at run_dealloc_range(). However we don't have the infrastructure to only flush NODATACOW inodes, here we flush all inodes anyway.
Wait for ordered extents This would convert the preallocated metadata space into per-trans metadata, which can be freed in later transaction commit.
Commit transaction This will free all per-trans metadata space.
Also we don't want to trigger flush multiple times, so here we introduce a per-root wait list and a new root status, to ensure only one thread starts the flushing.
Fixes: c6887cd11149 ("Btrfs: don't do nocow check unless we have to") Reviewed-by: Josef Bacik josef@toxicpanda.com Signed-off-by: Qu Wenruo wqu@suse.com Reviewed-by: David Sterba dsterba@suse.com Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Anand Jain anand.jain@oracle.com
fs/btrfs/ctree.h | 3 ++ fs/btrfs/disk-io.c | 1 + fs/btrfs/qgroup.c | 100 +++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 96 insertions(+), 8 deletions(-)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 7960359dbc70..5448dc62e915 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -945,6 +945,8 @@ enum { BTRFS_ROOT_DEAD_TREE, /* The root has a log tree. Used only for subvolume roots. */ BTRFS_ROOT_HAS_LOG_TREE,
/* Qgroup flushing is in progress */
BTRFS_ROOT_QGROUP_FLUSHING, };
/*
@@ -1097,6 +1099,7 @@ struct btrfs_root { spinlock_t qgroup_meta_rsv_lock; u64 qgroup_meta_rsv_pertrans; u64 qgroup_meta_rsv_prealloc;
wait_queue_head_t qgroup_flush_wait;
/* Number of active swapfiles */ atomic_t nr_swapfiles;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e6aa94a583e9..e3bcab38a166 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1154,6 +1154,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, mutex_init(&root->log_mutex); mutex_init(&root->ordered_extent_mutex); mutex_init(&root->delalloc_mutex);
- init_waitqueue_head(&root->qgroup_flush_wait); init_waitqueue_head(&root->log_writer_wait); init_waitqueue_head(&root->log_commit_wait[0]); init_waitqueue_head(&root->log_commit_wait[1]);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 50c45b4fcfd4..b312ac645e08 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3479,17 +3479,58 @@ static int qgroup_unreserve_range(struct btrfs_inode *inode, }
/*
- Reserve qgroup space for range [start, start + len).
- Try to free some space for qgroup.
- This function will either reserve space from related qgroups or doing
- nothing if the range is already reserved.
- For qgroup, there are only 3 ways to free qgroup space:
- Flush nodatacow write
- Any nodatacow write will free its reserved data space at run_delalloc_range().
- In theory, we should only flush nodatacow inodes, but it's not yet
- possible, so we need to flush the whole root.
- Return 0 for successful reserve
- Return <0 for error (including -EQUOT)
- Wait for ordered extents
- When ordered extents are finished, their reserved metadata is finally
- converted to per_trans status, which can be freed by later commit
- transaction.
- NOTE: this function may sleep for memory allocation.
- Commit transaction
- This would free the meta_per_trans space.
- In theory this shouldn't provide much space, but any more qgroup space
*/
- is needed.
-int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, +static int try_flush_qgroup(struct btrfs_root *root) +{
- struct btrfs_trans_handle *trans;
- int ret;
- /*
* We don't want to run flush again and again, so if there is a running
* one, we won't try to start a new flush, but exit directly.
*/
- if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) {
wait_event(root->qgroup_flush_wait,
!test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state));
return 0;
- }
- ret = btrfs_start_delalloc_snapshot(root);
- if (ret < 0)
goto out;
- btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out;
- }
- ret = btrfs_commit_transaction(trans);
+out:
- clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
- wake_up(&root->qgroup_flush_wait);
- return ret;
+}
+static int qgroup_reserve_data(struct btrfs_inode *inode, struct extent_changeset **reserved_ret, u64 start, u64 len) { @@ -3542,6 +3583,34 @@ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, return ret; }
+/*
- Reserve qgroup space for range [start, start + len).
- This function will either reserve space from related qgroups or do nothing
- if the range is already reserved.
- Return 0 for successful reservation
- Return <0 for error (including -EQUOT)
- NOTE: This function may sleep for memory allocation, dirty page flushing and
commit transaction. So caller should not hold any dirty page locked.
- */
+int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
struct extent_changeset **reserved_ret, u64 start,
u64 len)
+{
- int ret;
- ret = qgroup_reserve_data(inode, reserved_ret, start, len);
- if (ret <= 0 && ret != -EDQUOT)
return ret;
- ret = try_flush_qgroup(inode->root);
- if (ret < 0)
return ret;
- return qgroup_reserve_data(inode, reserved_ret, start, len);
+}
- /* Free ranges specified by @reserved, normally in error path */ static int qgroup_free_reserved_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len)
@@ -3712,7 +3781,7 @@ static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes, return num_bytes; }
-int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, +static int qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type, bool enforce) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -3739,6 +3808,21 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, return ret; }
+int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
enum btrfs_qgroup_rsv_type type, bool enforce)
+{
- int ret;
- ret = qgroup_reserve_meta(root, num_bytes, type, enforce);
- if (ret <= 0 && ret != -EDQUOT)
return ret;
- ret = try_flush_qgroup(root);
- if (ret < 0)
return ret;
- return qgroup_reserve_meta(root, num_bytes, type, enforce);
+}
- void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info;
On 13/08/2021 18:26, Qu Wenruo wrote:
On 2021/8/13 下午5:55, Anand Jain wrote:
From: Qu Wenruo wqu@suse.com
commit c53e9653605dbf708f5be02902de51831be4b009 upstream
This lacks certain upstream fixes for it:
f9baa501b4fd6962257853d46ddffbc21f27e344 btrfs: fix deadlock when cloning inline extents and using qgroups
4d14c5cde5c268a2bc26addecf09489cb953ef64 btrfs: don't flush from btrfs_delayed_inode_reserve_metadata
6f23277a49e68f8a9355385c846939ad0b1261e7 btrfs: qgroup: don't commit transaction when we already hold the handle
All these fixes are to ensure we don't try to flush in context where we shouldn't.
Without them, it can hit various deadlock.
Qu,
Thanks for taking a look. I will send it in v2.
-Anand
Thanks, Qu
[PROBLEM] There are known problem related to how btrfs handles qgroup reserved space. One of the most obvious case is the the test case btrfs/153, which do fallocate, then write into the preallocated range.
btrfs/153 1s ... - output mismatch (see xfstests-dev/results//btrfs/153.out.bad) --- tests/btrfs/153.out 2019-10-22 15:18:14.068965341 +0800 +++ xfstests-dev/results//btrfs/153.out.bad 2020-07-01 20:24:40.730000089 +0800 @@ -1,2 +1,5 @@ QA output created by 153 +pwrite: Disk quota exceeded +/mnt/scratch/testfile2: Disk quota exceeded +/mnt/scratch/testfile2: Disk quota exceeded Silence is golden ... (Run 'diff -u xfstests-dev/tests/btrfs/153.out xfstests-dev/results//btrfs/153.out.bad' to see the entire diff)
[CAUSE] Since commit c6887cd11149 ("Btrfs: don't do nocow check unless we have to"), we always reserve space no matter if it's COW or not.
Such behavior change is mostly for performance, and reverting it is not a good idea anyway.
For preallcoated extent, we reserve qgroup data space for it already, and since we also reserve data space for qgroup at buffered write time, it needs twice the space for us to write into preallocated space.
This leads to the -EDQUOT in buffered write routine.
And we can't follow the same solution, unlike data/meta space check, qgroup reserved space is shared between data/metadata. The EDQUOT can happen at the metadata reservation, so doing NODATACOW check after qgroup reservation failure is not a solution.
[FIX] To solve the problem, we don't return -EDQUOT directly, but every time we got a -EDQUOT, we try to flush qgroup space:
- Flush all inodes of the root
NODATACOW writes will free the qgroup reserved at run_dealloc_range(). However we don't have the infrastructure to only flush NODATACOW inodes, here we flush all inodes anyway.
- Wait for ordered extents
This would convert the preallocated metadata space into per-trans metadata, which can be freed in later transaction commit.
- Commit transaction
This will free all per-trans metadata space.
Also we don't want to trigger flush multiple times, so here we introduce a per-root wait list and a new root status, to ensure only one thread starts the flushing.
Fixes: c6887cd11149 ("Btrfs: don't do nocow check unless we have to") Reviewed-by: Josef Bacik josef@toxicpanda.com Signed-off-by: Qu Wenruo wqu@suse.com Reviewed-by: David Sterba dsterba@suse.com Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Anand Jain anand.jain@oracle.com
fs/btrfs/ctree.h | 3 ++ fs/btrfs/disk-io.c | 1 + fs/btrfs/qgroup.c | 100 +++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 96 insertions(+), 8 deletions(-)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 7960359dbc70..5448dc62e915 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -945,6 +945,8 @@ enum { BTRFS_ROOT_DEAD_TREE, /* The root has a log tree. Used only for subvolume roots. */ BTRFS_ROOT_HAS_LOG_TREE, + /* Qgroup flushing is in progress */ + BTRFS_ROOT_QGROUP_FLUSHING, };
/* @@ -1097,6 +1099,7 @@ struct btrfs_root { spinlock_t qgroup_meta_rsv_lock; u64 qgroup_meta_rsv_pertrans; u64 qgroup_meta_rsv_prealloc; + wait_queue_head_t qgroup_flush_wait;
/* Number of active swapfiles */ atomic_t nr_swapfiles; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e6aa94a583e9..e3bcab38a166 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1154,6 +1154,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, mutex_init(&root->log_mutex); mutex_init(&root->ordered_extent_mutex); mutex_init(&root->delalloc_mutex); + init_waitqueue_head(&root->qgroup_flush_wait); init_waitqueue_head(&root->log_writer_wait); init_waitqueue_head(&root->log_commit_wait[0]); init_waitqueue_head(&root->log_commit_wait[1]); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 50c45b4fcfd4..b312ac645e08 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3479,17 +3479,58 @@ static int qgroup_unreserve_range(struct btrfs_inode *inode, }
/*
- Reserve qgroup space for range [start, start + len).
- Try to free some space for qgroup.
*
- This function will either reserve space from related qgroups or doing
- nothing if the range is already reserved.
- For qgroup, there are only 3 ways to free qgroup space:
- Flush nodatacow write
- * Any nodatacow write will free its reserved data space at
run_delalloc_range().
- * In theory, we should only flush nodatacow inodes, but it's not yet
- * possible, so we need to flush the whole root.
*
- Return 0 for successful reserve
- Return <0 for error (including -EQUOT)
- Wait for ordered extents
- * When ordered extents are finished, their reserved metadata is
finally
- * converted to per_trans status, which can be freed by later commit
- * transaction.
*
- NOTE: this function may sleep for memory allocation.
- Commit transaction
- * This would free the meta_per_trans space.
- * In theory this shouldn't provide much space, but any more qgroup
space
- * is needed.
*/ -int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, +static int try_flush_qgroup(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + int ret;
+ /* + * We don't want to run flush again and again, so if there is a running + * one, we won't try to start a new flush, but exit directly. + */ + if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) { + wait_event(root->qgroup_flush_wait, + !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)); + return 0; + }
+ ret = btrfs_start_delalloc_snapshot(root); + if (ret < 0) + goto out; + btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
+ trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + }
+ ret = btrfs_commit_transaction(trans); +out: + clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state); + wake_up(&root->qgroup_flush_wait); + return ret; +}
+static int qgroup_reserve_data(struct btrfs_inode *inode, struct extent_changeset **reserved_ret, u64 start, u64 len) { @@ -3542,6 +3583,34 @@ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, return ret; }
+/*
- Reserve qgroup space for range [start, start + len).
- This function will either reserve space from related qgroups or do
nothing
- if the range is already reserved.
- Return 0 for successful reservation
- Return <0 for error (including -EQUOT)
- NOTE: This function may sleep for memory allocation, dirty page
flushing and
- * commit transaction. So caller should not hold any dirty page
locked.
- */
+int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, + struct extent_changeset **reserved_ret, u64 start, + u64 len) +{ + int ret;
+ ret = qgroup_reserve_data(inode, reserved_ret, start, len); + if (ret <= 0 && ret != -EDQUOT) + return ret;
+ ret = try_flush_qgroup(inode->root); + if (ret < 0) + return ret; + return qgroup_reserve_data(inode, reserved_ret, start, len); +}
/* Free ranges specified by @reserved, normally in error path */ static int qgroup_free_reserved_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len) @@ -3712,7 +3781,7 @@ static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes, return num_bytes; }
-int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, +static int qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type, bool enforce) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -3739,6 +3808,21 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, return ret; }
+int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, + enum btrfs_qgroup_rsv_type type, bool enforce) +{ + int ret;
+ ret = qgroup_reserve_meta(root, num_bytes, type, enforce); + if (ret <= 0 && ret != -EDQUOT) + return ret;
+ ret = try_flush_qgroup(root); + if (ret < 0) + return ret; + return qgroup_reserve_meta(root, num_bytes, type, enforce); +}
void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info;
On 2021/8/13 下午6:30, Anand Jain wrote:
On 13/08/2021 18:26, Qu Wenruo wrote:
On 2021/8/13 下午5:55, Anand Jain wrote:
From: Qu Wenruo wqu@suse.com
commit c53e9653605dbf708f5be02902de51831be4b009 upstream
This lacks certain upstream fixes for it:
f9baa501b4fd6962257853d46ddffbc21f27e344 btrfs: fix deadlock when cloning inline extents and using qgroups
4d14c5cde5c268a2bc26addecf09489cb953ef64 btrfs: don't flush from btrfs_delayed_inode_reserve_metadata
6f23277a49e68f8a9355385c846939ad0b1261e7 btrfs: qgroup: don't commit transaction when we already hold the handle
All these fixes are to ensure we don't try to flush in context where we shouldn't.
Without them, it can hit various deadlock.
Qu,
Thanks for taking a look. I will send it in v2.
I guess you only need to add the missing fixes?
Thanks, Qu
-Anand
Thanks, Qu
[PROBLEM] There are known problem related to how btrfs handles qgroup reserved space. One of the most obvious case is the the test case btrfs/153, which do fallocate, then write into the preallocated range.
btrfs/153 1s ... - output mismatch (see xfstests-dev/results//btrfs/153.out.bad) --- tests/btrfs/153.out 2019-10-22 15:18:14.068965341 +0800 +++ xfstests-dev/results//btrfs/153.out.bad 2020-07-01 20:24:40.730000089 +0800 @@ -1,2 +1,5 @@ QA output created by 153 +pwrite: Disk quota exceeded +/mnt/scratch/testfile2: Disk quota exceeded +/mnt/scratch/testfile2: Disk quota exceeded Silence is golden ... (Run 'diff -u xfstests-dev/tests/btrfs/153.out xfstests-dev/results//btrfs/153.out.bad' to see the entire diff)
[CAUSE] Since commit c6887cd11149 ("Btrfs: don't do nocow check unless we have to"), we always reserve space no matter if it's COW or not.
Such behavior change is mostly for performance, and reverting it is not a good idea anyway.
For preallcoated extent, we reserve qgroup data space for it already, and since we also reserve data space for qgroup at buffered write time, it needs twice the space for us to write into preallocated space.
This leads to the -EDQUOT in buffered write routine.
And we can't follow the same solution, unlike data/meta space check, qgroup reserved space is shared between data/metadata. The EDQUOT can happen at the metadata reservation, so doing NODATACOW check after qgroup reservation failure is not a solution.
[FIX] To solve the problem, we don't return -EDQUOT directly, but every time we got a -EDQUOT, we try to flush qgroup space:
- Flush all inodes of the root
NODATACOW writes will free the qgroup reserved at run_dealloc_range(). However we don't have the infrastructure to only flush NODATACOW inodes, here we flush all inodes anyway.
- Wait for ordered extents
This would convert the preallocated metadata space into per-trans metadata, which can be freed in later transaction commit.
- Commit transaction
This will free all per-trans metadata space.
Also we don't want to trigger flush multiple times, so here we introduce a per-root wait list and a new root status, to ensure only one thread starts the flushing.
Fixes: c6887cd11149 ("Btrfs: don't do nocow check unless we have to") Reviewed-by: Josef Bacik josef@toxicpanda.com Signed-off-by: Qu Wenruo wqu@suse.com Reviewed-by: David Sterba dsterba@suse.com Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Anand Jain anand.jain@oracle.com
fs/btrfs/ctree.h | 3 ++ fs/btrfs/disk-io.c | 1 + fs/btrfs/qgroup.c | 100 +++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 96 insertions(+), 8 deletions(-)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 7960359dbc70..5448dc62e915 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -945,6 +945,8 @@ enum { BTRFS_ROOT_DEAD_TREE, /* The root has a log tree. Used only for subvolume roots. */ BTRFS_ROOT_HAS_LOG_TREE, + /* Qgroup flushing is in progress */ + BTRFS_ROOT_QGROUP_FLUSHING, };
/* @@ -1097,6 +1099,7 @@ struct btrfs_root { spinlock_t qgroup_meta_rsv_lock; u64 qgroup_meta_rsv_pertrans; u64 qgroup_meta_rsv_prealloc; + wait_queue_head_t qgroup_flush_wait;
/* Number of active swapfiles */ atomic_t nr_swapfiles; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e6aa94a583e9..e3bcab38a166 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1154,6 +1154,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, mutex_init(&root->log_mutex); mutex_init(&root->ordered_extent_mutex); mutex_init(&root->delalloc_mutex); + init_waitqueue_head(&root->qgroup_flush_wait); init_waitqueue_head(&root->log_writer_wait); init_waitqueue_head(&root->log_commit_wait[0]); init_waitqueue_head(&root->log_commit_wait[1]); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 50c45b4fcfd4..b312ac645e08 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3479,17 +3479,58 @@ static int qgroup_unreserve_range(struct btrfs_inode *inode, }
/*
- Reserve qgroup space for range [start, start + len).
- Try to free some space for qgroup.
*
- This function will either reserve space from related qgroups or
doing
- nothing if the range is already reserved.
- For qgroup, there are only 3 ways to free qgroup space:
- Flush nodatacow write
- * Any nodatacow write will free its reserved data space at
run_delalloc_range().
- * In theory, we should only flush nodatacow inodes, but it's not yet
- * possible, so we need to flush the whole root.
*
- Return 0 for successful reserve
- Return <0 for error (including -EQUOT)
- Wait for ordered extents
- * When ordered extents are finished, their reserved metadata is
finally
- * converted to per_trans status, which can be freed by later commit
- * transaction.
*
- NOTE: this function may sleep for memory allocation.
- Commit transaction
- * This would free the meta_per_trans space.
- * In theory this shouldn't provide much space, but any more
qgroup space
- * is needed.
*/ -int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, +static int try_flush_qgroup(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + int ret;
+ /* + * We don't want to run flush again and again, so if there is a running + * one, we won't try to start a new flush, but exit directly. + */ + if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) { + wait_event(root->qgroup_flush_wait, + !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)); + return 0; + }
+ ret = btrfs_start_delalloc_snapshot(root); + if (ret < 0) + goto out; + btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
+ trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + }
+ ret = btrfs_commit_transaction(trans); +out: + clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state); + wake_up(&root->qgroup_flush_wait); + return ret; +}
+static int qgroup_reserve_data(struct btrfs_inode *inode, struct extent_changeset **reserved_ret, u64 start, u64 len) { @@ -3542,6 +3583,34 @@ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, return ret; }
+/*
- Reserve qgroup space for range [start, start + len).
- This function will either reserve space from related qgroups or
do nothing
- if the range is already reserved.
- Return 0 for successful reservation
- Return <0 for error (including -EQUOT)
- NOTE: This function may sleep for memory allocation, dirty page
flushing and
- * commit transaction. So caller should not hold any dirty page
locked.
- */
+int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, + struct extent_changeset **reserved_ret, u64 start, + u64 len) +{ + int ret;
+ ret = qgroup_reserve_data(inode, reserved_ret, start, len); + if (ret <= 0 && ret != -EDQUOT) + return ret;
+ ret = try_flush_qgroup(inode->root); + if (ret < 0) + return ret; + return qgroup_reserve_data(inode, reserved_ret, start, len); +}
/* Free ranges specified by @reserved, normally in error path */ static int qgroup_free_reserved_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len) @@ -3712,7 +3781,7 @@ static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes, return num_bytes; }
-int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, +static int qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type, bool enforce) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -3739,6 +3808,21 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, return ret; }
+int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, + enum btrfs_qgroup_rsv_type type, bool enforce) +{ + int ret;
+ ret = qgroup_reserve_meta(root, num_bytes, type, enforce); + if (ret <= 0 && ret != -EDQUOT) + return ret;
+ ret = try_flush_qgroup(root); + if (ret < 0) + return ret; + return qgroup_reserve_meta(root, num_bytes, type, enforce); +}
void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info;
On 13/08/2021 18:39, Qu Wenruo wrote:
On 2021/8/13 下午6:30, Anand Jain wrote:
On 13/08/2021 18:26, Qu Wenruo wrote:
On 2021/8/13 下午5:55, Anand Jain wrote:
From: Qu Wenruo wqu@suse.com
commit c53e9653605dbf708f5be02902de51831be4b009 upstream
This lacks certain upstream fixes for it:
f9baa501b4fd6962257853d46ddffbc21f27e344 btrfs: fix deadlock when cloning inline extents and using qgroups
4d14c5cde5c268a2bc26addecf09489cb953ef64 btrfs: don't flush from btrfs_delayed_inode_reserve_metadata
6f23277a49e68f8a9355385c846939ad0b1261e7 btrfs: qgroup: don't commit transaction when we already hold the handle
All these fixes are to ensure we don't try to flush in context where we shouldn't.
Without them, it can hit various deadlock.
Qu,
Thanks for taking a look. I will send it in v2.
I guess you only need to add the missing fixes?
Yeah, maybe it's better to send it as a new set.
Thx. Anand
Thanks, Qu
-Anand
Thanks, Qu
[PROBLEM] There are known problem related to how btrfs handles qgroup reserved space. One of the most obvious case is the the test case btrfs/153, which do fallocate, then write into the preallocated range.
btrfs/153 1s ... - output mismatch (see xfstests-dev/results//btrfs/153.out.bad) --- tests/btrfs/153.out 2019-10-22 15:18:14.068965341 +0800 +++ xfstests-dev/results//btrfs/153.out.bad 2020-07-01 20:24:40.730000089 +0800 @@ -1,2 +1,5 @@ QA output created by 153 +pwrite: Disk quota exceeded +/mnt/scratch/testfile2: Disk quota exceeded +/mnt/scratch/testfile2: Disk quota exceeded Silence is golden ... (Run 'diff -u xfstests-dev/tests/btrfs/153.out xfstests-dev/results//btrfs/153.out.bad' to see the entire diff)
[CAUSE] Since commit c6887cd11149 ("Btrfs: don't do nocow check unless we have to"), we always reserve space no matter if it's COW or not.
Such behavior change is mostly for performance, and reverting it is not a good idea anyway.
For preallcoated extent, we reserve qgroup data space for it already, and since we also reserve data space for qgroup at buffered write time, it needs twice the space for us to write into preallocated space.
This leads to the -EDQUOT in buffered write routine.
And we can't follow the same solution, unlike data/meta space check, qgroup reserved space is shared between data/metadata. The EDQUOT can happen at the metadata reservation, so doing NODATACOW check after qgroup reservation failure is not a solution.
[FIX] To solve the problem, we don't return -EDQUOT directly, but every time we got a -EDQUOT, we try to flush qgroup space:
- Flush all inodes of the root
NODATACOW writes will free the qgroup reserved at run_dealloc_range(). However we don't have the infrastructure to only flush NODATACOW inodes, here we flush all inodes anyway.
- Wait for ordered extents
This would convert the preallocated metadata space into per-trans metadata, which can be freed in later transaction commit.
- Commit transaction
This will free all per-trans metadata space.
Also we don't want to trigger flush multiple times, so here we introduce a per-root wait list and a new root status, to ensure only one thread starts the flushing.
Fixes: c6887cd11149 ("Btrfs: don't do nocow check unless we have to") Reviewed-by: Josef Bacik josef@toxicpanda.com Signed-off-by: Qu Wenruo wqu@suse.com Reviewed-by: David Sterba dsterba@suse.com Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Anand Jain anand.jain@oracle.com
fs/btrfs/ctree.h | 3 ++ fs/btrfs/disk-io.c | 1 + fs/btrfs/qgroup.c | 100 +++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 96 insertions(+), 8 deletions(-)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 7960359dbc70..5448dc62e915 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -945,6 +945,8 @@ enum { BTRFS_ROOT_DEAD_TREE, /* The root has a log tree. Used only for subvolume roots. */ BTRFS_ROOT_HAS_LOG_TREE, + /* Qgroup flushing is in progress */ + BTRFS_ROOT_QGROUP_FLUSHING, };
/* @@ -1097,6 +1099,7 @@ struct btrfs_root { spinlock_t qgroup_meta_rsv_lock; u64 qgroup_meta_rsv_pertrans; u64 qgroup_meta_rsv_prealloc; + wait_queue_head_t qgroup_flush_wait;
/* Number of active swapfiles */ atomic_t nr_swapfiles; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e6aa94a583e9..e3bcab38a166 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1154,6 +1154,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, mutex_init(&root->log_mutex); mutex_init(&root->ordered_extent_mutex); mutex_init(&root->delalloc_mutex); + init_waitqueue_head(&root->qgroup_flush_wait); init_waitqueue_head(&root->log_writer_wait); init_waitqueue_head(&root->log_commit_wait[0]); init_waitqueue_head(&root->log_commit_wait[1]); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 50c45b4fcfd4..b312ac645e08 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3479,17 +3479,58 @@ static int qgroup_unreserve_range(struct btrfs_inode *inode, }
/*
- Reserve qgroup space for range [start, start + len).
- Try to free some space for qgroup.
*
- This function will either reserve space from related qgroups or
doing
- nothing if the range is already reserved.
- For qgroup, there are only 3 ways to free qgroup space:
- Flush nodatacow write
- * Any nodatacow write will free its reserved data space at
run_delalloc_range().
- * In theory, we should only flush nodatacow inodes, but it's not
yet
- * possible, so we need to flush the whole root.
*
- Return 0 for successful reserve
- Return <0 for error (including -EQUOT)
- Wait for ordered extents
- * When ordered extents are finished, their reserved metadata is
finally
- * converted to per_trans status, which can be freed by later commit
- * transaction.
*
- NOTE: this function may sleep for memory allocation.
- Commit transaction
- * This would free the meta_per_trans space.
- * In theory this shouldn't provide much space, but any more
qgroup space
- * is needed.
*/ -int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, +static int try_flush_qgroup(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + int ret;
+ /* + * We don't want to run flush again and again, so if there is a running + * one, we won't try to start a new flush, but exit directly. + */ + if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) { + wait_event(root->qgroup_flush_wait, + !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)); + return 0; + }
+ ret = btrfs_start_delalloc_snapshot(root); + if (ret < 0) + goto out; + btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
+ trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + }
+ ret = btrfs_commit_transaction(trans); +out: + clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state); + wake_up(&root->qgroup_flush_wait); + return ret; +}
+static int qgroup_reserve_data(struct btrfs_inode *inode, struct extent_changeset **reserved_ret, u64 start, u64 len) { @@ -3542,6 +3583,34 @@ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, return ret; }
+/*
- Reserve qgroup space for range [start, start + len).
- This function will either reserve space from related qgroups or
do nothing
- if the range is already reserved.
- Return 0 for successful reservation
- Return <0 for error (including -EQUOT)
- NOTE: This function may sleep for memory allocation, dirty page
flushing and
- * commit transaction. So caller should not hold any dirty page
locked.
- */
+int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, + struct extent_changeset **reserved_ret, u64 start, + u64 len) +{ + int ret;
+ ret = qgroup_reserve_data(inode, reserved_ret, start, len); + if (ret <= 0 && ret != -EDQUOT) + return ret;
+ ret = try_flush_qgroup(inode->root); + if (ret < 0) + return ret; + return qgroup_reserve_data(inode, reserved_ret, start, len); +}
/* Free ranges specified by @reserved, normally in error path */ static int qgroup_free_reserved_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len) @@ -3712,7 +3781,7 @@ static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes, return num_bytes; }
-int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, +static int qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type, bool enforce) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -3739,6 +3808,21 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, return ret; }
+int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, + enum btrfs_qgroup_rsv_type type, bool enforce) +{ + int ret;
+ ret = qgroup_reserve_meta(root, num_bytes, type, enforce); + if (ret <= 0 && ret != -EDQUOT) + return ret;
+ ret = try_flush_qgroup(root); + if (ret < 0) + return ret; + return qgroup_reserve_meta(root, num_bytes, type, enforce); +}
void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info;
On Fri, Aug 13, 2021 at 06:41:53PM +0800, Anand Jain wrote:
On 13/08/2021 18:39, Qu Wenruo wrote:
On 2021/8/13 下午6:30, Anand Jain wrote:
On 13/08/2021 18:26, Qu Wenruo wrote:
On 2021/8/13 下午5:55, Anand Jain wrote:
From: Qu Wenruo wqu@suse.com
commit c53e9653605dbf708f5be02902de51831be4b009 upstream
This lacks certain upstream fixes for it:
f9baa501b4fd6962257853d46ddffbc21f27e344 btrfs: fix deadlock when cloning inline extents and using qgroups
4d14c5cde5c268a2bc26addecf09489cb953ef64 btrfs: don't flush from btrfs_delayed_inode_reserve_metadata
6f23277a49e68f8a9355385c846939ad0b1261e7 btrfs: qgroup: don't commit transaction when we already hold the handle
All these fixes are to ensure we don't try to flush in context where we shouldn't.
Without them, it can hit various deadlock.
Qu,
Thanks for taking a look. I will send it in v2.
I guess you only need to add the missing fixes?
Yeah, maybe it's better to send it as a new set.
So should I drop the existing patches and wait for a whole new series, or will you send these as an additional set?
And at least one of the above commits needs to go to the 5.10.y tree, I did not check them all...
thanks,
greg k-h
On 13/08/2021 18:56, Greg KH wrote:
On Fri, Aug 13, 2021 at 06:41:53PM +0800, Anand Jain wrote:
On 13/08/2021 18:39, Qu Wenruo wrote:
On 2021/8/13 下午6:30, Anand Jain wrote:
On 13/08/2021 18:26, Qu Wenruo wrote:
On 2021/8/13 下午5:55, Anand Jain wrote:
From: Qu Wenruo wqu@suse.com
commit c53e9653605dbf708f5be02902de51831be4b009 upstream
This lacks certain upstream fixes for it:
f9baa501b4fd6962257853d46ddffbc21f27e344 btrfs: fix deadlock when cloning inline extents and using qgroups
4d14c5cde5c268a2bc26addecf09489cb953ef64 btrfs: don't flush from btrfs_delayed_inode_reserve_metadata
6f23277a49e68f8a9355385c846939ad0b1261e7 btrfs: qgroup: don't commit transaction when we already hold the handle
All these fixes are to ensure we don't try to flush in context where we shouldn't.
Without them, it can hit various deadlock.
Qu,
Thanks for taking a look. I will send it in v2.
I guess you only need to add the missing fixes?
Yeah, maybe it's better to send it as a new set.
So should I drop the existing patches and wait for a whole new series, or will you send these as an additional set?
Greg, I am sending it as an additional set.
And at least one of the above commits needs to go to the 5.10.y tree, I did not check them all...
I need to look into it.
Thanks, Anand
thanks,
greg k-h
On 13/08/2021 19:06, Anand Jain wrote:
On 13/08/2021 18:56, Greg KH wrote:
On Fri, Aug 13, 2021 at 06:41:53PM +0800, Anand Jain wrote:
On 13/08/2021 18:39, Qu Wenruo wrote:
On 2021/8/13 下午6:30, Anand Jain wrote:
On 13/08/2021 18:26, Qu Wenruo wrote:
On 2021/8/13 下午5:55, Anand Jain wrote: > From: Qu Wenruo wqu@suse.com > > commit c53e9653605dbf708f5be02902de51831be4b009 upstream
This lacks certain upstream fixes for it:
f9baa501b4fd6962257853d46ddffbc21f27e344 btrfs: fix deadlock when cloning inline extents and using qgroups
4d14c5cde5c268a2bc26addecf09489cb953ef64 btrfs: don't flush from btrfs_delayed_inode_reserve_metadata
6f23277a49e68f8a9355385c846939ad0b1261e7 btrfs: qgroup: don't commit transaction when we already hold the handle
All these fixes are to ensure we don't try to flush in context where we shouldn't.
Without them, it can hit various deadlock.
Qu,
Thanks for taking a look. I will send it in v2.
I guess you only need to add the missing fixes?
Yeah, maybe it's better to send it as a new set.
So should I drop the existing patches and wait for a whole new series, or will you send these as an additional set?
Greg, I am sending it as an additional set.
And at least one of the above commits needs to go to the 5.10.y tree, I did not check them all...
I need to look into it.
We don't need 1/7 in 5.10.y it was a preparatory patch in 5.4.y [PATCH 1/7] btrfs: make qgroup_free_reserved_data take btrfs_inode
The rest of the patches (in patchset 1 and 2) are already in the stable-5.10.y.
Thx, Anand
Thanks, Anand
thanks,
greg k-h
From: Qu Wenruo wqu@suse.com
commit 3296bf562443a8ca35aaad959a76a49e9b412760 upstream
The state was introduced in commit 4a9d8bdee368 ("Btrfs: make the state of the transaction more readable"), then in commit 302167c50b32 ("btrfs: don't end the transaction for delayed refs in throttle") the state is completely removed.
So we can just clean up the state since it's only compared but never set.
Signed-off-by: Qu Wenruo wqu@suse.com Reviewed-by: David Sterba dsterba@suse.com Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Anand Jain anand.jain@oracle.com --- fs/btrfs/disk-io.c | 2 +- fs/btrfs/transaction.c | 15 +++------------ fs/btrfs/transaction.h | 1 - 3 files changed, 4 insertions(+), 14 deletions(-)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e3bcab38a166..6d6e7b0e3676 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1748,7 +1748,7 @@ static int transaction_kthread(void *arg) }
now = ktime_get_seconds(); - if (cur->state < TRANS_STATE_BLOCKED && + if (cur->state < TRANS_STATE_COMMIT_START && !test_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags) && (now < cur->start_time || now - cur->start_time < fs_info->commit_interval)) { diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index aca6c467d776..c314f26d1f15 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -27,7 +27,6 @@
static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { [TRANS_STATE_RUNNING] = 0U, - [TRANS_STATE_BLOCKED] = __TRANS_START, [TRANS_STATE_COMMIT_START] = (__TRANS_START | __TRANS_ATTACH), [TRANS_STATE_COMMIT_DOING] = (__TRANS_START | __TRANS_ATTACH | @@ -388,7 +387,7 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
static inline int is_transaction_blocked(struct btrfs_transaction *trans) { - return (trans->state >= TRANS_STATE_BLOCKED && + return (trans->state >= TRANS_STATE_COMMIT_START && trans->state < TRANS_STATE_UNBLOCKED && !TRANS_ABORTED(trans)); } @@ -580,7 +579,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, INIT_LIST_HEAD(&h->new_bgs);
smp_mb(); - if (cur_trans->state >= TRANS_STATE_BLOCKED && + if (cur_trans->state >= TRANS_STATE_COMMIT_START && may_wait_transaction(fs_info, type)) { current->journal_info = h; btrfs_commit_transaction(h); @@ -797,7 +796,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans) struct btrfs_transaction *cur_trans = trans->transaction;
smp_mb(); - if (cur_trans->state >= TRANS_STATE_BLOCKED || + if (cur_trans->state >= TRANS_STATE_COMMIT_START || cur_trans->delayed_refs.flushing) return 1;
@@ -830,7 +829,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_transaction *cur_trans = trans->transaction; - int lock = (trans->type != TRANS_JOIN_NOLOCK); int err = 0;
if (refcount_read(&trans->use_count) > 1) { @@ -846,13 +844,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
btrfs_trans_release_chunk_metadata(trans);
- if (lock && READ_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) { - if (throttle) - return btrfs_commit_transaction(trans); - else - wake_up_process(info->transaction_kthread); - } - if (trans->type & __TRANS_FREEZABLE) sb_end_intwrite(info->sb);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 7291a2a93075..761cc65a7264 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -13,7 +13,6 @@
enum btrfs_trans_state { TRANS_STATE_RUNNING, - TRANS_STATE_BLOCKED, TRANS_STATE_COMMIT_START, TRANS_STATE_COMMIT_DOING, TRANS_STATE_UNBLOCKED,
From: Qu Wenruo wqu@suse.com
commit adca4d945c8dca28a85df45c5b117e6dac2e77f1 upstream
commit a514d63882c3 ("btrfs: qgroup: Commit transaction in advance to reduce early EDQUOT") tries to reduce the early EDQUOT problems by checking the qgroup free against threshold and tries to wake up commit kthread to free some space.
The problem of that mechanism is, it can only free qgroup per-trans metadata space, can't do anything to data, nor prealloc qgroup space.
Now since we have the ability to flush qgroup space, and implemented retry-after-EDQUOT behavior, such mechanism can be completely replaced.
So this patch will cleanup such mechanism in favor of retry-after-EDQUOT.
Reviewed-by: Josef Bacik josef@toxicpanda.com Signed-off-by: Qu Wenruo wqu@suse.com Reviewed-by: David Sterba dsterba@suse.com Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Anand Jain anand.jain@oracle.com --- fs/btrfs/ctree.h | 5 ----- fs/btrfs/disk-io.c | 1 - fs/btrfs/qgroup.c | 43 ++---------------------------------------- fs/btrfs/transaction.c | 1 - fs/btrfs/transaction.h | 14 -------------- 5 files changed, 2 insertions(+), 62 deletions(-)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5448dc62e915..1dd36965cd08 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -504,11 +504,6 @@ enum { * (device replace, resize, device add/delete, balance) */ BTRFS_FS_EXCL_OP, - /* - * To info transaction_kthread we need an immediate commit so it - * doesn't need to wait for commit_interval - */ - BTRFS_FS_NEED_ASYNC_COMMIT, /* * Indicate that balance has been set up from the ioctl and is in the * main phase. The fs_info::balance_ctl is initialized. diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6d6e7b0e3676..9373b4805da2 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1749,7 +1749,6 @@ static int transaction_kthread(void *arg)
now = ktime_get_seconds(); if (cur->state < TRANS_STATE_COMMIT_START && - !test_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags) && (now < cur->start_time || now - cur->start_time < fs_info->commit_interval)) { spin_unlock(&fs_info->trans_lock); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index b312ac645e08..4720e477c482 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -11,7 +11,6 @@ #include <linux/slab.h> #include <linux/workqueue.h> #include <linux/btrfs.h> -#include <linux/sizes.h>
#include "ctree.h" #include "transaction.h" @@ -2840,20 +2839,8 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, return ret; }
-/* - * Two limits to commit transaction in advance. - * - * For RATIO, it will be 1/RATIO of the remaining limit as threshold. - * For SIZE, it will be in byte unit as threshold. - */ -#define QGROUP_FREE_RATIO 32 -#define QGROUP_FREE_SIZE SZ_32M -static bool qgroup_check_limits(struct btrfs_fs_info *fs_info, - const struct btrfs_qgroup *qg, u64 num_bytes) +static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes) { - u64 free; - u64 threshold; - if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer) return false; @@ -2862,32 +2849,6 @@ static bool qgroup_check_limits(struct btrfs_fs_info *fs_info, qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl) return false;
- /* - * Even if we passed the check, it's better to check if reservation - * for meta_pertrans is pushing us near limit. - * If there is too much pertrans reservation or it's near the limit, - * let's try commit transaction to free some, using transaction_kthread - */ - if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER | - BTRFS_QGROUP_LIMIT_MAX_EXCL))) { - if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) { - free = qg->max_excl - qgroup_rsv_total(qg) - qg->excl; - threshold = min_t(u64, qg->max_excl / QGROUP_FREE_RATIO, - QGROUP_FREE_SIZE); - } else { - free = qg->max_rfer - qgroup_rsv_total(qg) - qg->rfer; - threshold = min_t(u64, qg->max_rfer / QGROUP_FREE_RATIO, - QGROUP_FREE_SIZE); - } - - /* - * Use transaction_kthread to commit transaction, so we no - * longer need to bother nested transaction nor lock context. - */ - if (free < threshold) - btrfs_commit_transaction_locksafe(fs_info); - } - return true; }
@@ -2937,7 +2898,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
qg = unode_aux_to_qgroup(unode);
- if (enforce && !qgroup_check_limits(fs_info, qg, num_bytes)) { + if (enforce && !qgroup_check_limits(qg, num_bytes)) { ret = -EDQUOT; goto out; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index c314f26d1f15..948b11748fe6 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2295,7 +2295,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) */ cur_trans->state = TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); - clear_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags);
spin_lock(&fs_info->trans_lock); list_del_init(&cur_trans->list); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 761cc65a7264..d8a7d460e436 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -207,20 +207,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root); int btrfs_commit_transaction(struct btrfs_trans_handle *trans); int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, int wait_for_unblock); - -/* - * Try to commit transaction asynchronously, so this is safe to call - * even holding a spinlock. - * - * It's done by informing transaction_kthread to commit transaction without - * waiting for commit interval. - */ -static inline void btrfs_commit_transaction_locksafe( - struct btrfs_fs_info *fs_info) -{ - set_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags); - wake_up_process(fs_info->transaction_kthread); -} int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans); int btrfs_should_end_transaction(struct btrfs_trans_handle *trans); void btrfs_throttle(struct btrfs_fs_info *fs_info);
From: Filipe Manana fdmanana@suse.com
commit a855fbe69229078cd8aecd8974fb996a5ca651e6 upstream
When running test case btrfs/017 from fstests, lockdep reported the following splat:
[ 1297.067385] ====================================================== [ 1297.067708] WARNING: possible circular locking dependency detected [ 1297.068022] 5.10.0-rc4-btrfs-next-73 #1 Not tainted [ 1297.068322] ------------------------------------------------------ [ 1297.068629] btrfs/189080 is trying to acquire lock: [ 1297.068929] ffff9f2725731690 (sb_internal#2){.+.+}-{0:0}, at: btrfs_quota_enable+0xaf/0xa70 [btrfs] [ 1297.069274] but task is already holding lock: [ 1297.069868] ffff9f2702b61a08 (&fs_info->qgroup_ioctl_lock){+.+.}-{3:3}, at: btrfs_quota_enable+0x3b/0xa70 [btrfs] [ 1297.070219] which lock already depends on the new lock.
[ 1297.071131] the existing dependency chain (in reverse order) is: [ 1297.071721] -> #1 (&fs_info->qgroup_ioctl_lock){+.+.}-{3:3}: [ 1297.072375] lock_acquire+0xd8/0x490 [ 1297.072710] __mutex_lock+0xa3/0xb30 [ 1297.073061] btrfs_qgroup_inherit+0x59/0x6a0 [btrfs] [ 1297.073421] create_subvol+0x194/0x990 [btrfs] [ 1297.073780] btrfs_mksubvol+0x3fb/0x4a0 [btrfs] [ 1297.074133] __btrfs_ioctl_snap_create+0x119/0x1a0 [btrfs] [ 1297.074498] btrfs_ioctl_snap_create+0x58/0x80 [btrfs] [ 1297.074872] btrfs_ioctl+0x1a90/0x36f0 [btrfs] [ 1297.075245] __x64_sys_ioctl+0x83/0xb0 [ 1297.075617] do_syscall_64+0x33/0x80 [ 1297.075993] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 1297.076380] -> #0 (sb_internal#2){.+.+}-{0:0}: [ 1297.077166] check_prev_add+0x91/0xc60 [ 1297.077572] __lock_acquire+0x1740/0x3110 [ 1297.077984] lock_acquire+0xd8/0x490 [ 1297.078411] start_transaction+0x3c5/0x760 [btrfs] [ 1297.078853] btrfs_quota_enable+0xaf/0xa70 [btrfs] [ 1297.079323] btrfs_ioctl+0x2c60/0x36f0 [btrfs] [ 1297.079789] __x64_sys_ioctl+0x83/0xb0 [ 1297.080232] do_syscall_64+0x33/0x80 [ 1297.080680] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 1297.081139] other info that might help us debug this:
[ 1297.082536] Possible unsafe locking scenario:
[ 1297.083510] CPU0 CPU1 [ 1297.084005] ---- ---- [ 1297.084500] lock(&fs_info->qgroup_ioctl_lock); [ 1297.084994] lock(sb_internal#2); [ 1297.085485] lock(&fs_info->qgroup_ioctl_lock); [ 1297.085974] lock(sb_internal#2); [ 1297.086454] *** DEADLOCK *** [ 1297.087880] 3 locks held by btrfs/189080: [ 1297.088324] #0: ffff9f2725731470 (sb_writers#14){.+.+}-{0:0}, at: btrfs_ioctl+0xa73/0x36f0 [btrfs] [ 1297.088799] #1: ffff9f2702b60cc0 (&fs_info->subvol_sem){++++}-{3:3}, at: btrfs_ioctl+0x1f4d/0x36f0 [btrfs] [ 1297.089284] #2: ffff9f2702b61a08 (&fs_info->qgroup_ioctl_lock){+.+.}-{3:3}, at: btrfs_quota_enable+0x3b/0xa70 [btrfs] [ 1297.089771] stack backtrace: [ 1297.090662] CPU: 5 PID: 189080 Comm: btrfs Not tainted 5.10.0-rc4-btrfs-next-73 #1 [ 1297.091132] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 [ 1297.092123] Call Trace: [ 1297.092629] dump_stack+0x8d/0xb5 [ 1297.093115] check_noncircular+0xff/0x110 [ 1297.093596] check_prev_add+0x91/0xc60 [ 1297.094076] ? kvm_clock_read+0x14/0x30 [ 1297.094553] ? kvm_sched_clock_read+0x5/0x10 [ 1297.095029] __lock_acquire+0x1740/0x3110 [ 1297.095510] lock_acquire+0xd8/0x490 [ 1297.095993] ? btrfs_quota_enable+0xaf/0xa70 [btrfs] [ 1297.096476] start_transaction+0x3c5/0x760 [btrfs] [ 1297.096962] ? btrfs_quota_enable+0xaf/0xa70 [btrfs] [ 1297.097451] btrfs_quota_enable+0xaf/0xa70 [btrfs] [ 1297.097941] ? btrfs_ioctl+0x1f4d/0x36f0 [btrfs] [ 1297.098429] btrfs_ioctl+0x2c60/0x36f0 [btrfs] [ 1297.098904] ? do_user_addr_fault+0x20c/0x430 [ 1297.099382] ? kvm_clock_read+0x14/0x30 [ 1297.099854] ? kvm_sched_clock_read+0x5/0x10 [ 1297.100328] ? sched_clock+0x5/0x10 [ 1297.100801] ? sched_clock_cpu+0x12/0x180 [ 1297.101272] ? __x64_sys_ioctl+0x83/0xb0 [ 1297.101739] __x64_sys_ioctl+0x83/0xb0 [ 1297.102207] do_syscall_64+0x33/0x80 [ 1297.102673] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 1297.103148] RIP: 0033:0x7f773ff65d87
This is because during the quota enable ioctl we lock first the mutex qgroup_ioctl_lock and then start a transaction, and starting a transaction acquires a fs freeze semaphore (at the VFS level). However, every other code path, except for the quota disable ioctl path, we do the opposite: we start a transaction and then lock the mutex.
So fix this by making the quota enable and disable paths to start the transaction without having the mutex locked, and then, after starting the transaction, lock the mutex and check if some other task already enabled or disabled the quotas, bailing with success if that was the case.
Signed-off-by: Filipe Manana fdmanana@suse.com Reviewed-by: David Sterba dsterba@suse.com Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Anand Jain anand.jain@oracle.com
Conflicts: fs/btrfs/qgroup.c --- fs/btrfs/ctree.h | 5 ++++- fs/btrfs/qgroup.c | 56 ++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 52 insertions(+), 9 deletions(-)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1dd36965cd08..cd77c0621a55 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -827,7 +827,10 @@ struct btrfs_fs_info { */ struct ulist *qgroup_ulist;
- /* protect user change for quota operations */ + /* + * Protect user change for quota operations. If a transaction is needed, + * it must be started before locking this lock. + */ struct mutex qgroup_ioctl_lock;
/* list of dirty qgroups to be written at next commit */ diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 4720e477c482..2f119adbc808 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -886,6 +886,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) struct btrfs_key found_key; struct btrfs_qgroup *qgroup = NULL; struct btrfs_trans_handle *trans = NULL; + struct ulist *ulist = NULL; int ret = 0; int slot;
@@ -893,12 +894,27 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) if (fs_info->quota_root) goto out;
- fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL); - if (!fs_info->qgroup_ulist) { + ulist = ulist_alloc(GFP_KERNEL); + if (!ulist) { ret = -ENOMEM; goto out; }
+ /* + * Unlock qgroup_ioctl_lock before starting the transaction. This is to + * avoid lock acquisition inversion problems (reported by lockdep) between + * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we + * start a transaction. + * After we started the transaction lock qgroup_ioctl_lock again and + * check if someone else created the quota root in the meanwhile. If so, + * just return success and release the transaction handle. + * + * Also we don't need to worry about someone else calling + * btrfs_sysfs_add_qgroups() after we unlock and getting an error because + * that function returns 0 (success) when the sysfs entries already exist. + */ + mutex_unlock(&fs_info->qgroup_ioctl_lock); + /* * 1 for quota root item * 1 for BTRFS_QGROUP_STATUS item @@ -908,12 +924,20 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) * would be a lot of overkill. */ trans = btrfs_start_transaction(tree_root, 2); + + mutex_lock(&fs_info->qgroup_ioctl_lock); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; goto out; }
+ if (fs_info->quota_root) + goto out; + + fs_info->qgroup_ulist = ulist; + ulist = NULL; + /* * initially create the quota tree */ @@ -1046,10 +1070,13 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) if (ret) { ulist_free(fs_info->qgroup_ulist); fs_info->qgroup_ulist = NULL; - if (trans) - btrfs_end_transaction(trans); } mutex_unlock(&fs_info->qgroup_ioctl_lock); + if (ret && trans) + btrfs_end_transaction(trans); + else if (trans) + ret = btrfs_end_transaction(trans); + ulist_free(ulist); return ret; }
@@ -1062,19 +1089,29 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) goto out; + mutex_unlock(&fs_info->qgroup_ioctl_lock);
/* * 1 For the root item * * We should also reserve enough items for the quota tree deletion in * btrfs_clean_quota_tree but this is not done. + * + * Also, we must always start a transaction without holding the mutex + * qgroup_ioctl_lock, see btrfs_quota_enable(). */ trans = btrfs_start_transaction(fs_info->tree_root, 1); + + mutex_lock(&fs_info->qgroup_ioctl_lock); if (IS_ERR(trans)) { ret = PTR_ERR(trans); + trans = NULL; goto out; }
+ if (!fs_info->quota_root) + goto out; + clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); btrfs_qgroup_wait_for_completion(fs_info, false); spin_lock(&fs_info->qgroup_lock); @@ -1088,13 +1125,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) ret = btrfs_clean_quota_tree(trans, quota_root); if (ret) { btrfs_abort_transaction(trans, ret); - goto end_trans; + goto out; }
ret = btrfs_del_root(trans, "a_root->root_key); if (ret) { btrfs_abort_transaction(trans, ret); - goto end_trans; + goto out; }
list_del("a_root->dirty_list); @@ -1108,10 +1145,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) free_extent_buffer(quota_root->commit_root); kfree(quota_root);
-end_trans: - ret = btrfs_end_transaction(trans); out: mutex_unlock(&fs_info->qgroup_ioctl_lock); + if (ret && trans) + btrfs_end_transaction(trans); + else if (trans) + ret = btrfs_end_transaction(trans); + return ret; }
On Fri, Aug 13, 2021 at 05:55:23PM +0800, Anand Jain wrote:
Patch 1-2 and 5 helps to fix the conflicts smoothly. Patch 3-4 and 6 fixes the regression as reported by the test case btrfs/153 and comes from the below patch-set (btrfs: qgroup: Fix the long-existing regression of btrfs/153) Patch 7 fixes lockdep Warning as in the commit log now reported by the test case btrfs/153 on 5.4.y
All queued up, thanks!
greg k-h
linux-stable-mirror@lists.linaro.org