Hi Greg and stable team,
Here's a backport of relocation fixes that went into 5.16 aimed at the 5.15.x series of stable kernels. It's a problem people are currently running into when using btrfs on a zoned block device.
The following patches have been backported: 960a3166aed0 ("btrfs: zoned: allow preallocation for relocation inodes") 2adada886b26 ("btrfs: check for relocation inodes on zoned btrfs in should_nocow") e6d261e3b1f7 ("btrfs: zoned: use regular writes for relocation") 35156d852762 ("btrfs: zoned: only allow one process to add pages to a relocation inode") c2707a255623 ("btrfs: zoned: add a dedicated data relocation block group") 37f00a6d2e9c ("btrfs: introduce btrfs_is_data_reloc_root")
The backport has seen the usual regression testing with xfstests.
Johannes Thumshirn (6): btrfs: introduce btrfs_is_data_reloc_root btrfs: zoned: add a dedicated data relocation block group btrfs: zoned: only allow one process to add pages to a relocation inode btrfs: zoned: use regular writes for relocation btrfs: check for relocation inodes on zoned btrfs in should_nocow btrfs: zoned: allow preallocation for relocation inodes
fs/btrfs/block-group.c | 1 + fs/btrfs/ctree.h | 12 +++++++++ fs/btrfs/disk-io.c | 3 ++- fs/btrfs/extent-tree.c | 56 +++++++++++++++++++++++++++++++++++++++--- fs/btrfs/extent_io.c | 11 +++++++++ fs/btrfs/inode.c | 29 +++++++++++++--------- fs/btrfs/relocation.c | 38 +++------------------------- fs/btrfs/zoned.c | 21 ++++++++++++++++ fs/btrfs/zoned.h | 3 +++ 9 files changed, 123 insertions(+), 51 deletions(-)
commit 37f00a6d2e9c97d6e7b5c3d47c49b714c3d0b99f upstream
There are several places in our codebase where we check if a root is the root of the data reloc tree and subsequent patches will introduce more.
Factor out the check into a small helper function instead of open coding it multiple times.
Reviewed-by: Naohiro Aota naohiro.aota@wdc.com Signed-off-by: Johannes Thumshirn johannes.thumshirn@wdc.com Reviewed-by: David Sterba dsterba@suse.com Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Johannes Thumshirn johannes.thumshirn@wdc.com --- fs/btrfs/ctree.h | 5 +++++ fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent-tree.c | 2 +- fs/btrfs/inode.c | 19 ++++++++----------- fs/btrfs/relocation.c | 3 +-- 5 files changed, 16 insertions(+), 15 deletions(-)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c0cebcf745ce..5b990881052c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3842,6 +3842,11 @@ static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) return fs_info->zoned != 0; }
+static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) +{ + return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID; +} + /* * We use page status Private2 to indicate there is an ordered extent with * unfinished IO. diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6965ed081346..f63d8a7e6dae 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1500,7 +1500,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) goto fail;
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID && - root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { + !btrfs_is_data_reloc_root(root)) { set_bit(BTRFS_ROOT_SHAREABLE, &root->state); btrfs_check_and_init_root_item(&root->root_item); } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0ab456cb4bf8..45e020c9fdc9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2376,7 +2376,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
out: btrfs_free_path(path); - if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + if (btrfs_is_data_reloc_root(root)) WARN_ON(ret > 0); return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7c096ab9bb5e..85663bccde8a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1151,7 +1151,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * fails during the stage where it updates the bytenr of file extent * items. */ - if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + if (btrfs_is_data_reloc_root(root)) min_alloc_size = num_bytes; else min_alloc_size = fs_info->sectorsize; @@ -1187,8 +1187,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (ret) goto out_drop_extent_cache;
- if (root->root_key.objectid == - BTRFS_DATA_RELOC_TREE_OBJECTID) { + if (btrfs_is_data_reloc_root(root)) { ret = btrfs_reloc_clone_csums(inode, start, cur_alloc_size); /* @@ -1504,8 +1503,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, int *page_started, unsigned long *nr_written) { const bool is_space_ino = btrfs_is_free_space_inode(inode); - const bool is_reloc_ino = (inode->root->root_key.objectid == - BTRFS_DATA_RELOC_TREE_OBJECTID); + const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root); const u64 range_bytes = end + 1 - start; struct extent_io_tree *io_tree = &inode->io_tree; u64 range_start = start; @@ -1867,8 +1865,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, btrfs_dec_nocow_writers(fs_info, disk_bytenr); nocow = false;
- if (root->root_key.objectid == - BTRFS_DATA_RELOC_TREE_OBJECTID) + if (btrfs_is_data_reloc_root(root)) /* * Error handled later, as we must prevent * extent_clear_unlock_delalloc() in error handler @@ -2207,7 +2204,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, if (btrfs_is_testing(fs_info)) return;
- if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID && + if (!btrfs_is_data_reloc_root(root) && do_list && !(state->state & EXTENT_NORESERVE) && (*bits & EXTENT_CLEAR_DATA_RESV)) btrfs_free_reserved_data_space_noquota(fs_info, len); @@ -2532,7 +2529,7 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, goto mapit; } else if (async && !skip_sum) { /* csum items have already been cloned */ - if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + if (btrfs_is_data_reloc_root(root)) goto mapit; /* we're doing a write, do the async checksumming */ ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags, @@ -3304,7 +3301,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, u64 file_offset = pg_off + page_offset(page); int ret;
- if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && + if (btrfs_is_data_reloc_root(root) && test_range_bit(io_tree, file_offset, file_offset + sectorsize - 1, EXTENT_NODATASUM, 1, NULL)) { @@ -4005,7 +4002,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, * without delay */ if (!btrfs_is_free_space_inode(inode) - && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID + && !btrfs_is_data_reloc_root(root) && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { btrfs_update_root_times(trans, root);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 914d403b4415..2e4f109723af 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4386,8 +4386,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, if (!rc) return 0;
- BUG_ON(rc->stage == UPDATE_DATA_PTRS && - root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID); + BUG_ON(rc->stage == UPDATE_DATA_PTRS && btrfs_is_data_reloc_root(root));
level = btrfs_header_level(buf); if (btrfs_header_generation(buf) <=
commit c2707a25562343511bf9a3a6a636a16a822204eb upstream
Relocation in a zoned filesystem can fail with a transaction abort with error -22 (EINVAL). This happens because the relocation code assumes that the extents we relocated the data to have the same size the source extents had and ensures this by preallocating the extents.
But in a zoned filesystem we currently can't preallocate the extents as this would break the sequential write required rule. Therefore it can happen that the writeback process kicks in while we're still adding pages to a delalloc range and starts writing out dirty pages.
This then creates destination extents that are smaller than the source extents, triggering the following safety check in get_new_location():
1034 if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) { 1035 ret = -EINVAL; 1036 goto out; 1037 }
Temporarily create a dedicated block group for the relocation process, so no non-relocation data writes can interfere with the relocation writes.
This is needed that we can switch the relocation process on a zoned filesystem from the REQ_OP_ZONE_APPEND writing we use for data to a scheme like in a non-zoned filesystem using REQ_OP_WRITE and preallocation.
Fixes: 32430c614844 ("btrfs: zoned: enable relocation on a zoned filesystem") Reviewed-by: Naohiro Aota naohiro.aota@wdc.com Signed-off-by: Johannes Thumshirn johannes.thumshirn@wdc.com Reviewed-by: David Sterba dsterba@suse.com Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Johannes Thumshirn johannes.thumshirn@wdc.com --- fs/btrfs/block-group.c | 1 + fs/btrfs/ctree.h | 7 ++++++ fs/btrfs/disk-io.c | 1 + fs/btrfs/extent-tree.c | 54 ++++++++++++++++++++++++++++++++++++++++-- fs/btrfs/zoned.c | 10 ++++++++ fs/btrfs/zoned.h | 3 +++ 6 files changed, 74 insertions(+), 2 deletions(-)
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index a3b830b8410a..a53ebc52bd51 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -902,6 +902,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_unlock(&cluster->refill_lock);
btrfs_clear_treelog_bg(block_group); + btrfs_clear_data_reloc_bg(block_group);
path = btrfs_alloc_path(); if (!path) { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5b990881052c..ae06ad559353 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1017,6 +1017,13 @@ struct btrfs_fs_info { spinlock_t treelog_bg_lock; u64 treelog_bg;
+ /* + * Start of the dedicated data relocation block group, protected by + * relocation_bg_lock. + */ + spinlock_t relocation_bg_lock; + u64 data_reloc_bg; + #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; struct rb_root block_tree; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f63d8a7e6dae..e00c4c1f622f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2883,6 +2883,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->unused_bgs_lock); spin_lock_init(&fs_info->treelog_bg_lock); + spin_lock_init(&fs_info->relocation_bg_lock); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->unused_bg_unpin_mutex); mutex_init(&fs_info->reclaim_bgs_lock); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 45e020c9fdc9..87c23c5c0f26 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3495,6 +3495,9 @@ struct find_free_extent_ctl { /* Allocation is called for tree-log */ bool for_treelog;
+ /* Allocation is called for data relocation */ + bool for_data_reloc; + /* RAID index, converted from flags */ int index;
@@ -3756,6 +3759,7 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, u64 avail; u64 bytenr = block_group->start; u64 log_bytenr; + u64 data_reloc_bytenr; int ret = 0; bool skip;
@@ -3773,13 +3777,31 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, if (skip) return 1;
+ /* + * Do not allow non-relocation blocks in the dedicated relocation block + * group, and vice versa. + */ + spin_lock(&fs_info->relocation_bg_lock); + data_reloc_bytenr = fs_info->data_reloc_bg; + if (data_reloc_bytenr && + ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) || + (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr))) + skip = true; + spin_unlock(&fs_info->relocation_bg_lock); + if (skip) + return 1; + spin_lock(&space_info->lock); spin_lock(&block_group->lock); spin_lock(&fs_info->treelog_bg_lock); + spin_lock(&fs_info->relocation_bg_lock);
ASSERT(!ffe_ctl->for_treelog || block_group->start == fs_info->treelog_bg || fs_info->treelog_bg == 0); + ASSERT(!ffe_ctl->for_data_reloc || + block_group->start == fs_info->data_reloc_bg || + fs_info->data_reloc_bg == 0);
if (block_group->ro) { ret = 1; @@ -3796,6 +3818,16 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, goto out; }
+ /* + * Do not allow currently used block group to be the data relocation + * dedicated block group. + */ + if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg && + (block_group->used || block_group->reserved)) { + ret = 1; + goto out; + } + avail = block_group->length - block_group->alloc_offset; if (avail < num_bytes) { if (ffe_ctl->max_extent_size < avail) { @@ -3813,6 +3845,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, if (ffe_ctl->for_treelog && !fs_info->treelog_bg) fs_info->treelog_bg = block_group->start;
+ if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg) + fs_info->data_reloc_bg = block_group->start; + ffe_ctl->found_offset = start + block_group->alloc_offset; block_group->alloc_offset += num_bytes; spin_lock(&ctl->tree_lock); @@ -3829,6 +3864,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, out: if (ret && ffe_ctl->for_treelog) fs_info->treelog_bg = 0; + if (ret && ffe_ctl->for_data_reloc) + fs_info->data_reloc_bg = 0; + spin_unlock(&fs_info->relocation_bg_lock); spin_unlock(&fs_info->treelog_bg_lock); spin_unlock(&block_group->lock); spin_unlock(&space_info->lock); @@ -4085,6 +4123,12 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, ffe_ctl->hint_byte = fs_info->treelog_bg; spin_unlock(&fs_info->treelog_bg_lock); } + if (ffe_ctl->for_data_reloc) { + spin_lock(&fs_info->relocation_bg_lock); + if (fs_info->data_reloc_bg) + ffe_ctl->hint_byte = fs_info->data_reloc_bg; + spin_unlock(&fs_info->relocation_bg_lock); + } return 0; default: BUG(); @@ -4129,6 +4173,8 @@ static noinline int find_free_extent(struct btrfs_root *root, struct btrfs_space_info *space_info; bool full_search = false; bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); + bool for_data_reloc = (btrfs_is_data_reloc_root(root) && + flags & BTRFS_BLOCK_GROUP_DATA);
WARN_ON(num_bytes < fs_info->sectorsize);
@@ -4143,6 +4189,7 @@ static noinline int find_free_extent(struct btrfs_root *root, ffe_ctl.found_offset = 0; ffe_ctl.hint_byte = hint_byte_orig; ffe_ctl.for_treelog = for_treelog; + ffe_ctl.for_data_reloc = for_data_reloc; ffe_ctl.policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
/* For clustered allocation */ @@ -4220,6 +4267,8 @@ static noinline int find_free_extent(struct btrfs_root *root, if (unlikely(block_group->ro)) { if (for_treelog) btrfs_clear_treelog_bg(block_group); + if (ffe_ctl.for_data_reloc) + btrfs_clear_data_reloc_bg(block_group); continue; }
@@ -4408,6 +4457,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 flags; int ret; bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); + bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data);
flags = get_alloc_profile_by_root(root, is_data); again: @@ -4431,8 +4481,8 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
sinfo = btrfs_find_space_info(fs_info, flags); btrfs_err(fs_info, - "allocation failed flags %llu, wanted %llu tree-log %d", - flags, num_bytes, for_treelog); + "allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d", + flags, num_bytes, for_treelog, for_data_reloc); if (sinfo) btrfs_dump_space_info(fs_info, sinfo, num_bytes, 1); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 47af1ab3bf12..18ee85e1c9a2 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1530,3 +1530,13 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
return device; } + +void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + + spin_lock(&fs_info->relocation_bg_lock); + if (fs_info->data_reloc_bg == bg->start) + fs_info->data_reloc_bg = 0; + spin_unlock(&fs_info->relocation_bg_lock); +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 4b299705bb12..70b3be517599 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -66,6 +66,7 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, u64 physical_start, u64 physical_pos); struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, u64 logical, u64 length); +void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -199,6 +200,8 @@ static inline struct btrfs_device *btrfs_zoned_get_device( return ERR_PTR(-EOPNOTSUPP); }
+static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { } + #endif
static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
commit 35156d852762b58855f513b4f8bb7f32d69dc9c5 upstream
Don't allow more than one process to add pages to a relocation inode on a zoned filesystem, otherwise we cannot guarantee the sequential write rule once we're filling preallocated extents on a zoned filesystem.
Signed-off-by: Johannes Thumshirn johannes.thumshirn@wdc.com Reviewed-by: David Sterba dsterba@suse.com Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Johannes Thumshirn johannes.thumshirn@wdc.com --- fs/btrfs/extent_io.c | 11 +++++++++++ 1 file changed, 11 insertions(+)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index aaddd7225348..a40fb9c74dda 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5120,6 +5120,9 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, int extent_writepages(struct address_space *mapping, struct writeback_control *wbc) { + struct inode *inode = mapping->host; + const bool data_reloc = btrfs_is_data_reloc_root(BTRFS_I(inode)->root); + const bool zoned = btrfs_is_zoned(BTRFS_I(inode)->root->fs_info); int ret = 0; struct extent_page_data epd = { .bio_ctrl = { 0 }, @@ -5127,7 +5130,15 @@ int extent_writepages(struct address_space *mapping, .sync_io = wbc->sync_mode == WB_SYNC_ALL, };
+ /* + * Allow only a single thread to do the reloc work in zoned mode to + * protect the write pointer updates. + */ + if (data_reloc && zoned) + btrfs_inode_lock(inode, 0); ret = extent_write_cache_pages(mapping, wbc, &epd); + if (data_reloc && zoned) + btrfs_inode_unlock(inode, 0); ASSERT(ret <= 0); if (ret < 0) { end_write_bio(&epd, ret);
commit e6d261e3b1f777b499ce8f535ed44dd1b69278b7 upstream
Now that we have a dedicated block group for relocation, we can use REQ_OP_WRITE instead of REQ_OP_ZONE_APPEND for writing out the data on relocation.
Reviewed-by: Naohiro Aota naohiro.aota@wdc.com Signed-off-by: Johannes Thumshirn johannes.thumshirn@wdc.com Reviewed-by: David Sterba dsterba@suse.com Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Johannes Thumshirn johannes.thumshirn@wdc.com --- fs/btrfs/zoned.c | 11 +++++++++++ 1 file changed, 11 insertions(+)
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 18ee85e1c9a2..5672c24a2d58 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1304,6 +1304,17 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) if (!is_data_inode(&inode->vfs_inode)) return false;
+ /* + * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the + * extent layout the relocation code has. + * Furthermore we have set aside own block-group from which only the + * relocation "process" can allocate and make sure only one process at a + * time can add pages to an extent that gets relocated, so it's safe to + * use regular REQ_OP_WRITE for this special case. + */ + if (btrfs_is_data_reloc_root(inode->root)) + return false; + cache = btrfs_lookup_block_group(fs_info, start); ASSERT(cache); if (!cache)
commit 2adada886b26e998b5a624e72f0834ebfdc54cc7 upstream
Prepare for allowing preallocation for relocation inodes.
Reviewed-by: Naohiro Aota naohiro.aota@wdc.com Signed-off-by: Johannes Thumshirn johannes.thumshirn@wdc.com Reviewed-by: David Sterba dsterba@suse.com Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Johannes Thumshirn johannes.thumshirn@wdc.com --- fs/btrfs/inode.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 85663bccde8a..61b4651f008d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1945,7 +1945,15 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page const bool zoned = btrfs_is_zoned(inode->root->fs_info);
if (should_nocow(inode, start, end)) { - ASSERT(!zoned); + /* + * Normally on a zoned device we're only doing COW writes, but + * in case of relocation on a zoned filesystem we have taken + * precaution, that we're only writing sequentially. It's safe + * to use run_delalloc_nocow() here, like for regular + * preallocated inodes. + */ + ASSERT(!zoned || + (zoned && btrfs_is_data_reloc_root(inode->root))); ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, nr_written); } else if (!inode_can_compress(inode) ||
commit 960a3166aed015887cd54423a6589ae4d0b65bd5 upstream
Now that we use a dedicated block group and regular writes for data relocation, we can preallocate the space needed for a relocated inode, just like we do in regular mode.
Essentially this reverts commit 32430c614844 ("btrfs: zoned: enable relocation on a zoned filesystem") as it is not needed anymore.
Signed-off-by: Johannes Thumshirn johannes.thumshirn@wdc.com Reviewed-by: David Sterba dsterba@suse.com Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Johannes Thumshirn johannes.thumshirn@wdc.com --- fs/btrfs/relocation.c | 35 ++--------------------------------- 1 file changed, 2 insertions(+), 33 deletions(-)
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 2e4f109723af..d81bee621d37 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2852,31 +2852,6 @@ static noinline_for_stack int prealloc_file_extent_cluster( if (ret) return ret;
- /* - * On a zoned filesystem, we cannot preallocate the file region. - * Instead, we dirty and fiemap_write the region. - */ - if (btrfs_is_zoned(inode->root->fs_info)) { - struct btrfs_root *root = inode->root; - struct btrfs_trans_handle *trans; - - end = cluster->end - offset + 1; - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode); - i_size_write(&inode->vfs_inode, end); - ret = btrfs_update_inode(trans, root, inode); - if (ret) { - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - return ret; - } - - return btrfs_end_transaction(trans); - } - btrfs_inode_lock(&inode->vfs_inode, 0); for (nr = 0; nr < cluster->nr; nr++) { start = cluster->boundary[nr] - offset; @@ -3084,7 +3059,6 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, static int relocate_file_extent_cluster(struct inode *inode, struct file_extent_cluster *cluster) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 offset = BTRFS_I(inode)->index_cnt; unsigned long index; unsigned long last_index; @@ -3114,8 +3088,6 @@ static int relocate_file_extent_cluster(struct inode *inode, for (index = (cluster->start - offset) >> PAGE_SHIFT; index <= last_index && !ret; index++) ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index); - if (btrfs_is_zoned(fs_info) && !ret) - ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); if (ret == 0) WARN_ON(cluster_nr != cluster->nr); out: @@ -3770,12 +3742,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct btrfs_inode_item *item; struct extent_buffer *leaf; - u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC; int ret;
- if (btrfs_is_zoned(trans->fs_info)) - flags &= ~BTRFS_INODE_PREALLOC; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -3790,7 +3758,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, btrfs_set_inode_generation(leaf, item, 1); btrfs_set_inode_size(leaf, item, 0); btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); - btrfs_set_inode_flags(leaf, item, flags); + btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | + BTRFS_INODE_PREALLOC); btrfs_mark_buffer_dirty(leaf); out: btrfs_free_path(path);
On Thu, Nov 18, 2021 at 05:58:12PM +0900, Johannes Thumshirn wrote:
Hi Greg and stable team,
Here's a backport of relocation fixes that went into 5.16 aimed at the 5.15.x series of stable kernels. It's a problem people are currently running into when using btrfs on a zoned block device.
The following patches have been backported: 960a3166aed0 ("btrfs: zoned: allow preallocation for relocation inodes") 2adada886b26 ("btrfs: check for relocation inodes on zoned btrfs in should_nocow") e6d261e3b1f7 ("btrfs: zoned: use regular writes for relocation") 35156d852762 ("btrfs: zoned: only allow one process to add pages to a relocation inode") c2707a255623 ("btrfs: zoned: add a dedicated data relocation block group") 37f00a6d2e9c ("btrfs: introduce btrfs_is_data_reloc_root")
The backport has seen the usual regression testing with xfstests.
Now queued up, thanks.
greg k-h
linux-stable-mirror@lists.linaro.org