April 2022 - Linux-stable-mirror

FAILED: patch "[PATCH] btrfs: remove device item and update super block in the same" failed to apply to 4.19-stable tree

by gregkh＠linuxfoundation.org

The patch below does not apply to the 4.19-stable tree. If someone wants it applied there, or to any other stable or longterm tree, then please email the backport, including the original git commit id to <stable(a)vger.kernel.org>. thanks, greg k-h ------------------ original commit in Linus's tree ------------------ From bbac58698a55cc0a6f0c0d69a6dcd3f9f3134c11 Mon Sep 17 00:00:00 2001 From: Qu Wenruo <wqu(a)suse.com> Date: Tue, 8 Mar 2022 13:36:38 +0800 Subject: [PATCH] btrfs: remove device item and update super block in the same transaction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [BUG] There is a report that a btrfs has a bad super block num devices. This makes btrfs to reject the fs completely. BTRFS error (device sdd3): super_num_devices 3 mismatch with num_devices 2 found here BTRFS error (device sdd3): failed to read chunk tree: -22 BTRFS error (device sdd3): open_ctree failed [CAUSE] During btrfs device removal, chunk tree and super block num devs are updated in two different transactions: btrfs_rm_device() |- btrfs_rm_dev_item(device) | |- trans = btrfs_start_transaction() | | Now we got transaction X | | | |- btrfs_del_item() | | Now device item is removed from chunk tree | | | |- btrfs_commit_transaction() | Transaction X got committed, super num devs untouched, | but device item removed from chunk tree. | (AKA, super num devs is already incorrect) | |- cur_devices->num_devices--; |- cur_devices->total_devices--; |- btrfs_set_super_num_devices() All those operations are not in transaction X, thus it will only be written back to disk in next transaction. So after the transaction X in btrfs_rm_dev_item() committed, but before transaction X+1 (which can be minutes away), a power loss happen, then we got the super num mismatch. [FIX] Instead of starting and committing a transaction inside btrfs_rm_dev_item(), start a transaction in side btrfs_rm_device() and pass it to btrfs_rm_dev_item(). And only commit the transaction after everything is done. Reported-by: Luca Béla Palkovics <luca.bela.palkovics(a)gmail.com> Link: https://lore.kernel.org/linux-btrfs/CA+8xDSpvdm_U0QLBAnrH=zqDq_cWCOH5TiV46C… CC: stable(a)vger.kernel.org # 4.14+ Reviewed-by: Anand Jain <anand.jain(a)oracle.com> Signed-off-by: Qu Wenruo <wqu(a)suse.com> Signed-off-by: David Sterba <dsterba(a)suse.com> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1be7cb2f955f..2cfbc74a3b4e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1896,23 +1896,18 @@ static void update_dev_time(const char *device_path) path_put(&path); } -static int btrfs_rm_dev_item(struct btrfs_device *device) +static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans, + struct btrfs_device *device) { struct btrfs_root *root = device->fs_info->chunk_root; int ret; struct btrfs_path *path; struct btrfs_key key; - struct btrfs_trans_handle *trans; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - btrfs_free_path(path); - return PTR_ERR(trans); - } key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.type = BTRFS_DEV_ITEM_KEY; key.offset = device->devid; @@ -1923,21 +1918,12 @@ static int btrfs_rm_dev_item(struct btrfs_device *device) if (ret) { if (ret > 0) ret = -ENOENT; - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); goto out; } ret = btrfs_del_item(trans, root, path); - if (ret) { - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - } - out: btrfs_free_path(path); - if (!ret) - ret = btrfs_commit_transaction(trans); return ret; } @@ -2078,6 +2064,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, struct block_device **bdev, fmode_t *mode) { + struct btrfs_trans_handle *trans; struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; @@ -2098,7 +2085,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); if (ret) - goto out; + return ret; device = btrfs_find_device(fs_info->fs_devices, args); if (!device) { @@ -2106,27 +2093,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; else ret = -ENOENT; - goto out; + return ret; } if (btrfs_pinned_by_swapfile(fs_info, device)) { btrfs_warn_in_rcu(fs_info, "cannot remove device %s (devid %llu) due to active swapfile", rcu_str_deref(device->name), device->devid); - ret = -ETXTBSY; - goto out; + return -ETXTBSY; } - if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { - ret = BTRFS_ERROR_DEV_TGT_REPLACE; - goto out; - } + if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) + return BTRFS_ERROR_DEV_TGT_REPLACE; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && - fs_info->fs_devices->rw_devices == 1) { - ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; - goto out; - } + fs_info->fs_devices->rw_devices == 1) + return BTRFS_ERROR_DEV_ONLY_WRITABLE; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { mutex_lock(&fs_info->chunk_mutex); @@ -2139,14 +2121,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, if (ret) goto error_undo; - /* - * TODO: the superblock still includes this device in its num_devices - * counter although write_all_supers() is not locked out. This - * could give a filesystem state which requires a degraded mount. - */ - ret = btrfs_rm_dev_item(device); - if (ret) + trans = btrfs_start_transaction(fs_info->chunk_root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); goto error_undo; + } + + ret = btrfs_rm_dev_item(trans, device); + if (ret) { + /* Any error in dev item removal is critical */ + btrfs_crit(fs_info, + "failed to remove device item for devid %llu: %d", + device->devid, ret); + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); btrfs_scrub_cancel_dev(device); @@ -2229,7 +2219,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, free_fs_devices(cur_devices); } -out: + ret = btrfs_commit_transaction(trans); + return ret; error_undo: @@ -2240,7 +2231,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, device->fs_devices->rw_devices++; mutex_unlock(&fs_info->chunk_mutex); } - goto out; + return ret; } void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)

3 years, 3 months

1
0
0 0

FAILED: patch "[PATCH] btrfs: remove device item and update super block in the same" failed to apply to 5.4-stable tree

by gregkh＠linuxfoundation.org

The patch below does not apply to the 5.4-stable tree. If someone wants it applied there, or to any other stable or longterm tree, then please email the backport, including the original git commit id to <stable(a)vger.kernel.org>. thanks, greg k-h ------------------ original commit in Linus's tree ------------------ From bbac58698a55cc0a6f0c0d69a6dcd3f9f3134c11 Mon Sep 17 00:00:00 2001 From: Qu Wenruo <wqu(a)suse.com> Date: Tue, 8 Mar 2022 13:36:38 +0800 Subject: [PATCH] btrfs: remove device item and update super block in the same transaction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [BUG] There is a report that a btrfs has a bad super block num devices. This makes btrfs to reject the fs completely. BTRFS error (device sdd3): super_num_devices 3 mismatch with num_devices 2 found here BTRFS error (device sdd3): failed to read chunk tree: -22 BTRFS error (device sdd3): open_ctree failed [CAUSE] During btrfs device removal, chunk tree and super block num devs are updated in two different transactions: btrfs_rm_device() |- btrfs_rm_dev_item(device) | |- trans = btrfs_start_transaction() | | Now we got transaction X | | | |- btrfs_del_item() | | Now device item is removed from chunk tree | | | |- btrfs_commit_transaction() | Transaction X got committed, super num devs untouched, | but device item removed from chunk tree. | (AKA, super num devs is already incorrect) | |- cur_devices->num_devices--; |- cur_devices->total_devices--; |- btrfs_set_super_num_devices() All those operations are not in transaction X, thus it will only be written back to disk in next transaction. So after the transaction X in btrfs_rm_dev_item() committed, but before transaction X+1 (which can be minutes away), a power loss happen, then we got the super num mismatch. [FIX] Instead of starting and committing a transaction inside btrfs_rm_dev_item(), start a transaction in side btrfs_rm_device() and pass it to btrfs_rm_dev_item(). And only commit the transaction after everything is done. Reported-by: Luca Béla Palkovics <luca.bela.palkovics(a)gmail.com> Link: https://lore.kernel.org/linux-btrfs/CA+8xDSpvdm_U0QLBAnrH=zqDq_cWCOH5TiV46C… CC: stable(a)vger.kernel.org # 4.14+ Reviewed-by: Anand Jain <anand.jain(a)oracle.com> Signed-off-by: Qu Wenruo <wqu(a)suse.com> Signed-off-by: David Sterba <dsterba(a)suse.com> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1be7cb2f955f..2cfbc74a3b4e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1896,23 +1896,18 @@ static void update_dev_time(const char *device_path) path_put(&path); } -static int btrfs_rm_dev_item(struct btrfs_device *device) +static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans, + struct btrfs_device *device) { struct btrfs_root *root = device->fs_info->chunk_root; int ret; struct btrfs_path *path; struct btrfs_key key; - struct btrfs_trans_handle *trans; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - btrfs_free_path(path); - return PTR_ERR(trans); - } key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.type = BTRFS_DEV_ITEM_KEY; key.offset = device->devid; @@ -1923,21 +1918,12 @@ static int btrfs_rm_dev_item(struct btrfs_device *device) if (ret) { if (ret > 0) ret = -ENOENT; - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); goto out; } ret = btrfs_del_item(trans, root, path); - if (ret) { - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - } - out: btrfs_free_path(path); - if (!ret) - ret = btrfs_commit_transaction(trans); return ret; } @@ -2078,6 +2064,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, struct block_device **bdev, fmode_t *mode) { + struct btrfs_trans_handle *trans; struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; @@ -2098,7 +2085,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); if (ret) - goto out; + return ret; device = btrfs_find_device(fs_info->fs_devices, args); if (!device) { @@ -2106,27 +2093,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; else ret = -ENOENT; - goto out; + return ret; } if (btrfs_pinned_by_swapfile(fs_info, device)) { btrfs_warn_in_rcu(fs_info, "cannot remove device %s (devid %llu) due to active swapfile", rcu_str_deref(device->name), device->devid); - ret = -ETXTBSY; - goto out; + return -ETXTBSY; } - if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { - ret = BTRFS_ERROR_DEV_TGT_REPLACE; - goto out; - } + if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) + return BTRFS_ERROR_DEV_TGT_REPLACE; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && - fs_info->fs_devices->rw_devices == 1) { - ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; - goto out; - } + fs_info->fs_devices->rw_devices == 1) + return BTRFS_ERROR_DEV_ONLY_WRITABLE; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { mutex_lock(&fs_info->chunk_mutex); @@ -2139,14 +2121,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, if (ret) goto error_undo; - /* - * TODO: the superblock still includes this device in its num_devices - * counter although write_all_supers() is not locked out. This - * could give a filesystem state which requires a degraded mount. - */ - ret = btrfs_rm_dev_item(device); - if (ret) + trans = btrfs_start_transaction(fs_info->chunk_root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); goto error_undo; + } + + ret = btrfs_rm_dev_item(trans, device); + if (ret) { + /* Any error in dev item removal is critical */ + btrfs_crit(fs_info, + "failed to remove device item for devid %llu: %d", + device->devid, ret); + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); btrfs_scrub_cancel_dev(device); @@ -2229,7 +2219,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, free_fs_devices(cur_devices); } -out: + ret = btrfs_commit_transaction(trans); + return ret; error_undo: @@ -2240,7 +2231,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, device->fs_devices->rw_devices++; mutex_unlock(&fs_info->chunk_mutex); } - goto out; + return ret; } void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)

3 years, 3 months

1
0
0 0

FAILED: patch "[PATCH] btrfs: remove device item and update super block in the same" failed to apply to 5.10-stable tree

by gregkh＠linuxfoundation.org

The patch below does not apply to the 5.10-stable tree. If someone wants it applied there, or to any other stable or longterm tree, then please email the backport, including the original git commit id to <stable(a)vger.kernel.org>. thanks, greg k-h ------------------ original commit in Linus's tree ------------------ From bbac58698a55cc0a6f0c0d69a6dcd3f9f3134c11 Mon Sep 17 00:00:00 2001 From: Qu Wenruo <wqu(a)suse.com> Date: Tue, 8 Mar 2022 13:36:38 +0800 Subject: [PATCH] btrfs: remove device item and update super block in the same transaction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [BUG] There is a report that a btrfs has a bad super block num devices. This makes btrfs to reject the fs completely. BTRFS error (device sdd3): super_num_devices 3 mismatch with num_devices 2 found here BTRFS error (device sdd3): failed to read chunk tree: -22 BTRFS error (device sdd3): open_ctree failed [CAUSE] During btrfs device removal, chunk tree and super block num devs are updated in two different transactions: btrfs_rm_device() |- btrfs_rm_dev_item(device) | |- trans = btrfs_start_transaction() | | Now we got transaction X | | | |- btrfs_del_item() | | Now device item is removed from chunk tree | | | |- btrfs_commit_transaction() | Transaction X got committed, super num devs untouched, | but device item removed from chunk tree. | (AKA, super num devs is already incorrect) | |- cur_devices->num_devices--; |- cur_devices->total_devices--; |- btrfs_set_super_num_devices() All those operations are not in transaction X, thus it will only be written back to disk in next transaction. So after the transaction X in btrfs_rm_dev_item() committed, but before transaction X+1 (which can be minutes away), a power loss happen, then we got the super num mismatch. [FIX] Instead of starting and committing a transaction inside btrfs_rm_dev_item(), start a transaction in side btrfs_rm_device() and pass it to btrfs_rm_dev_item(). And only commit the transaction after everything is done. Reported-by: Luca Béla Palkovics <luca.bela.palkovics(a)gmail.com> Link: https://lore.kernel.org/linux-btrfs/CA+8xDSpvdm_U0QLBAnrH=zqDq_cWCOH5TiV46C… CC: stable(a)vger.kernel.org # 4.14+ Reviewed-by: Anand Jain <anand.jain(a)oracle.com> Signed-off-by: Qu Wenruo <wqu(a)suse.com> Signed-off-by: David Sterba <dsterba(a)suse.com> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1be7cb2f955f..2cfbc74a3b4e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1896,23 +1896,18 @@ static void update_dev_time(const char *device_path) path_put(&path); } -static int btrfs_rm_dev_item(struct btrfs_device *device) +static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans, + struct btrfs_device *device) { struct btrfs_root *root = device->fs_info->chunk_root; int ret; struct btrfs_path *path; struct btrfs_key key; - struct btrfs_trans_handle *trans; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - btrfs_free_path(path); - return PTR_ERR(trans); - } key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.type = BTRFS_DEV_ITEM_KEY; key.offset = device->devid; @@ -1923,21 +1918,12 @@ static int btrfs_rm_dev_item(struct btrfs_device *device) if (ret) { if (ret > 0) ret = -ENOENT; - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); goto out; } ret = btrfs_del_item(trans, root, path); - if (ret) { - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - } - out: btrfs_free_path(path); - if (!ret) - ret = btrfs_commit_transaction(trans); return ret; } @@ -2078,6 +2064,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, struct block_device **bdev, fmode_t *mode) { + struct btrfs_trans_handle *trans; struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; @@ -2098,7 +2085,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); if (ret) - goto out; + return ret; device = btrfs_find_device(fs_info->fs_devices, args); if (!device) { @@ -2106,27 +2093,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; else ret = -ENOENT; - goto out; + return ret; } if (btrfs_pinned_by_swapfile(fs_info, device)) { btrfs_warn_in_rcu(fs_info, "cannot remove device %s (devid %llu) due to active swapfile", rcu_str_deref(device->name), device->devid); - ret = -ETXTBSY; - goto out; + return -ETXTBSY; } - if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { - ret = BTRFS_ERROR_DEV_TGT_REPLACE; - goto out; - } + if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) + return BTRFS_ERROR_DEV_TGT_REPLACE; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && - fs_info->fs_devices->rw_devices == 1) { - ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; - goto out; - } + fs_info->fs_devices->rw_devices == 1) + return BTRFS_ERROR_DEV_ONLY_WRITABLE; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { mutex_lock(&fs_info->chunk_mutex); @@ -2139,14 +2121,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, if (ret) goto error_undo; - /* - * TODO: the superblock still includes this device in its num_devices - * counter although write_all_supers() is not locked out. This - * could give a filesystem state which requires a degraded mount. - */ - ret = btrfs_rm_dev_item(device); - if (ret) + trans = btrfs_start_transaction(fs_info->chunk_root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); goto error_undo; + } + + ret = btrfs_rm_dev_item(trans, device); + if (ret) { + /* Any error in dev item removal is critical */ + btrfs_crit(fs_info, + "failed to remove device item for devid %llu: %d", + device->devid, ret); + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); btrfs_scrub_cancel_dev(device); @@ -2229,7 +2219,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, free_fs_devices(cur_devices); } -out: + ret = btrfs_commit_transaction(trans); + return ret; error_undo: @@ -2240,7 +2231,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, device->fs_devices->rw_devices++; mutex_unlock(&fs_info->chunk_mutex); } - goto out; + return ret; } void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)

3 years, 3 months

1
0
0 0

FAILED: patch "[PATCH] btrfs: remove device item and update super block in the same" failed to apply to 5.15-stable tree

by gregkh＠linuxfoundation.org

The patch below does not apply to the 5.15-stable tree. If someone wants it applied there, or to any other stable or longterm tree, then please email the backport, including the original git commit id to <stable(a)vger.kernel.org>. thanks, greg k-h ------------------ original commit in Linus's tree ------------------ From bbac58698a55cc0a6f0c0d69a6dcd3f9f3134c11 Mon Sep 17 00:00:00 2001 From: Qu Wenruo <wqu(a)suse.com> Date: Tue, 8 Mar 2022 13:36:38 +0800 Subject: [PATCH] btrfs: remove device item and update super block in the same transaction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [BUG] There is a report that a btrfs has a bad super block num devices. This makes btrfs to reject the fs completely. BTRFS error (device sdd3): super_num_devices 3 mismatch with num_devices 2 found here BTRFS error (device sdd3): failed to read chunk tree: -22 BTRFS error (device sdd3): open_ctree failed [CAUSE] During btrfs device removal, chunk tree and super block num devs are updated in two different transactions: btrfs_rm_device() |- btrfs_rm_dev_item(device) | |- trans = btrfs_start_transaction() | | Now we got transaction X | | | |- btrfs_del_item() | | Now device item is removed from chunk tree | | | |- btrfs_commit_transaction() | Transaction X got committed, super num devs untouched, | but device item removed from chunk tree. | (AKA, super num devs is already incorrect) | |- cur_devices->num_devices--; |- cur_devices->total_devices--; |- btrfs_set_super_num_devices() All those operations are not in transaction X, thus it will only be written back to disk in next transaction. So after the transaction X in btrfs_rm_dev_item() committed, but before transaction X+1 (which can be minutes away), a power loss happen, then we got the super num mismatch. [FIX] Instead of starting and committing a transaction inside btrfs_rm_dev_item(), start a transaction in side btrfs_rm_device() and pass it to btrfs_rm_dev_item(). And only commit the transaction after everything is done. Reported-by: Luca Béla Palkovics <luca.bela.palkovics(a)gmail.com> Link: https://lore.kernel.org/linux-btrfs/CA+8xDSpvdm_U0QLBAnrH=zqDq_cWCOH5TiV46C… CC: stable(a)vger.kernel.org # 4.14+ Reviewed-by: Anand Jain <anand.jain(a)oracle.com> Signed-off-by: Qu Wenruo <wqu(a)suse.com> Signed-off-by: David Sterba <dsterba(a)suse.com> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1be7cb2f955f..2cfbc74a3b4e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1896,23 +1896,18 @@ static void update_dev_time(const char *device_path) path_put(&path); } -static int btrfs_rm_dev_item(struct btrfs_device *device) +static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans, + struct btrfs_device *device) { struct btrfs_root *root = device->fs_info->chunk_root; int ret; struct btrfs_path *path; struct btrfs_key key; - struct btrfs_trans_handle *trans; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - btrfs_free_path(path); - return PTR_ERR(trans); - } key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.type = BTRFS_DEV_ITEM_KEY; key.offset = device->devid; @@ -1923,21 +1918,12 @@ static int btrfs_rm_dev_item(struct btrfs_device *device) if (ret) { if (ret > 0) ret = -ENOENT; - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); goto out; } ret = btrfs_del_item(trans, root, path); - if (ret) { - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - } - out: btrfs_free_path(path); - if (!ret) - ret = btrfs_commit_transaction(trans); return ret; } @@ -2078,6 +2064,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, struct block_device **bdev, fmode_t *mode) { + struct btrfs_trans_handle *trans; struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; @@ -2098,7 +2085,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); if (ret) - goto out; + return ret; device = btrfs_find_device(fs_info->fs_devices, args); if (!device) { @@ -2106,27 +2093,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; else ret = -ENOENT; - goto out; + return ret; } if (btrfs_pinned_by_swapfile(fs_info, device)) { btrfs_warn_in_rcu(fs_info, "cannot remove device %s (devid %llu) due to active swapfile", rcu_str_deref(device->name), device->devid); - ret = -ETXTBSY; - goto out; + return -ETXTBSY; } - if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { - ret = BTRFS_ERROR_DEV_TGT_REPLACE; - goto out; - } + if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) + return BTRFS_ERROR_DEV_TGT_REPLACE; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && - fs_info->fs_devices->rw_devices == 1) { - ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; - goto out; - } + fs_info->fs_devices->rw_devices == 1) + return BTRFS_ERROR_DEV_ONLY_WRITABLE; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { mutex_lock(&fs_info->chunk_mutex); @@ -2139,14 +2121,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, if (ret) goto error_undo; - /* - * TODO: the superblock still includes this device in its num_devices - * counter although write_all_supers() is not locked out. This - * could give a filesystem state which requires a degraded mount. - */ - ret = btrfs_rm_dev_item(device); - if (ret) + trans = btrfs_start_transaction(fs_info->chunk_root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); goto error_undo; + } + + ret = btrfs_rm_dev_item(trans, device); + if (ret) { + /* Any error in dev item removal is critical */ + btrfs_crit(fs_info, + "failed to remove device item for devid %llu: %d", + device->devid, ret); + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + return ret; + } clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); btrfs_scrub_cancel_dev(device); @@ -2229,7 +2219,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, free_fs_devices(cur_devices); } -out: + ret = btrfs_commit_transaction(trans); + return ret; error_undo: @@ -2240,7 +2231,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, device->fs_devices->rw_devices++; mutex_unlock(&fs_info->chunk_mutex); } - goto out; + return ret; } void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)

3 years, 3 months

1
0
0 0

Queue up 5.10 io_uring backport

by Jens Axboe

Hi, Can you queue up this one for 5.10 stable? It has an extra hunk that's needed for 5.10. 5.15/16/17 will be a direct port of the 5.18-rc patch. commit e677edbcabee849bfdd43f1602bccbecf736a646 Author: Jens Axboe <axboe(a)kernel.dk> Date: Fri Apr 8 11:08:58 2022 -0600 io_uring: fix race between timeout flush and removal commit e677edbcabee849bfdd43f1602bccbecf736a646 upstream. io_flush_timeouts() assumes the timeout isn't in progress of triggering or being removed/canceled, so it unconditionally removes it from the timeout list and attempts to cancel it. Leave it on the list and let the normal timeout cancelation take care of it. Cc: stable(a)vger.kernel.org # 5.5+ Signed-off-by: Jens Axboe <axboe(a)kernel.dk> diff --git a/fs/io_uring.c b/fs/io_uring.c index 5959b0359524..bc6bf17566f8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1556,6 +1556,7 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx) static void io_flush_timeouts(struct io_ring_ctx *ctx) { + struct io_kiocb *req, *tmp; u32 seq; if (list_empty(&ctx->timeout_list)) @@ -1563,10 +1564,8 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx) seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); - do { + list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { u32 events_needed, events_got; - struct io_kiocb *req = list_first_entry(&ctx->timeout_list, - struct io_kiocb, timeout.list); if (io_is_timeout_noseq(req)) break; @@ -1583,9 +1582,8 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx) if (events_got < events_needed) break; - list_del_init(&req->timeout.list); io_kill_timeout(req, 0); - } while (!list_empty(&ctx->timeout_list)); + } ctx->cq_last_tm_flush = seq; } @@ -5639,6 +5637,7 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, else data->mode = HRTIMER_MODE_REL; + INIT_LIST_HEAD(&req->timeout.list); hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode); return 0; } @@ -6282,12 +6281,12 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) if (!list_empty(&req->link_list)) { prev = list_entry(req->link_list.prev, struct io_kiocb, link_list); - if (refcount_inc_not_zero(&prev->refs)) - list_del_init(&req->link_list); - else + list_del_init(&req->link_list); + if (!refcount_inc_not_zero(&prev->refs)) prev = NULL; } + list_del(&req->timeout.list); spin_unlock_irqrestore(&ctx->completion_lock, flags); if (prev) { -- Jens Axboe

3 years, 3 months

2
2
0 0

FAILED: patch "[PATCH] io_uring: move read/write file prep state into actual opcode" failed to apply to 5.16-stable tree

by gregkh＠linuxfoundation.org

The patch below does not apply to the 5.16-stable tree. If someone wants it applied there, or to any other stable or longterm tree, then please email the backport, including the original git commit id to <stable(a)vger.kernel.org>. thanks, greg k-h ------------------ original commit in Linus's tree ------------------ From 584b0180f0f4d67d7145950fe68c625f06c88b10 Mon Sep 17 00:00:00 2001 From: Jens Axboe <axboe(a)kernel.dk> Date: Tue, 29 Mar 2022 10:48:05 -0600 Subject: [PATCH] io_uring: move read/write file prep state into actual opcode handler In preparation for not necessarily having a file assigned at prep time, defer any initialization associated with the file to when the opcode handler is run. Cc: stable(a)vger.kernel.org # v5.15+ Signed-off-by: Jens Axboe <axboe(a)kernel.dk> diff --git a/fs/io_uring.c b/fs/io_uring.c index 0152ef49cf46..969f65de9972 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -592,7 +592,8 @@ struct io_rw { /* NOTE: kiocb has the file as the first member, so don't do it here */ struct kiocb kiocb; u64 addr; - u64 len; + u32 len; + u32 flags; }; struct io_connect { @@ -3178,42 +3179,11 @@ static inline bool io_file_supports_nowait(struct io_kiocb *req) static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_ring_ctx *ctx = req->ctx; struct kiocb *kiocb = &req->rw.kiocb; - struct file *file = req->file; unsigned ioprio; int ret; - if (!io_req_ffs_set(req)) - req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT; - kiocb->ki_pos = READ_ONCE(sqe->off); - kiocb->ki_flags = iocb_flags(file); - ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); - if (unlikely(ret)) - return ret; - - /* - * If the file is marked O_NONBLOCK, still allow retry for it if it - * supports async. Otherwise it's impossible to use O_NONBLOCK files - * reliably. If not, or it IOCB_NOWAIT is set, don't retry. - */ - if ((kiocb->ki_flags & IOCB_NOWAIT) || - ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req))) - req->flags |= REQ_F_NOWAIT; - - if (ctx->flags & IORING_SETUP_IOPOLL) { - if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) - return -EOPNOTSUPP; - - kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE; - kiocb->ki_complete = io_complete_rw_iopoll; - req->iopoll_completed = 0; - } else { - if (kiocb->ki_flags & IOCB_HIPRI) - return -EINVAL; - kiocb->ki_complete = io_complete_rw; - } ioprio = READ_ONCE(sqe->ioprio); if (ioprio) { @@ -3229,6 +3199,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->imu = NULL; req->rw.addr = READ_ONCE(sqe->addr); req->rw.len = READ_ONCE(sqe->len); + req->rw.flags = READ_ONCE(sqe->rw_flags); req->buf_index = READ_ONCE(sqe->buf_index); return 0; } @@ -3732,13 +3703,6 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw) return 0; } -static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - if (unlikely(!(req->file->f_mode & FMODE_READ))) - return -EBADF; - return io_prep_rw(req, sqe); -} - /* * This is our waitqueue callback handler, registered through __folio_lock_async() * when we initially tried to do the IO with the iocb armed our waitqueue. @@ -3826,6 +3790,49 @@ static bool need_read_all(struct io_kiocb *req) S_ISBLK(file_inode(req->file)->i_mode); } +static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) +{ + struct kiocb *kiocb = &req->rw.kiocb; + struct io_ring_ctx *ctx = req->ctx; + struct file *file = req->file; + int ret; + + if (unlikely(!file || !(file->f_mode & mode))) + return -EBADF; + + if (!io_req_ffs_set(req)) + req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT; + + kiocb->ki_flags = iocb_flags(file); + ret = kiocb_set_rw_flags(kiocb, req->rw.flags); + if (unlikely(ret)) + return ret; + + /* + * If the file is marked O_NONBLOCK, still allow retry for it if it + * supports async. Otherwise it's impossible to use O_NONBLOCK files + * reliably. If not, or it IOCB_NOWAIT is set, don't retry. + */ + if ((kiocb->ki_flags & IOCB_NOWAIT) || + ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req))) + req->flags |= REQ_F_NOWAIT; + + if (ctx->flags & IORING_SETUP_IOPOLL) { + if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) + return -EOPNOTSUPP; + + kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE; + kiocb->ki_complete = io_complete_rw_iopoll; + req->iopoll_completed = 0; + } else { + if (kiocb->ki_flags & IOCB_HIPRI) + return -EINVAL; + kiocb->ki_complete = io_complete_rw; + } + + return 0; +} + static int io_read(struct io_kiocb *req, unsigned int issue_flags) { struct io_rw_state __s, *s = &__s; @@ -3861,6 +3868,9 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) iov_iter_restore(&s->iter, &s->iter_state); iovec = NULL; } + ret = io_rw_init_file(req, FMODE_READ); + if (unlikely(ret)) + return ret; req->result = iov_iter_count(&s->iter); if (force_nonblock) { @@ -3964,13 +3974,6 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) return 0; } -static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - if (unlikely(!(req->file->f_mode & FMODE_WRITE))) - return -EBADF; - return io_prep_rw(req, sqe); -} - static int io_write(struct io_kiocb *req, unsigned int issue_flags) { struct io_rw_state __s, *s = &__s; @@ -3991,6 +3994,9 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) iov_iter_restore(&s->iter, &s->iter_state); iovec = NULL; } + ret = io_rw_init_file(req, FMODE_WRITE); + if (unlikely(ret)) + return ret; req->result = iov_iter_count(&s->iter); if (force_nonblock) { @@ -6987,11 +6993,10 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) case IORING_OP_READV: case IORING_OP_READ_FIXED: case IORING_OP_READ: - return io_read_prep(req, sqe); case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: case IORING_OP_WRITE: - return io_write_prep(req, sqe); + return io_prep_rw(req, sqe); case IORING_OP_POLL_ADD: return io_poll_add_prep(req, sqe); case IORING_OP_POLL_REMOVE:

3 years, 3 months

1
0
0 0

FAILED: patch "[PATCH] io_uring: defer file assignment" failed to apply to 5.17-stable tree

by gregkh＠linuxfoundation.org

The patch below does not apply to the 5.17-stable tree. If someone wants it applied there, or to any other stable or longterm tree, then please email the backport, including the original git commit id to <stable(a)vger.kernel.org>. thanks, greg k-h ------------------ original commit in Linus's tree ------------------ From 6bf9c47a398911e0ab920e362115153596c80432 Mon Sep 17 00:00:00 2001 From: Jens Axboe <axboe(a)kernel.dk> Date: Tue, 29 Mar 2022 10:10:08 -0600 Subject: [PATCH] io_uring: defer file assignment If an application uses direct open or accept, it knows in advance what direct descriptor value it will get as it picks it itself. This allows combined requests such as: sqe = io_uring_get_sqe(ring); io_uring_prep_openat_direct(sqe, ..., file_slot); sqe->flags |= IOSQE_IO_LINK | IOSQE_CQE_SKIP_SUCCESS; sqe = io_uring_get_sqe(ring); io_uring_prep_read(sqe,file_slot, buf, buf_size, 0); sqe->flags |= IOSQE_FIXED_FILE; io_uring_submit(ring); where we prepare both a file open and read, and only get a completion event for the read when both have completed successfully. Currently links are fully prepared before the head is issued, but that fails if the dependent link needs a file assigned that isn't valid until the head has completed. Conversely, if the same chain is performed but the fixed file slot is already valid, then we would be unexpectedly returning data from the old file slot rather than the newly opened one. Make sure we're consistent here. Allow deferral of file setup, which makes this documented case work. Cc: stable(a)vger.kernel.org # v5.15+ Signed-off-by: Jens Axboe <axboe(a)kernel.dk> diff --git a/fs/io-wq.h b/fs/io-wq.h index dbecd27656c7..04d374e65e54 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -155,6 +155,7 @@ struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack) struct io_wq_work { struct io_wq_work_node list; unsigned flags; + int fd; }; static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) diff --git a/fs/io_uring.c b/fs/io_uring.c index 398128db9728..bdc090fec29c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7240,6 +7240,23 @@ static void io_clean_op(struct io_kiocb *req) req->flags &= ~IO_REQ_CLEAN_FLAGS; } +static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags) +{ + if (req->file || !io_op_defs[req->opcode].needs_file) + return true; + + if (req->flags & REQ_F_FIXED_FILE) + req->file = io_file_get_fixed(req, req->work.fd, issue_flags); + else + req->file = io_file_get_normal(req, req->work.fd); + if (req->file) + return true; + + req_set_fail(req); + req->result = -EBADF; + return false; +} + static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) { const struct cred *creds = NULL; @@ -7250,6 +7267,8 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) if (!io_op_defs[req->opcode].audit_skip) audit_uring_entry(req->opcode); + if (unlikely(!io_assign_file(req, issue_flags))) + return -EBADF; switch (req->opcode) { case IORING_OP_NOP: @@ -7394,10 +7413,11 @@ static struct io_wq_work *io_wq_free_work(struct io_wq_work *work) static void io_wq_submit_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); + const struct io_op_def *def = &io_op_defs[req->opcode]; unsigned int issue_flags = IO_URING_F_UNLOCKED; bool needs_poll = false; struct io_kiocb *timeout; - int ret = 0; + int ret = 0, err = -ECANCELED; /* one will be dropped by ->io_free_work() after returning to io-wq */ if (!(req->flags & REQ_F_REFCOUNT)) @@ -7409,14 +7429,18 @@ static void io_wq_submit_work(struct io_wq_work *work) if (timeout) io_queue_linked_timeout(timeout); + if (!io_assign_file(req, issue_flags)) { + err = -EBADF; + work->flags |= IO_WQ_WORK_CANCEL; + } + /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ if (work->flags & IO_WQ_WORK_CANCEL) { - io_req_task_queue_fail(req, -ECANCELED); + io_req_task_queue_fail(req, err); return; } if (req->flags & REQ_F_FORCE_ASYNC) { - const struct io_op_def *def = &io_op_defs[req->opcode]; bool opcode_poll = def->pollin || def->pollout; if (opcode_poll && file_can_poll(req->file)) { @@ -7749,6 +7773,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, if (io_op_defs[opcode].needs_file) { struct io_submit_state *state = &ctx->submit_state; + req->work.fd = READ_ONCE(sqe->fd); + /* * Plug now if we have more than 2 IO left after this, and the * target is potentially a read/write to block based storage. @@ -7758,13 +7784,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, state->need_plug = false; blk_start_plug_nr_ios(&state->plug, state->submit_nr); } - - if (req->flags & REQ_F_FIXED_FILE) - req->file = io_file_get_fixed(req, READ_ONCE(sqe->fd), 0); - else - req->file = io_file_get_normal(req, READ_ONCE(sqe->fd)); - if (unlikely(!req->file)) - return -EBADF; } personality = READ_ONCE(sqe->personality);

3 years, 3 months

1
0
0 0

FAILED: patch "[PATCH] io_uring: defer file assignment" failed to apply to 5.16-stable tree

by gregkh＠linuxfoundation.org

The patch below does not apply to the 5.16-stable tree. If someone wants it applied there, or to any other stable or longterm tree, then please email the backport, including the original git commit id to <stable(a)vger.kernel.org>. thanks, greg k-h ------------------ original commit in Linus's tree ------------------ From 6bf9c47a398911e0ab920e362115153596c80432 Mon Sep 17 00:00:00 2001 From: Jens Axboe <axboe(a)kernel.dk> Date: Tue, 29 Mar 2022 10:10:08 -0600 Subject: [PATCH] io_uring: defer file assignment If an application uses direct open or accept, it knows in advance what direct descriptor value it will get as it picks it itself. This allows combined requests such as: sqe = io_uring_get_sqe(ring); io_uring_prep_openat_direct(sqe, ..., file_slot); sqe->flags |= IOSQE_IO_LINK | IOSQE_CQE_SKIP_SUCCESS; sqe = io_uring_get_sqe(ring); io_uring_prep_read(sqe,file_slot, buf, buf_size, 0); sqe->flags |= IOSQE_FIXED_FILE; io_uring_submit(ring); where we prepare both a file open and read, and only get a completion event for the read when both have completed successfully. Currently links are fully prepared before the head is issued, but that fails if the dependent link needs a file assigned that isn't valid until the head has completed. Conversely, if the same chain is performed but the fixed file slot is already valid, then we would be unexpectedly returning data from the old file slot rather than the newly opened one. Make sure we're consistent here. Allow deferral of file setup, which makes this documented case work. Cc: stable(a)vger.kernel.org # v5.15+ Signed-off-by: Jens Axboe <axboe(a)kernel.dk> diff --git a/fs/io-wq.h b/fs/io-wq.h index dbecd27656c7..04d374e65e54 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -155,6 +155,7 @@ struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack) struct io_wq_work { struct io_wq_work_node list; unsigned flags; + int fd; }; static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) diff --git a/fs/io_uring.c b/fs/io_uring.c index 398128db9728..bdc090fec29c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7240,6 +7240,23 @@ static void io_clean_op(struct io_kiocb *req) req->flags &= ~IO_REQ_CLEAN_FLAGS; } +static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags) +{ + if (req->file || !io_op_defs[req->opcode].needs_file) + return true; + + if (req->flags & REQ_F_FIXED_FILE) + req->file = io_file_get_fixed(req, req->work.fd, issue_flags); + else + req->file = io_file_get_normal(req, req->work.fd); + if (req->file) + return true; + + req_set_fail(req); + req->result = -EBADF; + return false; +} + static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) { const struct cred *creds = NULL; @@ -7250,6 +7267,8 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) if (!io_op_defs[req->opcode].audit_skip) audit_uring_entry(req->opcode); + if (unlikely(!io_assign_file(req, issue_flags))) + return -EBADF; switch (req->opcode) { case IORING_OP_NOP: @@ -7394,10 +7413,11 @@ static struct io_wq_work *io_wq_free_work(struct io_wq_work *work) static void io_wq_submit_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); + const struct io_op_def *def = &io_op_defs[req->opcode]; unsigned int issue_flags = IO_URING_F_UNLOCKED; bool needs_poll = false; struct io_kiocb *timeout; - int ret = 0; + int ret = 0, err = -ECANCELED; /* one will be dropped by ->io_free_work() after returning to io-wq */ if (!(req->flags & REQ_F_REFCOUNT)) @@ -7409,14 +7429,18 @@ static void io_wq_submit_work(struct io_wq_work *work) if (timeout) io_queue_linked_timeout(timeout); + if (!io_assign_file(req, issue_flags)) { + err = -EBADF; + work->flags |= IO_WQ_WORK_CANCEL; + } + /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ if (work->flags & IO_WQ_WORK_CANCEL) { - io_req_task_queue_fail(req, -ECANCELED); + io_req_task_queue_fail(req, err); return; } if (req->flags & REQ_F_FORCE_ASYNC) { - const struct io_op_def *def = &io_op_defs[req->opcode]; bool opcode_poll = def->pollin || def->pollout; if (opcode_poll && file_can_poll(req->file)) { @@ -7749,6 +7773,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, if (io_op_defs[opcode].needs_file) { struct io_submit_state *state = &ctx->submit_state; + req->work.fd = READ_ONCE(sqe->fd); + /* * Plug now if we have more than 2 IO left after this, and the * target is potentially a read/write to block based storage. @@ -7758,13 +7784,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, state->need_plug = false; blk_start_plug_nr_ios(&state->plug, state->submit_nr); } - - if (req->flags & REQ_F_FIXED_FILE) - req->file = io_file_get_fixed(req, READ_ONCE(sqe->fd), 0); - else - req->file = io_file_get_normal(req, READ_ONCE(sqe->fd)); - if (unlikely(!req->file)) - return -EBADF; } personality = READ_ONCE(sqe->personality);

3 years, 3 months

1
0
0 0

FAILED: patch "[PATCH] io_uring: defer file assignment" failed to apply to 5.15-stable tree

by gregkh＠linuxfoundation.org

The patch below does not apply to the 5.15-stable tree. If someone wants it applied there, or to any other stable or longterm tree, then please email the backport, including the original git commit id to <stable(a)vger.kernel.org>. thanks, greg k-h ------------------ original commit in Linus's tree ------------------ From 6bf9c47a398911e0ab920e362115153596c80432 Mon Sep 17 00:00:00 2001 From: Jens Axboe <axboe(a)kernel.dk> Date: Tue, 29 Mar 2022 10:10:08 -0600 Subject: [PATCH] io_uring: defer file assignment If an application uses direct open or accept, it knows in advance what direct descriptor value it will get as it picks it itself. This allows combined requests such as: sqe = io_uring_get_sqe(ring); io_uring_prep_openat_direct(sqe, ..., file_slot); sqe->flags |= IOSQE_IO_LINK | IOSQE_CQE_SKIP_SUCCESS; sqe = io_uring_get_sqe(ring); io_uring_prep_read(sqe,file_slot, buf, buf_size, 0); sqe->flags |= IOSQE_FIXED_FILE; io_uring_submit(ring); where we prepare both a file open and read, and only get a completion event for the read when both have completed successfully. Currently links are fully prepared before the head is issued, but that fails if the dependent link needs a file assigned that isn't valid until the head has completed. Conversely, if the same chain is performed but the fixed file slot is already valid, then we would be unexpectedly returning data from the old file slot rather than the newly opened one. Make sure we're consistent here. Allow deferral of file setup, which makes this documented case work. Cc: stable(a)vger.kernel.org # v5.15+ Signed-off-by: Jens Axboe <axboe(a)kernel.dk> diff --git a/fs/io-wq.h b/fs/io-wq.h index dbecd27656c7..04d374e65e54 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -155,6 +155,7 @@ struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack) struct io_wq_work { struct io_wq_work_node list; unsigned flags; + int fd; }; static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) diff --git a/fs/io_uring.c b/fs/io_uring.c index 398128db9728..bdc090fec29c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7240,6 +7240,23 @@ static void io_clean_op(struct io_kiocb *req) req->flags &= ~IO_REQ_CLEAN_FLAGS; } +static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags) +{ + if (req->file || !io_op_defs[req->opcode].needs_file) + return true; + + if (req->flags & REQ_F_FIXED_FILE) + req->file = io_file_get_fixed(req, req->work.fd, issue_flags); + else + req->file = io_file_get_normal(req, req->work.fd); + if (req->file) + return true; + + req_set_fail(req); + req->result = -EBADF; + return false; +} + static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) { const struct cred *creds = NULL; @@ -7250,6 +7267,8 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) if (!io_op_defs[req->opcode].audit_skip) audit_uring_entry(req->opcode); + if (unlikely(!io_assign_file(req, issue_flags))) + return -EBADF; switch (req->opcode) { case IORING_OP_NOP: @@ -7394,10 +7413,11 @@ static struct io_wq_work *io_wq_free_work(struct io_wq_work *work) static void io_wq_submit_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); + const struct io_op_def *def = &io_op_defs[req->opcode]; unsigned int issue_flags = IO_URING_F_UNLOCKED; bool needs_poll = false; struct io_kiocb *timeout; - int ret = 0; + int ret = 0, err = -ECANCELED; /* one will be dropped by ->io_free_work() after returning to io-wq */ if (!(req->flags & REQ_F_REFCOUNT)) @@ -7409,14 +7429,18 @@ static void io_wq_submit_work(struct io_wq_work *work) if (timeout) io_queue_linked_timeout(timeout); + if (!io_assign_file(req, issue_flags)) { + err = -EBADF; + work->flags |= IO_WQ_WORK_CANCEL; + } + /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ if (work->flags & IO_WQ_WORK_CANCEL) { - io_req_task_queue_fail(req, -ECANCELED); + io_req_task_queue_fail(req, err); return; } if (req->flags & REQ_F_FORCE_ASYNC) { - const struct io_op_def *def = &io_op_defs[req->opcode]; bool opcode_poll = def->pollin || def->pollout; if (opcode_poll && file_can_poll(req->file)) { @@ -7749,6 +7773,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, if (io_op_defs[opcode].needs_file) { struct io_submit_state *state = &ctx->submit_state; + req->work.fd = READ_ONCE(sqe->fd); + /* * Plug now if we have more than 2 IO left after this, and the * target is potentially a read/write to block based storage. @@ -7758,13 +7784,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, state->need_plug = false; blk_start_plug_nr_ios(&state->plug, state->submit_nr); } - - if (req->flags & REQ_F_FIXED_FILE) - req->file = io_file_get_fixed(req, READ_ONCE(sqe->fd), 0); - else - req->file = io_file_get_normal(req, READ_ONCE(sqe->fd)); - if (unlikely(!req->file)) - return -EBADF; } personality = READ_ONCE(sqe->personality);

3 years, 3 months

1
0
0 0

[PATCH 1/4] btrfs: avoid double clean up when submit_one_bio() failed

by Qu Wenruo

[BUG] When running generic/475 with 64K page size and 4K sector size, it has a very high chance (almost 100%) to hang, with mostly data page locked but no one is going to unlock it. [CAUSE] With commit 1784b7d502a9 ("btrfs: handle csum lookup errors properly on reads"), if we failed to lookup checksum due to metadata IO error, we will return error for btrfs_submit_data_bio(). This will cause the page to be unlocked twice in btrfs_do_readpage(): btrfs_do_readpage() |- submit_extent_page() | |- submit_one_bio() | |- btrfs_submit_data_bio() | |- if (ret) { | |- bio->bi_status = ret; | |- bio_endio(bio); } | In the endio function, we will call end_page_read() | and unlock_extent() to cleanup the subpage range. | |- if (ret) { |- unlock_extent(); end_page_read() } Here we unlock the extent and cleanup the subpage range again. For unlock_extent(), it's mostly double unlock safe. But for end_page_read(), it's not, especially for subpage case, as for subpage case we will call btrfs_subpage_end_reader() to reduce the reader number, and use that to number to determine if we need to unlock the full page. If double accounted, it can underflow the number and leave the page locked without anyone to unlock it. [FIX] The commit 1784b7d502a9 ("btrfs: handle csum lookup errors properly on reads") itself is completely fine, it's our existing code not properly handling the error from bio submission hook properly. Since one and only one of the endio function or the submit_extent_page() caller should do the cleanup, let's just ignore the return value from bio submission hook. Before the hook, it's callers' responsibility to do cleanup, after the hook (including inside the hook), it's the endio doing the cleanup. This patch is makes submit_one_bio() always return 0, but still keep the old int return value to make minimal change for backport. CC: stable(a)vger.kernel.org # 5.18+ Signed-off-by: Qu Wenruo <wqu(a)suse.com> --- fs/btrfs/extent_io.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 53b59944013f..34073b0ed6ca 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -165,24 +165,28 @@ static int add_extent_changeset(struct extent_state *state, u32 bits, return ret; } -int __must_check submit_one_bio(struct bio *bio, int mirror_num, - unsigned long bio_flags) +int submit_one_bio(struct bio *bio, int mirror_num, unsigned long bio_flags) { - blk_status_t ret = 0; struct extent_io_tree *tree = bio->bi_private; bio->bi_private = NULL; /* Caller should ensure the bio has at least some range added */ ASSERT(bio->bi_iter.bi_size); + if (is_data_inode(tree->private_data)) - ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num, + btrfs_submit_data_bio(tree->private_data, bio, mirror_num, bio_flags); else - ret = btrfs_submit_metadata_bio(tree->private_data, bio, + btrfs_submit_metadata_bio(tree->private_data, bio, mirror_num, bio_flags); - - return blk_status_to_errno(ret); + /* + * Above submission hooks will handle the error by ending the bio, + * which will do the cleanup properly. + * So here we should not return any error, or the caller of + * submit_extent_page() will do cleanup again, causing problems. + */ + return 0; } /* Cleanup unsubmitted bios */ -- 2.35.1

3 years, 3 months

3
4
0 0

2025

2024

2023

2022

2021

2020

2019

2018

2017

Linux-stable-mirror April 2022