From: Filipe Manana <fdmanana(a)suse.com>
[ Upstream commit 9b378f6ad48cfa195ed868db9123c09ee7ec5ea2 ]
The readdir implementation currently processes always up to the last index
it finds. This however can result in an infinite loop if the directory has
a large number of entries such that they won't all fit in the given buffer
passed to the readdir callback, that is, dir_emit() returns a non-zero
value. Because in that case readdir() will be called again and if in the
meanwhile new directory entries were added and we still can't put all the
remaining entries in the buffer, we keep repeating this over and over.
The following C program and test script reproduce the problem:
$ cat /mnt/readdir_prog.c
#include <sys/types.h>
#include <dirent.h>
#include <stdio.h>
int main(int argc, char *argv[])
{
DIR *dir = opendir(".");
struct dirent *dd;
while ((dd = readdir(dir))) {
printf("%s\n", dd->d_name);
rename(dd->d_name, "TEMPFILE");
rename("TEMPFILE", dd->d_name);
}
closedir(dir);
}
$ gcc -o /mnt/readdir_prog /mnt/readdir_prog.c
$ cat test.sh
#!/bin/bash
DEV=/dev/sdi
MNT=/mnt/sdi
mkfs.btrfs -f $DEV &> /dev/null
#mkfs.xfs -f $DEV &> /dev/null
#mkfs.ext4 -F $DEV &> /dev/null
mount $DEV $MNT
mkdir $MNT/testdir
for ((i = 1; i <= 2000; i++)); do
echo -n > $MNT/testdir/file_$i
done
cd $MNT/testdir
/mnt/readdir_prog
cd /mnt
umount $MNT
This behaviour is surprising to applications and it's unlike ext4, xfs,
tmpfs, vfat and other filesystems, which always finish. In this case where
new entries were added due to renames, some file names may be reported
more than once, but this varies according to each filesystem - for example
ext4 never reported the same file more than once while xfs reports the
first 13 file names twice.
So change our readdir implementation to track the last index number when
opendir() is called and then make readdir() never process beyond that
index number. This gives the same behaviour as ext4.
Reported-by: Rob Landley <rob(a)landley.net>
Link: https://lore.kernel.org/linux-btrfs/2c8c55ec-04c6-e0dc-9c5c-8c7924778c35@la…
Link: https://bugzilla.kernel.org/show_bug.cgi?id=217681
CC: stable(a)vger.kernel.org # 5.15
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
[ Resolve a conflict due to member changes in 96d89923fa94 ]
Signed-off-by: Qu Wenruo <wqu(a)suse.com>
---
fs/btrfs/ctree.h | 1 +
fs/btrfs/delayed-inode.c | 5 +-
fs/btrfs/delayed-inode.h | 1 +
fs/btrfs/inode.c | 131 +++++++++++++++++++++++----------------
4 files changed, 84 insertions(+), 54 deletions(-)
---
Changelog:
v2:
- Fix the upstream commit hash
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1467bf439cb4..7905f178efa3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1361,6 +1361,7 @@ struct btrfs_drop_extents_args {
struct btrfs_file_private {
void *filldir_buf;
+ u64 last_index;
};
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index fd951aeaeac5..5a98c5da1225 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1513,6 +1513,7 @@ int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode)
}
bool btrfs_readdir_get_delayed_items(struct inode *inode,
+ u64 last_index,
struct list_head *ins_list,
struct list_head *del_list)
{
@@ -1532,14 +1533,14 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode,
mutex_lock(&delayed_node->mutex);
item = __btrfs_first_delayed_insertion_item(delayed_node);
- while (item) {
+ while (item && item->key.offset <= last_index) {
refcount_inc(&item->refs);
list_add_tail(&item->readdir_list, ins_list);
item = __btrfs_next_delayed_item(item);
}
item = __btrfs_first_delayed_deletion_item(delayed_node);
- while (item) {
+ while (item && item->key.offset <= last_index) {
refcount_inc(&item->refs);
list_add_tail(&item->readdir_list, del_list);
item = __btrfs_next_delayed_item(item);
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index b2412160c5bc..a9cfce856d2e 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -123,6 +123,7 @@ void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info);
/* Used for readdir() */
bool btrfs_readdir_get_delayed_items(struct inode *inode,
+ u64 last_index,
struct list_head *ins_list,
struct list_head *del_list);
void btrfs_readdir_put_delayed_items(struct inode *inode,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 95af29634e55..1df374ce829b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6121,6 +6121,74 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
return d_splice_alias(inode, dentry);
}
+/*
+ * Find the highest existing sequence number in a directory and then set the
+ * in-memory index_cnt variable to the first free sequence number.
+ */
+static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_key key, found_key;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ int ret;
+
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_DIR_INDEX_KEY;
+ key.offset = (u64)-1;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ /* FIXME: we should be able to handle this */
+ if (ret == 0)
+ goto out;
+ ret = 0;
+
+ if (path->slots[0] == 0) {
+ inode->index_cnt = BTRFS_DIR_START_INDEX;
+ goto out;
+ }
+
+ path->slots[0]--;
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ if (found_key.objectid != btrfs_ino(inode) ||
+ found_key.type != BTRFS_DIR_INDEX_KEY) {
+ inode->index_cnt = BTRFS_DIR_START_INDEX;
+ goto out;
+ }
+
+ inode->index_cnt = found_key.offset + 1;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index)
+{
+ if (dir->index_cnt == (u64)-1) {
+ int ret;
+
+ ret = btrfs_inode_delayed_dir_index_count(dir);
+ if (ret) {
+ ret = btrfs_set_inode_index_count(dir);
+ if (ret)
+ return ret;
+ }
+ }
+
+ *index = dir->index_cnt;
+
+ return 0;
+}
+
/*
* All this infrastructure exists because dir_emit can fault, and we are holding
* the tree lock when doing readdir. For now just allocate a buffer and copy
@@ -6133,10 +6201,17 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
static int btrfs_opendir(struct inode *inode, struct file *file)
{
struct btrfs_file_private *private;
+ u64 last_index;
+ int ret;
+
+ ret = btrfs_get_dir_last_index(BTRFS_I(inode), &last_index);
+ if (ret)
+ return ret;
private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL);
if (!private)
return -ENOMEM;
+ private->last_index = last_index;
private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
if (!private->filldir_buf) {
kfree(private);
@@ -6205,7 +6280,8 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
INIT_LIST_HEAD(&ins_list);
INIT_LIST_HEAD(&del_list);
- put = btrfs_readdir_get_delayed_items(inode, &ins_list, &del_list);
+ put = btrfs_readdir_get_delayed_items(inode, private->last_index,
+ &ins_list, &del_list);
again:
key.type = BTRFS_DIR_INDEX_KEY;
@@ -6238,6 +6314,8 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
break;
if (found_key.offset < ctx->pos)
goto next;
+ if (found_key.offset > private->last_index)
+ break;
if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
goto next;
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
@@ -6371,57 +6449,6 @@ static int btrfs_update_time(struct inode *inode, struct timespec64 *now,
return dirty ? btrfs_dirty_inode(inode) : 0;
}
-/*
- * find the highest existing sequence number in a directory
- * and then set the in-memory index_cnt variable to reflect
- * free sequence numbers
- */
-static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
-{
- struct btrfs_root *root = inode->root;
- struct btrfs_key key, found_key;
- struct btrfs_path *path;
- struct extent_buffer *leaf;
- int ret;
-
- key.objectid = btrfs_ino(inode);
- key.type = BTRFS_DIR_INDEX_KEY;
- key.offset = (u64)-1;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
- /* FIXME: we should be able to handle this */
- if (ret == 0)
- goto out;
- ret = 0;
-
- if (path->slots[0] == 0) {
- inode->index_cnt = BTRFS_DIR_START_INDEX;
- goto out;
- }
-
- path->slots[0]--;
-
- leaf = path->nodes[0];
- btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-
- if (found_key.objectid != btrfs_ino(inode) ||
- found_key.type != BTRFS_DIR_INDEX_KEY) {
- inode->index_cnt = BTRFS_DIR_START_INDEX;
- goto out;
- }
-
- inode->index_cnt = found_key.offset + 1;
-out:
- btrfs_free_path(path);
- return ret;
-}
-
/*
* helper to find a free sequence number in a given directory. This current
* code is very simple, later versions will do smarter things in the btree
--
2.43.0
In zswap_writeback_entry(), after we get a folio from
__read_swap_cache_async(), we grab the tree lock again to check that the
swap entry was not invalidated and recycled. If it was, we delete the
folio we just added to the swap cache and exit.
However, __read_swap_cache_async() returns the folio locked when it is
newly allocated, which is always true for this path, and the folio is
ref'd. Make sure to unlock and put the folio before returning.
This was discovered by code inspection, probably because this path
handles a race condition that should not happen often, and the bug would
not crash the system, it will only strand the folio indefinitely.
Fixes: 04fc7816089c ("mm: fix zswap writeback race condition")
Cc: stable(a)vger.kernel.org
Signed-off-by: Yosry Ahmed <yosryahmed(a)google.com>
---
mm/zswap.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/mm/zswap.c b/mm/zswap.c
index 8f4a7efc2bdae..00e90b9b5417d 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1448,6 +1448,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) {
spin_unlock(&tree->lock);
delete_from_swap_cache(folio);
+ folio_unlock(folio);
+ folio_put(folio);
return -ENOMEM;
}
spin_unlock(&tree->lock);
--
2.43.0.429.g432eaa2c6b-goog
From: "Maciej S. Szmigiero" <maciej.szmigiero(a)oracle.com>
The stable kernel version backport of the patch disabling XSAVES on AMD
Zen family 0x17 applied this change to the wrong function (init_amd_k6()),
one which isn't called for Zen CPUs.
Move the erratum to the init_amd_zn() function instead.
Add an explicit family 0x17 check to the erratum so nothing will break if
someone naively makes this kernel version call init_amd_zn() also for
family 0x19 in the future (as the current upstream code does).
Fixes: e40c1e9da1ec ("x86/CPU/AMD: Disable XSAVES on AMD family 0x17")
Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero(a)oracle.com>
---
arch/x86/kernel/cpu/amd.c | 20 +++++++++++---------
1 file changed, 11 insertions(+), 9 deletions(-)
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index b2cdf1c07e56..4e84203fc067 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -277,15 +277,6 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
return;
}
#endif
- /*
- * Work around Erratum 1386. The XSAVES instruction malfunctions in
- * certain circumstances on Zen1/2 uarch, and not all parts have had
- * updated microcode at the time of writing (March 2023).
- *
- * Affected parts all have no supervisor XSAVE states, meaning that
- * the XSAVEC instruction (which works fine) is equivalent.
- */
- clear_cpu_cap(c, X86_FEATURE_XSAVES);
}
static void init_amd_k7(struct cpuinfo_x86 *c)
@@ -989,6 +980,17 @@ static void init_amd_zn(struct cpuinfo_x86 *c)
if (c->x86 == 0x19 && !cpu_has(c, X86_FEATURE_BTC_NO))
set_cpu_cap(c, X86_FEATURE_BTC_NO);
}
+
+ /*
+ * Work around Erratum 1386. The XSAVES instruction malfunctions in
+ * certain circumstances on Zen1/2 uarch, and not all parts have had
+ * updated microcode at the time of writing (March 2023).
+ *
+ * Affected parts all have no supervisor XSAVE states, meaning that
+ * the XSAVEC instruction (which works fine) is equivalent.
+ */
+ if (c->x86 == 0x17)
+ clear_cpu_cap(c, X86_FEATURE_XSAVES);
}
static bool cpu_has_zenbleed_microcode(void)
This reverts commit bed9e27baf52a09b7ba2a3714f1e24e17ced386d.
The original set [1][2] was expected to undo a suboptimal fix in [2], and
replace it with a better fix [1]. However, as reported by Dan Moulding [2]
causes an issue with raid5 with journal device.
Revert [2] for now to close the issue. We will follow up on another issue
reported by Juxiao Bi, as [2] is expected to fix it. We believe this is a
good trade-off, because the latter issue happens less freqently.
In the meanwhile, we will NOT revert [1], as it contains the right logic.
Reported-by: Dan Moulding <dan(a)danm.net>
Closes: https://lore.kernel.org/linux-raid/20240123005700.9302-1-dan@danm.net/
Fixes: bed9e27baf52 ("Revert "md/raid5: Wait for MD_SB_CHANGE_PENDING in raid5d"")
Cc: stable(a)vger.kernel.org # v5.19+
Cc: Junxiao Bi <junxiao.bi(a)oracle.com>
Cc: Yu Kuai <yukuai3(a)huawei.com>
Signed-off-by: Song Liu <song(a)kernel.org>
[1] commit d6e035aad6c0 ("md: bypass block throttle for superblock update")
[2] commit bed9e27baf52 ("Revert "md/raid5: Wait for MD_SB_CHANGE_PENDING in raid5d"")
---
drivers/md/raid5.c | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 8497880135ee..2b2f03705990 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -36,6 +36,7 @@
*/
#include <linux/blkdev.h>
+#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/raid/pq.h>
#include <linux/async_tx.h>
@@ -6773,7 +6774,18 @@ static void raid5d(struct md_thread *thread)
spin_unlock_irq(&conf->device_lock);
md_check_recovery(mddev);
spin_lock_irq(&conf->device_lock);
+
+ /*
+ * Waiting on MD_SB_CHANGE_PENDING below may deadlock
+ * seeing md_check_recovery() is needed to clear
+ * the flag when using mdmon.
+ */
+ continue;
}
+
+ wait_event_lock_irq(mddev->sb_wait,
+ !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
+ conf->device_lock);
}
pr_debug("%d stripes handled\n", handled);
--
2.34.1