Commit 38d715f494f2 ("btrfs: use btrfs_start_delalloc_roots in
shrink_delalloc") cleaned up how we do delalloc shrinking by utilizing
some infrastructure we have in place to flush inodes that we use for
device replace and snapshot. However this introduced a pretty serious
performance regression. The root cause is because before we would
generally use the normal writeback path to reclaim delalloc space, and
for this we would provide it with the number of pages we wanted to
flush. The referenced commit changed this to flush that many inodes,
which drastically increased the amount of space we were flushing in
certain cases, which severely affected performance.
We cannot revert this patch unfortunately, because Filipe has another
fix that requires the ability to skip flushing inodes that are being
cloned in certain scenarios, which means we need to keep using our
flushing infrastructure or risk re-introducing the deadlock.
Instead to fix this problem we can go back to providing
btrfs_start_delalloc_roots with a number of pages to flush, and then set
up a writeback_control and utilize sync_inode() to handle the flushing
for us. This gives us the same behavior we had prior to the fix, while
still allowing us to avoid the deadlock that was fixed by Filipe. The
user reported reproducer was a simple untarring of a large tarball of
the source code for Firefox. The results are as follows
5.9 0m54.258s
5.10 1m26.212s
Patch 0m38.800s
We are significantly faster because of the work I did around improving
ENOSPC flushing in 5.10 and 5.11, so reverting to the previous write out
behavior gave us a pretty big boost.
CC: stable(a)vger.kernel.org # 5.10
Reported-by: René Rebe <rene(a)exactcode.de>
Fixes: 38d715f494f2 ("btrfs: use btrfs_start_delalloc_roots in shrink_delalloc")
Signed-off-by: Josef Bacik <josef(a)toxicpanda.com>
---
fs/btrfs/inode.c | 60 +++++++++++++++++++++++++++++++------------
fs/btrfs/space-info.c | 4 ++-
2 files changed, 46 insertions(+), 18 deletions(-)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 070716650df8..a8e0a6b038d3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9390,7 +9390,8 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
* some fairly slow code that needs optimization. This walks the list
* of all the inodes with pending delalloc and forces them to disk.
*/
-static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot,
+static int start_delalloc_inodes(struct btrfs_root *root,
+ struct writeback_control *wbc, bool snapshot,
bool in_reclaim_context)
{
struct btrfs_inode *binode;
@@ -9399,6 +9400,7 @@ static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot
struct list_head works;
struct list_head splice;
int ret = 0;
+ bool full_flush = wbc->nr_to_write == LONG_MAX;
INIT_LIST_HEAD(&works);
INIT_LIST_HEAD(&splice);
@@ -9427,18 +9429,24 @@ static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot
if (snapshot)
set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
&binode->runtime_flags);
- work = btrfs_alloc_delalloc_work(inode);
- if (!work) {
- iput(inode);
- ret = -ENOMEM;
- goto out;
- }
- list_add_tail(&work->list, &works);
- btrfs_queue_work(root->fs_info->flush_workers,
- &work->work);
- if (*nr != U64_MAX) {
- (*nr)--;
- if (*nr == 0)
+ if (full_flush) {
+ work = btrfs_alloc_delalloc_work(inode);
+ if (!work) {
+ iput(inode);
+ ret = -ENOMEM;
+ goto out;
+ }
+ list_add_tail(&work->list, &works);
+ btrfs_queue_work(root->fs_info->flush_workers,
+ &work->work);
+ } else {
+ ret = sync_inode(inode, wbc);
+ if (!ret &&
+ test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ ret = sync_inode(inode, wbc);
+ btrfs_add_delayed_iput(inode);
+ if (ret || wbc->nr_to_write <= 0)
goto out;
}
cond_resched();
@@ -9464,18 +9472,29 @@ static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot
int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
{
+ struct writeback_control wbc = {
+ .nr_to_write = LONG_MAX,
+ .sync_mode = WB_SYNC_NONE,
+ .range_start = 0,
+ .range_end = LLONG_MAX,
+ };
struct btrfs_fs_info *fs_info = root->fs_info;
- u64 nr = U64_MAX;
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
return -EROFS;
- return start_delalloc_inodes(root, &nr, true, false);
+ return start_delalloc_inodes(root, &wbc, true, false);
}
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr,
bool in_reclaim_context)
{
+ struct writeback_control wbc = {
+ .nr_to_write = (nr == U64_MAX) ? LONG_MAX : (unsigned long)nr,
+ .sync_mode = WB_SYNC_NONE,
+ .range_start = 0,
+ .range_end = LLONG_MAX,
+ };
struct btrfs_root *root;
struct list_head splice;
int ret;
@@ -9489,6 +9508,13 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr,
spin_lock(&fs_info->delalloc_root_lock);
list_splice_init(&fs_info->delalloc_roots, &splice);
while (!list_empty(&splice) && nr) {
+ /*
+ * Reset nr_to_write here so we know that we're doing a full
+ * flush.
+ */
+ if (nr == U64_MAX)
+ wbc.nr_to_write = LONG_MAX;
+
root = list_first_entry(&splice, struct btrfs_root,
delalloc_root);
root = btrfs_grab_root(root);
@@ -9497,9 +9523,9 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr,
&fs_info->delalloc_roots);
spin_unlock(&fs_info->delalloc_root_lock);
- ret = start_delalloc_inodes(root, &nr, false, in_reclaim_context);
+ ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context);
btrfs_put_root(root);
- if (ret < 0)
+ if (ret < 0 || wbc.nr_to_write <= 0)
goto out;
spin_lock(&fs_info->delalloc_root_lock);
}
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 67e55c5479b8..e8347461c8dd 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -532,7 +532,9 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
loops = 0;
while ((delalloc_bytes || dio_bytes) && loops < 3) {
- btrfs_start_delalloc_roots(fs_info, items, true);
+ u64 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
+
+ btrfs_start_delalloc_roots(fs_info, nr_pages, true);
loops++;
if (wait_ordered && !trans) {
--
2.26.2
There is one GSWIP_MII_CFG register for each switch-port except the CPU
port. The register offset for the first port is 0x0, 0x02 for the
second, 0x04 for the third and so on.
Update the driver to not only restrict the GSWIP_MII_CFG registers to
ports 0, 1 and 5. Handle ports 0..5 instead but skip the CPU port. This
means we are not overwriting the configuration for the third port (port
two since we start counting from zero) with the settings for the sixth
port (with number five) anymore.
The GSWIP_MII_PCDU(p) registers are not updated because there's really
only three (one for each of the following ports: 0, 1, 5).
Fixes: 14fceff4771e51 ("net: dsa: Add Lantiq / Intel DSA driver for vrx200")
Cc: stable(a)vger.kernel.org
Signed-off-by: Martin Blumenstingl <martin.blumenstingl(a)googlemail.com>
---
drivers/net/dsa/lantiq_gswip.c | 23 ++++++-----------------
1 file changed, 6 insertions(+), 17 deletions(-)
diff --git a/drivers/net/dsa/lantiq_gswip.c b/drivers/net/dsa/lantiq_gswip.c
index 5d378c8026f0..4b36d89bec06 100644
--- a/drivers/net/dsa/lantiq_gswip.c
+++ b/drivers/net/dsa/lantiq_gswip.c
@@ -92,9 +92,7 @@
GSWIP_MDIO_PHY_FDUP_MASK)
/* GSWIP MII Registers */
-#define GSWIP_MII_CFG0 0x00
-#define GSWIP_MII_CFG1 0x02
-#define GSWIP_MII_CFG5 0x04
+#define GSWIP_MII_CFGp(p) (0x2 * (p))
#define GSWIP_MII_CFG_EN BIT(14)
#define GSWIP_MII_CFG_LDCLKDIS BIT(12)
#define GSWIP_MII_CFG_MODE_MIIP 0x0
@@ -392,17 +390,9 @@ static void gswip_mii_mask(struct gswip_priv *priv, u32 clear, u32 set,
static void gswip_mii_mask_cfg(struct gswip_priv *priv, u32 clear, u32 set,
int port)
{
- switch (port) {
- case 0:
- gswip_mii_mask(priv, clear, set, GSWIP_MII_CFG0);
- break;
- case 1:
- gswip_mii_mask(priv, clear, set, GSWIP_MII_CFG1);
- break;
- case 5:
- gswip_mii_mask(priv, clear, set, GSWIP_MII_CFG5);
- break;
- }
+ /* There's no MII_CFG register for the CPU port */
+ if (!dsa_is_cpu_port(priv->ds, port))
+ gswip_mii_mask(priv, clear, set, GSWIP_MII_CFGp(port));
}
static void gswip_mii_mask_pcdu(struct gswip_priv *priv, u32 clear, u32 set,
@@ -822,9 +812,8 @@ static int gswip_setup(struct dsa_switch *ds)
gswip_mdio_mask(priv, 0xff, 0x09, GSWIP_MDIO_MDC_CFG1);
/* Disable the xMII link */
- gswip_mii_mask_cfg(priv, GSWIP_MII_CFG_EN, 0, 0);
- gswip_mii_mask_cfg(priv, GSWIP_MII_CFG_EN, 0, 1);
- gswip_mii_mask_cfg(priv, GSWIP_MII_CFG_EN, 0, 5);
+ for (i = 0; i < priv->hw_info->max_ports; i++)
+ gswip_mii_mask_cfg(priv, GSWIP_MII_CFG_EN, 0, i);
/* enable special tag insertion on cpu port */
gswip_switch_mask(priv, 0, GSWIP_FDMA_PCTRL_STEN,
--
2.30.0
This is a revert for 38d715f494f2 ("btrfs: use
btrfs_start_delalloc_roots in shrink_delalloc"). A user reported a
problem where performance was significantly worse with this patch
applied. The problem needs to be fixed with proper pre-flushing, and
changes to how we deal with the work queues for the inodes. However
that work is much more complicated than is acceptable for stable, and
simply reverting this patch fixes the problem. The original patch was
a cleanup of the code, so it's fine to revert it. My numbers for the
original reported test, which was untarring a copy of the firefox
sources, are as follows
5.9 0m54.258s
5.10 1m26.212s
Fix 0m35.038s
cc: stable(a)vger.kernel.org # 5.10
Reported-by: René Rebe <rene(a)exactcode.de>
Fixes: 38d715f494f2 ("btrfs: use btrfs_start_delalloc_roots in shrink_delalloc")
Signed-off-by: Josef Bacik <josef(a)toxicpanda.com>
---
Dave, this is ontop of linus's branch, because we've changed the arguments for
btrfs_start_delalloc_roots in misc-next, and this needs to go back to 5.10 ASAP.
I can send a misc-next version if you want to have it there as well while we're
waiting for it to go into linus's tree, just let me know.
fs/btrfs/space-info.c | 54 ++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 53 insertions(+), 1 deletion(-)
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 64099565ab8f..a2b322275b8d 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -465,6 +465,28 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
up_read(&info->groups_sem);
}
+static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
+ unsigned long nr_pages, u64 nr_items)
+{
+ struct super_block *sb = fs_info->sb;
+
+ if (down_read_trylock(&sb->s_umount)) {
+ writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
+ up_read(&sb->s_umount);
+ } else {
+ /*
+ * We needn't worry the filesystem going from r/w to r/o though
+ * we don't acquire ->s_umount mutex, because the filesystem
+ * should guarantee the delalloc inodes list be empty after
+ * the filesystem is readonly(all dirty pages are written to
+ * the disk).
+ */
+ btrfs_start_delalloc_roots(fs_info, nr_items);
+ if (!current->journal_info)
+ btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
+ }
+}
+
static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
u64 to_reclaim)
{
@@ -490,8 +512,10 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans;
u64 delalloc_bytes;
u64 dio_bytes;
+ u64 async_pages;
u64 items;
long time_left;
+ unsigned long nr_pages;
int loops;
/* Calc the number of the pages we need flush for space reservation */
@@ -532,8 +556,36 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
loops = 0;
while ((delalloc_bytes || dio_bytes) && loops < 3) {
- btrfs_start_delalloc_roots(fs_info, items);
+ nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
+
+ /*
+ * Triggers inode writeback for up to nr_pages. This will invoke
+ * ->writepages callback and trigger delalloc filling
+ * (btrfs_run_delalloc_range()).
+ */
+ btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
+ /*
+ * We need to wait for the compressed pages to start before
+ * we continue.
+ */
+ async_pages = atomic_read(&fs_info->async_delalloc_pages);
+ if (!async_pages)
+ goto skip_async;
+
+ /*
+ * Calculate how many compressed pages we want to be written
+ * before we continue. I.e if there are more async pages than we
+ * require wait_event will wait until nr_pages are written.
+ */
+ if (async_pages <= nr_pages)
+ async_pages = 0;
+ else
+ async_pages -= nr_pages;
+ wait_event(fs_info->async_submit_wait,
+ atomic_read(&fs_info->async_delalloc_pages) <=
+ (int)async_pages);
+skip_async:
loops++;
if (wait_ordered && !trans) {
btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
--
2.26.2