Re: [REGRESSION] workqueue/writeback: Severe CPU hang due to kworker proliferation during I/O flush and cgroup cleanup

25 Sep 2025

Confirmed the patches worked for the mainline(6.17-rc7). But it's
still flaky(1/13) if I simply apply the patches to v6.12.46.
I think there should be some gap commits that I should apply as well.
Here is the diff between the two kernel versions after patches are applied:

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index fad8ddfa622bb..62d85c5086ba1 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -65,7 +65,7 @@ struct wb_writeback_work {
* timestamps written to disk after 12 hours, but in the worst case a
* few inodes might not their timestamps updated for 24 hours.
*/
-static unsigned int dirtytime_expire_interval = 12 * 60 * 60;
+unsigned int dirtytime_expire_interval = 12 * 60 * 60;
static inline struct inode *wb_inode(struct list_head *head)
{
@@ -290,6 +290,7 @@ void __inode_attach_wb(struct inode *inode, struct
folio *folio)
if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
wb_put(wb);
}
+EXPORT_SYMBOL_GPL(__inode_attach_wb);
/**
* inode_cgwb_move_to_attached - put the inode onto wb->b_attached list
@@ -770,9 +771,8 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
* writeback completion, wbc_detach_inode() should be called. This is used
* to track the cgroup writeback context.
*/
-static void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
- struct inode *inode)
- __releases(&inode->i_lock)
+void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
+ struct inode *inode)
{
if (!inode_cgwb_enabled(inode)) {
spin_unlock(&inode->i_lock);
@@ -802,24 +802,7 @@ static void wbc_attach_and_unlock_inode(struct
writeback_control *wbc,
if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css)))
inode_switch_wbs(inode, wbc->wb_id);
}
-
-/**
- * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite
- * @wbc: writeback_control of interest
- * @inode: target inode
- *
- * This function is to be used by __filemap_fdatawrite_range(), which is an
- * alternative entry point into writeback code, and first ensures @inode is
- * associated with a bdi_writeback and attaches it to @wbc.
- */
-void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
- struct inode *inode)
-{
- spin_lock(&inode->i_lock);
- inode_attach_wb(inode, NULL);
- wbc_attach_and_unlock_inode(wbc, inode);
-}
-EXPORT_SYMBOL_GPL(wbc_attach_fdatawrite_inode);
+EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode);
/**
* wbc_detach_inode - disassociate wbc from inode and perform foreign detection
@@ -1282,13 +1265,6 @@ static void bdi_split_work_to_wbs(struct
backing_dev_info *bdi,
}
}
-static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
- struct inode *inode)
- __releases(&inode->i_lock)
-{
- spin_unlock(&inode->i_lock);
-}
-
#endif /* CONFIG_CGROUP_WRITEBACK */
/*
@@ -2475,7 +2451,14 @@ static void wakeup_dirtytime_writeback(struct
work_struct *w)
schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
}
-static int dirtytime_interval_handler(const struct ctl_table *table, int write,
+static int __init start_dirtytime_writeback(void)
+{
+ schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
+ return 0;
+}
+__initcall(start_dirtytime_writeback);
+
+int dirtytime_interval_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -2486,25 +2469,6 @@ static int dirtytime_interval_handler(const
struct ctl_table *table, int write,
return ret;
}
-static const struct ctl_table vm_fs_writeback_table[] = {
- {
- .procname = "dirtytime_expire_seconds",
- .data = &dirtytime_expire_interval,
- .maxlen = sizeof(dirtytime_expire_interval),
- .mode = 0644,
- .proc_handler = dirtytime_interval_handler,
- .extra1 = SYSCTL_ZERO,
- },
-};
-
-static int __init start_dirtytime_writeback(void)
-{
- schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
- register_sysctl_init("vm", vm_fs_writeback_table);
- return 0;
-}
-__initcall(start_dirtytime_writeback);
-
/**
* __mark_inode_dirty - internal function to mark an inode dirty
*
On Wed, Sep 24, 2025 at 5:52 PM Tejun Heo tj@kernel.org wrote:
...
On Wed, Sep 24, 2025 at 05:24:15PM -0700, Chenglong Tang wrote:
...
The kernel v6.1 is good. The hang is reliably triggered(over 80% chance) on
kernels v6.6 and 6.12 and intermittently on mainline(6.17-rc7) with the
following steps:



*Environment:* A machine with a fast SSD and a high core count (e.g.,
Google Cloud's N2-standard-128).



*Workload:* Concurrently generate a large number of files (e.g., 2 million)
using multiple services managed by systemd-run. This creates significant
I/O and cgroup churn.



*Trigger:* After the file generation completes, terminate the systemd-run
services.



*Result:* Shortly after the services are killed, the system's CPU load
spikes, leading to a massive number of kworker/+inode_switch_wbs threads
and a system-wide hang/livelock where the machine becomes unresponsive (20s

300s).

Sounds like:
http://lkml.kernel.org/r/20250912103522.2935-1-jack@suse.cz
Can you see whether those patches resolve the problem?
Thanks.
--
tejun

    

2025

2024

2023

2022

2021

2020

2019

2018

2017

Re: [REGRESSION] workqueue/writeback: Severe CPU hang due to kworker proliferation during I/O flush and cgroup cleanup