Confirmed the patches worked for the mainline(6.17-rc7). But it's still flaky(1/13) if I simply apply the patches to v6.12.46.
I think there should be some gap commits that I should apply as well.
Here is the diff between the two kernel versions after patches are applied:
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index fad8ddfa622bb..62d85c5086ba1 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -65,7 +65,7 @@ struct wb_writeback_work { * timestamps written to disk after 12 hours, but in the worst case a * few inodes might not their timestamps updated for 24 hours. */ -static unsigned int dirtytime_expire_interval = 12 * 60 * 60; +unsigned int dirtytime_expire_interval = 12 * 60 * 60; static inline struct inode *wb_inode(struct list_head *head) { @@ -290,6 +290,7 @@ void __inode_attach_wb(struct inode *inode, struct folio *folio) if (unlikely(cmpxchg(&inode->i_wb, NULL, wb))) wb_put(wb); } +EXPORT_SYMBOL_GPL(__inode_attach_wb); /** * inode_cgwb_move_to_attached - put the inode onto wb->b_attached list @@ -770,9 +771,8 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) * writeback completion, wbc_detach_inode() should be called. This is used * to track the cgroup writeback context. */ -static void wbc_attach_and_unlock_inode(struct writeback_control *wbc, - struct inode *inode) - __releases(&inode->i_lock) +void wbc_attach_and_unlock_inode(struct writeback_control *wbc, + struct inode *inode) { if (!inode_cgwb_enabled(inode)) { spin_unlock(&inode->i_lock); @@ -802,24 +802,7 @@ static void wbc_attach_and_unlock_inode(struct writeback_control *wbc, if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css))) inode_switch_wbs(inode, wbc->wb_id); } - -/** - * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite - * @wbc: writeback_control of interest - * @inode: target inode - * - * This function is to be used by __filemap_fdatawrite_range(), which is an - * alternative entry point into writeback code, and first ensures @inode is - * associated with a bdi_writeback and attaches it to @wbc. - */ -void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, - struct inode *inode) -{ - spin_lock(&inode->i_lock); - inode_attach_wb(inode, NULL); - wbc_attach_and_unlock_inode(wbc, inode); -} -EXPORT_SYMBOL_GPL(wbc_attach_fdatawrite_inode); +EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode); /** * wbc_detach_inode - disassociate wbc from inode and perform foreign detection @@ -1282,13 +1265,6 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, } } -static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc, - struct inode *inode) - __releases(&inode->i_lock) -{ - spin_unlock(&inode->i_lock); -} - #endif /* CONFIG_CGROUP_WRITEBACK */ /* @@ -2475,7 +2451,14 @@ static void wakeup_dirtytime_writeback(struct work_struct *w) schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); } -static int dirtytime_interval_handler(const struct ctl_table *table, int write, +static int __init start_dirtytime_writeback(void) +{ + schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); + return 0; +} +__initcall(start_dirtytime_writeback); + +int dirtytime_interval_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -2486,25 +2469,6 @@ static int dirtytime_interval_handler(const struct ctl_table *table, int write, return ret; } -static const struct ctl_table vm_fs_writeback_table[] = { - { - .procname = "dirtytime_expire_seconds", - .data = &dirtytime_expire_interval, - .maxlen = sizeof(dirtytime_expire_interval), - .mode = 0644, - .proc_handler = dirtytime_interval_handler, - .extra1 = SYSCTL_ZERO, - }, -}; - -static int __init start_dirtytime_writeback(void) -{ - schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); - register_sysctl_init("vm", vm_fs_writeback_table); - return 0; -} -__initcall(start_dirtytime_writeback); - /** * __mark_inode_dirty - internal function to mark an inode dirty *
On Wed, Sep 24, 2025 at 5:52 PM Tejun Heo tj@kernel.org wrote:
On Wed, Sep 24, 2025 at 05:24:15PM -0700, Chenglong Tang wrote:
The kernel v6.1 is good. The hang is reliably triggered(over 80% chance) on kernels v6.6 and 6.12 and intermittently on mainline(6.17-rc7) with the following steps:
*Environment:* A machine with a fast SSD and a high core count (e.g., Google Cloud's N2-standard-128).
*Workload:* Concurrently generate a large number of files (e.g., 2 million) using multiple services managed by systemd-run. This creates significant I/O and cgroup churn.
*Trigger:* After the file generation completes, terminate the systemd-run services.
*Result:* Shortly after the services are killed, the system's CPU load spikes, leading to a massive number of kworker/+inode_switch_wbs threads and a system-wide hang/livelock where the machine becomes unresponsive (20s
- 300s).
Sounds like:
http://lkml.kernel.org/r/20250912103522.2935-1-jack@suse.cz
Can you see whether those patches resolve the problem?
Thanks.
-- tejun