From: Minchan Kim minchan@kernel.org
[ Upstream commit 393c3714081a53795bbff0e985d24146def6f57f ]
The kernfs implementation has big lock granularity(kernfs_rwsem) so every kernfs-based(e.g., sysfs, cgroup) fs are able to compete the lock. It makes trouble for some cases to wait the global lock for a long time even though they are totally independent contexts each other.
A general example is process A goes under direct reclaim with holding the lock when it accessed the file in sysfs and process B is waiting the lock with exclusive mode and then process C is waiting the lock until process B could finish the job after it gets the lock from process A.
This patch switches the global kernfs_rwsem to per-fs lock, which put the rwsem into kernfs_root.
Suggested-by: Tejun Heo tj@kernel.org Acked-by: Tejun Heo tj@kernel.org Signed-off-by: Minchan Kim minchan@kernel.org Link: https://lore.kernel.org/r/20211118230008.2679780-1-minchan@kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Jeremi Piotrowski jpiotrowski@linux.microsoft.com --- Hi Stable Maintainers,
This upstream commit fixes a kernel hang due to severe lock contention on kernfs_rwsem that occurs when container workloads perform a lot of cgroupfs accesses. Could you please apply to 5.15.y? I cherry-pick the upstream commit to v5.15.173 and then performed `git format-patch`.
Thanks, Jeremi
fs/kernfs/dir.c | 110 ++++++++++++++++++++++++----------------- fs/kernfs/file.c | 6 ++- fs/kernfs/inode.c | 22 ++++++--- fs/kernfs/mount.c | 15 +++--- fs/kernfs/symlink.c | 5 +- include/linux/kernfs.h | 2 + 6 files changed, 97 insertions(+), 63 deletions(-)
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 36430bdf9381..ebe4ab0765ee 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -17,7 +17,6 @@
#include "kernfs-internal.h"
-DECLARE_RWSEM(kernfs_rwsem); static DEFINE_SPINLOCK(kernfs_rename_lock); /* kn->parent and ->name */ /* * Don't use rename_lock to piggy back on pr_cont_buf. We don't want to @@ -34,7 +33,7 @@ static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */
static bool kernfs_active(struct kernfs_node *kn) { - lockdep_assert_held(&kernfs_rwsem); + lockdep_assert_held(&kernfs_root(kn)->kernfs_rwsem); return atomic_read(&kn->active) >= 0; }
@@ -465,14 +464,15 @@ void kernfs_put_active(struct kernfs_node *kn) * return after draining is complete. */ static void kernfs_drain(struct kernfs_node *kn) - __releases(&kernfs_rwsem) __acquires(&kernfs_rwsem) + __releases(&kernfs_root(kn)->kernfs_rwsem) + __acquires(&kernfs_root(kn)->kernfs_rwsem) { struct kernfs_root *root = kernfs_root(kn);
- lockdep_assert_held_write(&kernfs_rwsem); + lockdep_assert_held_write(&root->kernfs_rwsem); WARN_ON_ONCE(kernfs_active(kn));
- up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem);
if (kernfs_lockdep(kn)) { rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_); @@ -491,7 +491,7 @@ static void kernfs_drain(struct kernfs_node *kn)
kernfs_drain_open_files(kn);
- down_write(&kernfs_rwsem); + down_write(&root->kernfs_rwsem); }
/** @@ -740,11 +740,12 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root, int kernfs_add_one(struct kernfs_node *kn) { struct kernfs_node *parent = kn->parent; + struct kernfs_root *root = kernfs_root(parent); struct kernfs_iattrs *ps_iattr; bool has_ns; int ret;
- down_write(&kernfs_rwsem); + down_write(&root->kernfs_rwsem);
ret = -EINVAL; has_ns = kernfs_ns_enabled(parent); @@ -775,7 +776,7 @@ int kernfs_add_one(struct kernfs_node *kn) ps_iattr->ia_mtime = ps_iattr->ia_ctime; }
- up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem);
/* * Activate the new node unless CREATE_DEACTIVATED is requested. @@ -789,7 +790,7 @@ int kernfs_add_one(struct kernfs_node *kn) return 0;
out_unlock: - up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); return ret; }
@@ -810,7 +811,7 @@ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent, bool has_ns = kernfs_ns_enabled(parent); unsigned int hash;
- lockdep_assert_held(&kernfs_rwsem); + lockdep_assert_held(&kernfs_root(parent)->kernfs_rwsem);
if (has_ns != (bool)ns) { WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", @@ -842,7 +843,7 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent, size_t len; char *p, *name;
- lockdep_assert_held_read(&kernfs_rwsem); + lockdep_assert_held_read(&kernfs_root(parent)->kernfs_rwsem);
spin_lock_irq(&kernfs_pr_cont_lock);
@@ -880,11 +881,12 @@ struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name, const void *ns) { struct kernfs_node *kn; + struct kernfs_root *root = kernfs_root(parent);
- down_read(&kernfs_rwsem); + down_read(&root->kernfs_rwsem); kn = kernfs_find_ns(parent, name, ns); kernfs_get(kn); - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem);
return kn; } @@ -904,11 +906,12 @@ struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent, const char *path, const void *ns) { struct kernfs_node *kn; + struct kernfs_root *root = kernfs_root(parent);
- down_read(&kernfs_rwsem); + down_read(&root->kernfs_rwsem); kn = kernfs_walk_ns(parent, path, ns); kernfs_get(kn); - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem);
return kn; } @@ -933,6 +936,7 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, return ERR_PTR(-ENOMEM);
idr_init(&root->ino_idr); + init_rwsem(&root->kernfs_rwsem); INIT_LIST_HEAD(&root->supers);
/* @@ -1056,6 +1060,7 @@ struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent, static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) { struct kernfs_node *kn; + struct kernfs_root *root;
if (flags & LOOKUP_RCU) return -ECHILD; @@ -1067,18 +1072,19 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) /* If the kernfs parent node has changed discard and * proceed to ->lookup. */ - down_read(&kernfs_rwsem); spin_lock(&dentry->d_lock); parent = kernfs_dentry_node(dentry->d_parent); if (parent) { + spin_unlock(&dentry->d_lock); + root = kernfs_root(parent); + down_read(&root->kernfs_rwsem); if (kernfs_dir_changed(parent, dentry)) { - spin_unlock(&dentry->d_lock); - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem); return 0; } - } - spin_unlock(&dentry->d_lock); - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem); + } else + spin_unlock(&dentry->d_lock);
/* The kernfs parent node hasn't changed, leave the * dentry negative and return success. @@ -1087,7 +1093,8 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) }
kn = kernfs_dentry_node(dentry); - down_read(&kernfs_rwsem); + root = kernfs_root(kn); + down_read(&root->kernfs_rwsem);
/* The kernfs node has been deactivated */ if (!kernfs_active(kn)) @@ -1106,10 +1113,10 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) kernfs_info(dentry->d_sb)->ns != kn->ns) goto out_bad;
- up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem); return 1; out_bad: - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem); return 0; }
@@ -1123,10 +1130,12 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, { struct kernfs_node *parent = dir->i_private; struct kernfs_node *kn; + struct kernfs_root *root; struct inode *inode = NULL; const void *ns = NULL;
- down_read(&kernfs_rwsem); + root = kernfs_root(parent); + down_read(&root->kernfs_rwsem); if (kernfs_ns_enabled(parent)) ns = kernfs_info(dir->i_sb)->ns;
@@ -1137,7 +1146,7 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, * create a negative. */ if (!kernfs_active(kn)) { - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem); return NULL; } inode = kernfs_get_inode(dir->i_sb, kn); @@ -1152,7 +1161,7 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, */ if (!IS_ERR(inode)) kernfs_set_rev(parent, dentry); - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem);
/* instantiate and hash (possibly negative) dentry */ return d_splice_alias(inode, dentry); @@ -1275,7 +1284,7 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos, { struct rb_node *rbn;
- lockdep_assert_held_write(&kernfs_rwsem); + lockdep_assert_held_write(&kernfs_root(root)->kernfs_rwsem);
/* if first iteration, visit leftmost descendant which may be root */ if (!pos) @@ -1310,8 +1319,9 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos, void kernfs_activate(struct kernfs_node *kn) { struct kernfs_node *pos; + struct kernfs_root *root = kernfs_root(kn);
- down_write(&kernfs_rwsem); + down_write(&root->kernfs_rwsem);
pos = NULL; while ((pos = kernfs_next_descendant_post(pos, kn))) { @@ -1325,14 +1335,14 @@ void kernfs_activate(struct kernfs_node *kn) pos->flags |= KERNFS_ACTIVATED; }
- up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); }
static void __kernfs_remove(struct kernfs_node *kn) { struct kernfs_node *pos;
- lockdep_assert_held_write(&kernfs_rwsem); + lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem);
/* * Short-circuit if non-root @kn has already finished removal. @@ -1402,9 +1412,11 @@ static void __kernfs_remove(struct kernfs_node *kn) */ void kernfs_remove(struct kernfs_node *kn) { - down_write(&kernfs_rwsem); + struct kernfs_root *root = kernfs_root(kn); + + down_write(&root->kernfs_rwsem); __kernfs_remove(kn); - up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); }
/** @@ -1490,8 +1502,9 @@ void kernfs_unbreak_active_protection(struct kernfs_node *kn) bool kernfs_remove_self(struct kernfs_node *kn) { bool ret; + struct kernfs_root *root = kernfs_root(kn);
- down_write(&kernfs_rwsem); + down_write(&root->kernfs_rwsem); kernfs_break_active_protection(kn);
/* @@ -1519,9 +1532,9 @@ bool kernfs_remove_self(struct kernfs_node *kn) atomic_read(&kn->active) == KN_DEACTIVATED_BIAS) break;
- up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); schedule(); - down_write(&kernfs_rwsem); + down_write(&root->kernfs_rwsem); } finish_wait(waitq, &wait); WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb)); @@ -1534,7 +1547,7 @@ bool kernfs_remove_self(struct kernfs_node *kn) */ kernfs_unbreak_active_protection(kn);
- up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); return ret; }
@@ -1551,6 +1564,7 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, const void *ns) { struct kernfs_node *kn; + struct kernfs_root *root;
if (!parent) { WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n", @@ -1558,7 +1572,8 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, return -ENOENT; }
- down_write(&kernfs_rwsem); + root = kernfs_root(parent); + down_write(&root->kernfs_rwsem);
kn = kernfs_find_ns(parent, name, ns); if (kn) { @@ -1567,7 +1582,7 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, kernfs_put(kn); }
- up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem);
if (kn) return 0; @@ -1586,6 +1601,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name, const void *new_ns) { struct kernfs_node *old_parent; + struct kernfs_root *root; const char *old_name = NULL; int error;
@@ -1593,7 +1609,8 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, if (!kn->parent) return -EINVAL;
- down_write(&kernfs_rwsem); + root = kernfs_root(kn); + down_write(&root->kernfs_rwsem);
error = -ENOENT; if (!kernfs_active(kn) || !kernfs_active(new_parent) || @@ -1647,7 +1664,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
error = 0; out: - up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); return error; }
@@ -1718,11 +1735,14 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) struct dentry *dentry = file->f_path.dentry; struct kernfs_node *parent = kernfs_dentry_node(dentry); struct kernfs_node *pos = file->private_data; + struct kernfs_root *root; const void *ns = NULL;
if (!dir_emit_dots(file, ctx)) return 0; - down_read(&kernfs_rwsem); + + root = kernfs_root(parent); + down_read(&root->kernfs_rwsem);
if (kernfs_ns_enabled(parent)) ns = kernfs_info(dentry->d_sb)->ns; @@ -1739,12 +1759,12 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) file->private_data = pos; kernfs_get(pos);
- up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem); if (!dir_emit(ctx, name, len, ino, type)) return 0; - down_read(&kernfs_rwsem); + down_read(&root->kernfs_rwsem); } - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem); file->private_data = NULL; ctx->pos = INT_MAX; return 0; diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 60e2a86c535e..9414a7a60a9f 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -847,6 +847,7 @@ static void kernfs_notify_workfn(struct work_struct *work) { struct kernfs_node *kn; struct kernfs_super_info *info; + struct kernfs_root *root; repeat: /* pop one off the notify_list */ spin_lock_irq(&kernfs_notify_lock); @@ -859,8 +860,9 @@ static void kernfs_notify_workfn(struct work_struct *work) kn->attr.notify_next = NULL; spin_unlock_irq(&kernfs_notify_lock);
+ root = kernfs_root(kn); /* kick fsnotify */ - down_write(&kernfs_rwsem); + down_write(&root->kernfs_rwsem);
list_for_each_entry(info, &kernfs_root(kn)->supers, node) { struct kernfs_node *parent; @@ -898,7 +900,7 @@ static void kernfs_notify_workfn(struct work_struct *work) iput(inode); }
- up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); kernfs_put(kn); goto repeat; } diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index c0eae1725435..3d783d80f5da 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -99,10 +99,11 @@ int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) { int ret; + struct kernfs_root *root = kernfs_root(kn);
- down_write(&kernfs_rwsem); + down_write(&root->kernfs_rwsem); ret = __kernfs_setattr(kn, iattr); - up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); return ret; }
@@ -111,12 +112,14 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, { struct inode *inode = d_inode(dentry); struct kernfs_node *kn = inode->i_private; + struct kernfs_root *root; int error;
if (!kn) return -EINVAL;
- down_write(&kernfs_rwsem); + root = kernfs_root(kn); + down_write(&root->kernfs_rwsem); error = setattr_prepare(&init_user_ns, dentry, iattr); if (error) goto out; @@ -129,7 +132,7 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, setattr_copy(&init_user_ns, inode, iattr);
out: - up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); return error; }
@@ -184,13 +187,14 @@ int kernfs_iop_getattr(struct user_namespace *mnt_userns, { struct inode *inode = d_inode(path->dentry); struct kernfs_node *kn = inode->i_private; + struct kernfs_root *root = kernfs_root(kn);
- down_read(&kernfs_rwsem); + down_read(&root->kernfs_rwsem); spin_lock(&inode->i_lock); kernfs_refresh_inode(kn, inode); generic_fillattr(&init_user_ns, inode, stat); spin_unlock(&inode->i_lock); - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem);
return 0; } @@ -274,19 +278,21 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns, struct inode *inode, int mask) { struct kernfs_node *kn; + struct kernfs_root *root; int ret;
if (mask & MAY_NOT_BLOCK) return -ECHILD;
kn = inode->i_private; + root = kernfs_root(kn);
- down_read(&kernfs_rwsem); + down_read(&root->kernfs_rwsem); spin_lock(&inode->i_lock); kernfs_refresh_inode(kn, inode); ret = generic_permission(&init_user_ns, inode, mask); spin_unlock(&inode->i_lock); - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem);
return ret; } diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index f2f909d09f52..cfa79715fc1a 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -236,6 +236,7 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn, static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *kfc) { struct kernfs_super_info *info = kernfs_info(sb); + struct kernfs_root *kf_root = kfc->root; struct inode *inode; struct dentry *root;
@@ -255,9 +256,9 @@ static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *k sb->s_shrink.seeks = 0;
/* get root inode, initialize and unlock it */ - down_read(&kernfs_rwsem); + down_read(&kf_root->kernfs_rwsem); inode = kernfs_get_inode(sb, info->root->kn); - up_read(&kernfs_rwsem); + up_read(&kf_root->kernfs_rwsem); if (!inode) { pr_debug("kernfs: could not get root inode\n"); return -ENOMEM; @@ -334,6 +335,7 @@ int kernfs_get_tree(struct fs_context *fc)
if (!sb->s_root) { struct kernfs_super_info *info = kernfs_info(sb); + struct kernfs_root *root = kfc->root;
kfc->new_sb_created = true;
@@ -344,9 +346,9 @@ int kernfs_get_tree(struct fs_context *fc) } sb->s_flags |= SB_ACTIVE;
- down_write(&kernfs_rwsem); + down_write(&root->kernfs_rwsem); list_add(&info->node, &info->root->supers); - up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); }
fc->root = dget(sb->s_root); @@ -371,10 +373,11 @@ void kernfs_free_fs_context(struct fs_context *fc) void kernfs_kill_sb(struct super_block *sb) { struct kernfs_super_info *info = kernfs_info(sb); + struct kernfs_root *root = info->root;
- down_write(&kernfs_rwsem); + down_write(&root->kernfs_rwsem); list_del(&info->node); - up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem);
/* * Remove the superblock from fs_supers/s_instances diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c index c8f8e41b8411..efb0b9ca9057 100644 --- a/fs/kernfs/symlink.c +++ b/fs/kernfs/symlink.c @@ -114,11 +114,12 @@ static int kernfs_getlink(struct inode *inode, char *path) struct kernfs_node *kn = inode->i_private; struct kernfs_node *parent = kn->parent; struct kernfs_node *target = kn->symlink.target_kn; + struct kernfs_root *root = kernfs_root(parent); int error;
- down_read(&kernfs_rwsem); + down_read(&root->kernfs_rwsem); error = kernfs_get_target_path(parent, target, path); - up_read(&kernfs_rwsem); + up_read(&root->kernfs_rwsem);
return error; } diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 1093abf7c28c..e7078cde3522 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -16,6 +16,7 @@ #include <linux/atomic.h> #include <linux/uidgid.h> #include <linux/wait.h> +#include <linux/rwsem.h>
struct file; struct dentry; @@ -197,6 +198,7 @@ struct kernfs_root { struct list_head supers;
wait_queue_head_t deactivate_waitq; + struct rw_semaphore kernfs_rwsem; };
struct kernfs_open_file {
On Fri, Nov 29, 2024 at 12:32:36PM +0100, Jeremi Piotrowski wrote:
From: Minchan Kim minchan@kernel.org
[ Upstream commit 393c3714081a53795bbff0e985d24146def6f57f ]
The kernfs implementation has big lock granularity(kernfs_rwsem) so every kernfs-based(e.g., sysfs, cgroup) fs are able to compete the lock. It makes trouble for some cases to wait the global lock for a long time even though they are totally independent contexts each other.
A general example is process A goes under direct reclaim with holding the lock when it accessed the file in sysfs and process B is waiting the lock with exclusive mode and then process C is waiting the lock until process B could finish the job after it gets the lock from process A.
This patch switches the global kernfs_rwsem to per-fs lock, which put the rwsem into kernfs_root.
Suggested-by: Tejun Heo tj@kernel.org Acked-by: Tejun Heo tj@kernel.org Signed-off-by: Minchan Kim minchan@kernel.org Link: https://lore.kernel.org/r/20211118230008.2679780-1-minchan@kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Jeremi Piotrowski jpiotrowski@linux.microsoft.com
Hi Stable Maintainers,
This upstream commit fixes a kernel hang due to severe lock contention on kernfs_rwsem that occurs when container workloads perform a lot of cgroupfs accesses. Could you please apply to 5.15.y? I cherry-pick the upstream commit to v5.15.173 and then performed `git format-patch`.
This should not hang, but rather just reduce contention, right? Do you have real performance numbers that show this is needed? What workloads are overloading cgroupfs? And why not just switch them to 6.1.y kernels or newer?
thanks,
greg k-h
On 29/11/2024 13:12, Greg Kroah-Hartman wrote:
On Fri, Nov 29, 2024 at 12:32:36PM +0100, Jeremi Piotrowski wrote:
From: Minchan Kim minchan@kernel.org
[ Upstream commit 393c3714081a53795bbff0e985d24146def6f57f ]
The kernfs implementation has big lock granularity(kernfs_rwsem) so every kernfs-based(e.g., sysfs, cgroup) fs are able to compete the lock. It makes trouble for some cases to wait the global lock for a long time even though they are totally independent contexts each other.
A general example is process A goes under direct reclaim with holding the lock when it accessed the file in sysfs and process B is waiting the lock with exclusive mode and then process C is waiting the lock until process B could finish the job after it gets the lock from process A.
This patch switches the global kernfs_rwsem to per-fs lock, which put the rwsem into kernfs_root.
Suggested-by: Tejun Heo tj@kernel.org Acked-by: Tejun Heo tj@kernel.org Signed-off-by: Minchan Kim minchan@kernel.org Link: https://lore.kernel.org/r/20211118230008.2679780-1-minchan@kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Jeremi Piotrowski jpiotrowski@linux.microsoft.com
Hi Stable Maintainers,
This upstream commit fixes a kernel hang due to severe lock contention on kernfs_rwsem that occurs when container workloads perform a lot of cgroupfs accesses. Could you please apply to 5.15.y? I cherry-pick the upstream commit to v5.15.173 and then performed `git format-patch`.
This should not hang, but rather just reduce contention, right? Do you have real performance numbers that show this is needed? What workloads are overloading cgroupfs?
System hang due to the contention might be a more accurate description. On a kubernetes node there is always a stream of processes (systemd, kubelet, containerd, cadvisor) periodically opening/stating/reading cgroupfs files. Java apps also love reading cgroup files. Other operations such as creation of short-lived containers take a write lock on the rwsem when creating cgroups and when creating veth netdevs. The veth netdev creation takes the rwsem when creating sysfs files. Systemd service startup also contends for the same write lock.
It's not so much a particular workload as it is a matter of scale, the cgroupfs read accesses scale with the number of containers on a host. With enough readers and the right mix of writers, write operations can take minutes.
Here are some real performance number: I have a representative reproducer with 50 cgroupfs readers in a loop and a container batch job every minute. `systemctl status` times out after 1m30s, container creation takes over 4m causing the operations to pile up, making the situation even worse. With this patch included, under the same load the operations finish in ~10s, preventing the system from becoming unresponsive.
This patch stops sysfs and cgroupfs modifications from contending for the same rwsem, as well as lowering contention between different cgroup subsystems.
And why not just switch them to 6.1.y kernels or newer?
I wish we could just do that. Right now all our users are on 5.15 and a lot of their workloads are sensitive to changes to any part of the container stack including kernel version. So they will gradually migrate to kernel 6.1.y and newer as part of upgrading their clusters to a new kubernetes release after they validate their workloads on it. This is a slow process and in the meantime they are hitting the issue that the patch addresses. I'm sure there are other similar users of 5.15 out there.
thanks,
greg k-h
Thanks, Jeremi
On Fri, Nov 29, 2024 at 10:20:48PM +0100, Jeremi Piotrowski wrote:
And why not just switch them to 6.1.y kernels or newer?
I wish we could just do that. Right now all our users are on 5.15 and a lot of their workloads are sensitive to changes to any part of the container stack including kernel version. So they will gradually migrate to kernel 6.1.y and newer as part of upgrading their clusters to a new kubernetes release after they validate their workloads on it. This is a slow process and in the meantime they are hitting the issue that the patch addresses. I'm sure there are other similar users of 5.15 out there.
If they are "hitting this issue" then that is the perfect reason for them to migrate off of that kernel version. Don't think that modifying a core kernel functionality like this is somehow any more "safe" than moving to a newer kernel (especially given all of the unfixed issues in the 5.15.y branch compared to 6.1.y).
Saying "your workload is causing problems on this kernel version, please move to a newer one to resolve the issue." is a valid thing to tell customers.
Try that first please.
thanks,
greg k-h
[ Sasha's backport helper bot ]
Hi,
The upstream commit SHA1 provided is correct: 393c3714081a53795bbff0e985d24146def6f57f
WARNING: Author mismatch between patch and upstream commit: Backport author: Jeremi Piotrowski jpiotrowski@linux.microsoft.com Commit author: Minchan Kim minchan@kernel.org
Status in newer kernel trees: 6.12.y | Present (exact SHA1) 6.11.y | Present (exact SHA1) 6.6.y | Present (exact SHA1) 6.1.y | Present (exact SHA1) 5.15.y | Not found
Note: The patch differs from the upstream commit: --- 1: 393c3714081a5 ! 1: b3f5272b13165 kernfs: switch global kernfs_rwsem lock to per-fs lock @@ Metadata ## Commit message ## kernfs: switch global kernfs_rwsem lock to per-fs lock
+ [ Upstream commit 393c3714081a53795bbff0e985d24146def6f57f ] + The kernfs implementation has big lock granularity(kernfs_rwsem) so every kernfs-based(e.g., sysfs, cgroup) fs are able to compete the lock. It makes trouble for some cases to wait the global lock @@ Commit message Signed-off-by: Minchan Kim minchan@kernel.org Link: https://lore.kernel.org/r/20211118230008.2679780-1-minchan@kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org + Signed-off-by: Jeremi Piotrowski jpiotrowski@linux.microsoft.com
## fs/kernfs/dir.c ## @@ @@ fs/kernfs/dir.c
-DECLARE_RWSEM(kernfs_rwsem); static DEFINE_SPINLOCK(kernfs_rename_lock); /* kn->parent and ->name */ - static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by rename_lock */ - static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */ + /* + * Don't use rename_lock to piggy back on pr_cont_buf. We don't want to @@ fs/kernfs/dir.c: static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */
static bool kernfs_active(struct kernfs_node *kn) @@ fs/kernfs/dir.c: static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *p - lockdep_assert_held_read(&kernfs_rwsem); + lockdep_assert_held_read(&kernfs_root(parent)->kernfs_rwsem);
- /* grab kernfs_rename_lock to piggy back on kernfs_pr_cont_buf */ - spin_lock_irq(&kernfs_rename_lock); + spin_lock_irq(&kernfs_pr_cont_lock); + @@ fs/kernfs/dir.c: struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name, const void *ns) { @@ fs/kernfs/dir.c: int kernfs_remove_by_name_ns(struct kernfs_node *parent, const + down_write(&root->kernfs_rwsem);
kn = kernfs_find_ns(parent, name, ns); - if (kn) - __kernfs_remove(kn); + if (kn) { +@@ fs/kernfs/dir.c: int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, + kernfs_put(kn); + }
- up_write(&kernfs_rwsem); + up_write(&root->kernfs_rwsem); ---
Results of testing on various branches:
| Branch | Patch Apply | Build Test | |---------------------------|-------------|------------| | stable/linux-5.15.y | Success | Success |
linux-stable-mirror@lists.linaro.org