Syzbot found a corrupted list bug scenario that can be triggered from
cgroup_subtree_control_write(cgrp). The reproduces writes to
cgroup.subtree_control file, which invokes:
cgroup_apply_control_enable()->css_create()->css_populate_dir(), which
then fails with a fault injected -ENOMEM.
In such scenario the css_killed_work_fn will be en-queued via
cgroup_apply_control_disable(cgrp)->kill_css(css), and bail out to
cgroup_kn_unlock(). Then cgroup_kn_unlock() will call:
cgroup_put(cgrp)->css_put(&cgrp->self), which will try to enqueue
css_release_work_fn for the same css instance, causing a list_add
corruption bug, as can be seen in the syzkaller report [1].
Fix this by synchronizing the css ref_kill and css_release jobs.
css_release() function will check if the css_killed_work_fn() has been
scheduled for the css and only en-queue the css_release_work_fn()
if css_killed_work_fn wasn't already en-queued. Otherwise css_release() will
set the CSS_REL_LATER flag for that css. This will cause the css_release_work_fn()
work to be executed after css_killed_work_fn() is finished.
Two scc flags have been introduced to implement this serialization mechanizm:
* CSS_KILL_ENQED, which will be set when css_killed_work_fn() is en-queued, and
* CSS_REL_LATER, which, if set, will cause the css_release_work_fn() to be
scheduled after the css_killed_work_fn is finished.
There is also a new lock, which will protect the integrity of the css flags.
[1] https://syzkaller.appspot.com/bug?id=e26e54d6eac9d9fb50b221ec3e4627b327465d…
Cc: Tejun Heo <tj(a)kernel.org>
Cc: Michal Koutny <mkoutny(a)suse.com>
Cc: Zefan Li <lizefan.x(a)bytedance.com>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Christian Brauner <brauner(a)kernel.org>
Cc: Alexei Starovoitov <ast(a)kernel.org>
Cc: Daniel Borkmann <daniel(a)iogearbox.net>
Cc: Andrii Nakryiko <andrii(a)kernel.org>
Cc: Martin KaFai Lau <kafai(a)fb.com>
Cc: Song Liu <songliubraving(a)fb.com>
Cc: Yonghong Song <yhs(a)fb.com>
Cc: John Fastabend <john.fastabend(a)gmail.com>
Cc: KP Singh <kpsingh(a)kernel.org>
Cc: <cgroups(a)vger.kernel.org>
Cc: <netdev(a)vger.kernel.org>
Cc: <bpf(a)vger.kernel.org>
Cc: <stable(a)vger.kernel.org>
Cc: <linux-kernel(a)vger.kernel.org>
Reported-and-tested-by: syzbot+e42ae441c3b10acf9e9d(a)syzkaller.appspotmail.com
Fixes: 8f36aaec9c92 ("cgroup: Use rcu_work instead of explicit rcu and work item")
Signed-off-by: Tadeusz Struk <tadeusz.struk(a)linaro.org>
---
include/linux/cgroup-defs.h | 4 ++++
kernel/cgroup/cgroup.c | 35 ++++++++++++++++++++++++++++++++---
2 files changed, 36 insertions(+), 3 deletions(-)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 1bfcfb1af352..8dc8b4edb242 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -53,6 +53,8 @@ enum {
CSS_RELEASED = (1 << 2), /* refcnt reached zero, released */
CSS_VISIBLE = (1 << 3), /* css is visible to userland */
CSS_DYING = (1 << 4), /* css is dying */
+ CSS_KILL_ENQED = (1 << 5), /* kill work enqueued for the css */
+ CSS_REL_LATER = (1 << 6), /* release needs to be done after kill */
};
/* bits in struct cgroup flags field */
@@ -162,6 +164,8 @@ struct cgroup_subsys_state {
*/
int id;
+ /* lock to protect flags */
+ spinlock_t lock;
unsigned int flags;
/*
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 1779ccddb734..a0ceead4b390 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5210,8 +5210,23 @@ static void css_release(struct percpu_ref *ref)
struct cgroup_subsys_state *css =
container_of(ref, struct cgroup_subsys_state, refcnt);
- INIT_WORK(&css->destroy_work, css_release_work_fn);
- queue_work(cgroup_destroy_wq, &css->destroy_work);
+ spin_lock_bh(&css->lock);
+
+ /*
+ * Check if the css_killed_work_fn work has been scheduled for this
+ * css and enqueue css_release_work_fn only if it wasn't.
+ * Otherwise set the CSS_REL_LATER flag, which will cause
+ * release to be enqueued after css_killed_work_fn is finished.
+ * This is to prevent list corruption by en-queuing two instance
+ * of the same work struct on the same WQ, namely cgroup_destroy_wq.
+ */
+ if (!(css->flags & CSS_KILL_ENQED)) {
+ INIT_WORK(&css->destroy_work, css_release_work_fn);
+ queue_work(cgroup_destroy_wq, &css->destroy_work);
+ } else {
+ css->flags |= CSS_REL_LATER;
+ }
+ spin_unlock_bh(&css->lock);
}
static void init_and_link_css(struct cgroup_subsys_state *css,
@@ -5230,6 +5245,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
INIT_LIST_HEAD(&css->rstat_css_node);
css->serial_nr = css_serial_nr_next++;
atomic_set(&css->online_cnt, 0);
+ spin_lock_init(&css->lock);
if (cgroup_parent(cgrp)) {
css->parent = cgroup_css(cgroup_parent(cgrp), ss);
@@ -5545,10 +5561,12 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
*/
static void css_killed_work_fn(struct work_struct *work)
{
- struct cgroup_subsys_state *css =
+ struct cgroup_subsys_state *css_killed, *css =
container_of(work, struct cgroup_subsys_state, destroy_work);
mutex_lock(&cgroup_mutex);
+ css_killed = css;
+ css_killed->flags &= ~CSS_KILL_ENQED;
do {
offline_css(css);
@@ -5557,6 +5575,14 @@ static void css_killed_work_fn(struct work_struct *work)
css = css->parent;
} while (css && atomic_dec_and_test(&css->online_cnt));
+ spin_lock_bh(&css->lock);
+ if (css_killed->flags & CSS_REL_LATER) {
+ /* If css_release work was delayed for the css enqueue it now. */
+ INIT_WORK(&css_killed->destroy_work, css_release_work_fn);
+ queue_work(cgroup_destroy_wq, &css_killed->destroy_work);
+ css_killed->flags &= ~CSS_REL_LATER;
+ }
+ spin_unlock_bh(&css->lock);
mutex_unlock(&cgroup_mutex);
}
@@ -5566,10 +5592,13 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
struct cgroup_subsys_state *css =
container_of(ref, struct cgroup_subsys_state, refcnt);
+ spin_lock_bh(&css->lock);
if (atomic_dec_and_test(&css->online_cnt)) {
+ css->flags |= CSS_KILL_ENQED;
INIT_WORK(&css->destroy_work, css_killed_work_fn);
queue_work(cgroup_destroy_wq, &css->destroy_work);
}
+ spin_unlock_bh(&css->lock);
}
/**
--
2.36.1
Commit 8e7102273f59 ("bcache: make bch_btree_check() to be
multithreaded") makes bch_btree_check() to be much faster when checking
all btree nodes during cache device registration. But it isn't in ideal
shap yet, still can be improved.
This patch does the following thing to improve current parallel btree
nodes check by multiple threads in bch_btree_check(),
- Add read lock to root node while checking all the btree nodes with
multiple threads. Although currently it is not mandatory but it is
good to have a read lock in code logic.
- Remove local variable 'char name[32]', and generate kernel thread name
string directly when calling kthread_run().
- Allocate local variable "struct btree_check_state check_state" on the
stack and avoid unnecessary dynamic memory allocation for it.
- Increase check_state->started to count created kernel thread after it
succeeds to create.
- When wait for all checking kernel threads to finish, use wait_event()
to replace wait_event_interruptible().
With this change, the code is more clear, and some potential error
conditions are avoided.
Fixes: 8e7102273f59 ("bcache: make bch_btree_check() to be multithreaded")
Signed-off-by: Coly Li <colyli(a)suse.de>
Cc: stable(a)vger.kernel.org
---
drivers/md/bcache/btree.c | 58 ++++++++++++++++++---------------------
1 file changed, 26 insertions(+), 32 deletions(-)
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index ad9f16689419..2362bb8ef6d1 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -2006,8 +2006,7 @@ int bch_btree_check(struct cache_set *c)
int i;
struct bkey *k = NULL;
struct btree_iter iter;
- struct btree_check_state *check_state;
- char name[32];
+ struct btree_check_state check_state;
/* check and mark root node keys */
for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid)
@@ -2018,63 +2017,58 @@ int bch_btree_check(struct cache_set *c)
if (c->root->level == 0)
return 0;
- check_state = kzalloc(sizeof(struct btree_check_state), GFP_KERNEL);
- if (!check_state)
- return -ENOMEM;
-
- check_state->c = c;
- check_state->total_threads = bch_btree_chkthread_nr();
- check_state->key_idx = 0;
- spin_lock_init(&check_state->idx_lock);
- atomic_set(&check_state->started, 0);
- atomic_set(&check_state->enough, 0);
- init_waitqueue_head(&check_state->wait);
+ check_state.c = c;
+ check_state.total_threads = bch_btree_chkthread_nr();
+ check_state.key_idx = 0;
+ spin_lock_init(&check_state.idx_lock);
+ atomic_set(&check_state.started, 0);
+ atomic_set(&check_state.enough, 0);
+ init_waitqueue_head(&check_state.wait);
+ rw_lock(0, c->root, c->root->level);
/*
* Run multiple threads to check btree nodes in parallel,
- * if check_state->enough is non-zero, it means current
+ * if check_state.enough is non-zero, it means current
* running check threads are enough, unncessary to create
* more.
*/
- for (i = 0; i < check_state->total_threads; i++) {
- /* fetch latest check_state->enough earlier */
+ for (i = 0; i < check_state.total_threads; i++) {
+ /* fetch latest check_state.enough earlier */
smp_mb__before_atomic();
- if (atomic_read(&check_state->enough))
+ if (atomic_read(&check_state.enough))
break;
- check_state->infos[i].result = 0;
- check_state->infos[i].state = check_state;
- snprintf(name, sizeof(name), "bch_btrchk[%u]", i);
- atomic_inc(&check_state->started);
+ check_state.infos[i].result = 0;
+ check_state.infos[i].state = &check_state;
- check_state->infos[i].thread =
+ check_state.infos[i].thread =
kthread_run(bch_btree_check_thread,
- &check_state->infos[i],
- name);
- if (IS_ERR(check_state->infos[i].thread)) {
+ &check_state.infos[i],
+ "bch_btrchk[%d]", i);
+ if (IS_ERR(check_state.infos[i].thread)) {
pr_err("fails to run thread bch_btrchk[%d]\n", i);
for (--i; i >= 0; i--)
- kthread_stop(check_state->infos[i].thread);
+ kthread_stop(check_state.infos[i].thread);
ret = -ENOMEM;
goto out;
}
+ atomic_inc(&check_state.started);
}
/*
* Must wait for all threads to stop.
*/
- wait_event_interruptible(check_state->wait,
- atomic_read(&check_state->started) == 0);
+ wait_event(check_state.wait, atomic_read(&check_state.started) == 0);
- for (i = 0; i < check_state->total_threads; i++) {
- if (check_state->infos[i].result) {
- ret = check_state->infos[i].result;
+ for (i = 0; i < check_state.total_threads; i++) {
+ if (check_state.infos[i].result) {
+ ret = check_state.infos[i].result;
goto out;
}
}
out:
- kfree(check_state);
+ rw_unlock(0, c->root);
return ret;
}
--
2.35.3
From: Fabio Estevam <festevam(a)denx.de>
Since commit 358ba762d9f1 ("crypto: caam - enable prediction resistance
in HRWNG") the following CAAM errors can be seen on i.MX6SX:
caam_jr 2101000.jr: 20003c5b: CCB: desc idx 60: RNG: Hardware error
hwrng: no data available
This error is due to an incorrect entropy delay for i.MX6SX.
Fix it by increasing the minimum entropy delay for i.MX6SX
as done in U-Boot:
https://patchwork.ozlabs.org/project/uboot/patch/20220415111049.2565744-1-g…
As explained in the U-Boot patch:
"RNG self tests are run to determine the correct entropy delay.
Such tests are executed with different voltages and temperatures to identify
the worst case value for the entropy delay. For i.MX6SX, it was determined
that after adding a margin value of 1000 the minimum entropy delay should be
at least 12000."
Cc: <stable(a)vger.kernel.org>
Fixes: 358ba762d9f1 ("crypto: caam - enable prediction resistance in HRWNG")
Signed-off-by: Fabio Estevam <festevam(a)denx.de>
Reviewed-by: Horia Geantă <horia.geanta(a)nxp.com>
---
Changes since v4:
- Change the function name to needs_entropy_delay_adjustment() - Vabhav
- Improve the commit log by adding the explanation from the U-Boot
patch - Vabhav
drivers/crypto/caam/ctrl.c | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)
diff --git a/drivers/crypto/caam/ctrl.c b/drivers/crypto/caam/ctrl.c
index ca0361b2dbb0..f87aa2169e5f 100644
--- a/drivers/crypto/caam/ctrl.c
+++ b/drivers/crypto/caam/ctrl.c
@@ -609,6 +609,13 @@ static bool check_version(struct fsl_mc_version *mc_version, u32 major,
}
#endif
+static bool needs_entropy_delay_adjustment(void)
+{
+ if (of_machine_is_compatible("fsl,imx6sx"))
+ return true;
+ return false;
+}
+
/* Probe routine for CAAM top (controller) level */
static int caam_probe(struct platform_device *pdev)
{
@@ -855,6 +862,8 @@ static int caam_probe(struct platform_device *pdev)
* Also, if a handle was instantiated, do not change
* the TRNG parameters.
*/
+ if (needs_entropy_delay_adjustment())
+ ent_delay = 12000;
if (!(ctrlpriv->rng4_sh_init || inst_handles)) {
dev_info(dev,
"Entropy delay = %u\n",
@@ -871,6 +880,15 @@ static int caam_probe(struct platform_device *pdev)
*/
ret = instantiate_rng(dev, inst_handles,
gen_sk);
+ /*
+ * Entropy delay is determined via TRNG characterization.
+ * TRNG characterization is run across different voltages
+ * and temperatures.
+ * If worst case value for ent_dly is identified,
+ * the loop can be skipped for that platform.
+ */
+ if (needs_entropy_delay_adjustment())
+ break;
if (ret == -EAGAIN)
/*
* if here, the loop will rerun,
--
2.25.1