The patch below does not apply to the 6.12-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.12.y
git checkout FETCH_HEAD
git cherry-pick -x 1dd6c84f1c544e552848a8968599220bd464e338
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025122909-remover-casino-d84a@gregkh' --subject-prefix 'PATCH 6.12.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1dd6c84f1c544e552848a8968599220bd464e338 Mon Sep 17 00:00:00 2001
From: Zqiang <qiang.zhang(a)linux.dev>
Date: Mon, 1 Dec 2025 19:25:40 +0800
Subject: [PATCH] sched_ext: Fix incorrect sched_class settings for per-cpu
migration tasks
When loading the ebpf scheduler, the tasks in the scx_tasks list will
be traversed and invoke __setscheduler_class() to get new sched_class.
however, this would also incorrectly set the per-cpu migration
task's->sched_class to rt_sched_class, even after unload, the per-cpu
migration task's->sched_class remains sched_rt_class.
The log for this issue is as follows:
./scx_rustland --stats 1
[ 199.245639][ T630] sched_ext: "rustland" does not implement cgroup cpu.weight
[ 199.269213][ T630] sched_ext: BPF scheduler "rustland" enabled
04:25:09 [INFO] RustLand scheduler attached
bpftrace -e 'iter:task /strcontains(ctx->task->comm, "migration")/
{ printf("%s:%d->%pS\n", ctx->task->comm, ctx->task->pid, ctx->task->sched_class); }'
Attaching 1 probe...
migration/0:24->rt_sched_class+0x0/0xe0
migration/1:27->rt_sched_class+0x0/0xe0
migration/2:33->rt_sched_class+0x0/0xe0
migration/3:39->rt_sched_class+0x0/0xe0
migration/4:45->rt_sched_class+0x0/0xe0
migration/5:52->rt_sched_class+0x0/0xe0
migration/6:58->rt_sched_class+0x0/0xe0
migration/7:64->rt_sched_class+0x0/0xe0
sched_ext: BPF scheduler "rustland" disabled (unregistered from user space)
EXIT: unregistered from user space
04:25:21 [INFO] Unregister RustLand scheduler
bpftrace -e 'iter:task /strcontains(ctx->task->comm, "migration")/
{ printf("%s:%d->%pS\n", ctx->task->comm, ctx->task->pid, ctx->task->sched_class); }'
Attaching 1 probe...
migration/0:24->rt_sched_class+0x0/0xe0
migration/1:27->rt_sched_class+0x0/0xe0
migration/2:33->rt_sched_class+0x0/0xe0
migration/3:39->rt_sched_class+0x0/0xe0
migration/4:45->rt_sched_class+0x0/0xe0
migration/5:52->rt_sched_class+0x0/0xe0
migration/6:58->rt_sched_class+0x0/0xe0
migration/7:64->rt_sched_class+0x0/0xe0
This commit therefore generate a new scx_setscheduler_class() and
add check for stop_sched_class to replace __setscheduler_class().
Fixes: f0e1a0643a59 ("sched_ext: Implement BPF extensible scheduler class")
Cc: stable(a)vger.kernel.org # v6.12+
Signed-off-by: Zqiang <qiang.zhang(a)linux.dev>
Reviewed-by: Andrea Righi <arighi(a)nvidia.com>
Signed-off-by: Tejun Heo <tj(a)kernel.org>
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index dca9ca0c1854..b563b8c3fd24 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -248,6 +248,14 @@ static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id)
return rhashtable_lookup(&sch->dsq_hash, &dsq_id, dsq_hash_params);
}
+static const struct sched_class *scx_setscheduler_class(struct task_struct *p)
+{
+ if (p->sched_class == &stop_sched_class)
+ return &stop_sched_class;
+
+ return __setscheduler_class(p->policy, p->prio);
+}
+
/*
* scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX
* ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate
@@ -4241,8 +4249,7 @@ static void scx_disable_workfn(struct kthread_work *work)
while ((p = scx_task_iter_next_locked(&sti))) {
unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
const struct sched_class *old_class = p->sched_class;
- const struct sched_class *new_class =
- __setscheduler_class(p->policy, p->prio);
+ const struct sched_class *new_class = scx_setscheduler_class(p);
update_rq_clock(task_rq(p));
@@ -5042,8 +5049,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
while ((p = scx_task_iter_next_locked(&sti))) {
unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
const struct sched_class *old_class = p->sched_class;
- const struct sched_class *new_class =
- __setscheduler_class(p->policy, p->prio);
+ const struct sched_class *new_class = scx_setscheduler_class(p);
if (scx_get_task_state(p) != SCX_TASK_READY)
continue;
The patch below does not apply to the 6.18-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.18.y
git checkout FETCH_HEAD
git cherry-pick -x 1dd6c84f1c544e552848a8968599220bd464e338
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025122908-magician-unlaced-bef4@gregkh' --subject-prefix 'PATCH 6.18.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1dd6c84f1c544e552848a8968599220bd464e338 Mon Sep 17 00:00:00 2001
From: Zqiang <qiang.zhang(a)linux.dev>
Date: Mon, 1 Dec 2025 19:25:40 +0800
Subject: [PATCH] sched_ext: Fix incorrect sched_class settings for per-cpu
migration tasks
When loading the ebpf scheduler, the tasks in the scx_tasks list will
be traversed and invoke __setscheduler_class() to get new sched_class.
however, this would also incorrectly set the per-cpu migration
task's->sched_class to rt_sched_class, even after unload, the per-cpu
migration task's->sched_class remains sched_rt_class.
The log for this issue is as follows:
./scx_rustland --stats 1
[ 199.245639][ T630] sched_ext: "rustland" does not implement cgroup cpu.weight
[ 199.269213][ T630] sched_ext: BPF scheduler "rustland" enabled
04:25:09 [INFO] RustLand scheduler attached
bpftrace -e 'iter:task /strcontains(ctx->task->comm, "migration")/
{ printf("%s:%d->%pS\n", ctx->task->comm, ctx->task->pid, ctx->task->sched_class); }'
Attaching 1 probe...
migration/0:24->rt_sched_class+0x0/0xe0
migration/1:27->rt_sched_class+0x0/0xe0
migration/2:33->rt_sched_class+0x0/0xe0
migration/3:39->rt_sched_class+0x0/0xe0
migration/4:45->rt_sched_class+0x0/0xe0
migration/5:52->rt_sched_class+0x0/0xe0
migration/6:58->rt_sched_class+0x0/0xe0
migration/7:64->rt_sched_class+0x0/0xe0
sched_ext: BPF scheduler "rustland" disabled (unregistered from user space)
EXIT: unregistered from user space
04:25:21 [INFO] Unregister RustLand scheduler
bpftrace -e 'iter:task /strcontains(ctx->task->comm, "migration")/
{ printf("%s:%d->%pS\n", ctx->task->comm, ctx->task->pid, ctx->task->sched_class); }'
Attaching 1 probe...
migration/0:24->rt_sched_class+0x0/0xe0
migration/1:27->rt_sched_class+0x0/0xe0
migration/2:33->rt_sched_class+0x0/0xe0
migration/3:39->rt_sched_class+0x0/0xe0
migration/4:45->rt_sched_class+0x0/0xe0
migration/5:52->rt_sched_class+0x0/0xe0
migration/6:58->rt_sched_class+0x0/0xe0
migration/7:64->rt_sched_class+0x0/0xe0
This commit therefore generate a new scx_setscheduler_class() and
add check for stop_sched_class to replace __setscheduler_class().
Fixes: f0e1a0643a59 ("sched_ext: Implement BPF extensible scheduler class")
Cc: stable(a)vger.kernel.org # v6.12+
Signed-off-by: Zqiang <qiang.zhang(a)linux.dev>
Reviewed-by: Andrea Righi <arighi(a)nvidia.com>
Signed-off-by: Tejun Heo <tj(a)kernel.org>
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index dca9ca0c1854..b563b8c3fd24 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -248,6 +248,14 @@ static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id)
return rhashtable_lookup(&sch->dsq_hash, &dsq_id, dsq_hash_params);
}
+static const struct sched_class *scx_setscheduler_class(struct task_struct *p)
+{
+ if (p->sched_class == &stop_sched_class)
+ return &stop_sched_class;
+
+ return __setscheduler_class(p->policy, p->prio);
+}
+
/*
* scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX
* ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate
@@ -4241,8 +4249,7 @@ static void scx_disable_workfn(struct kthread_work *work)
while ((p = scx_task_iter_next_locked(&sti))) {
unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
const struct sched_class *old_class = p->sched_class;
- const struct sched_class *new_class =
- __setscheduler_class(p->policy, p->prio);
+ const struct sched_class *new_class = scx_setscheduler_class(p);
update_rq_clock(task_rq(p));
@@ -5042,8 +5049,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
while ((p = scx_task_iter_next_locked(&sti))) {
unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
const struct sched_class *old_class = p->sched_class;
- const struct sched_class *new_class =
- __setscheduler_class(p->policy, p->prio);
+ const struct sched_class *new_class = scx_setscheduler_class(p);
if (scx_get_task_state(p) != SCX_TASK_READY)
continue;
The patch below does not apply to the 6.18-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.18.y
git checkout FETCH_HEAD
git cherry-pick -x 79f3f9bedd149ea438aaeb0fb6a083637affe205
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025122900-pug-tartly-0d84@gregkh' --subject-prefix 'PATCH 6.18.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 79f3f9bedd149ea438aaeb0fb6a083637affe205 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz(a)infradead.org>
Date: Wed, 2 Apr 2025 20:07:34 +0200
Subject: [PATCH] sched/eevdf: Fix min_vruntime vs avg_vruntime
Basically, from the constraint that the sum of lag is zero, you can
infer that the 0-lag point is the weighted average of the individual
vruntime, which is what we're trying to compute:
\Sum w_i * v_i
avg = --------------
\Sum w_i
Now, since vruntime takes the whole u64 (worse, it wraps), this
multiplication term in the numerator is not something we can compute;
instead we do the min_vruntime (v0 henceforth) thing like:
v_i = (v_i - v0) + v0
This does two things:
- it keeps the key: (v_i - v0) 'small';
- it creates a relative 0-point in the modular space.
If you do that subtitution and work it all out, you end up with:
\Sum w_i * (v_i - v0)
avg = --------------------- + v0
\Sum w_i
Since you cannot very well track a ratio like that (and not suffer
terrible numerical problems) we simpy track the numerator and
denominator individually and only perform the division when strictly
needed.
Notably, the numerator lives in cfs_rq->avg_vruntime and the denominator
lives in cfs_rq->avg_load.
The one extra 'funny' is that these numbers track the entities in the
tree, and current is typically outside of the tree, so avg_vruntime()
adds current when needed before doing the division.
(vruntime_eligible() elides the division by cross-wise multiplication)
Anyway, as mentioned above, we currently use the CFS era min_vruntime
for this purpose. However, this thing can only move forward, while the
above avg can in fact move backward (when a non-eligible task leaves,
the average becomes smaller), this can cause trouble when through
happenstance (or construction) these values drift far enough apart to
wreck the game.
Replace cfs_rq::min_vruntime with cfs_rq::zero_vruntime which is kept
near/at avg_vruntime, following its motion.
The down-side is that this requires computing the avg more often.
Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy")
Reported-by: Zicheng Qu <quzicheng(a)huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Link: https://patch.msgid.link/20251106111741.GC4068168@noisy.programming.kicks-a…
Cc: stable(a)vger.kernel.org
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 02e16b70a790..41caa22e0680 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -796,7 +796,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
{
- s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, left_deadline = -1, spread;
+ s64 left_vruntime = -1, zero_vruntime, right_vruntime = -1, left_deadline = -1, spread;
struct sched_entity *last, *first, *root;
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
@@ -819,15 +819,15 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
last = __pick_last_entity(cfs_rq);
if (last)
right_vruntime = last->vruntime;
- min_vruntime = cfs_rq->min_vruntime;
+ zero_vruntime = cfs_rq->zero_vruntime;
raw_spin_rq_unlock_irqrestore(rq, flags);
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline",
SPLIT_NS(left_deadline));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime",
SPLIT_NS(left_vruntime));
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
- SPLIT_NS(min_vruntime));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "zero_vruntime",
+ SPLIT_NS(zero_vruntime));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime",
SPLIT_NS(avg_vruntime(cfs_rq)));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime",
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4a11a832d63e..8d971d48669f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -554,7 +554,7 @@ static inline bool entity_before(const struct sched_entity *a,
static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- return (s64)(se->vruntime - cfs_rq->min_vruntime);
+ return (s64)(se->vruntime - cfs_rq->zero_vruntime);
}
#define __node_2_se(node) \
@@ -606,13 +606,13 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
*
* Which we track using:
*
- * v0 := cfs_rq->min_vruntime
+ * v0 := cfs_rq->zero_vruntime
* \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime
* \Sum w_i := cfs_rq->avg_load
*
- * Since min_vruntime is a monotonic increasing variable that closely tracks
- * the per-task service, these deltas: (v_i - v), will be in the order of the
- * maximal (virtual) lag induced in the system due to quantisation.
+ * Since zero_vruntime closely tracks the per-task service, these
+ * deltas: (v_i - v), will be in the order of the maximal (virtual) lag
+ * induced in the system due to quantisation.
*
* Also, we use scale_load_down() to reduce the size.
*
@@ -671,7 +671,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
avg = div_s64(avg, load);
}
- return cfs_rq->min_vruntime + avg;
+ return cfs_rq->zero_vruntime + avg;
}
/*
@@ -732,7 +732,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
load += weight;
}
- return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load;
+ return avg >= (s64)(vruntime - cfs_rq->zero_vruntime) * load;
}
int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -740,42 +740,14 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
return vruntime_eligible(cfs_rq, se->vruntime);
}
-static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
+static void update_zero_vruntime(struct cfs_rq *cfs_rq)
{
- u64 min_vruntime = cfs_rq->min_vruntime;
- /*
- * open coded max_vruntime() to allow updating avg_vruntime
- */
- s64 delta = (s64)(vruntime - min_vruntime);
- if (delta > 0) {
- avg_vruntime_update(cfs_rq, delta);
- min_vruntime = vruntime;
- }
- return min_vruntime;
-}
+ u64 vruntime = avg_vruntime(cfs_rq);
+ s64 delta = (s64)(vruntime - cfs_rq->zero_vruntime);
-static void update_min_vruntime(struct cfs_rq *cfs_rq)
-{
- struct sched_entity *se = __pick_root_entity(cfs_rq);
- struct sched_entity *curr = cfs_rq->curr;
- u64 vruntime = cfs_rq->min_vruntime;
+ avg_vruntime_update(cfs_rq, delta);
- if (curr) {
- if (curr->on_rq)
- vruntime = curr->vruntime;
- else
- curr = NULL;
- }
-
- if (se) {
- if (!curr)
- vruntime = se->min_vruntime;
- else
- vruntime = min_vruntime(vruntime, se->min_vruntime);
- }
-
- /* ensure we never gain time by being placed backwards. */
- cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime);
+ cfs_rq->zero_vruntime = vruntime;
}
static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
@@ -848,6 +820,7 @@ RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
avg_vruntime_add(cfs_rq, se);
+ update_zero_vruntime(cfs_rq);
se->min_vruntime = se->vruntime;
se->min_slice = se->slice;
rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
@@ -859,6 +832,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
&min_vruntime_cb);
avg_vruntime_sub(cfs_rq, se);
+ update_zero_vruntime(cfs_rq);
}
struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq)
@@ -1226,7 +1200,6 @@ static void update_curr(struct cfs_rq *cfs_rq)
curr->vruntime += calc_delta_fair(delta_exec, curr);
resched = update_deadline(cfs_rq, curr);
- update_min_vruntime(cfs_rq);
if (entity_is_task(curr)) {
/*
@@ -3808,15 +3781,6 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
if (!curr)
__enqueue_entity(cfs_rq, se);
cfs_rq->nr_queued++;
-
- /*
- * The entity's vruntime has been adjusted, so let's check
- * whether the rq-wide min_vruntime needs updated too. Since
- * the calculations above require stable min_vruntime rather
- * than up-to-date one, we do the update at the end of the
- * reweight process.
- */
- update_min_vruntime(cfs_rq);
}
}
@@ -5429,15 +5393,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_cfs_group(se);
- /*
- * Now advance min_vruntime if @se was the entity holding it back,
- * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
- * put back on, and if we advance min_vruntime, we'll be placed back
- * further than we started -- i.e. we'll be penalized.
- */
- if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
- update_min_vruntime(cfs_rq);
-
if (flags & DEQUEUE_DELAYED)
finish_delayed_dequeue_entity(se);
@@ -9015,7 +8970,6 @@ static void yield_task_fair(struct rq *rq)
if (entity_eligible(cfs_rq, se)) {
se->vruntime = se->deadline;
se->deadline += calc_delta_fair(se->slice, se);
- update_min_vruntime(cfs_rq);
}
}
@@ -13078,23 +13032,6 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
* Which shows that S and s_i transform alike (which makes perfect sense
* given that S is basically the (weighted) average of s_i).
*
- * Then:
- *
- * x -> s_min := min{s_i} (8)
- *
- * to obtain:
- *
- * \Sum_i w_i (s_i - s_min)
- * S = s_min + ------------------------ (9)
- * \Sum_i w_i
- *
- * Which already looks familiar, and is the basis for our current
- * approximation:
- *
- * S ~= s_min (10)
- *
- * Now, obviously, (10) is absolute crap :-), but it sorta works.
- *
* So the thing to remember is that the above is strictly UP. It is
* possible to generalize to multiple runqueues -- however it gets really
* yuck when you have to add affinity support, as illustrated by our very
@@ -13116,23 +13053,23 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
* Let, for our runqueue 'k':
*
* T_k = \Sum_i w_i s_i
- * W_k = \Sum_i w_i ; for all i of k (11)
+ * W_k = \Sum_i w_i ; for all i of k (8)
*
* Then we can write (6) like:
*
* T_k
- * S_k = --- (12)
+ * S_k = --- (9)
* W_k
*
* From which immediately follows that:
*
* T_k + T_l
- * S_k+l = --------- (13)
+ * S_k+l = --------- (10)
* W_k + W_l
*
* On which we can define a combined lag:
*
- * lag_k+l(i) := S_k+l - s_i (14)
+ * lag_k+l(i) := S_k+l - s_i (11)
*
* And that gives us the tools to compare tasks across a combined runqueue.
*
@@ -13143,7 +13080,7 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
* using (7); this only requires storing single 'time'-stamps.
*
* b) when comparing tasks between 2 runqueues of which one is forced-idle,
- * compare the combined lag, per (14).
+ * compare the combined lag, per (11).
*
* Now, of course cgroups (I so hate them) make this more interesting in
* that a) seems to suggest we need to iterate all cgroup on a CPU at such
@@ -13191,12 +13128,11 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
* every tick. This limits the observed divergence due to the work
* conservancy.
*
- * On top of that, we can improve upon things by moving away from our
- * horrible (10) hack and moving to (9) and employing (13) here.
+ * On top of that, we can improve upon things by employing (10) here.
*/
/*
- * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed.
+ * se_fi_update - Update the cfs_rq->zero_vruntime_fi in a CFS hierarchy if needed.
*/
static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq,
bool forceidle)
@@ -13210,7 +13146,7 @@ static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq,
cfs_rq->forceidle_seq = fi_seq;
}
- cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
+ cfs_rq->zero_vruntime_fi = cfs_rq->zero_vruntime;
}
}
@@ -13263,11 +13199,11 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
/*
* Find delta after normalizing se's vruntime with its cfs_rq's
- * min_vruntime_fi, which would have been updated in prior calls
+ * zero_vruntime_fi, which would have been updated in prior calls
* to se_fi_update().
*/
delta = (s64)(sea->vruntime - seb->vruntime) +
- (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
+ (s64)(cfs_rqb->zero_vruntime_fi - cfs_rqa->zero_vruntime_fi);
return delta > 0;
}
@@ -13513,7 +13449,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
void init_cfs_rq(struct cfs_rq *cfs_rq)
{
cfs_rq->tasks_timeline = RB_ROOT_CACHED;
- cfs_rq->min_vruntime = (u64)(-(1LL << 20));
+ cfs_rq->zero_vruntime = (u64)(-(1LL << 20));
raw_spin_lock_init(&cfs_rq->removed.lock);
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 82e74e8ca2ea..5a3cf81c27be 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -681,10 +681,10 @@ struct cfs_rq {
s64 avg_vruntime;
u64 avg_load;
- u64 min_vruntime;
+ u64 zero_vruntime;
#ifdef CONFIG_SCHED_CORE
unsigned int forceidle_seq;
- u64 min_vruntime_fi;
+ u64 zero_vruntime_fi;
#endif
struct rb_root_cached tasks_timeline;
The issue occurs when gfs2_freeze_lock_shared() fails in
gfs2_fill_super(). If !sb_rdonly(sb), threads for the quotad and logd
were started, however, in the error path for gfs2_freeze_lock_shared(),
the threads are not stopped by gfs2_destroy_threads() before jumping to
fail_per_node.
This patch introduces fail_threads to handle stopping the threads if the
threads were started.
Reported-by: syzbot+4cb0d0336db6bc6930e9(a)syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=4cb0d0336db6bc6930e9
Fixes: a28dc123fa66 ("gfs2: init system threads before freeze lock")
Cc: stable(a)vger.kernel.org
Signed-off-by: Ryota Sakamoto <sakamo.ryota(a)gmail.com>
---
fs/gfs2/ops_fstype.c | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index e7a88b717991ae3647c1da039636daef7005a7f0..4b5ac1a7050f1fd34e10be4100a2bc381f49c83d 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1269,21 +1269,23 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc)
error = gfs2_freeze_lock_shared(sdp);
if (error)
- goto fail_per_node;
+ goto fail_threads;
if (!sb_rdonly(sb))
error = gfs2_make_fs_rw(sdp);
if (error) {
gfs2_freeze_unlock(sdp);
- gfs2_destroy_threads(sdp);
fs_err(sdp, "can't make FS RW: %d\n", error);
- goto fail_per_node;
+ goto fail_threads;
}
gfs2_glock_dq_uninit(&mount_gh);
gfs2_online_uevent(sdp);
return 0;
+fail_threads:
+ if (!sb_rdonly(sb))
+ gfs2_destroy_threads(sdp);
fail_per_node:
init_per_node(sdp, UNDO);
fail_inodes:
---
base-commit: 7839932417dd53bb09eb5a585a7a92781dfd7cb2
change-id: 20251230-fix-use-after-free-gfs2-66cfbe23baa8
Best regards,
--
Ryota Sakamoto <sakamo.ryota(a)gmail.com>
Fix a bug where an empty FDA (fd array) object with 0 fds would cause an
out-of-bounds error. The previous implementation used `skip == 0` to
mean "this is a pointer fixup", but 0 is also the correct skip length
for an empty FDA. If the FDA is at the end of the buffer, then this
results in an attempt to write 8-bytes out of bounds. This is caught and
results in an EINVAL error being returned to userspace.
The pattern of using `skip == 0` as a special value originates from the
C-implementation of Binder. As part of fixing this bug, this pattern is
replaced with a Rust enum.
I considered the alternate option of not pushing a fixup when the length
is zero, but I think it's cleaner to just get rid of the zero-is-special
stuff.
The root cause of this bug was diagnosed by Gemini CLI on first try. I
used the following prompt:
> There appears to be a bug in @drivers/android/binder/thread.rs where
> the Fixups oob bug is triggered with 316 304 316 324. This implies
> that we somehow ended up with a fixup where buffer A has a pointer to
> buffer B, but the pointer is located at an index in buffer A that is
> out of bounds. Please investigate the code to find the bug. You may
> compare with @drivers/android/binder.c that implements this correctly.
Cc: stable(a)vger.kernel.org
Reported-by: DeepChirp <DeepChirp(a)outlook.com>
Closes: https://github.com/waydroid/waydroid/issues/2157
Fixes: eafedbc7c050 ("rust_binder: add Rust Binder driver")
Tested-by: DeepChirp <DeepChirp(a)outlook.com>
Signed-off-by: Alice Ryhl <aliceryhl(a)google.com>
---
drivers/android/binder/thread.rs | 59 +++++++++++++++++++++++-----------------
1 file changed, 34 insertions(+), 25 deletions(-)
diff --git a/drivers/android/binder/thread.rs b/drivers/android/binder/thread.rs
index 1a8e6fdc0dc42369ee078e720aa02b2554fb7332..dcd47e10aeb8c748d04320fbbe15ad35201684b9 100644
--- a/drivers/android/binder/thread.rs
+++ b/drivers/android/binder/thread.rs
@@ -69,17 +69,24 @@ struct ScatterGatherEntry {
}
/// This entry specifies that a fixup should happen at `target_offset` of the
-/// buffer. If `skip` is nonzero, then the fixup is a `binder_fd_array_object`
-/// and is applied later. Otherwise if `skip` is zero, then the size of the
-/// fixup is `sizeof::<u64>()` and `pointer_value` is written to the buffer.
-struct PointerFixupEntry {
- /// The number of bytes to skip, or zero for a `binder_buffer_object` fixup.
- skip: usize,
- /// The translated pointer to write when `skip` is zero.
- pointer_value: u64,
- /// The offset at which the value should be written. The offset is relative
- /// to the original buffer.
- target_offset: usize,
+/// buffer.
+enum PointerFixupEntry {
+ /// A fixup for a `binder_buffer_object`.
+ Fixup {
+ /// The translated pointer to write.
+ pointer_value: u64,
+ /// The offset at which the value should be written. The offset is relative
+ /// to the original buffer.
+ target_offset: usize,
+ },
+ /// A skip for a `binder_fd_array_object`.
+ Skip {
+ /// The number of bytes to skip.
+ skip: usize,
+ /// The offset at which the skip should happen. The offset is relative
+ /// to the original buffer.
+ target_offset: usize,
+ },
}
/// Return type of `apply_and_validate_fixup_in_parent`.
@@ -762,8 +769,7 @@ fn translate_object(
parent_entry.fixup_min_offset = info.new_min_offset;
parent_entry.pointer_fixups.push(
- PointerFixupEntry {
- skip: 0,
+ PointerFixupEntry::Fixup {
pointer_value: buffer_ptr_in_user_space,
target_offset: info.target_offset,
},
@@ -807,9 +813,8 @@ fn translate_object(
parent_entry
.pointer_fixups
.push(
- PointerFixupEntry {
+ PointerFixupEntry::Skip {
skip: fds_len,
- pointer_value: 0,
target_offset: info.target_offset,
},
GFP_KERNEL,
@@ -871,17 +876,21 @@ fn apply_sg(&self, alloc: &mut Allocation, sg_state: &mut ScatterGatherState) ->
let mut reader =
UserSlice::new(UserPtr::from_addr(sg_entry.sender_uaddr), sg_entry.length).reader();
for fixup in &mut sg_entry.pointer_fixups {
- let fixup_len = if fixup.skip == 0 {
- size_of::<u64>()
- } else {
- fixup.skip
+ let (fixup_len, fixup_offset) = match fixup {
+ PointerFixupEntry::Fixup { target_offset, .. } => {
+ (size_of::<u64>(), *target_offset)
+ }
+ PointerFixupEntry::Skip {
+ skip,
+ target_offset,
+ } => (*skip, *target_offset),
};
- let target_offset_end = fixup.target_offset.checked_add(fixup_len).ok_or(EINVAL)?;
- if fixup.target_offset < end_of_previous_fixup || offset_end < target_offset_end {
+ let target_offset_end = fixup_offset.checked_add(fixup_len).ok_or(EINVAL)?;
+ if fixup_offset < end_of_previous_fixup || offset_end < target_offset_end {
pr_warn!(
"Fixups oob {} {} {} {}",
- fixup.target_offset,
+ fixup_offset,
end_of_previous_fixup,
offset_end,
target_offset_end
@@ -890,13 +899,13 @@ fn apply_sg(&self, alloc: &mut Allocation, sg_state: &mut ScatterGatherState) ->
}
let copy_off = end_of_previous_fixup;
- let copy_len = fixup.target_offset - end_of_previous_fixup;
+ let copy_len = fixup_offset - end_of_previous_fixup;
if let Err(err) = alloc.copy_into(&mut reader, copy_off, copy_len) {
pr_warn!("Failed copying into alloc: {:?}", err);
return Err(err.into());
}
- if fixup.skip == 0 {
- let res = alloc.write::<u64>(fixup.target_offset, &fixup.pointer_value);
+ if let PointerFixupEntry::Fixup { pointer_value, .. } = fixup {
+ let res = alloc.write::<u64>(fixup_offset, pointer_value);
if let Err(err) = res {
pr_warn!("Failed copying ptr into alloc: {:?}", err);
return Err(err.into());
---
base-commit: 8f0b4cce4481fb22653697cced8d0d04027cb1e8
change-id: 20251229-fda-zero-4e46e56be58d
Best regards,
--
Alice Ryhl <aliceryhl(a)google.com>