intel_th_output_open() calls bus_find_device_by_devt() which
internally increments the device reference count via get_device(), but
this reference is not properly released in several error paths. When
device driver is unavailable, file operations cannot be obtained, or
the driver's open method fails, the function returns without calling
put_device(), leading to a permanent device reference count leak. This
prevents the device from being properly released and could cause
resource exhaustion over time.
Found by code review.
Cc: stable(a)vger.kernel.org
Fixes: 39f4034693b7 ("intel_th: Add driver infrastructure for Intel(R) Trace Hub devices")
Signed-off-by: Ma Ke <make24(a)iscas.ac.cn>
---
drivers/hwtracing/intel_th/core.c | 12 +++++++++---
1 file changed, 9 insertions(+), 3 deletions(-)
diff --git a/drivers/hwtracing/intel_th/core.c b/drivers/hwtracing/intel_th/core.c
index 47d9e6c3bac0..ecc4b4ff5cf6 100644
--- a/drivers/hwtracing/intel_th/core.c
+++ b/drivers/hwtracing/intel_th/core.c
@@ -811,12 +811,12 @@ static int intel_th_output_open(struct inode *inode, struct file *file)
dev = bus_find_device_by_devt(&intel_th_bus, inode->i_rdev);
if (!dev || !dev->driver)
- return -ENODEV;
+ goto out_no_device;
thdrv = to_intel_th_driver(dev->driver);
fops = fops_get(thdrv->fops);
if (!fops)
- return -ENODEV;
+ goto out_put_device;
replace_fops(file, fops);
@@ -824,10 +824,16 @@ static int intel_th_output_open(struct inode *inode, struct file *file)
if (file->f_op->open) {
err = file->f_op->open(inode, file);
- return err;
+ if (err)
+ goto out_put_device;
}
return 0;
+
+out_put_device:
+ put_device(dev);
+out_no_device:
+ return err;
}
static const struct file_operations intel_th_output_fops = {
--
2.17.1
From: Chuck Lever <chuck.lever(a)oracle.com>
Mike noted that when NFSD responds to an NFS_FILE_SYNC WRITE, it
does not also persist file time stamps. To wit, Section 18.32.3
of RFC 8881 mandates:
> The client specifies with the stable parameter the method of how
> the data is to be processed by the server. If stable is
> FILE_SYNC4, the server MUST commit the data written plus all file
> system metadata to stable storage before returning results. This
> corresponds to the NFSv2 protocol semantics. Any other behavior
> constitutes a protocol violation. If stable is DATA_SYNC4, then
> the server MUST commit all of the data to stable storage and
> enough of the metadata to retrieve the data before returning.
Commit 3f3503adb332 ("NFSD: Use vfs_iocb_iter_write()") replaced:
- flags |= RWF_SYNC;
with:
+ kiocb.ki_flags |= IOCB_DSYNC;
which appears to be correct given:
if (flags & RWF_SYNC)
kiocb_flags |= IOCB_DSYNC;
in kiocb_set_rw_flags(). However the author of that commit did not
appreciate that the previous line in kiocb_set_rw_flags() results
in IOCB_SYNC also being set:
kiocb_flags |= (__force int) (flags & RWF_SUPPORTED);
RWF_SUPPORTED contains RWF_SYNC, and RWF_SYNC is the same bit as
IOCB_SYNC. Reviewers at the time did not catch the omission.
Reported-by: Mike Snitzer <snitzer(a)kernel.org>
Closes: https://lore.kernel.org/linux-nfs/20251018005431.3403-1-cel@kernel.org/T/#t
Fixes: 3f3503adb332 ("NFSD: Use vfs_iocb_iter_write()")
Cc: stable(a)vger.kernel.org
Reviewed-by: Jeff Layton <jlayton(a)kernel.org>
Reviewed-by: NeilBrown <neil(a)brown.name>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
Signed-off-by: Chuck Lever <chuck.lever(a)oracle.com>
---
fs/nfsd/vfs.c | 14 ++++++++++++--
1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index f537a7b4ee01..5333d49910d9 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1314,8 +1314,18 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
stable = NFS_UNSTABLE;
init_sync_kiocb(&kiocb, file);
kiocb.ki_pos = offset;
- if (stable && !fhp->fh_use_wgather)
- kiocb.ki_flags |= IOCB_DSYNC;
+ if (likely(!fhp->fh_use_wgather)) {
+ switch (stable) {
+ case NFS_FILE_SYNC:
+ /* persist data and timestamps */
+ kiocb.ki_flags |= IOCB_DSYNC | IOCB_SYNC;
+ break;
+ case NFS_DATA_SYNC:
+ /* persist data only */
+ kiocb.ki_flags |= IOCB_DSYNC;
+ break;
+ }
+ }
nvecs = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, payload);
iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt);
--
2.51.0
The function 'ep0_rx_state()' accessed 'mreq->request' before verifying
that mreq was valid. If 'next_ep0_request()' returned NULL, this could
lead to a NULL pointer dereference. The return value of
'next_ep0_request()' is checked in every other code path except
here. It appears that the intended 'if (mreq)' check was mistakenly
written as 'if (req)', since the req pointer cannot be NULL when mreq
is not NULL.
Initialize 'mreq' and 'req' to NULL by default, and switch 'req'
NULL-checking to 'mreq' non-NULL check to prevent invalid memory access.
Found by Linux Verification Center (linuxtesting.org) with SVACE.
Fixes: df2069acb005 ("usb: Add MediaTek USB3 DRD driver")
Signed-off-by: Pavel Zhigulin <Pavel.Zhigulin(a)kaspersky.com>
---
v2: Add <stable(a)vger.kernel.org> to CC list
v1: https://lore.kernel.org/all/20251027193152.3906497-1-Pavel.Zhigulin@kaspers…
drivers/usb/mtu3/mtu3_gadget_ep0.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/drivers/usb/mtu3/mtu3_gadget_ep0.c b/drivers/usb/mtu3/mtu3_gadget_ep0.c
index e4fd1bb14a55..ee7466ca4d99 100644
--- a/drivers/usb/mtu3/mtu3_gadget_ep0.c
+++ b/drivers/usb/mtu3/mtu3_gadget_ep0.c
@@ -508,8 +508,8 @@ static int handle_standard_request(struct mtu3 *mtu,
/* receive an data packet (OUT) */
static void ep0_rx_state(struct mtu3 *mtu)
{
- struct mtu3_request *mreq;
- struct usb_request *req;
+ struct mtu3_request *mreq = NULL;
+ struct usb_request *req = NULL;
void __iomem *mbase = mtu->mac_base;
u32 maxp;
u32 csr;
@@ -519,10 +519,11 @@ static void ep0_rx_state(struct mtu3 *mtu)
csr = mtu3_readl(mbase, U3D_EP0CSR) & EP0_W1C_BITS;
mreq = next_ep0_request(mtu);
- req = &mreq->request;
/* read packet and ack; or stall because of gadget driver bug */
- if (req) {
+ if (mreq) {
+ req = &mreq->request;
+
void *buf = req->buf + req->actual;
unsigned int len = req->length - req->actual;
--
2.43.0
From: NeilBrown <neil(a)brown.name>
A recent change to clamp_t() in 6.1.y caused fs/nfsd/nfs4state.c to fail
to compile with gcc-9.
The code was written with the assumption that when "max < min",
clamp(val, min, max)
would return max. This assumption is not documented as an API promise
and the change cause a compile failure if it could be statically
determined that "max < min".
The relevant code was no longer present upstream when the clamp() change
landed there, so there is no upstream change to backport.
As there is no clear case that the code is functioning incorrectly, the
patch aims to restore the behaviour to exactly that before the clamp
change, and to match what compilers other than gcc-9 produce.
clamp_t(type,v,min,max) is replaced with
__clamp((type)v, (type)min, (type)max)
Some of those type casts are unnecessary but they are included to make
the code obviously correct.
(__clamp() is the same as clamp(), but without the static API usage
test).
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220745#c0
Fixes: 1519fbc8832b ("minmax.h: use BUILD_BUG_ON_MSG() for the lo < hi test in clamp()")
Signed-off-by: NeilBrown <neil(a)brown.name>
---
fs/nfsd/nfs4state.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 08bfc2b29b65..d485a140d36d 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1822,8 +1822,9 @@ static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca, struct nfsd_net *nn
*/
scale_factor = max_t(unsigned int, 8, nn->nfsd_serv->sv_nrthreads);
- avail = clamp_t(unsigned long, avail, slotsize,
- total_avail/scale_factor);
+ avail = __clamp((unsigned long)avail,
+ (unsigned long)slotsize,
+ (unsigned long)(total_avail/scale_factor));
num = min_t(int, num, avail / slotsize);
num = max_t(int, num, 1);
nfsd_drc_mem_used += num * slotsize;
--
2.50.0.107.gf914562f5916.dirty
From: Niravkumar L Rabara <niravkumarlaxmidas.rabara(a)altera.com>
The OCRAM ECC is always enabled either by the BootROM or by the Secure
Device Manager (SDM) during a power-on reset on SoCFPGA.
However, during a warm reset, the OCRAM content is retained to preserve
data, while the control and status registers are reset to their default
values. As a result, ECC must be explicitly re-enabled after a warm reset.
Fixes: 17e47dc6db4f ("EDAC/altera: Add Stratix10 OCRAM ECC support")
Cc: stable(a)vger.kernel.org
Signed-off-by: Niravkumar L Rabara <niravkumarlaxmidas.rabara(a)altera.com>
Acked-by: Dinh Nguyen <dinguyen(a)kernel.org>
---
v2 changes:
- Add Fixes and Cc tags
- Retains Acked-by from v1 patch
v1 link:
https://lore.kernel.org/all/20251103140920.1060643-1-niravkumarlaxmidas.rab…
drivers/edac/altera_edac.c | 18 +++++++++++++++---
1 file changed, 15 insertions(+), 3 deletions(-)
diff --git a/drivers/edac/altera_edac.c b/drivers/edac/altera_edac.c
index 103b2c2eba2a..a776d61027f2 100644
--- a/drivers/edac/altera_edac.c
+++ b/drivers/edac/altera_edac.c
@@ -1184,10 +1184,22 @@ altr_check_ocram_deps_init(struct altr_edac_device_dev *device)
if (ret)
return ret;
- /* Verify OCRAM has been initialized */
+ /*
+ * Verify that OCRAM has been initialized.
+ * During a warm reset, OCRAM contents are retained, but the control
+ * and status registers are reset to their default values. Therefore,
+ * ECC must be explicitly re-enabled in the control register.
+ * Error condition: if INITCOMPLETEA is clear and ECC_EN is already set.
+ */
if (!ecc_test_bits(ALTR_A10_ECC_INITCOMPLETEA,
- (base + ALTR_A10_ECC_INITSTAT_OFST)))
- return -ENODEV;
+ (base + ALTR_A10_ECC_INITSTAT_OFST))) {
+ if (!ecc_test_bits(ALTR_A10_ECC_EN,
+ (base + ALTR_A10_ECC_CTRL_OFST)))
+ ecc_set_bits(ALTR_A10_ECC_EN,
+ (base + ALTR_A10_ECC_CTRL_OFST));
+ else
+ return -ENODEV;
+ }
/* Enable IRQ on Single Bit Error */
writel(ALTR_A10_ECC_SERRINTEN, (base + ALTR_A10_ECC_ERRINTENS_OFST));
--
2.25.1
Protect access to fore200e->available_cell_rate with rate_mtx lock to
prevent potential data race.
In this case, since the update depends on a prior read, a data race
could lead to a wrong fore200e.available_cell_rate value.
The field fore200e.available_cell_rate is generally protected by the lock
fore200e.rate_mtx when accessed. In all other read and write cases, this
field is consistently protected by the lock, except for this case and
during initialization.
This potential bug was detected by our experimental static analysis tool,
which analyzes locking APIs and paired functions to identify data races
and atomicity violations.
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Cc: stable(a)vger.kernel.org
Signed-off-by: Gui-Dong Han <2045gemini(a)gmail.com>
---
v2:
* Added a description of the data race hazard in fore200e_open(), as
suggested by Jakub Kicinski and Simon Horman.
---
drivers/atm/fore200e.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/atm/fore200e.c b/drivers/atm/fore200e.c
index 4fea1149e003..f62e38571440 100644
--- a/drivers/atm/fore200e.c
+++ b/drivers/atm/fore200e.c
@@ -1374,7 +1374,9 @@ fore200e_open(struct atm_vcc *vcc)
vcc->dev_data = NULL;
+ mutex_lock(&fore200e->rate_mtx);
fore200e->available_cell_rate += vcc->qos.txtp.max_pcr;
+ mutex_unlock(&fore200e->rate_mtx);
kfree(fore200e_vcc);
return -EINVAL;
--
2.25.1
This patch series fixes delayed hw_error handling during SSR.
Patch 1 adds a wakeup to ensure hw_error is processed promptly after coredump collection.
Patch 2 corrects the timeout unit from jiffies to ms.
Changes v2:
- Split timeout conversion into a separate patch.
- Clarified commit messages and added test case description.
- Link to v1
https://lore.kernel.org/all/20251104112601.2670019-1-quic_shuaz@quicinc.com/
Shuai Zhang (2):
Bluetooth: qca: Fix delayed hw_error handling due to missing wakeup
during SSR
Bluetooth: hci_qca: Convert timeout from jiffies to ms
drivers/bluetooth/hci_qca.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
--
2.34.1
The following commit has been merged into the sched/core branch of tip:
Commit-ID: 79f3f9bedd149ea438aaeb0fb6a083637affe205
Gitweb: https://git.kernel.org/tip/79f3f9bedd149ea438aaeb0fb6a083637affe205
Author: Peter Zijlstra <peterz(a)infradead.org>
AuthorDate: Wed, 02 Apr 2025 20:07:34 +02:00
Committer: Peter Zijlstra <peterz(a)infradead.org>
CommitterDate: Tue, 11 Nov 2025 12:33:38 +01:00
sched/eevdf: Fix min_vruntime vs avg_vruntime
Basically, from the constraint that the sum of lag is zero, you can
infer that the 0-lag point is the weighted average of the individual
vruntime, which is what we're trying to compute:
\Sum w_i * v_i
avg = --------------
\Sum w_i
Now, since vruntime takes the whole u64 (worse, it wraps), this
multiplication term in the numerator is not something we can compute;
instead we do the min_vruntime (v0 henceforth) thing like:
v_i = (v_i - v0) + v0
This does two things:
- it keeps the key: (v_i - v0) 'small';
- it creates a relative 0-point in the modular space.
If you do that subtitution and work it all out, you end up with:
\Sum w_i * (v_i - v0)
avg = --------------------- + v0
\Sum w_i
Since you cannot very well track a ratio like that (and not suffer
terrible numerical problems) we simpy track the numerator and
denominator individually and only perform the division when strictly
needed.
Notably, the numerator lives in cfs_rq->avg_vruntime and the denominator
lives in cfs_rq->avg_load.
The one extra 'funny' is that these numbers track the entities in the
tree, and current is typically outside of the tree, so avg_vruntime()
adds current when needed before doing the division.
(vruntime_eligible() elides the division by cross-wise multiplication)
Anyway, as mentioned above, we currently use the CFS era min_vruntime
for this purpose. However, this thing can only move forward, while the
above avg can in fact move backward (when a non-eligible task leaves,
the average becomes smaller), this can cause trouble when through
happenstance (or construction) these values drift far enough apart to
wreck the game.
Replace cfs_rq::min_vruntime with cfs_rq::zero_vruntime which is kept
near/at avg_vruntime, following its motion.
The down-side is that this requires computing the avg more often.
Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy")
Reported-by: Zicheng Qu <quzicheng(a)huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Link: https://patch.msgid.link/20251106111741.GC4068168@noisy.programming.kicks-a…
Cc: stable(a)vger.kernel.org
---
kernel/sched/debug.c | 8 +--
kernel/sched/fair.c | 114 +++++++++---------------------------------
kernel/sched/sched.h | 4 +-
3 files changed, 31 insertions(+), 95 deletions(-)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 02e16b7..41caa22 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -796,7 +796,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
{
- s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, left_deadline = -1, spread;
+ s64 left_vruntime = -1, zero_vruntime, right_vruntime = -1, left_deadline = -1, spread;
struct sched_entity *last, *first, *root;
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
@@ -819,15 +819,15 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
last = __pick_last_entity(cfs_rq);
if (last)
right_vruntime = last->vruntime;
- min_vruntime = cfs_rq->min_vruntime;
+ zero_vruntime = cfs_rq->zero_vruntime;
raw_spin_rq_unlock_irqrestore(rq, flags);
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline",
SPLIT_NS(left_deadline));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime",
SPLIT_NS(left_vruntime));
- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
- SPLIT_NS(min_vruntime));
+ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "zero_vruntime",
+ SPLIT_NS(zero_vruntime));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime",
SPLIT_NS(avg_vruntime(cfs_rq)));
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime",
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4a11a83..8d971d4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -554,7 +554,7 @@ static inline bool entity_before(const struct sched_entity *a,
static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- return (s64)(se->vruntime - cfs_rq->min_vruntime);
+ return (s64)(se->vruntime - cfs_rq->zero_vruntime);
}
#define __node_2_se(node) \
@@ -606,13 +606,13 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
*
* Which we track using:
*
- * v0 := cfs_rq->min_vruntime
+ * v0 := cfs_rq->zero_vruntime
* \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime
* \Sum w_i := cfs_rq->avg_load
*
- * Since min_vruntime is a monotonic increasing variable that closely tracks
- * the per-task service, these deltas: (v_i - v), will be in the order of the
- * maximal (virtual) lag induced in the system due to quantisation.
+ * Since zero_vruntime closely tracks the per-task service, these
+ * deltas: (v_i - v), will be in the order of the maximal (virtual) lag
+ * induced in the system due to quantisation.
*
* Also, we use scale_load_down() to reduce the size.
*
@@ -671,7 +671,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
avg = div_s64(avg, load);
}
- return cfs_rq->min_vruntime + avg;
+ return cfs_rq->zero_vruntime + avg;
}
/*
@@ -732,7 +732,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
load += weight;
}
- return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load;
+ return avg >= (s64)(vruntime - cfs_rq->zero_vruntime) * load;
}
int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -740,42 +740,14 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
return vruntime_eligible(cfs_rq, se->vruntime);
}
-static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
+static void update_zero_vruntime(struct cfs_rq *cfs_rq)
{
- u64 min_vruntime = cfs_rq->min_vruntime;
- /*
- * open coded max_vruntime() to allow updating avg_vruntime
- */
- s64 delta = (s64)(vruntime - min_vruntime);
- if (delta > 0) {
- avg_vruntime_update(cfs_rq, delta);
- min_vruntime = vruntime;
- }
- return min_vruntime;
-}
+ u64 vruntime = avg_vruntime(cfs_rq);
+ s64 delta = (s64)(vruntime - cfs_rq->zero_vruntime);
-static void update_min_vruntime(struct cfs_rq *cfs_rq)
-{
- struct sched_entity *se = __pick_root_entity(cfs_rq);
- struct sched_entity *curr = cfs_rq->curr;
- u64 vruntime = cfs_rq->min_vruntime;
-
- if (curr) {
- if (curr->on_rq)
- vruntime = curr->vruntime;
- else
- curr = NULL;
- }
+ avg_vruntime_update(cfs_rq, delta);
- if (se) {
- if (!curr)
- vruntime = se->min_vruntime;
- else
- vruntime = min_vruntime(vruntime, se->min_vruntime);
- }
-
- /* ensure we never gain time by being placed backwards. */
- cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime);
+ cfs_rq->zero_vruntime = vruntime;
}
static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
@@ -848,6 +820,7 @@ RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
avg_vruntime_add(cfs_rq, se);
+ update_zero_vruntime(cfs_rq);
se->min_vruntime = se->vruntime;
se->min_slice = se->slice;
rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
@@ -859,6 +832,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
&min_vruntime_cb);
avg_vruntime_sub(cfs_rq, se);
+ update_zero_vruntime(cfs_rq);
}
struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq)
@@ -1226,7 +1200,6 @@ static void update_curr(struct cfs_rq *cfs_rq)
curr->vruntime += calc_delta_fair(delta_exec, curr);
resched = update_deadline(cfs_rq, curr);
- update_min_vruntime(cfs_rq);
if (entity_is_task(curr)) {
/*
@@ -3808,15 +3781,6 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
if (!curr)
__enqueue_entity(cfs_rq, se);
cfs_rq->nr_queued++;
-
- /*
- * The entity's vruntime has been adjusted, so let's check
- * whether the rq-wide min_vruntime needs updated too. Since
- * the calculations above require stable min_vruntime rather
- * than up-to-date one, we do the update at the end of the
- * reweight process.
- */
- update_min_vruntime(cfs_rq);
}
}
@@ -5429,15 +5393,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_cfs_group(se);
- /*
- * Now advance min_vruntime if @se was the entity holding it back,
- * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
- * put back on, and if we advance min_vruntime, we'll be placed back
- * further than we started -- i.e. we'll be penalized.
- */
- if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
- update_min_vruntime(cfs_rq);
-
if (flags & DEQUEUE_DELAYED)
finish_delayed_dequeue_entity(se);
@@ -9015,7 +8970,6 @@ static void yield_task_fair(struct rq *rq)
if (entity_eligible(cfs_rq, se)) {
se->vruntime = se->deadline;
se->deadline += calc_delta_fair(se->slice, se);
- update_min_vruntime(cfs_rq);
}
}
@@ -13078,23 +13032,6 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
* Which shows that S and s_i transform alike (which makes perfect sense
* given that S is basically the (weighted) average of s_i).
*
- * Then:
- *
- * x -> s_min := min{s_i} (8)
- *
- * to obtain:
- *
- * \Sum_i w_i (s_i - s_min)
- * S = s_min + ------------------------ (9)
- * \Sum_i w_i
- *
- * Which already looks familiar, and is the basis for our current
- * approximation:
- *
- * S ~= s_min (10)
- *
- * Now, obviously, (10) is absolute crap :-), but it sorta works.
- *
* So the thing to remember is that the above is strictly UP. It is
* possible to generalize to multiple runqueues -- however it gets really
* yuck when you have to add affinity support, as illustrated by our very
@@ -13116,23 +13053,23 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
* Let, for our runqueue 'k':
*
* T_k = \Sum_i w_i s_i
- * W_k = \Sum_i w_i ; for all i of k (11)
+ * W_k = \Sum_i w_i ; for all i of k (8)
*
* Then we can write (6) like:
*
* T_k
- * S_k = --- (12)
+ * S_k = --- (9)
* W_k
*
* From which immediately follows that:
*
* T_k + T_l
- * S_k+l = --------- (13)
+ * S_k+l = --------- (10)
* W_k + W_l
*
* On which we can define a combined lag:
*
- * lag_k+l(i) := S_k+l - s_i (14)
+ * lag_k+l(i) := S_k+l - s_i (11)
*
* And that gives us the tools to compare tasks across a combined runqueue.
*
@@ -13143,7 +13080,7 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
* using (7); this only requires storing single 'time'-stamps.
*
* b) when comparing tasks between 2 runqueues of which one is forced-idle,
- * compare the combined lag, per (14).
+ * compare the combined lag, per (11).
*
* Now, of course cgroups (I so hate them) make this more interesting in
* that a) seems to suggest we need to iterate all cgroup on a CPU at such
@@ -13191,12 +13128,11 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
* every tick. This limits the observed divergence due to the work
* conservancy.
*
- * On top of that, we can improve upon things by moving away from our
- * horrible (10) hack and moving to (9) and employing (13) here.
+ * On top of that, we can improve upon things by employing (10) here.
*/
/*
- * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed.
+ * se_fi_update - Update the cfs_rq->zero_vruntime_fi in a CFS hierarchy if needed.
*/
static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq,
bool forceidle)
@@ -13210,7 +13146,7 @@ static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq,
cfs_rq->forceidle_seq = fi_seq;
}
- cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
+ cfs_rq->zero_vruntime_fi = cfs_rq->zero_vruntime;
}
}
@@ -13263,11 +13199,11 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
/*
* Find delta after normalizing se's vruntime with its cfs_rq's
- * min_vruntime_fi, which would have been updated in prior calls
+ * zero_vruntime_fi, which would have been updated in prior calls
* to se_fi_update().
*/
delta = (s64)(sea->vruntime - seb->vruntime) +
- (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
+ (s64)(cfs_rqb->zero_vruntime_fi - cfs_rqa->zero_vruntime_fi);
return delta > 0;
}
@@ -13513,7 +13449,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
void init_cfs_rq(struct cfs_rq *cfs_rq)
{
cfs_rq->tasks_timeline = RB_ROOT_CACHED;
- cfs_rq->min_vruntime = (u64)(-(1LL << 20));
+ cfs_rq->zero_vruntime = (u64)(-(1LL << 20));
raw_spin_lock_init(&cfs_rq->removed.lock);
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 82e74e8..5a3cf81 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -681,10 +681,10 @@ struct cfs_rq {
s64 avg_vruntime;
u64 avg_load;
- u64 min_vruntime;
+ u64 zero_vruntime;
#ifdef CONFIG_SCHED_CORE
unsigned int forceidle_seq;
- u64 min_vruntime_fi;
+ u64 zero_vruntime_fi;
#endif
struct rb_root_cached tasks_timeline;