This patch fixes an issue that was fixed in the commit
df7b59ba9245 ("dm verity: fix FEC for RS roots unaligned to block size")
but later broken again in the commit
8ca7cab82bda ("dm verity fec: fix misaligned RS roots IO")
If the Reed-Solomon roots setting spans multiple blocks, the code does not
use proper parity bytes and randomly fails to repair even trivial errors.
This bug cannot happen if the sector size is multiple of RS roots
setting (Android case with roots 2).
The previous solution was to find a dm-bufio block size that is multiple
of the device sector size and roots size. Unfortunately, the optimization
in commit 8ca7cab82bda ("dm verity fec: fix misaligned RS roots IO")
is incorrect and uses data block size for some roots (for example, it uses
4096 block size for roots = 20).
This patch uses a different approach:
- It always uses a configured data block size for dm-bufio to avoid
possible misaligned IOs.
- and it caches the processed parity bytes, so it can join it
if it spans two blocks.
As the RS calculation is called only if an error is detected and
the process is computationally intensive, copying a few more bytes
should not introduce performance issues.
The issue was reported to cryptsetup with trivial reproducer
https://gitlab.com/cryptsetup/cryptsetup/-/issues/923
Reproducer (with roots=20):
# create verity device with RS FEC
dd if=/dev/urandom of=data.img bs=4096 count=8 status=none
veritysetup format data.img hash.img --fec-device=fec.img --fec-roots=20 | \
awk '/^Root hash/{ print $3 }' >roothash
# create an erasure that should always be repairable with this roots setting
dd if=/dev/zero of=data.img conv=notrunc bs=1 count=4 seek=4 status=none
# try to read it through dm-verity
veritysetup open data.img test hash.img --fec-device=fec.img --fec-roots=20 $(cat roothash)
dd if=/dev/mapper/test of=/dev/null bs=4096 status=noxfer
Even now the log says it cannot repair it:
: verity-fec: 7:1: FEC 0: failed to correct: -74
: device-mapper: verity: 7:1: data block 0 is corrupted
...
With this fix, errors are properly repaired.
: verity-fec: 7:1: FEC 0: corrected 4 errors
Signed-off-by: Milan Broz <gmazyland(a)gmail.com>
Fixes: 8ca7cab82bda ("dm verity fec: fix misaligned RS roots IO")
Cc: stable(a)vger.kernel.org
---
drivers/md/dm-verity-fec.c | 40 +++++++++++++++++++++++++-------------
1 file changed, 26 insertions(+), 14 deletions(-)
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index 62b1a44b8dd2..6bd9848518d4 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -60,15 +60,19 @@ static int fec_decode_rs8(struct dm_verity *v, struct dm_verity_fec_io *fio,
* to the data block. Caller is responsible for releasing buf.
*/
static u8 *fec_read_parity(struct dm_verity *v, u64 rsb, int index,
- unsigned int *offset, struct dm_buffer **buf,
- unsigned short ioprio)
+ unsigned int *offset, unsigned int par_buf_offset,
+ struct dm_buffer **buf, unsigned short ioprio)
{
u64 position, block, rem;
u8 *res;
+ /* We have already part of parity bytes read, skip to the next block */
+ if (par_buf_offset)
+ index++;
+
position = (index + rsb) * v->fec->roots;
block = div64_u64_rem(position, v->fec->io_size, &rem);
- *offset = (unsigned int)rem;
+ *offset = par_buf_offset ? 0 : (unsigned int)rem;
res = dm_bufio_read_with_ioprio(v->fec->bufio, block, buf, ioprio);
if (IS_ERR(res)) {
@@ -128,11 +132,12 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io,
{
int r, corrected = 0, res;
struct dm_buffer *buf;
- unsigned int n, i, offset;
- u8 *par, *block;
+ unsigned int n, i, offset, par_buf_offset = 0;
+ u8 *par, *block, par_buf[DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN];
struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
- par = fec_read_parity(v, rsb, block_offset, &offset, &buf, bio_prio(bio));
+ par = fec_read_parity(v, rsb, block_offset, &offset,
+ par_buf_offset, &buf, bio_prio(bio));
if (IS_ERR(par))
return PTR_ERR(par);
@@ -142,7 +147,8 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io,
*/
fec_for_each_buffer_rs_block(fio, n, i) {
block = fec_buffer_rs_block(v, fio, n, i);
- res = fec_decode_rs8(v, fio, block, &par[offset], neras);
+ memcpy(&par_buf[par_buf_offset], &par[offset], v->fec->roots - par_buf_offset);
+ res = fec_decode_rs8(v, fio, block, par_buf, neras);
if (res < 0) {
r = res;
goto error;
@@ -155,12 +161,21 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io,
if (block_offset >= 1 << v->data_dev_block_bits)
goto done;
- /* read the next block when we run out of parity bytes */
- offset += v->fec->roots;
+ /* Read the next block when we run out of parity bytes */
+ offset += (v->fec->roots - par_buf_offset);
+ /* Check if parity bytes are split between blocks */
+ if (offset < v->fec->io_size && (offset + v->fec->roots) > v->fec->io_size) {
+ par_buf_offset = v->fec->io_size - offset;
+ memcpy(par_buf, &par[offset], par_buf_offset);
+ offset += par_buf_offset;
+ } else
+ par_buf_offset = 0;
+
if (offset >= v->fec->io_size) {
dm_bufio_release(buf);
- par = fec_read_parity(v, rsb, block_offset, &offset, &buf, bio_prio(bio));
+ par = fec_read_parity(v, rsb, block_offset, &offset,
+ par_buf_offset, &buf, bio_prio(bio));
if (IS_ERR(par))
return PTR_ERR(par);
}
@@ -724,10 +739,7 @@ int verity_fec_ctr(struct dm_verity *v)
return -E2BIG;
}
- if ((f->roots << SECTOR_SHIFT) & ((1 << v->data_dev_block_bits) - 1))
- f->io_size = 1 << v->data_dev_block_bits;
- else
- f->io_size = v->fec->roots << SECTOR_SHIFT;
+ f->io_size = 1 << v->data_dev_block_bits;
f->bufio = dm_bufio_client_create(f->dev->bdev,
f->io_size,
--
2.45.2
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x bcc80dec91ee745b3d66f3e48f0ec2efdea97149
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024122324-twitter-clustered-891d@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From bcc80dec91ee745b3d66f3e48f0ec2efdea97149 Mon Sep 17 00:00:00 2001
From: Naman Jain <namjain(a)linux.microsoft.com>
Date: Tue, 17 Sep 2024 11:09:17 +0530
Subject: [PATCH] x86/hyperv: Fix hv tsc page based sched_clock for hibernation
read_hv_sched_clock_tsc() assumes that the Hyper-V clock counter is
bigger than the variable hv_sched_clock_offset, which is cached during
early boot, but depending on the timing this assumption may be false
when a hibernated VM starts again (the clock counter starts from 0
again) and is resuming back (Note: hv_init_tsc_clocksource() is not
called during hibernation/resume); consequently,
read_hv_sched_clock_tsc() may return a negative integer (which is
interpreted as a huge positive integer since the return type is u64)
and new kernel messages are prefixed with huge timestamps before
read_hv_sched_clock_tsc() grows big enough (which typically takes
several seconds).
Fix the issue by saving the Hyper-V clock counter just before the
suspend, and using it to correct the hv_sched_clock_offset in
resume. This makes hv tsc page based sched_clock continuous and ensures
that post resume, it starts from where it left off during suspend.
Override x86_platform.save_sched_clock_state and
x86_platform.restore_sched_clock_state routines to correct this as soon
as possible.
Note: if Invariant TSC is available, the issue doesn't happen because
1) we don't register read_hv_sched_clock_tsc() for sched clock:
See commit e5313f1c5404 ("clocksource/drivers/hyper-v: Rework
clocksource and sched clock setup");
2) the common x86 code adjusts TSC similarly: see
__restore_processor_state() -> tsc_verify_tsc_adjust(true) and
x86_platform.restore_sched_clock_state().
Cc: stable(a)vger.kernel.org
Fixes: 1349401ff1aa ("clocksource/drivers/hyper-v: Suspend/resume Hyper-V clocksource for hibernation")
Co-developed-by: Dexuan Cui <decui(a)microsoft.com>
Signed-off-by: Dexuan Cui <decui(a)microsoft.com>
Signed-off-by: Naman Jain <namjain(a)linux.microsoft.com>
Reviewed-by: Michael Kelley <mhklinux(a)outlook.com>
Link: https://lore.kernel.org/r/20240917053917.76787-1-namjain@linux.microsoft.com
Signed-off-by: Wei Liu <wei.liu(a)kernel.org>
Message-ID: <20240917053917.76787-1-namjain(a)linux.microsoft.com>
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index d18078834ded..dc12fe5ef3ca 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -223,6 +223,63 @@ static void hv_machine_crash_shutdown(struct pt_regs *regs)
hyperv_cleanup();
}
#endif /* CONFIG_CRASH_DUMP */
+
+static u64 hv_ref_counter_at_suspend;
+static void (*old_save_sched_clock_state)(void);
+static void (*old_restore_sched_clock_state)(void);
+
+/*
+ * Hyper-V clock counter resets during hibernation. Save and restore clock
+ * offset during suspend/resume, while also considering the time passed
+ * before suspend. This is to make sure that sched_clock using hv tsc page
+ * based clocksource, proceeds from where it left off during suspend and
+ * it shows correct time for the timestamps of kernel messages after resume.
+ */
+static void save_hv_clock_tsc_state(void)
+{
+ hv_ref_counter_at_suspend = hv_read_reference_counter();
+}
+
+static void restore_hv_clock_tsc_state(void)
+{
+ /*
+ * Adjust the offsets used by hv tsc clocksource to
+ * account for the time spent before hibernation.
+ * adjusted value = reference counter (time) at suspend
+ * - reference counter (time) now.
+ */
+ hv_adj_sched_clock_offset(hv_ref_counter_at_suspend - hv_read_reference_counter());
+}
+
+/*
+ * Functions to override save_sched_clock_state and restore_sched_clock_state
+ * functions of x86_platform. The Hyper-V clock counter is reset during
+ * suspend-resume and the offset used to measure time needs to be
+ * corrected, post resume.
+ */
+static void hv_save_sched_clock_state(void)
+{
+ old_save_sched_clock_state();
+ save_hv_clock_tsc_state();
+}
+
+static void hv_restore_sched_clock_state(void)
+{
+ restore_hv_clock_tsc_state();
+ old_restore_sched_clock_state();
+}
+
+static void __init x86_setup_ops_for_tsc_pg_clock(void)
+{
+ if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE))
+ return;
+
+ old_save_sched_clock_state = x86_platform.save_sched_clock_state;
+ x86_platform.save_sched_clock_state = hv_save_sched_clock_state;
+
+ old_restore_sched_clock_state = x86_platform.restore_sched_clock_state;
+ x86_platform.restore_sched_clock_state = hv_restore_sched_clock_state;
+}
#endif /* CONFIG_HYPERV */
static uint32_t __init ms_hyperv_platform(void)
@@ -579,6 +636,7 @@ static void __init ms_hyperv_init_platform(void)
/* Register Hyper-V specific clocksource */
hv_init_clocksource();
+ x86_setup_ops_for_tsc_pg_clock();
hv_vtl_init_platform();
#endif
/*
diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c
index 99177835cade..b39dee7b93af 100644
--- a/drivers/clocksource/hyperv_timer.c
+++ b/drivers/clocksource/hyperv_timer.c
@@ -27,7 +27,8 @@
#include <asm/mshyperv.h>
static struct clock_event_device __percpu *hv_clock_event;
-static u64 hv_sched_clock_offset __ro_after_init;
+/* Note: offset can hold negative values after hibernation. */
+static u64 hv_sched_clock_offset __read_mostly;
/*
* If false, we're using the old mechanism for stimer0 interrupts
@@ -470,6 +471,17 @@ static void resume_hv_clock_tsc(struct clocksource *arg)
hv_set_msr(HV_MSR_REFERENCE_TSC, tsc_msr.as_uint64);
}
+/*
+ * Called during resume from hibernation, from overridden
+ * x86_platform.restore_sched_clock_state routine. This is to adjust offsets
+ * used to calculate time for hv tsc page based sched_clock, to account for
+ * time spent before hibernation.
+ */
+void hv_adj_sched_clock_offset(u64 offset)
+{
+ hv_sched_clock_offset -= offset;
+}
+
#ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK
static int hv_cs_enable(struct clocksource *cs)
{
diff --git a/include/clocksource/hyperv_timer.h b/include/clocksource/hyperv_timer.h
index 6cdc873ac907..aa5233b1eba9 100644
--- a/include/clocksource/hyperv_timer.h
+++ b/include/clocksource/hyperv_timer.h
@@ -38,6 +38,8 @@ extern void hv_remap_tsc_clocksource(void);
extern unsigned long hv_get_tsc_pfn(void);
extern struct ms_hyperv_tsc_page *hv_get_tsc_page(void);
+extern void hv_adj_sched_clock_offset(u64 offset);
+
static __always_inline bool
hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg,
u64 *cur_tsc, u64 *time)
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x bcc80dec91ee745b3d66f3e48f0ec2efdea97149
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024122327-imaginary-gizzard-8e8b@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From bcc80dec91ee745b3d66f3e48f0ec2efdea97149 Mon Sep 17 00:00:00 2001
From: Naman Jain <namjain(a)linux.microsoft.com>
Date: Tue, 17 Sep 2024 11:09:17 +0530
Subject: [PATCH] x86/hyperv: Fix hv tsc page based sched_clock for hibernation
read_hv_sched_clock_tsc() assumes that the Hyper-V clock counter is
bigger than the variable hv_sched_clock_offset, which is cached during
early boot, but depending on the timing this assumption may be false
when a hibernated VM starts again (the clock counter starts from 0
again) and is resuming back (Note: hv_init_tsc_clocksource() is not
called during hibernation/resume); consequently,
read_hv_sched_clock_tsc() may return a negative integer (which is
interpreted as a huge positive integer since the return type is u64)
and new kernel messages are prefixed with huge timestamps before
read_hv_sched_clock_tsc() grows big enough (which typically takes
several seconds).
Fix the issue by saving the Hyper-V clock counter just before the
suspend, and using it to correct the hv_sched_clock_offset in
resume. This makes hv tsc page based sched_clock continuous and ensures
that post resume, it starts from where it left off during suspend.
Override x86_platform.save_sched_clock_state and
x86_platform.restore_sched_clock_state routines to correct this as soon
as possible.
Note: if Invariant TSC is available, the issue doesn't happen because
1) we don't register read_hv_sched_clock_tsc() for sched clock:
See commit e5313f1c5404 ("clocksource/drivers/hyper-v: Rework
clocksource and sched clock setup");
2) the common x86 code adjusts TSC similarly: see
__restore_processor_state() -> tsc_verify_tsc_adjust(true) and
x86_platform.restore_sched_clock_state().
Cc: stable(a)vger.kernel.org
Fixes: 1349401ff1aa ("clocksource/drivers/hyper-v: Suspend/resume Hyper-V clocksource for hibernation")
Co-developed-by: Dexuan Cui <decui(a)microsoft.com>
Signed-off-by: Dexuan Cui <decui(a)microsoft.com>
Signed-off-by: Naman Jain <namjain(a)linux.microsoft.com>
Reviewed-by: Michael Kelley <mhklinux(a)outlook.com>
Link: https://lore.kernel.org/r/20240917053917.76787-1-namjain@linux.microsoft.com
Signed-off-by: Wei Liu <wei.liu(a)kernel.org>
Message-ID: <20240917053917.76787-1-namjain(a)linux.microsoft.com>
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index d18078834ded..dc12fe5ef3ca 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -223,6 +223,63 @@ static void hv_machine_crash_shutdown(struct pt_regs *regs)
hyperv_cleanup();
}
#endif /* CONFIG_CRASH_DUMP */
+
+static u64 hv_ref_counter_at_suspend;
+static void (*old_save_sched_clock_state)(void);
+static void (*old_restore_sched_clock_state)(void);
+
+/*
+ * Hyper-V clock counter resets during hibernation. Save and restore clock
+ * offset during suspend/resume, while also considering the time passed
+ * before suspend. This is to make sure that sched_clock using hv tsc page
+ * based clocksource, proceeds from where it left off during suspend and
+ * it shows correct time for the timestamps of kernel messages after resume.
+ */
+static void save_hv_clock_tsc_state(void)
+{
+ hv_ref_counter_at_suspend = hv_read_reference_counter();
+}
+
+static void restore_hv_clock_tsc_state(void)
+{
+ /*
+ * Adjust the offsets used by hv tsc clocksource to
+ * account for the time spent before hibernation.
+ * adjusted value = reference counter (time) at suspend
+ * - reference counter (time) now.
+ */
+ hv_adj_sched_clock_offset(hv_ref_counter_at_suspend - hv_read_reference_counter());
+}
+
+/*
+ * Functions to override save_sched_clock_state and restore_sched_clock_state
+ * functions of x86_platform. The Hyper-V clock counter is reset during
+ * suspend-resume and the offset used to measure time needs to be
+ * corrected, post resume.
+ */
+static void hv_save_sched_clock_state(void)
+{
+ old_save_sched_clock_state();
+ save_hv_clock_tsc_state();
+}
+
+static void hv_restore_sched_clock_state(void)
+{
+ restore_hv_clock_tsc_state();
+ old_restore_sched_clock_state();
+}
+
+static void __init x86_setup_ops_for_tsc_pg_clock(void)
+{
+ if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE))
+ return;
+
+ old_save_sched_clock_state = x86_platform.save_sched_clock_state;
+ x86_platform.save_sched_clock_state = hv_save_sched_clock_state;
+
+ old_restore_sched_clock_state = x86_platform.restore_sched_clock_state;
+ x86_platform.restore_sched_clock_state = hv_restore_sched_clock_state;
+}
#endif /* CONFIG_HYPERV */
static uint32_t __init ms_hyperv_platform(void)
@@ -579,6 +636,7 @@ static void __init ms_hyperv_init_platform(void)
/* Register Hyper-V specific clocksource */
hv_init_clocksource();
+ x86_setup_ops_for_tsc_pg_clock();
hv_vtl_init_platform();
#endif
/*
diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c
index 99177835cade..b39dee7b93af 100644
--- a/drivers/clocksource/hyperv_timer.c
+++ b/drivers/clocksource/hyperv_timer.c
@@ -27,7 +27,8 @@
#include <asm/mshyperv.h>
static struct clock_event_device __percpu *hv_clock_event;
-static u64 hv_sched_clock_offset __ro_after_init;
+/* Note: offset can hold negative values after hibernation. */
+static u64 hv_sched_clock_offset __read_mostly;
/*
* If false, we're using the old mechanism for stimer0 interrupts
@@ -470,6 +471,17 @@ static void resume_hv_clock_tsc(struct clocksource *arg)
hv_set_msr(HV_MSR_REFERENCE_TSC, tsc_msr.as_uint64);
}
+/*
+ * Called during resume from hibernation, from overridden
+ * x86_platform.restore_sched_clock_state routine. This is to adjust offsets
+ * used to calculate time for hv tsc page based sched_clock, to account for
+ * time spent before hibernation.
+ */
+void hv_adj_sched_clock_offset(u64 offset)
+{
+ hv_sched_clock_offset -= offset;
+}
+
#ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK
static int hv_cs_enable(struct clocksource *cs)
{
diff --git a/include/clocksource/hyperv_timer.h b/include/clocksource/hyperv_timer.h
index 6cdc873ac907..aa5233b1eba9 100644
--- a/include/clocksource/hyperv_timer.h
+++ b/include/clocksource/hyperv_timer.h
@@ -38,6 +38,8 @@ extern void hv_remap_tsc_clocksource(void);
extern unsigned long hv_get_tsc_pfn(void);
extern struct ms_hyperv_tsc_page *hv_get_tsc_page(void);
+extern void hv_adj_sched_clock_offset(u64 offset);
+
static __always_inline bool
hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg,
u64 *cur_tsc, u64 *time)
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x bcc80dec91ee745b3d66f3e48f0ec2efdea97149
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024122325-flavoring-mute-cb7e@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From bcc80dec91ee745b3d66f3e48f0ec2efdea97149 Mon Sep 17 00:00:00 2001
From: Naman Jain <namjain(a)linux.microsoft.com>
Date: Tue, 17 Sep 2024 11:09:17 +0530
Subject: [PATCH] x86/hyperv: Fix hv tsc page based sched_clock for hibernation
read_hv_sched_clock_tsc() assumes that the Hyper-V clock counter is
bigger than the variable hv_sched_clock_offset, which is cached during
early boot, but depending on the timing this assumption may be false
when a hibernated VM starts again (the clock counter starts from 0
again) and is resuming back (Note: hv_init_tsc_clocksource() is not
called during hibernation/resume); consequently,
read_hv_sched_clock_tsc() may return a negative integer (which is
interpreted as a huge positive integer since the return type is u64)
and new kernel messages are prefixed with huge timestamps before
read_hv_sched_clock_tsc() grows big enough (which typically takes
several seconds).
Fix the issue by saving the Hyper-V clock counter just before the
suspend, and using it to correct the hv_sched_clock_offset in
resume. This makes hv tsc page based sched_clock continuous and ensures
that post resume, it starts from where it left off during suspend.
Override x86_platform.save_sched_clock_state and
x86_platform.restore_sched_clock_state routines to correct this as soon
as possible.
Note: if Invariant TSC is available, the issue doesn't happen because
1) we don't register read_hv_sched_clock_tsc() for sched clock:
See commit e5313f1c5404 ("clocksource/drivers/hyper-v: Rework
clocksource and sched clock setup");
2) the common x86 code adjusts TSC similarly: see
__restore_processor_state() -> tsc_verify_tsc_adjust(true) and
x86_platform.restore_sched_clock_state().
Cc: stable(a)vger.kernel.org
Fixes: 1349401ff1aa ("clocksource/drivers/hyper-v: Suspend/resume Hyper-V clocksource for hibernation")
Co-developed-by: Dexuan Cui <decui(a)microsoft.com>
Signed-off-by: Dexuan Cui <decui(a)microsoft.com>
Signed-off-by: Naman Jain <namjain(a)linux.microsoft.com>
Reviewed-by: Michael Kelley <mhklinux(a)outlook.com>
Link: https://lore.kernel.org/r/20240917053917.76787-1-namjain@linux.microsoft.com
Signed-off-by: Wei Liu <wei.liu(a)kernel.org>
Message-ID: <20240917053917.76787-1-namjain(a)linux.microsoft.com>
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index d18078834ded..dc12fe5ef3ca 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -223,6 +223,63 @@ static void hv_machine_crash_shutdown(struct pt_regs *regs)
hyperv_cleanup();
}
#endif /* CONFIG_CRASH_DUMP */
+
+static u64 hv_ref_counter_at_suspend;
+static void (*old_save_sched_clock_state)(void);
+static void (*old_restore_sched_clock_state)(void);
+
+/*
+ * Hyper-V clock counter resets during hibernation. Save and restore clock
+ * offset during suspend/resume, while also considering the time passed
+ * before suspend. This is to make sure that sched_clock using hv tsc page
+ * based clocksource, proceeds from where it left off during suspend and
+ * it shows correct time for the timestamps of kernel messages after resume.
+ */
+static void save_hv_clock_tsc_state(void)
+{
+ hv_ref_counter_at_suspend = hv_read_reference_counter();
+}
+
+static void restore_hv_clock_tsc_state(void)
+{
+ /*
+ * Adjust the offsets used by hv tsc clocksource to
+ * account for the time spent before hibernation.
+ * adjusted value = reference counter (time) at suspend
+ * - reference counter (time) now.
+ */
+ hv_adj_sched_clock_offset(hv_ref_counter_at_suspend - hv_read_reference_counter());
+}
+
+/*
+ * Functions to override save_sched_clock_state and restore_sched_clock_state
+ * functions of x86_platform. The Hyper-V clock counter is reset during
+ * suspend-resume and the offset used to measure time needs to be
+ * corrected, post resume.
+ */
+static void hv_save_sched_clock_state(void)
+{
+ old_save_sched_clock_state();
+ save_hv_clock_tsc_state();
+}
+
+static void hv_restore_sched_clock_state(void)
+{
+ restore_hv_clock_tsc_state();
+ old_restore_sched_clock_state();
+}
+
+static void __init x86_setup_ops_for_tsc_pg_clock(void)
+{
+ if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE))
+ return;
+
+ old_save_sched_clock_state = x86_platform.save_sched_clock_state;
+ x86_platform.save_sched_clock_state = hv_save_sched_clock_state;
+
+ old_restore_sched_clock_state = x86_platform.restore_sched_clock_state;
+ x86_platform.restore_sched_clock_state = hv_restore_sched_clock_state;
+}
#endif /* CONFIG_HYPERV */
static uint32_t __init ms_hyperv_platform(void)
@@ -579,6 +636,7 @@ static void __init ms_hyperv_init_platform(void)
/* Register Hyper-V specific clocksource */
hv_init_clocksource();
+ x86_setup_ops_for_tsc_pg_clock();
hv_vtl_init_platform();
#endif
/*
diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c
index 99177835cade..b39dee7b93af 100644
--- a/drivers/clocksource/hyperv_timer.c
+++ b/drivers/clocksource/hyperv_timer.c
@@ -27,7 +27,8 @@
#include <asm/mshyperv.h>
static struct clock_event_device __percpu *hv_clock_event;
-static u64 hv_sched_clock_offset __ro_after_init;
+/* Note: offset can hold negative values after hibernation. */
+static u64 hv_sched_clock_offset __read_mostly;
/*
* If false, we're using the old mechanism for stimer0 interrupts
@@ -470,6 +471,17 @@ static void resume_hv_clock_tsc(struct clocksource *arg)
hv_set_msr(HV_MSR_REFERENCE_TSC, tsc_msr.as_uint64);
}
+/*
+ * Called during resume from hibernation, from overridden
+ * x86_platform.restore_sched_clock_state routine. This is to adjust offsets
+ * used to calculate time for hv tsc page based sched_clock, to account for
+ * time spent before hibernation.
+ */
+void hv_adj_sched_clock_offset(u64 offset)
+{
+ hv_sched_clock_offset -= offset;
+}
+
#ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK
static int hv_cs_enable(struct clocksource *cs)
{
diff --git a/include/clocksource/hyperv_timer.h b/include/clocksource/hyperv_timer.h
index 6cdc873ac907..aa5233b1eba9 100644
--- a/include/clocksource/hyperv_timer.h
+++ b/include/clocksource/hyperv_timer.h
@@ -38,6 +38,8 @@ extern void hv_remap_tsc_clocksource(void);
extern unsigned long hv_get_tsc_pfn(void);
extern struct ms_hyperv_tsc_page *hv_get_tsc_page(void);
+extern void hv_adj_sched_clock_offset(u64 offset);
+
static __always_inline bool
hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg,
u64 *cur_tsc, u64 *time)
From: Kaixin Wang <kxwang23(a)m.fudan.edu.cn>
[ Upstream commit 609366e7a06d035990df78f1562291c3bf0d4a12 ]
In the cdns_i3c_master_probe function, &master->hj_work is bound with
cdns_i3c_master_hj. And cdns_i3c_master_interrupt can call
cnds_i3c_master_demux_ibis function to start the work.
If we remove the module which will call cdns_i3c_master_remove to
make cleanup, it will free master->base through i3c_master_unregister
while the work mentioned above will be used. The sequence of operations
that may lead to a UAF bug is as follows:
CPU0 CPU1
| cdns_i3c_master_hj
cdns_i3c_master_remove |
i3c_master_unregister(&master->base) |
device_unregister(&master->dev) |
device_release |
//free master->base |
| i3c_master_do_daa(&master->base)
| //use master->base
Fix it by ensuring that the work is canceled before proceeding with
the cleanup in cdns_i3c_master_remove.
Signed-off-by: Kaixin Wang <kxwang23(a)m.fudan.edu.cn>
Link: https://lore.kernel.org/r/20240911153544.848398-1-kxwang23@m.fudan.edu.cn
Signed-off-by: Alexandre Belloni <alexandre.belloni(a)bootlin.com>
Signed-off-by: Jianqi Ren <jianqi.ren.cn(a)windriver.com>
---
drivers/i3c/master/i3c-master-cdns.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/i3c/master/i3c-master-cdns.c b/drivers/i3c/master/i3c-master-cdns.c
index b9cfda6ae9ae..4473c0b1ae2e 100644
--- a/drivers/i3c/master/i3c-master-cdns.c
+++ b/drivers/i3c/master/i3c-master-cdns.c
@@ -1668,6 +1668,7 @@ static int cdns_i3c_master_remove(struct platform_device *pdev)
struct cdns_i3c_master *master = platform_get_drvdata(pdev);
int ret;
+ cancel_work_sync(&master->hj_work);
ret = i3c_master_unregister(&master->base);
if (ret)
return ret;
--
2.25.1
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x bcc80dec91ee745b3d66f3e48f0ec2efdea97149
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024122326-eraser-perennial-6b59@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From bcc80dec91ee745b3d66f3e48f0ec2efdea97149 Mon Sep 17 00:00:00 2001
From: Naman Jain <namjain(a)linux.microsoft.com>
Date: Tue, 17 Sep 2024 11:09:17 +0530
Subject: [PATCH] x86/hyperv: Fix hv tsc page based sched_clock for hibernation
read_hv_sched_clock_tsc() assumes that the Hyper-V clock counter is
bigger than the variable hv_sched_clock_offset, which is cached during
early boot, but depending on the timing this assumption may be false
when a hibernated VM starts again (the clock counter starts from 0
again) and is resuming back (Note: hv_init_tsc_clocksource() is not
called during hibernation/resume); consequently,
read_hv_sched_clock_tsc() may return a negative integer (which is
interpreted as a huge positive integer since the return type is u64)
and new kernel messages are prefixed with huge timestamps before
read_hv_sched_clock_tsc() grows big enough (which typically takes
several seconds).
Fix the issue by saving the Hyper-V clock counter just before the
suspend, and using it to correct the hv_sched_clock_offset in
resume. This makes hv tsc page based sched_clock continuous and ensures
that post resume, it starts from where it left off during suspend.
Override x86_platform.save_sched_clock_state and
x86_platform.restore_sched_clock_state routines to correct this as soon
as possible.
Note: if Invariant TSC is available, the issue doesn't happen because
1) we don't register read_hv_sched_clock_tsc() for sched clock:
See commit e5313f1c5404 ("clocksource/drivers/hyper-v: Rework
clocksource and sched clock setup");
2) the common x86 code adjusts TSC similarly: see
__restore_processor_state() -> tsc_verify_tsc_adjust(true) and
x86_platform.restore_sched_clock_state().
Cc: stable(a)vger.kernel.org
Fixes: 1349401ff1aa ("clocksource/drivers/hyper-v: Suspend/resume Hyper-V clocksource for hibernation")
Co-developed-by: Dexuan Cui <decui(a)microsoft.com>
Signed-off-by: Dexuan Cui <decui(a)microsoft.com>
Signed-off-by: Naman Jain <namjain(a)linux.microsoft.com>
Reviewed-by: Michael Kelley <mhklinux(a)outlook.com>
Link: https://lore.kernel.org/r/20240917053917.76787-1-namjain@linux.microsoft.com
Signed-off-by: Wei Liu <wei.liu(a)kernel.org>
Message-ID: <20240917053917.76787-1-namjain(a)linux.microsoft.com>
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index d18078834ded..dc12fe5ef3ca 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -223,6 +223,63 @@ static void hv_machine_crash_shutdown(struct pt_regs *regs)
hyperv_cleanup();
}
#endif /* CONFIG_CRASH_DUMP */
+
+static u64 hv_ref_counter_at_suspend;
+static void (*old_save_sched_clock_state)(void);
+static void (*old_restore_sched_clock_state)(void);
+
+/*
+ * Hyper-V clock counter resets during hibernation. Save and restore clock
+ * offset during suspend/resume, while also considering the time passed
+ * before suspend. This is to make sure that sched_clock using hv tsc page
+ * based clocksource, proceeds from where it left off during suspend and
+ * it shows correct time for the timestamps of kernel messages after resume.
+ */
+static void save_hv_clock_tsc_state(void)
+{
+ hv_ref_counter_at_suspend = hv_read_reference_counter();
+}
+
+static void restore_hv_clock_tsc_state(void)
+{
+ /*
+ * Adjust the offsets used by hv tsc clocksource to
+ * account for the time spent before hibernation.
+ * adjusted value = reference counter (time) at suspend
+ * - reference counter (time) now.
+ */
+ hv_adj_sched_clock_offset(hv_ref_counter_at_suspend - hv_read_reference_counter());
+}
+
+/*
+ * Functions to override save_sched_clock_state and restore_sched_clock_state
+ * functions of x86_platform. The Hyper-V clock counter is reset during
+ * suspend-resume and the offset used to measure time needs to be
+ * corrected, post resume.
+ */
+static void hv_save_sched_clock_state(void)
+{
+ old_save_sched_clock_state();
+ save_hv_clock_tsc_state();
+}
+
+static void hv_restore_sched_clock_state(void)
+{
+ restore_hv_clock_tsc_state();
+ old_restore_sched_clock_state();
+}
+
+static void __init x86_setup_ops_for_tsc_pg_clock(void)
+{
+ if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE))
+ return;
+
+ old_save_sched_clock_state = x86_platform.save_sched_clock_state;
+ x86_platform.save_sched_clock_state = hv_save_sched_clock_state;
+
+ old_restore_sched_clock_state = x86_platform.restore_sched_clock_state;
+ x86_platform.restore_sched_clock_state = hv_restore_sched_clock_state;
+}
#endif /* CONFIG_HYPERV */
static uint32_t __init ms_hyperv_platform(void)
@@ -579,6 +636,7 @@ static void __init ms_hyperv_init_platform(void)
/* Register Hyper-V specific clocksource */
hv_init_clocksource();
+ x86_setup_ops_for_tsc_pg_clock();
hv_vtl_init_platform();
#endif
/*
diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c
index 99177835cade..b39dee7b93af 100644
--- a/drivers/clocksource/hyperv_timer.c
+++ b/drivers/clocksource/hyperv_timer.c
@@ -27,7 +27,8 @@
#include <asm/mshyperv.h>
static struct clock_event_device __percpu *hv_clock_event;
-static u64 hv_sched_clock_offset __ro_after_init;
+/* Note: offset can hold negative values after hibernation. */
+static u64 hv_sched_clock_offset __read_mostly;
/*
* If false, we're using the old mechanism for stimer0 interrupts
@@ -470,6 +471,17 @@ static void resume_hv_clock_tsc(struct clocksource *arg)
hv_set_msr(HV_MSR_REFERENCE_TSC, tsc_msr.as_uint64);
}
+/*
+ * Called during resume from hibernation, from overridden
+ * x86_platform.restore_sched_clock_state routine. This is to adjust offsets
+ * used to calculate time for hv tsc page based sched_clock, to account for
+ * time spent before hibernation.
+ */
+void hv_adj_sched_clock_offset(u64 offset)
+{
+ hv_sched_clock_offset -= offset;
+}
+
#ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK
static int hv_cs_enable(struct clocksource *cs)
{
diff --git a/include/clocksource/hyperv_timer.h b/include/clocksource/hyperv_timer.h
index 6cdc873ac907..aa5233b1eba9 100644
--- a/include/clocksource/hyperv_timer.h
+++ b/include/clocksource/hyperv_timer.h
@@ -38,6 +38,8 @@ extern void hv_remap_tsc_clocksource(void);
extern unsigned long hv_get_tsc_pfn(void);
extern struct ms_hyperv_tsc_page *hv_get_tsc_page(void);
+extern void hv_adj_sched_clock_offset(u64 offset);
+
static __always_inline bool
hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg,
u64 *cur_tsc, u64 *time)
From: Johannes Thumshirn <johannes.thumshirn(a)wdc.com>
[ Upstream commit 05b36b04d74a517d6675bf2f90829ff1ac7e28dc ]
Shinichiro reported the following use-after free that sometimes is
happening in our CI system when running fstests' btrfs/284 on a TCMU
runner device:
BUG: KASAN: slab-use-after-free in lock_release+0x708/0x780
Read of size 8 at addr ffff888106a83f18 by task kworker/u80:6/219
CPU: 8 UID: 0 PID: 219 Comm: kworker/u80:6 Not tainted 6.12.0-rc6-kts+ #15
Hardware name: Supermicro Super Server/X11SPi-TF, BIOS 3.3 02/21/2020
Workqueue: btrfs-endio btrfs_end_bio_work [btrfs]
Call Trace:
<TASK>
dump_stack_lvl+0x6e/0xa0
? lock_release+0x708/0x780
print_report+0x174/0x505
? lock_release+0x708/0x780
? __virt_addr_valid+0x224/0x410
? lock_release+0x708/0x780
kasan_report+0xda/0x1b0
? lock_release+0x708/0x780
? __wake_up+0x44/0x60
lock_release+0x708/0x780
? __pfx_lock_release+0x10/0x10
? __pfx_do_raw_spin_lock+0x10/0x10
? lock_is_held_type+0x9a/0x110
_raw_spin_unlock_irqrestore+0x1f/0x60
__wake_up+0x44/0x60
btrfs_encoded_read_endio+0x14b/0x190 [btrfs]
btrfs_check_read_bio+0x8d9/0x1360 [btrfs]
? lock_release+0x1b0/0x780
? trace_lock_acquire+0x12f/0x1a0
? __pfx_btrfs_check_read_bio+0x10/0x10 [btrfs]
? process_one_work+0x7e3/0x1460
? lock_acquire+0x31/0xc0
? process_one_work+0x7e3/0x1460
process_one_work+0x85c/0x1460
? __pfx_process_one_work+0x10/0x10
? assign_work+0x16c/0x240
worker_thread+0x5e6/0xfc0
? __pfx_worker_thread+0x10/0x10
kthread+0x2c3/0x3a0
? __pfx_kthread+0x10/0x10
ret_from_fork+0x31/0x70
? __pfx_kthread+0x10/0x10
ret_from_fork_asm+0x1a/0x30
</TASK>
Allocated by task 3661:
kasan_save_stack+0x30/0x50
kasan_save_track+0x14/0x30
__kasan_kmalloc+0xaa/0xb0
btrfs_encoded_read_regular_fill_pages+0x16c/0x6d0 [btrfs]
send_extent_data+0xf0f/0x24a0 [btrfs]
process_extent+0x48a/0x1830 [btrfs]
changed_cb+0x178b/0x2ea0 [btrfs]
btrfs_ioctl_send+0x3bf9/0x5c20 [btrfs]
_btrfs_ioctl_send+0x117/0x330 [btrfs]
btrfs_ioctl+0x184a/0x60a0 [btrfs]
__x64_sys_ioctl+0x12e/0x1a0
do_syscall_64+0x95/0x180
entry_SYSCALL_64_after_hwframe+0x76/0x7e
Freed by task 3661:
kasan_save_stack+0x30/0x50
kasan_save_track+0x14/0x30
kasan_save_free_info+0x3b/0x70
__kasan_slab_free+0x4f/0x70
kfree+0x143/0x490
btrfs_encoded_read_regular_fill_pages+0x531/0x6d0 [btrfs]
send_extent_data+0xf0f/0x24a0 [btrfs]
process_extent+0x48a/0x1830 [btrfs]
changed_cb+0x178b/0x2ea0 [btrfs]
btrfs_ioctl_send+0x3bf9/0x5c20 [btrfs]
_btrfs_ioctl_send+0x117/0x330 [btrfs]
btrfs_ioctl+0x184a/0x60a0 [btrfs]
__x64_sys_ioctl+0x12e/0x1a0
do_syscall_64+0x95/0x180
entry_SYSCALL_64_after_hwframe+0x76/0x7e
The buggy address belongs to the object at ffff888106a83f00
which belongs to the cache kmalloc-rnd-07-96 of size 96
The buggy address is located 24 bytes inside of
freed 96-byte region [ffff888106a83f00, ffff888106a83f60)
The buggy address belongs to the physical page:
page: refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff888106a83800 pfn:0x106a83
flags: 0x17ffffc0000000(node=0|zone=2|lastcpupid=0x1fffff)
page_type: f5(slab)
raw: 0017ffffc0000000 ffff888100053680 ffffea0004917200 0000000000000004
raw: ffff888106a83800 0000000080200019 00000001f5000000 0000000000000000
page dumped because: kasan: bad access detected
Memory state around the buggy address:
ffff888106a83e00: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
ffff888106a83e80: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
>ffff888106a83f00: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
^
ffff888106a83f80: fa fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
ffff888106a84000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
==================================================================
Further analyzing the trace and the crash dump's vmcore file shows that
the wake_up() call in btrfs_encoded_read_endio() is calling wake_up() on
the wait_queue that is in the private data passed to the end_io handler.
Commit 4ff47df40447 ("btrfs: move priv off stack in
btrfs_encoded_read_regular_fill_pages()") moved 'struct
btrfs_encoded_read_private' off the stack.
Before that commit one can see a corruption of the private data when
analyzing the vmcore after a crash:
*(struct btrfs_encoded_read_private *)0xffff88815626eec8 = {
.wait = (wait_queue_head_t){
.lock = (spinlock_t){
.rlock = (struct raw_spinlock){
.raw_lock = (arch_spinlock_t){
.val = (atomic_t){
.counter = (int)-2005885696,
},
.locked = (u8)0,
.pending = (u8)157,
.locked_pending = (u16)40192,
.tail = (u16)34928,
},
.magic = (unsigned int)536325682,
.owner_cpu = (unsigned int)29,
.owner = (void *)__SCT__tp_func_btrfs_transaction_commit+0x0 = 0x0,
.dep_map = (struct lockdep_map){
.key = (struct lock_class_key *)0xffff8881575a3b6c,
.class_cache = (struct lock_class *[2]){ 0xffff8882a71985c0, 0xffffea00066f5d40 },
.name = (const char *)0xffff88815626f100 = "",
.wait_type_outer = (u8)37,
.wait_type_inner = (u8)178,
.lock_type = (u8)154,
},
},
.__padding = (u8 [24]){ 0, 157, 112, 136, 50, 174, 247, 31, 29 },
.dep_map = (struct lockdep_map){
.key = (struct lock_class_key *)0xffff8881575a3b6c,
.class_cache = (struct lock_class *[2]){ 0xffff8882a71985c0, 0xffffea00066f5d40 },
.name = (const char *)0xffff88815626f100 = "",
.wait_type_outer = (u8)37,
.wait_type_inner = (u8)178,
.lock_type = (u8)154,
},
},
.head = (struct list_head){
.next = (struct list_head *)0x112cca,
.prev = (struct list_head *)0x47,
},
},
.pending = (atomic_t){
.counter = (int)-1491499288,
},
.status = (blk_status_t)130,
}
Here we can see several indicators of in-memory data corruption, e.g. the
large negative atomic values of ->pending or
->wait->lock->rlock->raw_lock->val, as well as the bogus spinlock magic
0x1ff7ae32 (decimal 536325682 above) instead of 0xdead4ead or the bogus
pointer values for ->wait->head.
To fix this, change atomic_dec_return() to atomic_dec_and_test() to fix the
corruption, as atomic_dec_return() is defined as two instructions on
x86_64, whereas atomic_dec_and_test() is defined as a single atomic
operation. This can lead to a situation where counter value is already
decremented but the if statement in btrfs_encoded_read_endio() is not
completely processed, i.e. the 0 test has not completed. If another thread
continues executing btrfs_encoded_read_regular_fill_pages() the
atomic_dec_return() there can see an already updated ->pending counter and
continues by freeing the private data. Continuing in the endio handler the
test for 0 succeeds and the wait_queue is woken up, resulting in a
use-after-free.
Reported-by: Shinichiro Kawasaki <shinichiro.kawasaki(a)wdc.com>
Suggested-by: Damien Le Moal <Damien.LeMoal(a)wdc.com>
Fixes: 1881fba89bd5 ("btrfs: add BTRFS_IOC_ENCODED_READ ioctl")
CC: stable(a)vger.kernel.org # 6.1+
Reviewed-by: Filipe Manana <fdmanana(a)suse.com>
Reviewed-by: Qu Wenruo <wqu(a)suse.com>
Signed-off-by: Johannes Thumshirn <johannes.thumshirn(a)wdc.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: Alva Lan <alvalan9(a)foxmail.com>
---
fs/btrfs/inode.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d6767f728c07..eb9319d856f2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9972,7 +9972,7 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
*/
WRITE_ONCE(priv->status, bbio->bio.bi_status);
}
- if (!atomic_dec_return(&priv->pending))
+ if (atomic_dec_and_test(&priv->pending))
wake_up(&priv->wait);
bio_put(&bbio->bio);
}
--
2.43.0
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x f64f610ec6ab59dd0391b03842cea3a4cd8ee34f
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025010307-papaya-expenses-89a5@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f64f610ec6ab59dd0391b03842cea3a4cd8ee34f Mon Sep 17 00:00:00 2001
From: Lucas Stach <l.stach(a)pengutronix.de>
Date: Wed, 18 Dec 2024 19:44:33 +0100
Subject: [PATCH] pmdomain: core: add dummy release function to genpd device
The genpd device, which is really only used as a handle to lookup
OPP, but not even registered to the device core otherwise and thus
lifetime linked to the genpd struct it is contained in, is missing
a release function. After b8f7bbd1f4ec ("pmdomain: core: Add
missing put_device()") the device will be cleaned up going through
the driver core device_release() function, which will warn when no
release callback is present for the device. Add a dummy release
function to shut up the warning.
Signed-off-by: Lucas Stach <l.stach(a)pengutronix.de>
Tested-by: Luca Ceresoli <luca.ceresoli(a)bootlin.com>
Fixes: b8f7bbd1f4ec ("pmdomain: core: Add missing put_device()")
Cc: stable(a)vger.kernel.org
Message-ID: <20241218184433.1930532-1-l.stach(a)pengutronix.de>
Signed-off-by: Ulf Hansson <ulf.hansson(a)linaro.org>
diff --git a/drivers/pmdomain/core.c b/drivers/pmdomain/core.c
index bb11f467dc78..20a9efebbcb7 100644
--- a/drivers/pmdomain/core.c
+++ b/drivers/pmdomain/core.c
@@ -2142,6 +2142,11 @@ static int genpd_set_default_power_state(struct generic_pm_domain *genpd)
return 0;
}
+static void genpd_provider_release(struct device *dev)
+{
+ /* nothing to be done here */
+}
+
static int genpd_alloc_data(struct generic_pm_domain *genpd)
{
struct genpd_governor_data *gd = NULL;
@@ -2173,6 +2178,7 @@ static int genpd_alloc_data(struct generic_pm_domain *genpd)
genpd->gd = gd;
device_initialize(&genpd->dev);
+ genpd->dev.release = genpd_provider_release;
if (!genpd_is_dev_name_fw(genpd)) {
dev_set_name(&genpd->dev, "%s", genpd->name);
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x f64f610ec6ab59dd0391b03842cea3a4cd8ee34f
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025010306-chooser-varmint-6746@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f64f610ec6ab59dd0391b03842cea3a4cd8ee34f Mon Sep 17 00:00:00 2001
From: Lucas Stach <l.stach(a)pengutronix.de>
Date: Wed, 18 Dec 2024 19:44:33 +0100
Subject: [PATCH] pmdomain: core: add dummy release function to genpd device
The genpd device, which is really only used as a handle to lookup
OPP, but not even registered to the device core otherwise and thus
lifetime linked to the genpd struct it is contained in, is missing
a release function. After b8f7bbd1f4ec ("pmdomain: core: Add
missing put_device()") the device will be cleaned up going through
the driver core device_release() function, which will warn when no
release callback is present for the device. Add a dummy release
function to shut up the warning.
Signed-off-by: Lucas Stach <l.stach(a)pengutronix.de>
Tested-by: Luca Ceresoli <luca.ceresoli(a)bootlin.com>
Fixes: b8f7bbd1f4ec ("pmdomain: core: Add missing put_device()")
Cc: stable(a)vger.kernel.org
Message-ID: <20241218184433.1930532-1-l.stach(a)pengutronix.de>
Signed-off-by: Ulf Hansson <ulf.hansson(a)linaro.org>
diff --git a/drivers/pmdomain/core.c b/drivers/pmdomain/core.c
index bb11f467dc78..20a9efebbcb7 100644
--- a/drivers/pmdomain/core.c
+++ b/drivers/pmdomain/core.c
@@ -2142,6 +2142,11 @@ static int genpd_set_default_power_state(struct generic_pm_domain *genpd)
return 0;
}
+static void genpd_provider_release(struct device *dev)
+{
+ /* nothing to be done here */
+}
+
static int genpd_alloc_data(struct generic_pm_domain *genpd)
{
struct genpd_governor_data *gd = NULL;
@@ -2173,6 +2178,7 @@ static int genpd_alloc_data(struct generic_pm_domain *genpd)
genpd->gd = gd;
device_initialize(&genpd->dev);
+ genpd->dev.release = genpd_provider_release;
if (!genpd_is_dev_name_fw(genpd)) {
dev_set_name(&genpd->dev, "%s", genpd->name);