When using the in-kernel pd-mapper on x1e80100, client drivers often
fail to communicate with the firmware during boot, which specifically
breaks battery and USB-C altmode notifications. This has been observed
to happen on almost every second boot (41%) but likely depends on probe
order:
pmic_glink_altmode.pmic_glink_altmode pmic_glink.altmode.0: failed to send altmode request: 0x10 (-125)
pmic_glink_altmode.pmic_glink_altmode pmic_glink.altmode.0: failed to request altmode notifications: -125
ucsi_glink.pmic_glink_ucsi pmic_glink.ucsi.0: failed to send UCSI read request: -125
qcom_battmgr.pmic_glink_power_supply pmic_glink.power-supply.0: failed to request power notifications
In the same setup audio also fails to probe albeit much more rarely:
PDR: avs/audio get domain list txn wait failed: -110
PDR: service lookup for avs/audio failed: -110
Chris Lew has provided an analysis and is working on a fix for the
ECANCELED (125) errors, but it is not yet clear whether this will also
address the audio regression.
Even if this was first observed on x1e80100 there is currently no reason
to believe that these issues are specific to that platform.
Disable the in-kernel pd-mapper for now, and make sure to backport this
to stable to prevent users and distros from migrating away from the
user-space service.
Fixes: 1ebcde047c54 ("soc: qcom: add pd-mapper implementation")
Cc: stable(a)vger.kernel.org # 6.11
Link: https://lore.kernel.org/lkml/Zqet8iInnDhnxkT9@hovoldconsulting.com/
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
---
It's now been over two months since I reported this regression, and even
if we seem to be making some progress on at least some of these issues I
think we need disable the pd-mapper temporarily until the fixes are in
place (e.g. to prevent distros from dropping the user-space service).
Johan
#regzbot introduced: 1ebcde047c54
drivers/soc/qcom/Kconfig | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/soc/qcom/Kconfig b/drivers/soc/qcom/Kconfig
index 74b9121240f8..35ddab9338d4 100644
--- a/drivers/soc/qcom/Kconfig
+++ b/drivers/soc/qcom/Kconfig
@@ -78,6 +78,7 @@ config QCOM_PD_MAPPER
select QCOM_PDR_MSG
select AUXILIARY_BUS
depends on NET && QRTR && (ARCH_QCOM || COMPILE_TEST)
+ depends on BROKEN
default QCOM_RPROC_COMMON
help
The Protection Domain Mapper maps registered services to the domains
--
2.45.2
From: Bard Liao <yung-chuan.liao(a)linux.intel.com>
[ Upstream commit 569922b82ca660f8b24e705f6cf674e6b1f99cc7 ]
Each cpu DAI should associate with a widget. However, the topology might
not create the right number of DAI widgets for aggregated amps. And it
will cause NULL pointer deference.
Check that the DAI widget associated with the CPU DAI is valid to prevent
NULL pointer deference due to missing DAI widgets in topologies with
aggregated amps.
Signed-off-by: Bard Liao <yung-chuan.liao(a)linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan(a)linux.intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi(a)linux.intel.com>
Reviewed-by: Liam Girdwood <liam.r.girdwood(a)intel.com>
Link: https://patch.msgid.link/20241203104853.56956-1-yung-chuan.liao@linux.intel…
Signed-off-by: Mark Brown <broonie(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
sound/soc/sof/intel/hda-dai.c | 12 ++++++++++++
sound/soc/sof/intel/hda.c | 5 +++++
2 files changed, 17 insertions(+)
diff --git a/sound/soc/sof/intel/hda-dai.c b/sound/soc/sof/intel/hda-dai.c
index 0db2a3e554fb2..da12aabc1bb85 100644
--- a/sound/soc/sof/intel/hda-dai.c
+++ b/sound/soc/sof/intel/hda-dai.c
@@ -503,6 +503,12 @@ int sdw_hda_dai_hw_params(struct snd_pcm_substream *substream,
int ret;
int i;
+ if (!w) {
+ dev_err(cpu_dai->dev, "%s widget not found, check amp link num in the topology\n",
+ cpu_dai->name);
+ return -EINVAL;
+ }
+
ops = hda_dai_get_ops(substream, cpu_dai);
if (!ops) {
dev_err(cpu_dai->dev, "DAI widget ops not set\n");
@@ -582,6 +588,12 @@ int sdw_hda_dai_hw_params(struct snd_pcm_substream *substream,
*/
for_each_rtd_cpu_dais(rtd, i, dai) {
w = snd_soc_dai_get_widget(dai, substream->stream);
+ if (!w) {
+ dev_err(cpu_dai->dev,
+ "%s widget not found, check amp link num in the topology\n",
+ dai->name);
+ return -EINVAL;
+ }
ipc4_copier = widget_to_copier(w);
memcpy(&ipc4_copier->dma_config_tlv[cpu_dai_id], dma_config_tlv,
sizeof(*dma_config_tlv));
diff --git a/sound/soc/sof/intel/hda.c b/sound/soc/sof/intel/hda.c
index f991785f727e9..be689f6e10c81 100644
--- a/sound/soc/sof/intel/hda.c
+++ b/sound/soc/sof/intel/hda.c
@@ -63,6 +63,11 @@ static int sdw_params_stream(struct device *dev,
struct snd_soc_dapm_widget *w = snd_soc_dai_get_widget(d, params_data->substream->stream);
struct snd_sof_dai_config_data data = { 0 };
+ if (!w) {
+ dev_err(dev, "%s widget not found, check amp link num in the topology\n",
+ d->name);
+ return -EINVAL;
+ }
data.dai_index = (params_data->link_id << 8) | d->id;
data.dai_data = params_data->alh_stream_id;
data.dai_node_id = data.dai_data;
--
2.39.5
Good Day,
How are you?
My name is Calib Cassim from Eskom Holdings Ltd. SA. I got your email from
my personal search.
I have in my position an (over-invoice / estimated) contract amount
executed by a Foreign Contractor through my Department, which the
contractor has been paid, and left the (over-invoice / estimated) amount
with the Payor.
So, I'm asking for your help to receive $25.5 Million on my behalf for my
investment in your area, I will obtain all the needed legal documents in
your name for the transfer of the amount to you.
If you can help, please send me your phone number for easy communication.
Thanks,
Mr. Calib
Fail I/Os instead of retry to prevent user space processes from being
blocked on the I/O completion for several minutes.
Retrying I/Os during "depopulation in progress" or "depopulation restore
in progress" results in a continuous retry loop until the depopulation
completes or until the I/O retry loop is aborted due to a timeout by
the scsi_cmd_runtime_exceeced().
Depopulation is slow and can take 24+ hours to complete on 20+ TB HDDs.
Most I/Os in the depopulation retry loop end up taking several minutes
before returning the failure to user space.
Cc: <stable(a)vger.kernel.org> # 4.18.x: 2bbeb8d scsi: core: Handle depopulation and restoration in progress
Cc: <stable(a)vger.kernel.org> # 4.18.x
Fixes: e37c7d9a0341 ("scsi: core: sanitize++ in progress")
Signed-off-by: Igor Pylypiv <ipylypiv(a)google.com>
---
Changes in v2:
- Added Fixes: and Cc: stable tags.
drivers/scsi/scsi_lib.c | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index e7ea1f04164a..3ab4c958da45 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -872,13 +872,18 @@ static void scsi_io_completion_action(struct scsi_cmnd *cmd, int result)
case 0x1a: /* start stop unit in progress */
case 0x1b: /* sanitize in progress */
case 0x1d: /* configuration in progress */
- case 0x24: /* depopulation in progress */
- case 0x25: /* depopulation restore in progress */
action = ACTION_DELAYED_RETRY;
break;
case 0x0a: /* ALUA state transition */
action = ACTION_DELAYED_REPREP;
break;
+ /*
+ * Depopulation might take many hours,
+ * thus it is not worthwhile to retry.
+ */
+ case 0x24: /* depopulation in progress */
+ case 0x25: /* depopulation restore in progress */
+ fallthrough;
default:
action = ACTION_FAIL;
break;
--
2.48.1.362.g079036d154-goog
From: Hugo Villeneuve <hvilleneuve(a)dimonoff.com>
The current reset pulse width is measured to be 5us on a
Renesas RZ/G2L SOM. The manufacturer's minimum reset pulse width is
specified as 10us.
Extend reset pulse width to make sure it is long enough on all platforms.
Also reword confusing comments about reset pin assertion.
Fixes: 5b0c03e24a06 ("Input: Add driver for Cypress Generation 5 touchscreen")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Hugo Villeneuve <hvilleneuve(a)dimonoff.com>
---
drivers/input/touchscreen/cyttsp5.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/drivers/input/touchscreen/cyttsp5.c b/drivers/input/touchscreen/cyttsp5.c
index eafe5a9b8964..bb09e84d0e92 100644
--- a/drivers/input/touchscreen/cyttsp5.c
+++ b/drivers/input/touchscreen/cyttsp5.c
@@ -870,13 +870,16 @@ static int cyttsp5_probe(struct device *dev, struct regmap *regmap, int irq,
ts->input->phys = ts->phys;
input_set_drvdata(ts->input, ts);
- /* Reset the gpio to be in a reset state */
+ /* Assert gpio to be in a reset state */
ts->reset_gpio = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_HIGH);
if (IS_ERR(ts->reset_gpio)) {
error = PTR_ERR(ts->reset_gpio);
dev_err(dev, "Failed to request reset gpio, error %d\n", error);
return error;
}
+
+ fsleep(1000); /* Ensure long-enough reset pulse (minimum 10us). */
+
gpiod_set_value_cansleep(ts->reset_gpio, 0);
/* Need a delay to have device up */
base-commit: 0de63bb7d91975e73338300a57c54b93d3cc151c
--
2.39.5
Add a sanity check to madvise_dontneed_free() to address a corner case
in madvise where a race condition causes the current vma being processed
to be backed by a different page size.
During a madvise(MADV_DONTNEED) call on a memory region registered with
a userfaultfd, there's a period of time where the process mm lock is
temporarily released in order to send a UFFD_EVENT_REMOVE and let
userspace handle the event. During this time, the vma covering the
current address range may change due to an explicit mmap done
concurrently by another thread.
If, after that change, the memory region, which was originally backed by
4KB pages, is now backed by hugepages, the end address is rounded down
to a hugepage boundary to avoid data loss (see "Fixes" below). This
rounding may cause the end address to be truncated to the same address
as the start.
Make this corner case follow the same semantics as in other similar
cases where the requested region has zero length (ie. return 0).
This will make madvise_walk_vmas() continue to the next vma in the
range (this time holding the process mm lock) which, due to the prev
pointer becoming stale because of the vma change, will be the same
hugepage-backed vma that was just checked before. The next time
madvise_dontneed_free() runs for this vma, if the start address isn't
aligned to a hugepage boundary, it'll return -EINVAL, which is also in
line with the madvise api.
From userspace perspective, madvise() will return EINVAL because the
start address isn't aligned according to the new vma alignment
requirements (hugepage), even though it was correctly page-aligned when
the call was issued.
Fixes: 8ebe0a5eaaeb ("mm,madvise,hugetlb: fix unexpected data loss with MADV_DONTNEED on hugetlbfs")
Cc: stable(a)vger.kernel.org
Signed-off-by: Ricardo Cañuelo Navarro <rcn(a)igalia.com>
---
Changes in v2:
- Added documentation in the code to tell the user how this situation
can happen. (Andrew)
---
mm/madvise.c | 11 ++++++++++-
1 file changed, 10 insertions(+), 1 deletion(-)
diff --git a/mm/madvise.c b/mm/madvise.c
index 49f3a75046f6..08b207f8e61e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -933,7 +933,16 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
*/
end = vma->vm_end;
}
- VM_WARN_ON(start >= end);
+ /*
+ * If the memory region between start and end was
+ * originally backed by 4kB pages and then remapped to
+ * be backed by hugepages while mmap_lock was dropped,
+ * the adjustment for hugetlb vma above may have rounded
+ * end down to the start address.
+ */
+ if (start == end)
+ return 0;
+ VM_WARN_ON(start > end);
}
if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
--
2.48.1
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 72dad8e377afa50435940adfb697e070d3556670
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020535-recognize-universe-67dc@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 72dad8e377afa50435940adfb697e070d3556670 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu(a)suse.com>
Date: Thu, 12 Dec 2024 16:43:55 +1030
Subject: [PATCH] btrfs: fix double accounting race when
btrfs_run_delalloc_range() failed
[BUG]
When running btrfs with block size (4K) smaller than page size (64K,
aarch64), there is a very high chance to crash the kernel at
generic/750, with the following messages:
(before the call traces, there are 3 extra debug messages added)
BTRFS warning (device dm-3): read-write for sector size 4096 with page size 65536 is experimental
BTRFS info (device dm-3): checking UUID tree
hrtimer: interrupt took 5451385 ns
BTRFS error (device dm-3): cow_file_range failed, root=4957 inode=257 start=1605632 len=69632: -28
BTRFS error (device dm-3): run_delalloc_nocow failed, root=4957 inode=257 start=1605632 len=69632: -28
BTRFS error (device dm-3): failed to run delalloc range, root=4957 ino=257 folio=1572864 submit_bitmap=8-15 start=1605632 len=69632: -28
------------[ cut here ]------------
WARNING: CPU: 2 PID: 3020984 at ordered-data.c:360 can_finish_ordered_extent+0x370/0x3b8 [btrfs]
CPU: 2 UID: 0 PID: 3020984 Comm: kworker/u24:1 Tainted: G OE 6.13.0-rc1-custom+ #89
Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE
Hardware name: QEMU KVM Virtual Machine, BIOS unknown 2/2/2022
Workqueue: events_unbound btrfs_async_reclaim_data_space [btrfs]
pc : can_finish_ordered_extent+0x370/0x3b8 [btrfs]
lr : can_finish_ordered_extent+0x1ec/0x3b8 [btrfs]
Call trace:
can_finish_ordered_extent+0x370/0x3b8 [btrfs] (P)
can_finish_ordered_extent+0x1ec/0x3b8 [btrfs] (L)
btrfs_mark_ordered_io_finished+0x130/0x2b8 [btrfs]
extent_writepage+0x10c/0x3b8 [btrfs]
extent_write_cache_pages+0x21c/0x4e8 [btrfs]
btrfs_writepages+0x94/0x160 [btrfs]
do_writepages+0x74/0x190
filemap_fdatawrite_wbc+0x74/0xa0
start_delalloc_inodes+0x17c/0x3b0 [btrfs]
btrfs_start_delalloc_roots+0x17c/0x288 [btrfs]
shrink_delalloc+0x11c/0x280 [btrfs]
flush_space+0x288/0x328 [btrfs]
btrfs_async_reclaim_data_space+0x180/0x228 [btrfs]
process_one_work+0x228/0x680
worker_thread+0x1bc/0x360
kthread+0x100/0x118
ret_from_fork+0x10/0x20
---[ end trace 0000000000000000 ]---
BTRFS critical (device dm-3): bad ordered extent accounting, root=4957 ino=257 OE offset=1605632 OE len=16384 to_dec=16384 left=0
BTRFS critical (device dm-3): bad ordered extent accounting, root=4957 ino=257 OE offset=1622016 OE len=12288 to_dec=12288 left=0
Unable to handle kernel NULL pointer dereference at virtual address 0000000000000008
BTRFS critical (device dm-3): bad ordered extent accounting, root=4957 ino=257 OE offset=1634304 OE len=8192 to_dec=4096 left=0
CPU: 1 UID: 0 PID: 3286940 Comm: kworker/u24:3 Tainted: G W OE 6.13.0-rc1-custom+ #89
Hardware name: QEMU KVM Virtual Machine, BIOS unknown 2/2/2022
Workqueue: btrfs_work_helper [btrfs] (btrfs-endio-write)
pstate: 404000c5 (nZcv daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
pc : process_one_work+0x110/0x680
lr : worker_thread+0x1bc/0x360
Call trace:
process_one_work+0x110/0x680 (P)
worker_thread+0x1bc/0x360 (L)
worker_thread+0x1bc/0x360
kthread+0x100/0x118
ret_from_fork+0x10/0x20
Code: f84086a1 f9000fe1 53041c21 b9003361 (f9400661)
---[ end trace 0000000000000000 ]---
Kernel panic - not syncing: Oops: Fatal exception
SMP: stopping secondary CPUs
SMP: failed to stop secondary CPUs 2-3
Dumping ftrace buffer:
(ftrace buffer empty)
Kernel Offset: 0x275bb9540000 from 0xffff800080000000
PHYS_OFFSET: 0xffff8fbba0000000
CPU features: 0x100,00000070,00801250,8201720b
[CAUSE]
The above warning is triggered immediately after the delalloc range
failure, this happens in the following sequence:
- Range [1568K, 1636K) is dirty
1536K 1568K 1600K 1636K 1664K
| |/////////|////////| |
Where 1536K, 1600K and 1664K are page boundaries (64K page size)
- Enter extent_writepage() for page 1536K
- Enter run_delalloc_nocow() with locked page 1536K and range
[1568K, 1636K)
This is due to the inode having preallocated extents.
- Enter cow_file_range() with locked page 1536K and range
[1568K, 1636K)
- btrfs_reserve_extent() only reserved two extents
The main loop of cow_file_range() only reserved two data extents,
Now we have:
1536K 1568K 1600K 1636K 1664K
| |<-->|<--->|/|///////| |
1584K 1596K
Range [1568K, 1596K) has an ordered extent reserved.
- btrfs_reserve_extent() failed inside cow_file_range() for file offset
1596K
This is already a bug in our space reservation code, but for now let's
focus on the error handling path.
Now cow_file_range() returned -ENOSPC.
- btrfs_run_delalloc_range() do error cleanup <<< ROOT CAUSE
Call btrfs_cleanup_ordered_extents() with locked folio 1536K and range
[1568K, 1636K)
Function btrfs_cleanup_ordered_extents() normally needs to skip the
ranges inside the folio, as it will normally be cleaned up by
extent_writepage().
Such split error handling is already problematic in the first place.
What's worse is the folio range skipping itself, which is not taking
subpage cases into consideration at all, it will only skip the range
if the page start >= the range start.
In our case, the page start < the range start, since for subpage cases
we can have delalloc ranges inside the folio but not covering the
folio.
So it doesn't skip the page range at all.
This means all the ordered extents, both [1568K, 1584K) and
[1584K, 1596K) will be marked as IOERR.
And these two ordered extents have no more pending ios, they are marked
finished, and *QUEUED* to be deleted from the io tree.
- extent_writepage() do error cleanup
Call btrfs_mark_ordered_io_finished() for the range [1536K, 1600K).
Although ranges [1568K, 1584K) and [1584K, 1596K) are finished, the
deletion from io tree is async, it may or may not happen at this
time.
If the ranges have not yet been removed, we will do double cleaning on
those ranges, triggering the above ordered extent warnings.
In theory there are other bugs, like the cleanup in extent_writepage()
can cause double accounting on ranges that are submitted asynchronously
(compression for example).
But that's much harder to trigger because normally we do not mix regular
and compression delalloc ranges.
[FIX]
The folio range split is already buggy and not subpage compatible, it
was introduced a long time ago where subpage support was not even considered.
So instead of splitting the ordered extents cleanup into the folio range
and out of folio range, do all the cleanup inside writepage_delalloc().
- Pass @NULL as locked_folio for btrfs_cleanup_ordered_extents() in
btrfs_run_delalloc_range()
- Skip the btrfs_cleanup_ordered_extents() if writepage_delalloc()
failed
So all ordered extents are only cleaned up by
btrfs_run_delalloc_range().
- Handle the ranges that already have ordered extents allocated
If part of the folio already has ordered extent allocated, and
btrfs_run_delalloc_range() failed, we also need to cleanup that range.
Now we have a concentrated error handling for ordered extents during
btrfs_run_delalloc_range().
Fixes: d1051d6ebf8e ("btrfs: Fix error handling in btrfs_cleanup_ordered_extents")
CC: stable(a)vger.kernel.org # 5.15+
Reviewed-by: Boris Burkov <boris(a)bur.io>
Signed-off-by: Qu Wenruo <wqu(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c068a442753c..bc2bd103c8cc 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1134,14 +1134,19 @@ static bool find_next_delalloc_bitmap(struct folio *folio,
}
/*
- * helper for extent_writepage(), doing all of the delayed allocation setup.
+ * Do all of the delayed allocation setup.
*
- * This returns 1 if btrfs_run_delalloc_range function did all the work required
- * to write the page (copy into inline extent). In this case the IO has
- * been started and the page is already unlocked.
+ * Return >0 if all the dirty blocks are submitted async (compression) or inlined.
+ * The @folio should no longer be touched (treat it as already unlocked).
*
- * This returns 0 if all went well (page still locked)
- * This returns < 0 if there were errors (page still locked)
+ * Return 0 if there is still dirty block that needs to be submitted through
+ * extent_writepage_io().
+ * bio_ctrl->submit_bitmap will indicate which blocks of the folio should be
+ * submitted, and @folio is still kept locked.
+ *
+ * Return <0 if there is any error hit.
+ * Any allocated ordered extent range covering this folio will be marked
+ * finished (IOERR), and @folio is still kept locked.
*/
static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
struct folio *folio,
@@ -1159,6 +1164,16 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
* last delalloc end.
*/
u64 last_delalloc_end = 0;
+ /*
+ * The range end (exclusive) of the last successfully finished delalloc
+ * range.
+ * Any range covered by ordered extent must either be manually marked
+ * finished (error handling), or has IO submitted (and finish the
+ * ordered extent normally).
+ *
+ * This records the end of ordered extent cleanup if we hit an error.
+ */
+ u64 last_finished_delalloc_end = page_start;
u64 delalloc_start = page_start;
u64 delalloc_end = page_end;
u64 delalloc_to_write = 0;
@@ -1227,11 +1242,19 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
found_len = last_delalloc_end + 1 - found_start;
if (ret >= 0) {
+ /*
+ * Some delalloc range may be created by previous folios.
+ * Thus we still need to clean up this range during error
+ * handling.
+ */
+ last_finished_delalloc_end = found_start;
/* No errors hit so far, run the current delalloc range. */
ret = btrfs_run_delalloc_range(inode, folio,
found_start,
found_start + found_len - 1,
wbc);
+ if (ret >= 0)
+ last_finished_delalloc_end = found_start + found_len;
} else {
/*
* We've hit an error during previous delalloc range,
@@ -1266,8 +1289,22 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
delalloc_start = found_start + found_len;
}
- if (ret < 0)
+ /*
+ * It's possible we had some ordered extents created before we hit
+ * an error, cleanup non-async successfully created delalloc ranges.
+ */
+ if (unlikely(ret < 0)) {
+ unsigned int bitmap_size = min(
+ (last_finished_delalloc_end - page_start) >>
+ fs_info->sectorsize_bits,
+ fs_info->sectors_per_page);
+
+ for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size)
+ btrfs_mark_ordered_io_finished(inode, folio,
+ page_start + (bit << fs_info->sectorsize_bits),
+ fs_info->sectorsize, false);
return ret;
+ }
out:
if (last_delalloc_end)
delalloc_end = last_delalloc_end;
@@ -1501,13 +1538,13 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
bio_ctrl->wbc->nr_to_write--;
-done:
- if (ret) {
+ if (ret)
btrfs_mark_ordered_io_finished(inode, folio,
page_start, PAGE_SIZE, !ret);
- mapping_set_error(folio->mapping, ret);
- }
+done:
+ if (ret < 0)
+ mapping_set_error(folio->mapping, ret);
/*
* Only unlock ranges that are submitted. As there can be some async
* submitted ranges inside the folio.
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1546f341f9a4..b81afe757f64 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2301,8 +2301,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol
out:
if (ret < 0)
- btrfs_cleanup_ordered_extents(inode, locked_folio, start,
- end - start + 1);
+ btrfs_cleanup_ordered_extents(inode, NULL, start, end - start + 1);
return ret;
}
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 72dad8e377afa50435940adfb697e070d3556670
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020507-parole-precise-a4a5@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 72dad8e377afa50435940adfb697e070d3556670 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu(a)suse.com>
Date: Thu, 12 Dec 2024 16:43:55 +1030
Subject: [PATCH] btrfs: fix double accounting race when
btrfs_run_delalloc_range() failed
[BUG]
When running btrfs with block size (4K) smaller than page size (64K,
aarch64), there is a very high chance to crash the kernel at
generic/750, with the following messages:
(before the call traces, there are 3 extra debug messages added)
BTRFS warning (device dm-3): read-write for sector size 4096 with page size 65536 is experimental
BTRFS info (device dm-3): checking UUID tree
hrtimer: interrupt took 5451385 ns
BTRFS error (device dm-3): cow_file_range failed, root=4957 inode=257 start=1605632 len=69632: -28
BTRFS error (device dm-3): run_delalloc_nocow failed, root=4957 inode=257 start=1605632 len=69632: -28
BTRFS error (device dm-3): failed to run delalloc range, root=4957 ino=257 folio=1572864 submit_bitmap=8-15 start=1605632 len=69632: -28
------------[ cut here ]------------
WARNING: CPU: 2 PID: 3020984 at ordered-data.c:360 can_finish_ordered_extent+0x370/0x3b8 [btrfs]
CPU: 2 UID: 0 PID: 3020984 Comm: kworker/u24:1 Tainted: G OE 6.13.0-rc1-custom+ #89
Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE
Hardware name: QEMU KVM Virtual Machine, BIOS unknown 2/2/2022
Workqueue: events_unbound btrfs_async_reclaim_data_space [btrfs]
pc : can_finish_ordered_extent+0x370/0x3b8 [btrfs]
lr : can_finish_ordered_extent+0x1ec/0x3b8 [btrfs]
Call trace:
can_finish_ordered_extent+0x370/0x3b8 [btrfs] (P)
can_finish_ordered_extent+0x1ec/0x3b8 [btrfs] (L)
btrfs_mark_ordered_io_finished+0x130/0x2b8 [btrfs]
extent_writepage+0x10c/0x3b8 [btrfs]
extent_write_cache_pages+0x21c/0x4e8 [btrfs]
btrfs_writepages+0x94/0x160 [btrfs]
do_writepages+0x74/0x190
filemap_fdatawrite_wbc+0x74/0xa0
start_delalloc_inodes+0x17c/0x3b0 [btrfs]
btrfs_start_delalloc_roots+0x17c/0x288 [btrfs]
shrink_delalloc+0x11c/0x280 [btrfs]
flush_space+0x288/0x328 [btrfs]
btrfs_async_reclaim_data_space+0x180/0x228 [btrfs]
process_one_work+0x228/0x680
worker_thread+0x1bc/0x360
kthread+0x100/0x118
ret_from_fork+0x10/0x20
---[ end trace 0000000000000000 ]---
BTRFS critical (device dm-3): bad ordered extent accounting, root=4957 ino=257 OE offset=1605632 OE len=16384 to_dec=16384 left=0
BTRFS critical (device dm-3): bad ordered extent accounting, root=4957 ino=257 OE offset=1622016 OE len=12288 to_dec=12288 left=0
Unable to handle kernel NULL pointer dereference at virtual address 0000000000000008
BTRFS critical (device dm-3): bad ordered extent accounting, root=4957 ino=257 OE offset=1634304 OE len=8192 to_dec=4096 left=0
CPU: 1 UID: 0 PID: 3286940 Comm: kworker/u24:3 Tainted: G W OE 6.13.0-rc1-custom+ #89
Hardware name: QEMU KVM Virtual Machine, BIOS unknown 2/2/2022
Workqueue: btrfs_work_helper [btrfs] (btrfs-endio-write)
pstate: 404000c5 (nZcv daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
pc : process_one_work+0x110/0x680
lr : worker_thread+0x1bc/0x360
Call trace:
process_one_work+0x110/0x680 (P)
worker_thread+0x1bc/0x360 (L)
worker_thread+0x1bc/0x360
kthread+0x100/0x118
ret_from_fork+0x10/0x20
Code: f84086a1 f9000fe1 53041c21 b9003361 (f9400661)
---[ end trace 0000000000000000 ]---
Kernel panic - not syncing: Oops: Fatal exception
SMP: stopping secondary CPUs
SMP: failed to stop secondary CPUs 2-3
Dumping ftrace buffer:
(ftrace buffer empty)
Kernel Offset: 0x275bb9540000 from 0xffff800080000000
PHYS_OFFSET: 0xffff8fbba0000000
CPU features: 0x100,00000070,00801250,8201720b
[CAUSE]
The above warning is triggered immediately after the delalloc range
failure, this happens in the following sequence:
- Range [1568K, 1636K) is dirty
1536K 1568K 1600K 1636K 1664K
| |/////////|////////| |
Where 1536K, 1600K and 1664K are page boundaries (64K page size)
- Enter extent_writepage() for page 1536K
- Enter run_delalloc_nocow() with locked page 1536K and range
[1568K, 1636K)
This is due to the inode having preallocated extents.
- Enter cow_file_range() with locked page 1536K and range
[1568K, 1636K)
- btrfs_reserve_extent() only reserved two extents
The main loop of cow_file_range() only reserved two data extents,
Now we have:
1536K 1568K 1600K 1636K 1664K
| |<-->|<--->|/|///////| |
1584K 1596K
Range [1568K, 1596K) has an ordered extent reserved.
- btrfs_reserve_extent() failed inside cow_file_range() for file offset
1596K
This is already a bug in our space reservation code, but for now let's
focus on the error handling path.
Now cow_file_range() returned -ENOSPC.
- btrfs_run_delalloc_range() do error cleanup <<< ROOT CAUSE
Call btrfs_cleanup_ordered_extents() with locked folio 1536K and range
[1568K, 1636K)
Function btrfs_cleanup_ordered_extents() normally needs to skip the
ranges inside the folio, as it will normally be cleaned up by
extent_writepage().
Such split error handling is already problematic in the first place.
What's worse is the folio range skipping itself, which is not taking
subpage cases into consideration at all, it will only skip the range
if the page start >= the range start.
In our case, the page start < the range start, since for subpage cases
we can have delalloc ranges inside the folio but not covering the
folio.
So it doesn't skip the page range at all.
This means all the ordered extents, both [1568K, 1584K) and
[1584K, 1596K) will be marked as IOERR.
And these two ordered extents have no more pending ios, they are marked
finished, and *QUEUED* to be deleted from the io tree.
- extent_writepage() do error cleanup
Call btrfs_mark_ordered_io_finished() for the range [1536K, 1600K).
Although ranges [1568K, 1584K) and [1584K, 1596K) are finished, the
deletion from io tree is async, it may or may not happen at this
time.
If the ranges have not yet been removed, we will do double cleaning on
those ranges, triggering the above ordered extent warnings.
In theory there are other bugs, like the cleanup in extent_writepage()
can cause double accounting on ranges that are submitted asynchronously
(compression for example).
But that's much harder to trigger because normally we do not mix regular
and compression delalloc ranges.
[FIX]
The folio range split is already buggy and not subpage compatible, it
was introduced a long time ago where subpage support was not even considered.
So instead of splitting the ordered extents cleanup into the folio range
and out of folio range, do all the cleanup inside writepage_delalloc().
- Pass @NULL as locked_folio for btrfs_cleanup_ordered_extents() in
btrfs_run_delalloc_range()
- Skip the btrfs_cleanup_ordered_extents() if writepage_delalloc()
failed
So all ordered extents are only cleaned up by
btrfs_run_delalloc_range().
- Handle the ranges that already have ordered extents allocated
If part of the folio already has ordered extent allocated, and
btrfs_run_delalloc_range() failed, we also need to cleanup that range.
Now we have a concentrated error handling for ordered extents during
btrfs_run_delalloc_range().
Fixes: d1051d6ebf8e ("btrfs: Fix error handling in btrfs_cleanup_ordered_extents")
CC: stable(a)vger.kernel.org # 5.15+
Reviewed-by: Boris Burkov <boris(a)bur.io>
Signed-off-by: Qu Wenruo <wqu(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c068a442753c..bc2bd103c8cc 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1134,14 +1134,19 @@ static bool find_next_delalloc_bitmap(struct folio *folio,
}
/*
- * helper for extent_writepage(), doing all of the delayed allocation setup.
+ * Do all of the delayed allocation setup.
*
- * This returns 1 if btrfs_run_delalloc_range function did all the work required
- * to write the page (copy into inline extent). In this case the IO has
- * been started and the page is already unlocked.
+ * Return >0 if all the dirty blocks are submitted async (compression) or inlined.
+ * The @folio should no longer be touched (treat it as already unlocked).
*
- * This returns 0 if all went well (page still locked)
- * This returns < 0 if there were errors (page still locked)
+ * Return 0 if there is still dirty block that needs to be submitted through
+ * extent_writepage_io().
+ * bio_ctrl->submit_bitmap will indicate which blocks of the folio should be
+ * submitted, and @folio is still kept locked.
+ *
+ * Return <0 if there is any error hit.
+ * Any allocated ordered extent range covering this folio will be marked
+ * finished (IOERR), and @folio is still kept locked.
*/
static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
struct folio *folio,
@@ -1159,6 +1164,16 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
* last delalloc end.
*/
u64 last_delalloc_end = 0;
+ /*
+ * The range end (exclusive) of the last successfully finished delalloc
+ * range.
+ * Any range covered by ordered extent must either be manually marked
+ * finished (error handling), or has IO submitted (and finish the
+ * ordered extent normally).
+ *
+ * This records the end of ordered extent cleanup if we hit an error.
+ */
+ u64 last_finished_delalloc_end = page_start;
u64 delalloc_start = page_start;
u64 delalloc_end = page_end;
u64 delalloc_to_write = 0;
@@ -1227,11 +1242,19 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
found_len = last_delalloc_end + 1 - found_start;
if (ret >= 0) {
+ /*
+ * Some delalloc range may be created by previous folios.
+ * Thus we still need to clean up this range during error
+ * handling.
+ */
+ last_finished_delalloc_end = found_start;
/* No errors hit so far, run the current delalloc range. */
ret = btrfs_run_delalloc_range(inode, folio,
found_start,
found_start + found_len - 1,
wbc);
+ if (ret >= 0)
+ last_finished_delalloc_end = found_start + found_len;
} else {
/*
* We've hit an error during previous delalloc range,
@@ -1266,8 +1289,22 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
delalloc_start = found_start + found_len;
}
- if (ret < 0)
+ /*
+ * It's possible we had some ordered extents created before we hit
+ * an error, cleanup non-async successfully created delalloc ranges.
+ */
+ if (unlikely(ret < 0)) {
+ unsigned int bitmap_size = min(
+ (last_finished_delalloc_end - page_start) >>
+ fs_info->sectorsize_bits,
+ fs_info->sectors_per_page);
+
+ for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size)
+ btrfs_mark_ordered_io_finished(inode, folio,
+ page_start + (bit << fs_info->sectorsize_bits),
+ fs_info->sectorsize, false);
return ret;
+ }
out:
if (last_delalloc_end)
delalloc_end = last_delalloc_end;
@@ -1501,13 +1538,13 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
bio_ctrl->wbc->nr_to_write--;
-done:
- if (ret) {
+ if (ret)
btrfs_mark_ordered_io_finished(inode, folio,
page_start, PAGE_SIZE, !ret);
- mapping_set_error(folio->mapping, ret);
- }
+done:
+ if (ret < 0)
+ mapping_set_error(folio->mapping, ret);
/*
* Only unlock ranges that are submitted. As there can be some async
* submitted ranges inside the folio.
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1546f341f9a4..b81afe757f64 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2301,8 +2301,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol
out:
if (ret < 0)
- btrfs_cleanup_ordered_extents(inode, locked_folio, start,
- end - start + 1);
+ btrfs_cleanup_ordered_extents(inode, NULL, start, end - start + 1);
return ret;
}
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 72dad8e377afa50435940adfb697e070d3556670
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020557-faucet-engross-c54a@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 72dad8e377afa50435940adfb697e070d3556670 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu(a)suse.com>
Date: Thu, 12 Dec 2024 16:43:55 +1030
Subject: [PATCH] btrfs: fix double accounting race when
btrfs_run_delalloc_range() failed
[BUG]
When running btrfs with block size (4K) smaller than page size (64K,
aarch64), there is a very high chance to crash the kernel at
generic/750, with the following messages:
(before the call traces, there are 3 extra debug messages added)
BTRFS warning (device dm-3): read-write for sector size 4096 with page size 65536 is experimental
BTRFS info (device dm-3): checking UUID tree
hrtimer: interrupt took 5451385 ns
BTRFS error (device dm-3): cow_file_range failed, root=4957 inode=257 start=1605632 len=69632: -28
BTRFS error (device dm-3): run_delalloc_nocow failed, root=4957 inode=257 start=1605632 len=69632: -28
BTRFS error (device dm-3): failed to run delalloc range, root=4957 ino=257 folio=1572864 submit_bitmap=8-15 start=1605632 len=69632: -28
------------[ cut here ]------------
WARNING: CPU: 2 PID: 3020984 at ordered-data.c:360 can_finish_ordered_extent+0x370/0x3b8 [btrfs]
CPU: 2 UID: 0 PID: 3020984 Comm: kworker/u24:1 Tainted: G OE 6.13.0-rc1-custom+ #89
Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE
Hardware name: QEMU KVM Virtual Machine, BIOS unknown 2/2/2022
Workqueue: events_unbound btrfs_async_reclaim_data_space [btrfs]
pc : can_finish_ordered_extent+0x370/0x3b8 [btrfs]
lr : can_finish_ordered_extent+0x1ec/0x3b8 [btrfs]
Call trace:
can_finish_ordered_extent+0x370/0x3b8 [btrfs] (P)
can_finish_ordered_extent+0x1ec/0x3b8 [btrfs] (L)
btrfs_mark_ordered_io_finished+0x130/0x2b8 [btrfs]
extent_writepage+0x10c/0x3b8 [btrfs]
extent_write_cache_pages+0x21c/0x4e8 [btrfs]
btrfs_writepages+0x94/0x160 [btrfs]
do_writepages+0x74/0x190
filemap_fdatawrite_wbc+0x74/0xa0
start_delalloc_inodes+0x17c/0x3b0 [btrfs]
btrfs_start_delalloc_roots+0x17c/0x288 [btrfs]
shrink_delalloc+0x11c/0x280 [btrfs]
flush_space+0x288/0x328 [btrfs]
btrfs_async_reclaim_data_space+0x180/0x228 [btrfs]
process_one_work+0x228/0x680
worker_thread+0x1bc/0x360
kthread+0x100/0x118
ret_from_fork+0x10/0x20
---[ end trace 0000000000000000 ]---
BTRFS critical (device dm-3): bad ordered extent accounting, root=4957 ino=257 OE offset=1605632 OE len=16384 to_dec=16384 left=0
BTRFS critical (device dm-3): bad ordered extent accounting, root=4957 ino=257 OE offset=1622016 OE len=12288 to_dec=12288 left=0
Unable to handle kernel NULL pointer dereference at virtual address 0000000000000008
BTRFS critical (device dm-3): bad ordered extent accounting, root=4957 ino=257 OE offset=1634304 OE len=8192 to_dec=4096 left=0
CPU: 1 UID: 0 PID: 3286940 Comm: kworker/u24:3 Tainted: G W OE 6.13.0-rc1-custom+ #89
Hardware name: QEMU KVM Virtual Machine, BIOS unknown 2/2/2022
Workqueue: btrfs_work_helper [btrfs] (btrfs-endio-write)
pstate: 404000c5 (nZcv daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
pc : process_one_work+0x110/0x680
lr : worker_thread+0x1bc/0x360
Call trace:
process_one_work+0x110/0x680 (P)
worker_thread+0x1bc/0x360 (L)
worker_thread+0x1bc/0x360
kthread+0x100/0x118
ret_from_fork+0x10/0x20
Code: f84086a1 f9000fe1 53041c21 b9003361 (f9400661)
---[ end trace 0000000000000000 ]---
Kernel panic - not syncing: Oops: Fatal exception
SMP: stopping secondary CPUs
SMP: failed to stop secondary CPUs 2-3
Dumping ftrace buffer:
(ftrace buffer empty)
Kernel Offset: 0x275bb9540000 from 0xffff800080000000
PHYS_OFFSET: 0xffff8fbba0000000
CPU features: 0x100,00000070,00801250,8201720b
[CAUSE]
The above warning is triggered immediately after the delalloc range
failure, this happens in the following sequence:
- Range [1568K, 1636K) is dirty
1536K 1568K 1600K 1636K 1664K
| |/////////|////////| |
Where 1536K, 1600K and 1664K are page boundaries (64K page size)
- Enter extent_writepage() for page 1536K
- Enter run_delalloc_nocow() with locked page 1536K and range
[1568K, 1636K)
This is due to the inode having preallocated extents.
- Enter cow_file_range() with locked page 1536K and range
[1568K, 1636K)
- btrfs_reserve_extent() only reserved two extents
The main loop of cow_file_range() only reserved two data extents,
Now we have:
1536K 1568K 1600K 1636K 1664K
| |<-->|<--->|/|///////| |
1584K 1596K
Range [1568K, 1596K) has an ordered extent reserved.
- btrfs_reserve_extent() failed inside cow_file_range() for file offset
1596K
This is already a bug in our space reservation code, but for now let's
focus on the error handling path.
Now cow_file_range() returned -ENOSPC.
- btrfs_run_delalloc_range() do error cleanup <<< ROOT CAUSE
Call btrfs_cleanup_ordered_extents() with locked folio 1536K and range
[1568K, 1636K)
Function btrfs_cleanup_ordered_extents() normally needs to skip the
ranges inside the folio, as it will normally be cleaned up by
extent_writepage().
Such split error handling is already problematic in the first place.
What's worse is the folio range skipping itself, which is not taking
subpage cases into consideration at all, it will only skip the range
if the page start >= the range start.
In our case, the page start < the range start, since for subpage cases
we can have delalloc ranges inside the folio but not covering the
folio.
So it doesn't skip the page range at all.
This means all the ordered extents, both [1568K, 1584K) and
[1584K, 1596K) will be marked as IOERR.
And these two ordered extents have no more pending ios, they are marked
finished, and *QUEUED* to be deleted from the io tree.
- extent_writepage() do error cleanup
Call btrfs_mark_ordered_io_finished() for the range [1536K, 1600K).
Although ranges [1568K, 1584K) and [1584K, 1596K) are finished, the
deletion from io tree is async, it may or may not happen at this
time.
If the ranges have not yet been removed, we will do double cleaning on
those ranges, triggering the above ordered extent warnings.
In theory there are other bugs, like the cleanup in extent_writepage()
can cause double accounting on ranges that are submitted asynchronously
(compression for example).
But that's much harder to trigger because normally we do not mix regular
and compression delalloc ranges.
[FIX]
The folio range split is already buggy and not subpage compatible, it
was introduced a long time ago where subpage support was not even considered.
So instead of splitting the ordered extents cleanup into the folio range
and out of folio range, do all the cleanup inside writepage_delalloc().
- Pass @NULL as locked_folio for btrfs_cleanup_ordered_extents() in
btrfs_run_delalloc_range()
- Skip the btrfs_cleanup_ordered_extents() if writepage_delalloc()
failed
So all ordered extents are only cleaned up by
btrfs_run_delalloc_range().
- Handle the ranges that already have ordered extents allocated
If part of the folio already has ordered extent allocated, and
btrfs_run_delalloc_range() failed, we also need to cleanup that range.
Now we have a concentrated error handling for ordered extents during
btrfs_run_delalloc_range().
Fixes: d1051d6ebf8e ("btrfs: Fix error handling in btrfs_cleanup_ordered_extents")
CC: stable(a)vger.kernel.org # 5.15+
Reviewed-by: Boris Burkov <boris(a)bur.io>
Signed-off-by: Qu Wenruo <wqu(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c068a442753c..bc2bd103c8cc 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1134,14 +1134,19 @@ static bool find_next_delalloc_bitmap(struct folio *folio,
}
/*
- * helper for extent_writepage(), doing all of the delayed allocation setup.
+ * Do all of the delayed allocation setup.
*
- * This returns 1 if btrfs_run_delalloc_range function did all the work required
- * to write the page (copy into inline extent). In this case the IO has
- * been started and the page is already unlocked.
+ * Return >0 if all the dirty blocks are submitted async (compression) or inlined.
+ * The @folio should no longer be touched (treat it as already unlocked).
*
- * This returns 0 if all went well (page still locked)
- * This returns < 0 if there were errors (page still locked)
+ * Return 0 if there is still dirty block that needs to be submitted through
+ * extent_writepage_io().
+ * bio_ctrl->submit_bitmap will indicate which blocks of the folio should be
+ * submitted, and @folio is still kept locked.
+ *
+ * Return <0 if there is any error hit.
+ * Any allocated ordered extent range covering this folio will be marked
+ * finished (IOERR), and @folio is still kept locked.
*/
static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
struct folio *folio,
@@ -1159,6 +1164,16 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
* last delalloc end.
*/
u64 last_delalloc_end = 0;
+ /*
+ * The range end (exclusive) of the last successfully finished delalloc
+ * range.
+ * Any range covered by ordered extent must either be manually marked
+ * finished (error handling), or has IO submitted (and finish the
+ * ordered extent normally).
+ *
+ * This records the end of ordered extent cleanup if we hit an error.
+ */
+ u64 last_finished_delalloc_end = page_start;
u64 delalloc_start = page_start;
u64 delalloc_end = page_end;
u64 delalloc_to_write = 0;
@@ -1227,11 +1242,19 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
found_len = last_delalloc_end + 1 - found_start;
if (ret >= 0) {
+ /*
+ * Some delalloc range may be created by previous folios.
+ * Thus we still need to clean up this range during error
+ * handling.
+ */
+ last_finished_delalloc_end = found_start;
/* No errors hit so far, run the current delalloc range. */
ret = btrfs_run_delalloc_range(inode, folio,
found_start,
found_start + found_len - 1,
wbc);
+ if (ret >= 0)
+ last_finished_delalloc_end = found_start + found_len;
} else {
/*
* We've hit an error during previous delalloc range,
@@ -1266,8 +1289,22 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
delalloc_start = found_start + found_len;
}
- if (ret < 0)
+ /*
+ * It's possible we had some ordered extents created before we hit
+ * an error, cleanup non-async successfully created delalloc ranges.
+ */
+ if (unlikely(ret < 0)) {
+ unsigned int bitmap_size = min(
+ (last_finished_delalloc_end - page_start) >>
+ fs_info->sectorsize_bits,
+ fs_info->sectors_per_page);
+
+ for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size)
+ btrfs_mark_ordered_io_finished(inode, folio,
+ page_start + (bit << fs_info->sectorsize_bits),
+ fs_info->sectorsize, false);
return ret;
+ }
out:
if (last_delalloc_end)
delalloc_end = last_delalloc_end;
@@ -1501,13 +1538,13 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
bio_ctrl->wbc->nr_to_write--;
-done:
- if (ret) {
+ if (ret)
btrfs_mark_ordered_io_finished(inode, folio,
page_start, PAGE_SIZE, !ret);
- mapping_set_error(folio->mapping, ret);
- }
+done:
+ if (ret < 0)
+ mapping_set_error(folio->mapping, ret);
/*
* Only unlock ranges that are submitted. As there can be some async
* submitted ranges inside the folio.
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1546f341f9a4..b81afe757f64 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2301,8 +2301,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol
out:
if (ret < 0)
- btrfs_cleanup_ordered_extents(inode, locked_folio, start,
- end - start + 1);
+ btrfs_cleanup_ordered_extents(inode, NULL, start, end - start + 1);
return ret;
}
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 72dad8e377afa50435940adfb697e070d3556670
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020552-lark-replica-ce84@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 72dad8e377afa50435940adfb697e070d3556670 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu(a)suse.com>
Date: Thu, 12 Dec 2024 16:43:55 +1030
Subject: [PATCH] btrfs: fix double accounting race when
btrfs_run_delalloc_range() failed
[BUG]
When running btrfs with block size (4K) smaller than page size (64K,
aarch64), there is a very high chance to crash the kernel at
generic/750, with the following messages:
(before the call traces, there are 3 extra debug messages added)
BTRFS warning (device dm-3): read-write for sector size 4096 with page size 65536 is experimental
BTRFS info (device dm-3): checking UUID tree
hrtimer: interrupt took 5451385 ns
BTRFS error (device dm-3): cow_file_range failed, root=4957 inode=257 start=1605632 len=69632: -28
BTRFS error (device dm-3): run_delalloc_nocow failed, root=4957 inode=257 start=1605632 len=69632: -28
BTRFS error (device dm-3): failed to run delalloc range, root=4957 ino=257 folio=1572864 submit_bitmap=8-15 start=1605632 len=69632: -28
------------[ cut here ]------------
WARNING: CPU: 2 PID: 3020984 at ordered-data.c:360 can_finish_ordered_extent+0x370/0x3b8 [btrfs]
CPU: 2 UID: 0 PID: 3020984 Comm: kworker/u24:1 Tainted: G OE 6.13.0-rc1-custom+ #89
Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE
Hardware name: QEMU KVM Virtual Machine, BIOS unknown 2/2/2022
Workqueue: events_unbound btrfs_async_reclaim_data_space [btrfs]
pc : can_finish_ordered_extent+0x370/0x3b8 [btrfs]
lr : can_finish_ordered_extent+0x1ec/0x3b8 [btrfs]
Call trace:
can_finish_ordered_extent+0x370/0x3b8 [btrfs] (P)
can_finish_ordered_extent+0x1ec/0x3b8 [btrfs] (L)
btrfs_mark_ordered_io_finished+0x130/0x2b8 [btrfs]
extent_writepage+0x10c/0x3b8 [btrfs]
extent_write_cache_pages+0x21c/0x4e8 [btrfs]
btrfs_writepages+0x94/0x160 [btrfs]
do_writepages+0x74/0x190
filemap_fdatawrite_wbc+0x74/0xa0
start_delalloc_inodes+0x17c/0x3b0 [btrfs]
btrfs_start_delalloc_roots+0x17c/0x288 [btrfs]
shrink_delalloc+0x11c/0x280 [btrfs]
flush_space+0x288/0x328 [btrfs]
btrfs_async_reclaim_data_space+0x180/0x228 [btrfs]
process_one_work+0x228/0x680
worker_thread+0x1bc/0x360
kthread+0x100/0x118
ret_from_fork+0x10/0x20
---[ end trace 0000000000000000 ]---
BTRFS critical (device dm-3): bad ordered extent accounting, root=4957 ino=257 OE offset=1605632 OE len=16384 to_dec=16384 left=0
BTRFS critical (device dm-3): bad ordered extent accounting, root=4957 ino=257 OE offset=1622016 OE len=12288 to_dec=12288 left=0
Unable to handle kernel NULL pointer dereference at virtual address 0000000000000008
BTRFS critical (device dm-3): bad ordered extent accounting, root=4957 ino=257 OE offset=1634304 OE len=8192 to_dec=4096 left=0
CPU: 1 UID: 0 PID: 3286940 Comm: kworker/u24:3 Tainted: G W OE 6.13.0-rc1-custom+ #89
Hardware name: QEMU KVM Virtual Machine, BIOS unknown 2/2/2022
Workqueue: btrfs_work_helper [btrfs] (btrfs-endio-write)
pstate: 404000c5 (nZcv daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
pc : process_one_work+0x110/0x680
lr : worker_thread+0x1bc/0x360
Call trace:
process_one_work+0x110/0x680 (P)
worker_thread+0x1bc/0x360 (L)
worker_thread+0x1bc/0x360
kthread+0x100/0x118
ret_from_fork+0x10/0x20
Code: f84086a1 f9000fe1 53041c21 b9003361 (f9400661)
---[ end trace 0000000000000000 ]---
Kernel panic - not syncing: Oops: Fatal exception
SMP: stopping secondary CPUs
SMP: failed to stop secondary CPUs 2-3
Dumping ftrace buffer:
(ftrace buffer empty)
Kernel Offset: 0x275bb9540000 from 0xffff800080000000
PHYS_OFFSET: 0xffff8fbba0000000
CPU features: 0x100,00000070,00801250,8201720b
[CAUSE]
The above warning is triggered immediately after the delalloc range
failure, this happens in the following sequence:
- Range [1568K, 1636K) is dirty
1536K 1568K 1600K 1636K 1664K
| |/////////|////////| |
Where 1536K, 1600K and 1664K are page boundaries (64K page size)
- Enter extent_writepage() for page 1536K
- Enter run_delalloc_nocow() with locked page 1536K and range
[1568K, 1636K)
This is due to the inode having preallocated extents.
- Enter cow_file_range() with locked page 1536K and range
[1568K, 1636K)
- btrfs_reserve_extent() only reserved two extents
The main loop of cow_file_range() only reserved two data extents,
Now we have:
1536K 1568K 1600K 1636K 1664K
| |<-->|<--->|/|///////| |
1584K 1596K
Range [1568K, 1596K) has an ordered extent reserved.
- btrfs_reserve_extent() failed inside cow_file_range() for file offset
1596K
This is already a bug in our space reservation code, but for now let's
focus on the error handling path.
Now cow_file_range() returned -ENOSPC.
- btrfs_run_delalloc_range() do error cleanup <<< ROOT CAUSE
Call btrfs_cleanup_ordered_extents() with locked folio 1536K and range
[1568K, 1636K)
Function btrfs_cleanup_ordered_extents() normally needs to skip the
ranges inside the folio, as it will normally be cleaned up by
extent_writepage().
Such split error handling is already problematic in the first place.
What's worse is the folio range skipping itself, which is not taking
subpage cases into consideration at all, it will only skip the range
if the page start >= the range start.
In our case, the page start < the range start, since for subpage cases
we can have delalloc ranges inside the folio but not covering the
folio.
So it doesn't skip the page range at all.
This means all the ordered extents, both [1568K, 1584K) and
[1584K, 1596K) will be marked as IOERR.
And these two ordered extents have no more pending ios, they are marked
finished, and *QUEUED* to be deleted from the io tree.
- extent_writepage() do error cleanup
Call btrfs_mark_ordered_io_finished() for the range [1536K, 1600K).
Although ranges [1568K, 1584K) and [1584K, 1596K) are finished, the
deletion from io tree is async, it may or may not happen at this
time.
If the ranges have not yet been removed, we will do double cleaning on
those ranges, triggering the above ordered extent warnings.
In theory there are other bugs, like the cleanup in extent_writepage()
can cause double accounting on ranges that are submitted asynchronously
(compression for example).
But that's much harder to trigger because normally we do not mix regular
and compression delalloc ranges.
[FIX]
The folio range split is already buggy and not subpage compatible, it
was introduced a long time ago where subpage support was not even considered.
So instead of splitting the ordered extents cleanup into the folio range
and out of folio range, do all the cleanup inside writepage_delalloc().
- Pass @NULL as locked_folio for btrfs_cleanup_ordered_extents() in
btrfs_run_delalloc_range()
- Skip the btrfs_cleanup_ordered_extents() if writepage_delalloc()
failed
So all ordered extents are only cleaned up by
btrfs_run_delalloc_range().
- Handle the ranges that already have ordered extents allocated
If part of the folio already has ordered extent allocated, and
btrfs_run_delalloc_range() failed, we also need to cleanup that range.
Now we have a concentrated error handling for ordered extents during
btrfs_run_delalloc_range().
Fixes: d1051d6ebf8e ("btrfs: Fix error handling in btrfs_cleanup_ordered_extents")
CC: stable(a)vger.kernel.org # 5.15+
Reviewed-by: Boris Burkov <boris(a)bur.io>
Signed-off-by: Qu Wenruo <wqu(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c068a442753c..bc2bd103c8cc 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1134,14 +1134,19 @@ static bool find_next_delalloc_bitmap(struct folio *folio,
}
/*
- * helper for extent_writepage(), doing all of the delayed allocation setup.
+ * Do all of the delayed allocation setup.
*
- * This returns 1 if btrfs_run_delalloc_range function did all the work required
- * to write the page (copy into inline extent). In this case the IO has
- * been started and the page is already unlocked.
+ * Return >0 if all the dirty blocks are submitted async (compression) or inlined.
+ * The @folio should no longer be touched (treat it as already unlocked).
*
- * This returns 0 if all went well (page still locked)
- * This returns < 0 if there were errors (page still locked)
+ * Return 0 if there is still dirty block that needs to be submitted through
+ * extent_writepage_io().
+ * bio_ctrl->submit_bitmap will indicate which blocks of the folio should be
+ * submitted, and @folio is still kept locked.
+ *
+ * Return <0 if there is any error hit.
+ * Any allocated ordered extent range covering this folio will be marked
+ * finished (IOERR), and @folio is still kept locked.
*/
static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
struct folio *folio,
@@ -1159,6 +1164,16 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
* last delalloc end.
*/
u64 last_delalloc_end = 0;
+ /*
+ * The range end (exclusive) of the last successfully finished delalloc
+ * range.
+ * Any range covered by ordered extent must either be manually marked
+ * finished (error handling), or has IO submitted (and finish the
+ * ordered extent normally).
+ *
+ * This records the end of ordered extent cleanup if we hit an error.
+ */
+ u64 last_finished_delalloc_end = page_start;
u64 delalloc_start = page_start;
u64 delalloc_end = page_end;
u64 delalloc_to_write = 0;
@@ -1227,11 +1242,19 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
found_len = last_delalloc_end + 1 - found_start;
if (ret >= 0) {
+ /*
+ * Some delalloc range may be created by previous folios.
+ * Thus we still need to clean up this range during error
+ * handling.
+ */
+ last_finished_delalloc_end = found_start;
/* No errors hit so far, run the current delalloc range. */
ret = btrfs_run_delalloc_range(inode, folio,
found_start,
found_start + found_len - 1,
wbc);
+ if (ret >= 0)
+ last_finished_delalloc_end = found_start + found_len;
} else {
/*
* We've hit an error during previous delalloc range,
@@ -1266,8 +1289,22 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
delalloc_start = found_start + found_len;
}
- if (ret < 0)
+ /*
+ * It's possible we had some ordered extents created before we hit
+ * an error, cleanup non-async successfully created delalloc ranges.
+ */
+ if (unlikely(ret < 0)) {
+ unsigned int bitmap_size = min(
+ (last_finished_delalloc_end - page_start) >>
+ fs_info->sectorsize_bits,
+ fs_info->sectors_per_page);
+
+ for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size)
+ btrfs_mark_ordered_io_finished(inode, folio,
+ page_start + (bit << fs_info->sectorsize_bits),
+ fs_info->sectorsize, false);
return ret;
+ }
out:
if (last_delalloc_end)
delalloc_end = last_delalloc_end;
@@ -1501,13 +1538,13 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
bio_ctrl->wbc->nr_to_write--;
-done:
- if (ret) {
+ if (ret)
btrfs_mark_ordered_io_finished(inode, folio,
page_start, PAGE_SIZE, !ret);
- mapping_set_error(folio->mapping, ret);
- }
+done:
+ if (ret < 0)
+ mapping_set_error(folio->mapping, ret);
/*
* Only unlock ranges that are submitted. As there can be some async
* submitted ranges inside the folio.
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1546f341f9a4..b81afe757f64 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2301,8 +2301,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol
out:
if (ret < 0)
- btrfs_cleanup_ordered_extents(inode, locked_folio, start,
- end - start + 1);
+ btrfs_cleanup_ordered_extents(inode, NULL, start, end - start + 1);
return ret;
}
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 72dad8e377afa50435940adfb697e070d3556670
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020549-blast-sports-fb0a@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 72dad8e377afa50435940adfb697e070d3556670 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu(a)suse.com>
Date: Thu, 12 Dec 2024 16:43:55 +1030
Subject: [PATCH] btrfs: fix double accounting race when
btrfs_run_delalloc_range() failed
[BUG]
When running btrfs with block size (4K) smaller than page size (64K,
aarch64), there is a very high chance to crash the kernel at
generic/750, with the following messages:
(before the call traces, there are 3 extra debug messages added)
BTRFS warning (device dm-3): read-write for sector size 4096 with page size 65536 is experimental
BTRFS info (device dm-3): checking UUID tree
hrtimer: interrupt took 5451385 ns
BTRFS error (device dm-3): cow_file_range failed, root=4957 inode=257 start=1605632 len=69632: -28
BTRFS error (device dm-3): run_delalloc_nocow failed, root=4957 inode=257 start=1605632 len=69632: -28
BTRFS error (device dm-3): failed to run delalloc range, root=4957 ino=257 folio=1572864 submit_bitmap=8-15 start=1605632 len=69632: -28
------------[ cut here ]------------
WARNING: CPU: 2 PID: 3020984 at ordered-data.c:360 can_finish_ordered_extent+0x370/0x3b8 [btrfs]
CPU: 2 UID: 0 PID: 3020984 Comm: kworker/u24:1 Tainted: G OE 6.13.0-rc1-custom+ #89
Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE
Hardware name: QEMU KVM Virtual Machine, BIOS unknown 2/2/2022
Workqueue: events_unbound btrfs_async_reclaim_data_space [btrfs]
pc : can_finish_ordered_extent+0x370/0x3b8 [btrfs]
lr : can_finish_ordered_extent+0x1ec/0x3b8 [btrfs]
Call trace:
can_finish_ordered_extent+0x370/0x3b8 [btrfs] (P)
can_finish_ordered_extent+0x1ec/0x3b8 [btrfs] (L)
btrfs_mark_ordered_io_finished+0x130/0x2b8 [btrfs]
extent_writepage+0x10c/0x3b8 [btrfs]
extent_write_cache_pages+0x21c/0x4e8 [btrfs]
btrfs_writepages+0x94/0x160 [btrfs]
do_writepages+0x74/0x190
filemap_fdatawrite_wbc+0x74/0xa0
start_delalloc_inodes+0x17c/0x3b0 [btrfs]
btrfs_start_delalloc_roots+0x17c/0x288 [btrfs]
shrink_delalloc+0x11c/0x280 [btrfs]
flush_space+0x288/0x328 [btrfs]
btrfs_async_reclaim_data_space+0x180/0x228 [btrfs]
process_one_work+0x228/0x680
worker_thread+0x1bc/0x360
kthread+0x100/0x118
ret_from_fork+0x10/0x20
---[ end trace 0000000000000000 ]---
BTRFS critical (device dm-3): bad ordered extent accounting, root=4957 ino=257 OE offset=1605632 OE len=16384 to_dec=16384 left=0
BTRFS critical (device dm-3): bad ordered extent accounting, root=4957 ino=257 OE offset=1622016 OE len=12288 to_dec=12288 left=0
Unable to handle kernel NULL pointer dereference at virtual address 0000000000000008
BTRFS critical (device dm-3): bad ordered extent accounting, root=4957 ino=257 OE offset=1634304 OE len=8192 to_dec=4096 left=0
CPU: 1 UID: 0 PID: 3286940 Comm: kworker/u24:3 Tainted: G W OE 6.13.0-rc1-custom+ #89
Hardware name: QEMU KVM Virtual Machine, BIOS unknown 2/2/2022
Workqueue: btrfs_work_helper [btrfs] (btrfs-endio-write)
pstate: 404000c5 (nZcv daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
pc : process_one_work+0x110/0x680
lr : worker_thread+0x1bc/0x360
Call trace:
process_one_work+0x110/0x680 (P)
worker_thread+0x1bc/0x360 (L)
worker_thread+0x1bc/0x360
kthread+0x100/0x118
ret_from_fork+0x10/0x20
Code: f84086a1 f9000fe1 53041c21 b9003361 (f9400661)
---[ end trace 0000000000000000 ]---
Kernel panic - not syncing: Oops: Fatal exception
SMP: stopping secondary CPUs
SMP: failed to stop secondary CPUs 2-3
Dumping ftrace buffer:
(ftrace buffer empty)
Kernel Offset: 0x275bb9540000 from 0xffff800080000000
PHYS_OFFSET: 0xffff8fbba0000000
CPU features: 0x100,00000070,00801250,8201720b
[CAUSE]
The above warning is triggered immediately after the delalloc range
failure, this happens in the following sequence:
- Range [1568K, 1636K) is dirty
1536K 1568K 1600K 1636K 1664K
| |/////////|////////| |
Where 1536K, 1600K and 1664K are page boundaries (64K page size)
- Enter extent_writepage() for page 1536K
- Enter run_delalloc_nocow() with locked page 1536K and range
[1568K, 1636K)
This is due to the inode having preallocated extents.
- Enter cow_file_range() with locked page 1536K and range
[1568K, 1636K)
- btrfs_reserve_extent() only reserved two extents
The main loop of cow_file_range() only reserved two data extents,
Now we have:
1536K 1568K 1600K 1636K 1664K
| |<-->|<--->|/|///////| |
1584K 1596K
Range [1568K, 1596K) has an ordered extent reserved.
- btrfs_reserve_extent() failed inside cow_file_range() for file offset
1596K
This is already a bug in our space reservation code, but for now let's
focus on the error handling path.
Now cow_file_range() returned -ENOSPC.
- btrfs_run_delalloc_range() do error cleanup <<< ROOT CAUSE
Call btrfs_cleanup_ordered_extents() with locked folio 1536K and range
[1568K, 1636K)
Function btrfs_cleanup_ordered_extents() normally needs to skip the
ranges inside the folio, as it will normally be cleaned up by
extent_writepage().
Such split error handling is already problematic in the first place.
What's worse is the folio range skipping itself, which is not taking
subpage cases into consideration at all, it will only skip the range
if the page start >= the range start.
In our case, the page start < the range start, since for subpage cases
we can have delalloc ranges inside the folio but not covering the
folio.
So it doesn't skip the page range at all.
This means all the ordered extents, both [1568K, 1584K) and
[1584K, 1596K) will be marked as IOERR.
And these two ordered extents have no more pending ios, they are marked
finished, and *QUEUED* to be deleted from the io tree.
- extent_writepage() do error cleanup
Call btrfs_mark_ordered_io_finished() for the range [1536K, 1600K).
Although ranges [1568K, 1584K) and [1584K, 1596K) are finished, the
deletion from io tree is async, it may or may not happen at this
time.
If the ranges have not yet been removed, we will do double cleaning on
those ranges, triggering the above ordered extent warnings.
In theory there are other bugs, like the cleanup in extent_writepage()
can cause double accounting on ranges that are submitted asynchronously
(compression for example).
But that's much harder to trigger because normally we do not mix regular
and compression delalloc ranges.
[FIX]
The folio range split is already buggy and not subpage compatible, it
was introduced a long time ago where subpage support was not even considered.
So instead of splitting the ordered extents cleanup into the folio range
and out of folio range, do all the cleanup inside writepage_delalloc().
- Pass @NULL as locked_folio for btrfs_cleanup_ordered_extents() in
btrfs_run_delalloc_range()
- Skip the btrfs_cleanup_ordered_extents() if writepage_delalloc()
failed
So all ordered extents are only cleaned up by
btrfs_run_delalloc_range().
- Handle the ranges that already have ordered extents allocated
If part of the folio already has ordered extent allocated, and
btrfs_run_delalloc_range() failed, we also need to cleanup that range.
Now we have a concentrated error handling for ordered extents during
btrfs_run_delalloc_range().
Fixes: d1051d6ebf8e ("btrfs: Fix error handling in btrfs_cleanup_ordered_extents")
CC: stable(a)vger.kernel.org # 5.15+
Reviewed-by: Boris Burkov <boris(a)bur.io>
Signed-off-by: Qu Wenruo <wqu(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c068a442753c..bc2bd103c8cc 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1134,14 +1134,19 @@ static bool find_next_delalloc_bitmap(struct folio *folio,
}
/*
- * helper for extent_writepage(), doing all of the delayed allocation setup.
+ * Do all of the delayed allocation setup.
*
- * This returns 1 if btrfs_run_delalloc_range function did all the work required
- * to write the page (copy into inline extent). In this case the IO has
- * been started and the page is already unlocked.
+ * Return >0 if all the dirty blocks are submitted async (compression) or inlined.
+ * The @folio should no longer be touched (treat it as already unlocked).
*
- * This returns 0 if all went well (page still locked)
- * This returns < 0 if there were errors (page still locked)
+ * Return 0 if there is still dirty block that needs to be submitted through
+ * extent_writepage_io().
+ * bio_ctrl->submit_bitmap will indicate which blocks of the folio should be
+ * submitted, and @folio is still kept locked.
+ *
+ * Return <0 if there is any error hit.
+ * Any allocated ordered extent range covering this folio will be marked
+ * finished (IOERR), and @folio is still kept locked.
*/
static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
struct folio *folio,
@@ -1159,6 +1164,16 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
* last delalloc end.
*/
u64 last_delalloc_end = 0;
+ /*
+ * The range end (exclusive) of the last successfully finished delalloc
+ * range.
+ * Any range covered by ordered extent must either be manually marked
+ * finished (error handling), or has IO submitted (and finish the
+ * ordered extent normally).
+ *
+ * This records the end of ordered extent cleanup if we hit an error.
+ */
+ u64 last_finished_delalloc_end = page_start;
u64 delalloc_start = page_start;
u64 delalloc_end = page_end;
u64 delalloc_to_write = 0;
@@ -1227,11 +1242,19 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
found_len = last_delalloc_end + 1 - found_start;
if (ret >= 0) {
+ /*
+ * Some delalloc range may be created by previous folios.
+ * Thus we still need to clean up this range during error
+ * handling.
+ */
+ last_finished_delalloc_end = found_start;
/* No errors hit so far, run the current delalloc range. */
ret = btrfs_run_delalloc_range(inode, folio,
found_start,
found_start + found_len - 1,
wbc);
+ if (ret >= 0)
+ last_finished_delalloc_end = found_start + found_len;
} else {
/*
* We've hit an error during previous delalloc range,
@@ -1266,8 +1289,22 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
delalloc_start = found_start + found_len;
}
- if (ret < 0)
+ /*
+ * It's possible we had some ordered extents created before we hit
+ * an error, cleanup non-async successfully created delalloc ranges.
+ */
+ if (unlikely(ret < 0)) {
+ unsigned int bitmap_size = min(
+ (last_finished_delalloc_end - page_start) >>
+ fs_info->sectorsize_bits,
+ fs_info->sectors_per_page);
+
+ for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size)
+ btrfs_mark_ordered_io_finished(inode, folio,
+ page_start + (bit << fs_info->sectorsize_bits),
+ fs_info->sectorsize, false);
return ret;
+ }
out:
if (last_delalloc_end)
delalloc_end = last_delalloc_end;
@@ -1501,13 +1538,13 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
bio_ctrl->wbc->nr_to_write--;
-done:
- if (ret) {
+ if (ret)
btrfs_mark_ordered_io_finished(inode, folio,
page_start, PAGE_SIZE, !ret);
- mapping_set_error(folio->mapping, ret);
- }
+done:
+ if (ret < 0)
+ mapping_set_error(folio->mapping, ret);
/*
* Only unlock ranges that are submitted. As there can be some async
* submitted ranges inside the folio.
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1546f341f9a4..b81afe757f64 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2301,8 +2301,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol
out:
if (ret < 0)
- btrfs_cleanup_ordered_extents(inode, locked_folio, start,
- end - start + 1);
+ btrfs_cleanup_ordered_extents(inode, NULL, start, end - start + 1);
return ret;
}
The patch below does not apply to the 6.12-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.12.y
git checkout FETCH_HEAD
git cherry-pick -x 72dad8e377afa50435940adfb697e070d3556670
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020547-navigator-sesame-819a@gregkh' --subject-prefix 'PATCH 6.12.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 72dad8e377afa50435940adfb697e070d3556670 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu(a)suse.com>
Date: Thu, 12 Dec 2024 16:43:55 +1030
Subject: [PATCH] btrfs: fix double accounting race when
btrfs_run_delalloc_range() failed
[BUG]
When running btrfs with block size (4K) smaller than page size (64K,
aarch64), there is a very high chance to crash the kernel at
generic/750, with the following messages:
(before the call traces, there are 3 extra debug messages added)
BTRFS warning (device dm-3): read-write for sector size 4096 with page size 65536 is experimental
BTRFS info (device dm-3): checking UUID tree
hrtimer: interrupt took 5451385 ns
BTRFS error (device dm-3): cow_file_range failed, root=4957 inode=257 start=1605632 len=69632: -28
BTRFS error (device dm-3): run_delalloc_nocow failed, root=4957 inode=257 start=1605632 len=69632: -28
BTRFS error (device dm-3): failed to run delalloc range, root=4957 ino=257 folio=1572864 submit_bitmap=8-15 start=1605632 len=69632: -28
------------[ cut here ]------------
WARNING: CPU: 2 PID: 3020984 at ordered-data.c:360 can_finish_ordered_extent+0x370/0x3b8 [btrfs]
CPU: 2 UID: 0 PID: 3020984 Comm: kworker/u24:1 Tainted: G OE 6.13.0-rc1-custom+ #89
Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE
Hardware name: QEMU KVM Virtual Machine, BIOS unknown 2/2/2022
Workqueue: events_unbound btrfs_async_reclaim_data_space [btrfs]
pc : can_finish_ordered_extent+0x370/0x3b8 [btrfs]
lr : can_finish_ordered_extent+0x1ec/0x3b8 [btrfs]
Call trace:
can_finish_ordered_extent+0x370/0x3b8 [btrfs] (P)
can_finish_ordered_extent+0x1ec/0x3b8 [btrfs] (L)
btrfs_mark_ordered_io_finished+0x130/0x2b8 [btrfs]
extent_writepage+0x10c/0x3b8 [btrfs]
extent_write_cache_pages+0x21c/0x4e8 [btrfs]
btrfs_writepages+0x94/0x160 [btrfs]
do_writepages+0x74/0x190
filemap_fdatawrite_wbc+0x74/0xa0
start_delalloc_inodes+0x17c/0x3b0 [btrfs]
btrfs_start_delalloc_roots+0x17c/0x288 [btrfs]
shrink_delalloc+0x11c/0x280 [btrfs]
flush_space+0x288/0x328 [btrfs]
btrfs_async_reclaim_data_space+0x180/0x228 [btrfs]
process_one_work+0x228/0x680
worker_thread+0x1bc/0x360
kthread+0x100/0x118
ret_from_fork+0x10/0x20
---[ end trace 0000000000000000 ]---
BTRFS critical (device dm-3): bad ordered extent accounting, root=4957 ino=257 OE offset=1605632 OE len=16384 to_dec=16384 left=0
BTRFS critical (device dm-3): bad ordered extent accounting, root=4957 ino=257 OE offset=1622016 OE len=12288 to_dec=12288 left=0
Unable to handle kernel NULL pointer dereference at virtual address 0000000000000008
BTRFS critical (device dm-3): bad ordered extent accounting, root=4957 ino=257 OE offset=1634304 OE len=8192 to_dec=4096 left=0
CPU: 1 UID: 0 PID: 3286940 Comm: kworker/u24:3 Tainted: G W OE 6.13.0-rc1-custom+ #89
Hardware name: QEMU KVM Virtual Machine, BIOS unknown 2/2/2022
Workqueue: btrfs_work_helper [btrfs] (btrfs-endio-write)
pstate: 404000c5 (nZcv daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
pc : process_one_work+0x110/0x680
lr : worker_thread+0x1bc/0x360
Call trace:
process_one_work+0x110/0x680 (P)
worker_thread+0x1bc/0x360 (L)
worker_thread+0x1bc/0x360
kthread+0x100/0x118
ret_from_fork+0x10/0x20
Code: f84086a1 f9000fe1 53041c21 b9003361 (f9400661)
---[ end trace 0000000000000000 ]---
Kernel panic - not syncing: Oops: Fatal exception
SMP: stopping secondary CPUs
SMP: failed to stop secondary CPUs 2-3
Dumping ftrace buffer:
(ftrace buffer empty)
Kernel Offset: 0x275bb9540000 from 0xffff800080000000
PHYS_OFFSET: 0xffff8fbba0000000
CPU features: 0x100,00000070,00801250,8201720b
[CAUSE]
The above warning is triggered immediately after the delalloc range
failure, this happens in the following sequence:
- Range [1568K, 1636K) is dirty
1536K 1568K 1600K 1636K 1664K
| |/////////|////////| |
Where 1536K, 1600K and 1664K are page boundaries (64K page size)
- Enter extent_writepage() for page 1536K
- Enter run_delalloc_nocow() with locked page 1536K and range
[1568K, 1636K)
This is due to the inode having preallocated extents.
- Enter cow_file_range() with locked page 1536K and range
[1568K, 1636K)
- btrfs_reserve_extent() only reserved two extents
The main loop of cow_file_range() only reserved two data extents,
Now we have:
1536K 1568K 1600K 1636K 1664K
| |<-->|<--->|/|///////| |
1584K 1596K
Range [1568K, 1596K) has an ordered extent reserved.
- btrfs_reserve_extent() failed inside cow_file_range() for file offset
1596K
This is already a bug in our space reservation code, but for now let's
focus on the error handling path.
Now cow_file_range() returned -ENOSPC.
- btrfs_run_delalloc_range() do error cleanup <<< ROOT CAUSE
Call btrfs_cleanup_ordered_extents() with locked folio 1536K and range
[1568K, 1636K)
Function btrfs_cleanup_ordered_extents() normally needs to skip the
ranges inside the folio, as it will normally be cleaned up by
extent_writepage().
Such split error handling is already problematic in the first place.
What's worse is the folio range skipping itself, which is not taking
subpage cases into consideration at all, it will only skip the range
if the page start >= the range start.
In our case, the page start < the range start, since for subpage cases
we can have delalloc ranges inside the folio but not covering the
folio.
So it doesn't skip the page range at all.
This means all the ordered extents, both [1568K, 1584K) and
[1584K, 1596K) will be marked as IOERR.
And these two ordered extents have no more pending ios, they are marked
finished, and *QUEUED* to be deleted from the io tree.
- extent_writepage() do error cleanup
Call btrfs_mark_ordered_io_finished() for the range [1536K, 1600K).
Although ranges [1568K, 1584K) and [1584K, 1596K) are finished, the
deletion from io tree is async, it may or may not happen at this
time.
If the ranges have not yet been removed, we will do double cleaning on
those ranges, triggering the above ordered extent warnings.
In theory there are other bugs, like the cleanup in extent_writepage()
can cause double accounting on ranges that are submitted asynchronously
(compression for example).
But that's much harder to trigger because normally we do not mix regular
and compression delalloc ranges.
[FIX]
The folio range split is already buggy and not subpage compatible, it
was introduced a long time ago where subpage support was not even considered.
So instead of splitting the ordered extents cleanup into the folio range
and out of folio range, do all the cleanup inside writepage_delalloc().
- Pass @NULL as locked_folio for btrfs_cleanup_ordered_extents() in
btrfs_run_delalloc_range()
- Skip the btrfs_cleanup_ordered_extents() if writepage_delalloc()
failed
So all ordered extents are only cleaned up by
btrfs_run_delalloc_range().
- Handle the ranges that already have ordered extents allocated
If part of the folio already has ordered extent allocated, and
btrfs_run_delalloc_range() failed, we also need to cleanup that range.
Now we have a concentrated error handling for ordered extents during
btrfs_run_delalloc_range().
Fixes: d1051d6ebf8e ("btrfs: Fix error handling in btrfs_cleanup_ordered_extents")
CC: stable(a)vger.kernel.org # 5.15+
Reviewed-by: Boris Burkov <boris(a)bur.io>
Signed-off-by: Qu Wenruo <wqu(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c068a442753c..bc2bd103c8cc 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1134,14 +1134,19 @@ static bool find_next_delalloc_bitmap(struct folio *folio,
}
/*
- * helper for extent_writepage(), doing all of the delayed allocation setup.
+ * Do all of the delayed allocation setup.
*
- * This returns 1 if btrfs_run_delalloc_range function did all the work required
- * to write the page (copy into inline extent). In this case the IO has
- * been started and the page is already unlocked.
+ * Return >0 if all the dirty blocks are submitted async (compression) or inlined.
+ * The @folio should no longer be touched (treat it as already unlocked).
*
- * This returns 0 if all went well (page still locked)
- * This returns < 0 if there were errors (page still locked)
+ * Return 0 if there is still dirty block that needs to be submitted through
+ * extent_writepage_io().
+ * bio_ctrl->submit_bitmap will indicate which blocks of the folio should be
+ * submitted, and @folio is still kept locked.
+ *
+ * Return <0 if there is any error hit.
+ * Any allocated ordered extent range covering this folio will be marked
+ * finished (IOERR), and @folio is still kept locked.
*/
static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
struct folio *folio,
@@ -1159,6 +1164,16 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
* last delalloc end.
*/
u64 last_delalloc_end = 0;
+ /*
+ * The range end (exclusive) of the last successfully finished delalloc
+ * range.
+ * Any range covered by ordered extent must either be manually marked
+ * finished (error handling), or has IO submitted (and finish the
+ * ordered extent normally).
+ *
+ * This records the end of ordered extent cleanup if we hit an error.
+ */
+ u64 last_finished_delalloc_end = page_start;
u64 delalloc_start = page_start;
u64 delalloc_end = page_end;
u64 delalloc_to_write = 0;
@@ -1227,11 +1242,19 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
found_len = last_delalloc_end + 1 - found_start;
if (ret >= 0) {
+ /*
+ * Some delalloc range may be created by previous folios.
+ * Thus we still need to clean up this range during error
+ * handling.
+ */
+ last_finished_delalloc_end = found_start;
/* No errors hit so far, run the current delalloc range. */
ret = btrfs_run_delalloc_range(inode, folio,
found_start,
found_start + found_len - 1,
wbc);
+ if (ret >= 0)
+ last_finished_delalloc_end = found_start + found_len;
} else {
/*
* We've hit an error during previous delalloc range,
@@ -1266,8 +1289,22 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
delalloc_start = found_start + found_len;
}
- if (ret < 0)
+ /*
+ * It's possible we had some ordered extents created before we hit
+ * an error, cleanup non-async successfully created delalloc ranges.
+ */
+ if (unlikely(ret < 0)) {
+ unsigned int bitmap_size = min(
+ (last_finished_delalloc_end - page_start) >>
+ fs_info->sectorsize_bits,
+ fs_info->sectors_per_page);
+
+ for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size)
+ btrfs_mark_ordered_io_finished(inode, folio,
+ page_start + (bit << fs_info->sectorsize_bits),
+ fs_info->sectorsize, false);
return ret;
+ }
out:
if (last_delalloc_end)
delalloc_end = last_delalloc_end;
@@ -1501,13 +1538,13 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
bio_ctrl->wbc->nr_to_write--;
-done:
- if (ret) {
+ if (ret)
btrfs_mark_ordered_io_finished(inode, folio,
page_start, PAGE_SIZE, !ret);
- mapping_set_error(folio->mapping, ret);
- }
+done:
+ if (ret < 0)
+ mapping_set_error(folio->mapping, ret);
/*
* Only unlock ranges that are submitted. As there can be some async
* submitted ranges inside the folio.
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1546f341f9a4..b81afe757f64 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2301,8 +2301,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol
out:
if (ret < 0)
- btrfs_cleanup_ordered_extents(inode, locked_folio, start,
- end - start + 1);
+ btrfs_cleanup_ordered_extents(inode, NULL, start, end - start + 1);
return ret;
}
The patch below does not apply to the 6.13-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.13.y
git checkout FETCH_HEAD
git cherry-pick -x 72dad8e377afa50435940adfb697e070d3556670
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020544-aftermath-monkhood-1fa2@gregkh' --subject-prefix 'PATCH 6.13.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 72dad8e377afa50435940adfb697e070d3556670 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu(a)suse.com>
Date: Thu, 12 Dec 2024 16:43:55 +1030
Subject: [PATCH] btrfs: fix double accounting race when
btrfs_run_delalloc_range() failed
[BUG]
When running btrfs with block size (4K) smaller than page size (64K,
aarch64), there is a very high chance to crash the kernel at
generic/750, with the following messages:
(before the call traces, there are 3 extra debug messages added)
BTRFS warning (device dm-3): read-write for sector size 4096 with page size 65536 is experimental
BTRFS info (device dm-3): checking UUID tree
hrtimer: interrupt took 5451385 ns
BTRFS error (device dm-3): cow_file_range failed, root=4957 inode=257 start=1605632 len=69632: -28
BTRFS error (device dm-3): run_delalloc_nocow failed, root=4957 inode=257 start=1605632 len=69632: -28
BTRFS error (device dm-3): failed to run delalloc range, root=4957 ino=257 folio=1572864 submit_bitmap=8-15 start=1605632 len=69632: -28
------------[ cut here ]------------
WARNING: CPU: 2 PID: 3020984 at ordered-data.c:360 can_finish_ordered_extent+0x370/0x3b8 [btrfs]
CPU: 2 UID: 0 PID: 3020984 Comm: kworker/u24:1 Tainted: G OE 6.13.0-rc1-custom+ #89
Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE
Hardware name: QEMU KVM Virtual Machine, BIOS unknown 2/2/2022
Workqueue: events_unbound btrfs_async_reclaim_data_space [btrfs]
pc : can_finish_ordered_extent+0x370/0x3b8 [btrfs]
lr : can_finish_ordered_extent+0x1ec/0x3b8 [btrfs]
Call trace:
can_finish_ordered_extent+0x370/0x3b8 [btrfs] (P)
can_finish_ordered_extent+0x1ec/0x3b8 [btrfs] (L)
btrfs_mark_ordered_io_finished+0x130/0x2b8 [btrfs]
extent_writepage+0x10c/0x3b8 [btrfs]
extent_write_cache_pages+0x21c/0x4e8 [btrfs]
btrfs_writepages+0x94/0x160 [btrfs]
do_writepages+0x74/0x190
filemap_fdatawrite_wbc+0x74/0xa0
start_delalloc_inodes+0x17c/0x3b0 [btrfs]
btrfs_start_delalloc_roots+0x17c/0x288 [btrfs]
shrink_delalloc+0x11c/0x280 [btrfs]
flush_space+0x288/0x328 [btrfs]
btrfs_async_reclaim_data_space+0x180/0x228 [btrfs]
process_one_work+0x228/0x680
worker_thread+0x1bc/0x360
kthread+0x100/0x118
ret_from_fork+0x10/0x20
---[ end trace 0000000000000000 ]---
BTRFS critical (device dm-3): bad ordered extent accounting, root=4957 ino=257 OE offset=1605632 OE len=16384 to_dec=16384 left=0
BTRFS critical (device dm-3): bad ordered extent accounting, root=4957 ino=257 OE offset=1622016 OE len=12288 to_dec=12288 left=0
Unable to handle kernel NULL pointer dereference at virtual address 0000000000000008
BTRFS critical (device dm-3): bad ordered extent accounting, root=4957 ino=257 OE offset=1634304 OE len=8192 to_dec=4096 left=0
CPU: 1 UID: 0 PID: 3286940 Comm: kworker/u24:3 Tainted: G W OE 6.13.0-rc1-custom+ #89
Hardware name: QEMU KVM Virtual Machine, BIOS unknown 2/2/2022
Workqueue: btrfs_work_helper [btrfs] (btrfs-endio-write)
pstate: 404000c5 (nZcv daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
pc : process_one_work+0x110/0x680
lr : worker_thread+0x1bc/0x360
Call trace:
process_one_work+0x110/0x680 (P)
worker_thread+0x1bc/0x360 (L)
worker_thread+0x1bc/0x360
kthread+0x100/0x118
ret_from_fork+0x10/0x20
Code: f84086a1 f9000fe1 53041c21 b9003361 (f9400661)
---[ end trace 0000000000000000 ]---
Kernel panic - not syncing: Oops: Fatal exception
SMP: stopping secondary CPUs
SMP: failed to stop secondary CPUs 2-3
Dumping ftrace buffer:
(ftrace buffer empty)
Kernel Offset: 0x275bb9540000 from 0xffff800080000000
PHYS_OFFSET: 0xffff8fbba0000000
CPU features: 0x100,00000070,00801250,8201720b
[CAUSE]
The above warning is triggered immediately after the delalloc range
failure, this happens in the following sequence:
- Range [1568K, 1636K) is dirty
1536K 1568K 1600K 1636K 1664K
| |/////////|////////| |
Where 1536K, 1600K and 1664K are page boundaries (64K page size)
- Enter extent_writepage() for page 1536K
- Enter run_delalloc_nocow() with locked page 1536K and range
[1568K, 1636K)
This is due to the inode having preallocated extents.
- Enter cow_file_range() with locked page 1536K and range
[1568K, 1636K)
- btrfs_reserve_extent() only reserved two extents
The main loop of cow_file_range() only reserved two data extents,
Now we have:
1536K 1568K 1600K 1636K 1664K
| |<-->|<--->|/|///////| |
1584K 1596K
Range [1568K, 1596K) has an ordered extent reserved.
- btrfs_reserve_extent() failed inside cow_file_range() for file offset
1596K
This is already a bug in our space reservation code, but for now let's
focus on the error handling path.
Now cow_file_range() returned -ENOSPC.
- btrfs_run_delalloc_range() do error cleanup <<< ROOT CAUSE
Call btrfs_cleanup_ordered_extents() with locked folio 1536K and range
[1568K, 1636K)
Function btrfs_cleanup_ordered_extents() normally needs to skip the
ranges inside the folio, as it will normally be cleaned up by
extent_writepage().
Such split error handling is already problematic in the first place.
What's worse is the folio range skipping itself, which is not taking
subpage cases into consideration at all, it will only skip the range
if the page start >= the range start.
In our case, the page start < the range start, since for subpage cases
we can have delalloc ranges inside the folio but not covering the
folio.
So it doesn't skip the page range at all.
This means all the ordered extents, both [1568K, 1584K) and
[1584K, 1596K) will be marked as IOERR.
And these two ordered extents have no more pending ios, they are marked
finished, and *QUEUED* to be deleted from the io tree.
- extent_writepage() do error cleanup
Call btrfs_mark_ordered_io_finished() for the range [1536K, 1600K).
Although ranges [1568K, 1584K) and [1584K, 1596K) are finished, the
deletion from io tree is async, it may or may not happen at this
time.
If the ranges have not yet been removed, we will do double cleaning on
those ranges, triggering the above ordered extent warnings.
In theory there are other bugs, like the cleanup in extent_writepage()
can cause double accounting on ranges that are submitted asynchronously
(compression for example).
But that's much harder to trigger because normally we do not mix regular
and compression delalloc ranges.
[FIX]
The folio range split is already buggy and not subpage compatible, it
was introduced a long time ago where subpage support was not even considered.
So instead of splitting the ordered extents cleanup into the folio range
and out of folio range, do all the cleanup inside writepage_delalloc().
- Pass @NULL as locked_folio for btrfs_cleanup_ordered_extents() in
btrfs_run_delalloc_range()
- Skip the btrfs_cleanup_ordered_extents() if writepage_delalloc()
failed
So all ordered extents are only cleaned up by
btrfs_run_delalloc_range().
- Handle the ranges that already have ordered extents allocated
If part of the folio already has ordered extent allocated, and
btrfs_run_delalloc_range() failed, we also need to cleanup that range.
Now we have a concentrated error handling for ordered extents during
btrfs_run_delalloc_range().
Fixes: d1051d6ebf8e ("btrfs: Fix error handling in btrfs_cleanup_ordered_extents")
CC: stable(a)vger.kernel.org # 5.15+
Reviewed-by: Boris Burkov <boris(a)bur.io>
Signed-off-by: Qu Wenruo <wqu(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c068a442753c..bc2bd103c8cc 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1134,14 +1134,19 @@ static bool find_next_delalloc_bitmap(struct folio *folio,
}
/*
- * helper for extent_writepage(), doing all of the delayed allocation setup.
+ * Do all of the delayed allocation setup.
*
- * This returns 1 if btrfs_run_delalloc_range function did all the work required
- * to write the page (copy into inline extent). In this case the IO has
- * been started and the page is already unlocked.
+ * Return >0 if all the dirty blocks are submitted async (compression) or inlined.
+ * The @folio should no longer be touched (treat it as already unlocked).
*
- * This returns 0 if all went well (page still locked)
- * This returns < 0 if there were errors (page still locked)
+ * Return 0 if there is still dirty block that needs to be submitted through
+ * extent_writepage_io().
+ * bio_ctrl->submit_bitmap will indicate which blocks of the folio should be
+ * submitted, and @folio is still kept locked.
+ *
+ * Return <0 if there is any error hit.
+ * Any allocated ordered extent range covering this folio will be marked
+ * finished (IOERR), and @folio is still kept locked.
*/
static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
struct folio *folio,
@@ -1159,6 +1164,16 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
* last delalloc end.
*/
u64 last_delalloc_end = 0;
+ /*
+ * The range end (exclusive) of the last successfully finished delalloc
+ * range.
+ * Any range covered by ordered extent must either be manually marked
+ * finished (error handling), or has IO submitted (and finish the
+ * ordered extent normally).
+ *
+ * This records the end of ordered extent cleanup if we hit an error.
+ */
+ u64 last_finished_delalloc_end = page_start;
u64 delalloc_start = page_start;
u64 delalloc_end = page_end;
u64 delalloc_to_write = 0;
@@ -1227,11 +1242,19 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
found_len = last_delalloc_end + 1 - found_start;
if (ret >= 0) {
+ /*
+ * Some delalloc range may be created by previous folios.
+ * Thus we still need to clean up this range during error
+ * handling.
+ */
+ last_finished_delalloc_end = found_start;
/* No errors hit so far, run the current delalloc range. */
ret = btrfs_run_delalloc_range(inode, folio,
found_start,
found_start + found_len - 1,
wbc);
+ if (ret >= 0)
+ last_finished_delalloc_end = found_start + found_len;
} else {
/*
* We've hit an error during previous delalloc range,
@@ -1266,8 +1289,22 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
delalloc_start = found_start + found_len;
}
- if (ret < 0)
+ /*
+ * It's possible we had some ordered extents created before we hit
+ * an error, cleanup non-async successfully created delalloc ranges.
+ */
+ if (unlikely(ret < 0)) {
+ unsigned int bitmap_size = min(
+ (last_finished_delalloc_end - page_start) >>
+ fs_info->sectorsize_bits,
+ fs_info->sectors_per_page);
+
+ for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size)
+ btrfs_mark_ordered_io_finished(inode, folio,
+ page_start + (bit << fs_info->sectorsize_bits),
+ fs_info->sectorsize, false);
return ret;
+ }
out:
if (last_delalloc_end)
delalloc_end = last_delalloc_end;
@@ -1501,13 +1538,13 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
bio_ctrl->wbc->nr_to_write--;
-done:
- if (ret) {
+ if (ret)
btrfs_mark_ordered_io_finished(inode, folio,
page_start, PAGE_SIZE, !ret);
- mapping_set_error(folio->mapping, ret);
- }
+done:
+ if (ret < 0)
+ mapping_set_error(folio->mapping, ret);
/*
* Only unlock ranges that are submitted. As there can be some async
* submitted ranges inside the folio.
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1546f341f9a4..b81afe757f64 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2301,8 +2301,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol
out:
if (ret < 0)
- btrfs_cleanup_ordered_extents(inode, locked_folio, start,
- end - start + 1);
+ btrfs_cleanup_ordered_extents(inode, NULL, start, end - start + 1);
return ret;
}
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 8bf334beb3496da3c3fbf3daf3856f7eec70dacc
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020538-unmoving-shelving-20de@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8bf334beb3496da3c3fbf3daf3856f7eec70dacc Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu(a)suse.com>
Date: Thu, 12 Dec 2024 16:43:56 +1030
Subject: [PATCH] btrfs: fix double accounting race when extent_writepage_io()
failed
[BUG]
If submit_one_sector() failed inside extent_writepage_io() for sector
size < page size cases (e.g. 4K sector size and 64K page size), then
we can hit double ordered extent accounting error.
This should be very rare, as submit_one_sector() only fails when we
failed to grab the extent map, and such extent map should exist inside
the memory and has been pinned.
[CAUSE]
For example we have the following folio layout:
0 4K 32K 48K 60K 64K
|//| |//////| |///|
Where |///| is the dirty range we need to writeback. The 3 different
dirty ranges are submitted for regular COW.
Now we hit the following sequence:
- submit_one_sector() returned 0 for [0, 4K)
- submit_one_sector() returned 0 for [32K, 48K)
- submit_one_sector() returned error for [60K, 64K)
- btrfs_mark_ordered_io_finished() called for the whole folio
This will mark the following ranges as finished:
* [0, 4K)
* [32K, 48K)
Both ranges have their IO already submitted, this cleanup will
lead to double accounting.
* [60K, 64K)
That's the correct cleanup.
The only good news is, this error is only theoretical, as the target
extent map is always pinned, thus we should directly grab it from
memory, other than reading it from the disk.
[FIX]
Instead of calling btrfs_mark_ordered_io_finished() for the whole folio
range, which can touch ranges we should not touch, instead
move the error handling inside extent_writepage_io().
So that we can cleanup exact sectors that ought to be submitted but failed.
This provides much more accurate cleanup, avoiding the double accounting.
CC: stable(a)vger.kernel.org # 5.15+
Signed-off-by: Qu Wenruo <wqu(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index bc2bd103c8cc..5014134b9aa2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1420,6 +1420,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
struct btrfs_fs_info *fs_info = inode->root->fs_info;
unsigned long range_bitmap = 0;
bool submitted_io = false;
+ bool error = false;
const u64 folio_start = folio_pos(folio);
u64 cur;
int bit;
@@ -1462,11 +1463,26 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
break;
}
ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size);
- if (ret < 0)
- goto out;
+ if (unlikely(ret < 0)) {
+ /*
+ * bio_ctrl may contain a bio crossing several folios.
+ * Submit it immediately so that the bio has a chance
+ * to finish normally, other than marked as error.
+ */
+ submit_one_bio(bio_ctrl);
+ /*
+ * Failed to grab the extent map which should be very rare.
+ * Since there is no bio submitted to finish the ordered
+ * extent, we have to manually finish this sector.
+ */
+ btrfs_mark_ordered_io_finished(inode, folio, cur,
+ fs_info->sectorsize, false);
+ error = true;
+ continue;
+ }
submitted_io = true;
}
-out:
+
/*
* If we didn't submitted any sector (>= i_size), folio dirty get
* cleared but PAGECACHE_TAG_DIRTY is not cleared (only cleared
@@ -1474,8 +1490,11 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
*
* Here we set writeback and clear for the range. If the full folio
* is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag.
+ *
+ * If we hit any error, the corresponding sector will still be dirty
+ * thus no need to clear PAGECACHE_TAG_DIRTY.
*/
- if (!submitted_io) {
+ if (!submitted_io && !error) {
btrfs_folio_set_writeback(fs_info, folio, start, len);
btrfs_folio_clear_writeback(fs_info, folio, start, len);
}
@@ -1495,7 +1514,6 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
{
struct btrfs_inode *inode = BTRFS_I(folio->mapping->host);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- const u64 page_start = folio_pos(folio);
int ret;
size_t pg_offset;
loff_t i_size = i_size_read(&inode->vfs_inode);
@@ -1538,10 +1556,6 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
bio_ctrl->wbc->nr_to_write--;
- if (ret)
- btrfs_mark_ordered_io_finished(inode, folio,
- page_start, PAGE_SIZE, !ret);
-
done:
if (ret < 0)
mapping_set_error(folio->mapping, ret);
@@ -2314,11 +2328,8 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
if (ret == 1)
goto next_page;
- if (ret) {
- btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio,
- cur, cur_len, !ret);
+ if (ret)
mapping_set_error(mapping, ret);
- }
btrfs_folio_end_lock(fs_info, folio, cur, cur_len);
if (ret < 0)
found_error = true;
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 8bf334beb3496da3c3fbf3daf3856f7eec70dacc
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020537-dominoes-catty-42c4@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8bf334beb3496da3c3fbf3daf3856f7eec70dacc Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu(a)suse.com>
Date: Thu, 12 Dec 2024 16:43:56 +1030
Subject: [PATCH] btrfs: fix double accounting race when extent_writepage_io()
failed
[BUG]
If submit_one_sector() failed inside extent_writepage_io() for sector
size < page size cases (e.g. 4K sector size and 64K page size), then
we can hit double ordered extent accounting error.
This should be very rare, as submit_one_sector() only fails when we
failed to grab the extent map, and such extent map should exist inside
the memory and has been pinned.
[CAUSE]
For example we have the following folio layout:
0 4K 32K 48K 60K 64K
|//| |//////| |///|
Where |///| is the dirty range we need to writeback. The 3 different
dirty ranges are submitted for regular COW.
Now we hit the following sequence:
- submit_one_sector() returned 0 for [0, 4K)
- submit_one_sector() returned 0 for [32K, 48K)
- submit_one_sector() returned error for [60K, 64K)
- btrfs_mark_ordered_io_finished() called for the whole folio
This will mark the following ranges as finished:
* [0, 4K)
* [32K, 48K)
Both ranges have their IO already submitted, this cleanup will
lead to double accounting.
* [60K, 64K)
That's the correct cleanup.
The only good news is, this error is only theoretical, as the target
extent map is always pinned, thus we should directly grab it from
memory, other than reading it from the disk.
[FIX]
Instead of calling btrfs_mark_ordered_io_finished() for the whole folio
range, which can touch ranges we should not touch, instead
move the error handling inside extent_writepage_io().
So that we can cleanup exact sectors that ought to be submitted but failed.
This provides much more accurate cleanup, avoiding the double accounting.
CC: stable(a)vger.kernel.org # 5.15+
Signed-off-by: Qu Wenruo <wqu(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index bc2bd103c8cc..5014134b9aa2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1420,6 +1420,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
struct btrfs_fs_info *fs_info = inode->root->fs_info;
unsigned long range_bitmap = 0;
bool submitted_io = false;
+ bool error = false;
const u64 folio_start = folio_pos(folio);
u64 cur;
int bit;
@@ -1462,11 +1463,26 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
break;
}
ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size);
- if (ret < 0)
- goto out;
+ if (unlikely(ret < 0)) {
+ /*
+ * bio_ctrl may contain a bio crossing several folios.
+ * Submit it immediately so that the bio has a chance
+ * to finish normally, other than marked as error.
+ */
+ submit_one_bio(bio_ctrl);
+ /*
+ * Failed to grab the extent map which should be very rare.
+ * Since there is no bio submitted to finish the ordered
+ * extent, we have to manually finish this sector.
+ */
+ btrfs_mark_ordered_io_finished(inode, folio, cur,
+ fs_info->sectorsize, false);
+ error = true;
+ continue;
+ }
submitted_io = true;
}
-out:
+
/*
* If we didn't submitted any sector (>= i_size), folio dirty get
* cleared but PAGECACHE_TAG_DIRTY is not cleared (only cleared
@@ -1474,8 +1490,11 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
*
* Here we set writeback and clear for the range. If the full folio
* is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag.
+ *
+ * If we hit any error, the corresponding sector will still be dirty
+ * thus no need to clear PAGECACHE_TAG_DIRTY.
*/
- if (!submitted_io) {
+ if (!submitted_io && !error) {
btrfs_folio_set_writeback(fs_info, folio, start, len);
btrfs_folio_clear_writeback(fs_info, folio, start, len);
}
@@ -1495,7 +1514,6 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
{
struct btrfs_inode *inode = BTRFS_I(folio->mapping->host);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- const u64 page_start = folio_pos(folio);
int ret;
size_t pg_offset;
loff_t i_size = i_size_read(&inode->vfs_inode);
@@ -1538,10 +1556,6 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
bio_ctrl->wbc->nr_to_write--;
- if (ret)
- btrfs_mark_ordered_io_finished(inode, folio,
- page_start, PAGE_SIZE, !ret);
-
done:
if (ret < 0)
mapping_set_error(folio->mapping, ret);
@@ -2314,11 +2328,8 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
if (ret == 1)
goto next_page;
- if (ret) {
- btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio,
- cur, cur_len, !ret);
+ if (ret)
mapping_set_error(mapping, ret);
- }
btrfs_folio_end_lock(fs_info, folio, cur, cur_len);
if (ret < 0)
found_error = true;
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 8bf334beb3496da3c3fbf3daf3856f7eec70dacc
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020535-spender-graveness-c15e@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8bf334beb3496da3c3fbf3daf3856f7eec70dacc Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu(a)suse.com>
Date: Thu, 12 Dec 2024 16:43:56 +1030
Subject: [PATCH] btrfs: fix double accounting race when extent_writepage_io()
failed
[BUG]
If submit_one_sector() failed inside extent_writepage_io() for sector
size < page size cases (e.g. 4K sector size and 64K page size), then
we can hit double ordered extent accounting error.
This should be very rare, as submit_one_sector() only fails when we
failed to grab the extent map, and such extent map should exist inside
the memory and has been pinned.
[CAUSE]
For example we have the following folio layout:
0 4K 32K 48K 60K 64K
|//| |//////| |///|
Where |///| is the dirty range we need to writeback. The 3 different
dirty ranges are submitted for regular COW.
Now we hit the following sequence:
- submit_one_sector() returned 0 for [0, 4K)
- submit_one_sector() returned 0 for [32K, 48K)
- submit_one_sector() returned error for [60K, 64K)
- btrfs_mark_ordered_io_finished() called for the whole folio
This will mark the following ranges as finished:
* [0, 4K)
* [32K, 48K)
Both ranges have their IO already submitted, this cleanup will
lead to double accounting.
* [60K, 64K)
That's the correct cleanup.
The only good news is, this error is only theoretical, as the target
extent map is always pinned, thus we should directly grab it from
memory, other than reading it from the disk.
[FIX]
Instead of calling btrfs_mark_ordered_io_finished() for the whole folio
range, which can touch ranges we should not touch, instead
move the error handling inside extent_writepage_io().
So that we can cleanup exact sectors that ought to be submitted but failed.
This provides much more accurate cleanup, avoiding the double accounting.
CC: stable(a)vger.kernel.org # 5.15+
Signed-off-by: Qu Wenruo <wqu(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index bc2bd103c8cc..5014134b9aa2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1420,6 +1420,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
struct btrfs_fs_info *fs_info = inode->root->fs_info;
unsigned long range_bitmap = 0;
bool submitted_io = false;
+ bool error = false;
const u64 folio_start = folio_pos(folio);
u64 cur;
int bit;
@@ -1462,11 +1463,26 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
break;
}
ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size);
- if (ret < 0)
- goto out;
+ if (unlikely(ret < 0)) {
+ /*
+ * bio_ctrl may contain a bio crossing several folios.
+ * Submit it immediately so that the bio has a chance
+ * to finish normally, other than marked as error.
+ */
+ submit_one_bio(bio_ctrl);
+ /*
+ * Failed to grab the extent map which should be very rare.
+ * Since there is no bio submitted to finish the ordered
+ * extent, we have to manually finish this sector.
+ */
+ btrfs_mark_ordered_io_finished(inode, folio, cur,
+ fs_info->sectorsize, false);
+ error = true;
+ continue;
+ }
submitted_io = true;
}
-out:
+
/*
* If we didn't submitted any sector (>= i_size), folio dirty get
* cleared but PAGECACHE_TAG_DIRTY is not cleared (only cleared
@@ -1474,8 +1490,11 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
*
* Here we set writeback and clear for the range. If the full folio
* is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag.
+ *
+ * If we hit any error, the corresponding sector will still be dirty
+ * thus no need to clear PAGECACHE_TAG_DIRTY.
*/
- if (!submitted_io) {
+ if (!submitted_io && !error) {
btrfs_folio_set_writeback(fs_info, folio, start, len);
btrfs_folio_clear_writeback(fs_info, folio, start, len);
}
@@ -1495,7 +1514,6 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
{
struct btrfs_inode *inode = BTRFS_I(folio->mapping->host);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- const u64 page_start = folio_pos(folio);
int ret;
size_t pg_offset;
loff_t i_size = i_size_read(&inode->vfs_inode);
@@ -1538,10 +1556,6 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
bio_ctrl->wbc->nr_to_write--;
- if (ret)
- btrfs_mark_ordered_io_finished(inode, folio,
- page_start, PAGE_SIZE, !ret);
-
done:
if (ret < 0)
mapping_set_error(folio->mapping, ret);
@@ -2314,11 +2328,8 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
if (ret == 1)
goto next_page;
- if (ret) {
- btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio,
- cur, cur_len, !ret);
+ if (ret)
mapping_set_error(mapping, ret);
- }
btrfs_folio_end_lock(fs_info, folio, cur, cur_len);
if (ret < 0)
found_error = true;
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 8bf334beb3496da3c3fbf3daf3856f7eec70dacc
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020534-imposing-hacksaw-eff6@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8bf334beb3496da3c3fbf3daf3856f7eec70dacc Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu(a)suse.com>
Date: Thu, 12 Dec 2024 16:43:56 +1030
Subject: [PATCH] btrfs: fix double accounting race when extent_writepage_io()
failed
[BUG]
If submit_one_sector() failed inside extent_writepage_io() for sector
size < page size cases (e.g. 4K sector size and 64K page size), then
we can hit double ordered extent accounting error.
This should be very rare, as submit_one_sector() only fails when we
failed to grab the extent map, and such extent map should exist inside
the memory and has been pinned.
[CAUSE]
For example we have the following folio layout:
0 4K 32K 48K 60K 64K
|//| |//////| |///|
Where |///| is the dirty range we need to writeback. The 3 different
dirty ranges are submitted for regular COW.
Now we hit the following sequence:
- submit_one_sector() returned 0 for [0, 4K)
- submit_one_sector() returned 0 for [32K, 48K)
- submit_one_sector() returned error for [60K, 64K)
- btrfs_mark_ordered_io_finished() called for the whole folio
This will mark the following ranges as finished:
* [0, 4K)
* [32K, 48K)
Both ranges have their IO already submitted, this cleanup will
lead to double accounting.
* [60K, 64K)
That's the correct cleanup.
The only good news is, this error is only theoretical, as the target
extent map is always pinned, thus we should directly grab it from
memory, other than reading it from the disk.
[FIX]
Instead of calling btrfs_mark_ordered_io_finished() for the whole folio
range, which can touch ranges we should not touch, instead
move the error handling inside extent_writepage_io().
So that we can cleanup exact sectors that ought to be submitted but failed.
This provides much more accurate cleanup, avoiding the double accounting.
CC: stable(a)vger.kernel.org # 5.15+
Signed-off-by: Qu Wenruo <wqu(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index bc2bd103c8cc..5014134b9aa2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1420,6 +1420,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
struct btrfs_fs_info *fs_info = inode->root->fs_info;
unsigned long range_bitmap = 0;
bool submitted_io = false;
+ bool error = false;
const u64 folio_start = folio_pos(folio);
u64 cur;
int bit;
@@ -1462,11 +1463,26 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
break;
}
ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size);
- if (ret < 0)
- goto out;
+ if (unlikely(ret < 0)) {
+ /*
+ * bio_ctrl may contain a bio crossing several folios.
+ * Submit it immediately so that the bio has a chance
+ * to finish normally, other than marked as error.
+ */
+ submit_one_bio(bio_ctrl);
+ /*
+ * Failed to grab the extent map which should be very rare.
+ * Since there is no bio submitted to finish the ordered
+ * extent, we have to manually finish this sector.
+ */
+ btrfs_mark_ordered_io_finished(inode, folio, cur,
+ fs_info->sectorsize, false);
+ error = true;
+ continue;
+ }
submitted_io = true;
}
-out:
+
/*
* If we didn't submitted any sector (>= i_size), folio dirty get
* cleared but PAGECACHE_TAG_DIRTY is not cleared (only cleared
@@ -1474,8 +1490,11 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
*
* Here we set writeback and clear for the range. If the full folio
* is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag.
+ *
+ * If we hit any error, the corresponding sector will still be dirty
+ * thus no need to clear PAGECACHE_TAG_DIRTY.
*/
- if (!submitted_io) {
+ if (!submitted_io && !error) {
btrfs_folio_set_writeback(fs_info, folio, start, len);
btrfs_folio_clear_writeback(fs_info, folio, start, len);
}
@@ -1495,7 +1514,6 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
{
struct btrfs_inode *inode = BTRFS_I(folio->mapping->host);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- const u64 page_start = folio_pos(folio);
int ret;
size_t pg_offset;
loff_t i_size = i_size_read(&inode->vfs_inode);
@@ -1538,10 +1556,6 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
bio_ctrl->wbc->nr_to_write--;
- if (ret)
- btrfs_mark_ordered_io_finished(inode, folio,
- page_start, PAGE_SIZE, !ret);
-
done:
if (ret < 0)
mapping_set_error(folio->mapping, ret);
@@ -2314,11 +2328,8 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
if (ret == 1)
goto next_page;
- if (ret) {
- btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio,
- cur, cur_len, !ret);
+ if (ret)
mapping_set_error(mapping, ret);
- }
btrfs_folio_end_lock(fs_info, folio, cur, cur_len);
if (ret < 0)
found_error = true;
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 8bf334beb3496da3c3fbf3daf3856f7eec70dacc
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020533-maturity-audacious-3a1f@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8bf334beb3496da3c3fbf3daf3856f7eec70dacc Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu(a)suse.com>
Date: Thu, 12 Dec 2024 16:43:56 +1030
Subject: [PATCH] btrfs: fix double accounting race when extent_writepage_io()
failed
[BUG]
If submit_one_sector() failed inside extent_writepage_io() for sector
size < page size cases (e.g. 4K sector size and 64K page size), then
we can hit double ordered extent accounting error.
This should be very rare, as submit_one_sector() only fails when we
failed to grab the extent map, and such extent map should exist inside
the memory and has been pinned.
[CAUSE]
For example we have the following folio layout:
0 4K 32K 48K 60K 64K
|//| |//////| |///|
Where |///| is the dirty range we need to writeback. The 3 different
dirty ranges are submitted for regular COW.
Now we hit the following sequence:
- submit_one_sector() returned 0 for [0, 4K)
- submit_one_sector() returned 0 for [32K, 48K)
- submit_one_sector() returned error for [60K, 64K)
- btrfs_mark_ordered_io_finished() called for the whole folio
This will mark the following ranges as finished:
* [0, 4K)
* [32K, 48K)
Both ranges have their IO already submitted, this cleanup will
lead to double accounting.
* [60K, 64K)
That's the correct cleanup.
The only good news is, this error is only theoretical, as the target
extent map is always pinned, thus we should directly grab it from
memory, other than reading it from the disk.
[FIX]
Instead of calling btrfs_mark_ordered_io_finished() for the whole folio
range, which can touch ranges we should not touch, instead
move the error handling inside extent_writepage_io().
So that we can cleanup exact sectors that ought to be submitted but failed.
This provides much more accurate cleanup, avoiding the double accounting.
CC: stable(a)vger.kernel.org # 5.15+
Signed-off-by: Qu Wenruo <wqu(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index bc2bd103c8cc..5014134b9aa2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1420,6 +1420,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
struct btrfs_fs_info *fs_info = inode->root->fs_info;
unsigned long range_bitmap = 0;
bool submitted_io = false;
+ bool error = false;
const u64 folio_start = folio_pos(folio);
u64 cur;
int bit;
@@ -1462,11 +1463,26 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
break;
}
ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size);
- if (ret < 0)
- goto out;
+ if (unlikely(ret < 0)) {
+ /*
+ * bio_ctrl may contain a bio crossing several folios.
+ * Submit it immediately so that the bio has a chance
+ * to finish normally, other than marked as error.
+ */
+ submit_one_bio(bio_ctrl);
+ /*
+ * Failed to grab the extent map which should be very rare.
+ * Since there is no bio submitted to finish the ordered
+ * extent, we have to manually finish this sector.
+ */
+ btrfs_mark_ordered_io_finished(inode, folio, cur,
+ fs_info->sectorsize, false);
+ error = true;
+ continue;
+ }
submitted_io = true;
}
-out:
+
/*
* If we didn't submitted any sector (>= i_size), folio dirty get
* cleared but PAGECACHE_TAG_DIRTY is not cleared (only cleared
@@ -1474,8 +1490,11 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
*
* Here we set writeback and clear for the range. If the full folio
* is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag.
+ *
+ * If we hit any error, the corresponding sector will still be dirty
+ * thus no need to clear PAGECACHE_TAG_DIRTY.
*/
- if (!submitted_io) {
+ if (!submitted_io && !error) {
btrfs_folio_set_writeback(fs_info, folio, start, len);
btrfs_folio_clear_writeback(fs_info, folio, start, len);
}
@@ -1495,7 +1514,6 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
{
struct btrfs_inode *inode = BTRFS_I(folio->mapping->host);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- const u64 page_start = folio_pos(folio);
int ret;
size_t pg_offset;
loff_t i_size = i_size_read(&inode->vfs_inode);
@@ -1538,10 +1556,6 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
bio_ctrl->wbc->nr_to_write--;
- if (ret)
- btrfs_mark_ordered_io_finished(inode, folio,
- page_start, PAGE_SIZE, !ret);
-
done:
if (ret < 0)
mapping_set_error(folio->mapping, ret);
@@ -2314,11 +2328,8 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
if (ret == 1)
goto next_page;
- if (ret) {
- btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio,
- cur, cur_len, !ret);
+ if (ret)
mapping_set_error(mapping, ret);
- }
btrfs_folio_end_lock(fs_info, folio, cur, cur_len);
if (ret < 0)
found_error = true;
The patch below does not apply to the 6.12-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.12.y
git checkout FETCH_HEAD
git cherry-pick -x 8bf334beb3496da3c3fbf3daf3856f7eec70dacc
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020532-component-gratified-8153@gregkh' --subject-prefix 'PATCH 6.12.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8bf334beb3496da3c3fbf3daf3856f7eec70dacc Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu(a)suse.com>
Date: Thu, 12 Dec 2024 16:43:56 +1030
Subject: [PATCH] btrfs: fix double accounting race when extent_writepage_io()
failed
[BUG]
If submit_one_sector() failed inside extent_writepage_io() for sector
size < page size cases (e.g. 4K sector size and 64K page size), then
we can hit double ordered extent accounting error.
This should be very rare, as submit_one_sector() only fails when we
failed to grab the extent map, and such extent map should exist inside
the memory and has been pinned.
[CAUSE]
For example we have the following folio layout:
0 4K 32K 48K 60K 64K
|//| |//////| |///|
Where |///| is the dirty range we need to writeback. The 3 different
dirty ranges are submitted for regular COW.
Now we hit the following sequence:
- submit_one_sector() returned 0 for [0, 4K)
- submit_one_sector() returned 0 for [32K, 48K)
- submit_one_sector() returned error for [60K, 64K)
- btrfs_mark_ordered_io_finished() called for the whole folio
This will mark the following ranges as finished:
* [0, 4K)
* [32K, 48K)
Both ranges have their IO already submitted, this cleanup will
lead to double accounting.
* [60K, 64K)
That's the correct cleanup.
The only good news is, this error is only theoretical, as the target
extent map is always pinned, thus we should directly grab it from
memory, other than reading it from the disk.
[FIX]
Instead of calling btrfs_mark_ordered_io_finished() for the whole folio
range, which can touch ranges we should not touch, instead
move the error handling inside extent_writepage_io().
So that we can cleanup exact sectors that ought to be submitted but failed.
This provides much more accurate cleanup, avoiding the double accounting.
CC: stable(a)vger.kernel.org # 5.15+
Signed-off-by: Qu Wenruo <wqu(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index bc2bd103c8cc..5014134b9aa2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1420,6 +1420,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
struct btrfs_fs_info *fs_info = inode->root->fs_info;
unsigned long range_bitmap = 0;
bool submitted_io = false;
+ bool error = false;
const u64 folio_start = folio_pos(folio);
u64 cur;
int bit;
@@ -1462,11 +1463,26 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
break;
}
ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size);
- if (ret < 0)
- goto out;
+ if (unlikely(ret < 0)) {
+ /*
+ * bio_ctrl may contain a bio crossing several folios.
+ * Submit it immediately so that the bio has a chance
+ * to finish normally, other than marked as error.
+ */
+ submit_one_bio(bio_ctrl);
+ /*
+ * Failed to grab the extent map which should be very rare.
+ * Since there is no bio submitted to finish the ordered
+ * extent, we have to manually finish this sector.
+ */
+ btrfs_mark_ordered_io_finished(inode, folio, cur,
+ fs_info->sectorsize, false);
+ error = true;
+ continue;
+ }
submitted_io = true;
}
-out:
+
/*
* If we didn't submitted any sector (>= i_size), folio dirty get
* cleared but PAGECACHE_TAG_DIRTY is not cleared (only cleared
@@ -1474,8 +1490,11 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
*
* Here we set writeback and clear for the range. If the full folio
* is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag.
+ *
+ * If we hit any error, the corresponding sector will still be dirty
+ * thus no need to clear PAGECACHE_TAG_DIRTY.
*/
- if (!submitted_io) {
+ if (!submitted_io && !error) {
btrfs_folio_set_writeback(fs_info, folio, start, len);
btrfs_folio_clear_writeback(fs_info, folio, start, len);
}
@@ -1495,7 +1514,6 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
{
struct btrfs_inode *inode = BTRFS_I(folio->mapping->host);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- const u64 page_start = folio_pos(folio);
int ret;
size_t pg_offset;
loff_t i_size = i_size_read(&inode->vfs_inode);
@@ -1538,10 +1556,6 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
bio_ctrl->wbc->nr_to_write--;
- if (ret)
- btrfs_mark_ordered_io_finished(inode, folio,
- page_start, PAGE_SIZE, !ret);
-
done:
if (ret < 0)
mapping_set_error(folio->mapping, ret);
@@ -2314,11 +2328,8 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
if (ret == 1)
goto next_page;
- if (ret) {
- btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio,
- cur, cur_len, !ret);
+ if (ret)
mapping_set_error(mapping, ret);
- }
btrfs_folio_end_lock(fs_info, folio, cur, cur_len);
if (ret < 0)
found_error = true;
The patch below does not apply to the 6.13-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.13.y
git checkout FETCH_HEAD
git cherry-pick -x 8bf334beb3496da3c3fbf3daf3856f7eec70dacc
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020531-uneasily-twitch-9e6b@gregkh' --subject-prefix 'PATCH 6.13.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8bf334beb3496da3c3fbf3daf3856f7eec70dacc Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu(a)suse.com>
Date: Thu, 12 Dec 2024 16:43:56 +1030
Subject: [PATCH] btrfs: fix double accounting race when extent_writepage_io()
failed
[BUG]
If submit_one_sector() failed inside extent_writepage_io() for sector
size < page size cases (e.g. 4K sector size and 64K page size), then
we can hit double ordered extent accounting error.
This should be very rare, as submit_one_sector() only fails when we
failed to grab the extent map, and such extent map should exist inside
the memory and has been pinned.
[CAUSE]
For example we have the following folio layout:
0 4K 32K 48K 60K 64K
|//| |//////| |///|
Where |///| is the dirty range we need to writeback. The 3 different
dirty ranges are submitted for regular COW.
Now we hit the following sequence:
- submit_one_sector() returned 0 for [0, 4K)
- submit_one_sector() returned 0 for [32K, 48K)
- submit_one_sector() returned error for [60K, 64K)
- btrfs_mark_ordered_io_finished() called for the whole folio
This will mark the following ranges as finished:
* [0, 4K)
* [32K, 48K)
Both ranges have their IO already submitted, this cleanup will
lead to double accounting.
* [60K, 64K)
That's the correct cleanup.
The only good news is, this error is only theoretical, as the target
extent map is always pinned, thus we should directly grab it from
memory, other than reading it from the disk.
[FIX]
Instead of calling btrfs_mark_ordered_io_finished() for the whole folio
range, which can touch ranges we should not touch, instead
move the error handling inside extent_writepage_io().
So that we can cleanup exact sectors that ought to be submitted but failed.
This provides much more accurate cleanup, avoiding the double accounting.
CC: stable(a)vger.kernel.org # 5.15+
Signed-off-by: Qu Wenruo <wqu(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index bc2bd103c8cc..5014134b9aa2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1420,6 +1420,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
struct btrfs_fs_info *fs_info = inode->root->fs_info;
unsigned long range_bitmap = 0;
bool submitted_io = false;
+ bool error = false;
const u64 folio_start = folio_pos(folio);
u64 cur;
int bit;
@@ -1462,11 +1463,26 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
break;
}
ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size);
- if (ret < 0)
- goto out;
+ if (unlikely(ret < 0)) {
+ /*
+ * bio_ctrl may contain a bio crossing several folios.
+ * Submit it immediately so that the bio has a chance
+ * to finish normally, other than marked as error.
+ */
+ submit_one_bio(bio_ctrl);
+ /*
+ * Failed to grab the extent map which should be very rare.
+ * Since there is no bio submitted to finish the ordered
+ * extent, we have to manually finish this sector.
+ */
+ btrfs_mark_ordered_io_finished(inode, folio, cur,
+ fs_info->sectorsize, false);
+ error = true;
+ continue;
+ }
submitted_io = true;
}
-out:
+
/*
* If we didn't submitted any sector (>= i_size), folio dirty get
* cleared but PAGECACHE_TAG_DIRTY is not cleared (only cleared
@@ -1474,8 +1490,11 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
*
* Here we set writeback and clear for the range. If the full folio
* is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag.
+ *
+ * If we hit any error, the corresponding sector will still be dirty
+ * thus no need to clear PAGECACHE_TAG_DIRTY.
*/
- if (!submitted_io) {
+ if (!submitted_io && !error) {
btrfs_folio_set_writeback(fs_info, folio, start, len);
btrfs_folio_clear_writeback(fs_info, folio, start, len);
}
@@ -1495,7 +1514,6 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
{
struct btrfs_inode *inode = BTRFS_I(folio->mapping->host);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- const u64 page_start = folio_pos(folio);
int ret;
size_t pg_offset;
loff_t i_size = i_size_read(&inode->vfs_inode);
@@ -1538,10 +1556,6 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
bio_ctrl->wbc->nr_to_write--;
- if (ret)
- btrfs_mark_ordered_io_finished(inode, folio,
- page_start, PAGE_SIZE, !ret);
-
done:
if (ret < 0)
mapping_set_error(folio->mapping, ret);
@@ -2314,11 +2328,8 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
if (ret == 1)
goto next_page;
- if (ret) {
- btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio,
- cur, cur_len, !ret);
+ if (ret)
mapping_set_error(mapping, ret);
- }
btrfs_folio_end_lock(fs_info, folio, cur, cur_len);
if (ret < 0)
found_error = true;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x ade81479c7dda1ce3eedb215c78bc615bbd04f06
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020518-carnage-deflator-99f2@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ade81479c7dda1ce3eedb215c78bc615bbd04f06 Mon Sep 17 00:00:00 2001
From: Chen Ridong <chenridong(a)huawei.com>
Date: Tue, 24 Dec 2024 02:52:38 +0000
Subject: [PATCH] memcg: fix soft lockup in the OOM process
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
A soft lockup issue was found in the product with about 56,000 tasks were
in the OOM cgroup, it was traversing them when the soft lockup was
triggered.
watchdog: BUG: soft lockup - CPU#2 stuck for 23s! [VM Thread:1503066]
CPU: 2 PID: 1503066 Comm: VM Thread Kdump: loaded Tainted: G
Hardware name: Huawei Cloud OpenStack Nova, BIOS
RIP: 0010:console_unlock+0x343/0x540
RSP: 0000:ffffb751447db9a0 EFLAGS: 00000247 ORIG_RAX: ffffffffffffff13
RAX: 0000000000000001 RBX: 0000000000000000 RCX: 00000000ffffffff
RDX: 0000000000000000 RSI: 0000000000000004 RDI: 0000000000000247
RBP: ffffffffafc71f90 R08: 0000000000000000 R09: 0000000000000040
R10: 0000000000000080 R11: 0000000000000000 R12: ffffffffafc74bd0
R13: ffffffffaf60a220 R14: 0000000000000247 R15: 0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f2fe6ad91f0 CR3: 00000004b2076003 CR4: 0000000000360ee0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
vprintk_emit+0x193/0x280
printk+0x52/0x6e
dump_task+0x114/0x130
mem_cgroup_scan_tasks+0x76/0x100
dump_header+0x1fe/0x210
oom_kill_process+0xd1/0x100
out_of_memory+0x125/0x570
mem_cgroup_out_of_memory+0xb5/0xd0
try_charge+0x720/0x770
mem_cgroup_try_charge+0x86/0x180
mem_cgroup_try_charge_delay+0x1c/0x40
do_anonymous_page+0xb5/0x390
handle_mm_fault+0xc4/0x1f0
This is because thousands of processes are in the OOM cgroup, it takes a
long time to traverse all of them. As a result, this lead to soft lockup
in the OOM process.
To fix this issue, call 'cond_resched' in the 'mem_cgroup_scan_tasks'
function per 1000 iterations. For global OOM, call
'touch_softlockup_watchdog' per 1000 iterations to avoid this issue.
Link: https://lkml.kernel.org/r/20241224025238.3768787-1-chenridong@huaweicloud.c…
Fixes: 9cbb78bb3143 ("mm, memcg: introduce own oom handler to iterate only over its own threads")
Signed-off-by: Chen Ridong <chenridong(a)huawei.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: Roman Gushchin <roman.gushchin(a)linux.dev>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Shakeel Butt <shakeelb(a)google.com>
Cc: Muchun Song <songmuchun(a)bytedance.com>
Cc: Michal Koutný <mkoutny(a)suse.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 65fb5eee1466..46f8b372d212 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1161,6 +1161,7 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
{
struct mem_cgroup *iter;
int ret = 0;
+ int i = 0;
BUG_ON(mem_cgroup_is_root(memcg));
@@ -1169,8 +1170,12 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
struct task_struct *task;
css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
- while (!ret && (task = css_task_iter_next(&it)))
+ while (!ret && (task = css_task_iter_next(&it))) {
+ /* Avoid potential softlockup warning */
+ if ((++i & 1023) == 0)
+ cond_resched();
ret = fn(task, arg);
+ }
css_task_iter_end(&it);
if (ret) {
mem_cgroup_iter_break(memcg, iter);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 1c485beb0b93..044ebab2c941 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -44,6 +44,7 @@
#include <linux/init.h>
#include <linux/mmu_notifier.h>
#include <linux/cred.h>
+#include <linux/nmi.h>
#include <asm/tlb.h>
#include "internal.h"
@@ -430,10 +431,15 @@ static void dump_tasks(struct oom_control *oc)
mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
else {
struct task_struct *p;
+ int i = 0;
rcu_read_lock();
- for_each_process(p)
+ for_each_process(p) {
+ /* Avoid potential softlockup warning */
+ if ((++i & 1023) == 0)
+ touch_softlockup_watchdog();
dump_task(p, oc);
+ }
rcu_read_unlock();
}
}
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x ade81479c7dda1ce3eedb215c78bc615bbd04f06
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020517-confident-sterile-7acb@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ade81479c7dda1ce3eedb215c78bc615bbd04f06 Mon Sep 17 00:00:00 2001
From: Chen Ridong <chenridong(a)huawei.com>
Date: Tue, 24 Dec 2024 02:52:38 +0000
Subject: [PATCH] memcg: fix soft lockup in the OOM process
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
A soft lockup issue was found in the product with about 56,000 tasks were
in the OOM cgroup, it was traversing them when the soft lockup was
triggered.
watchdog: BUG: soft lockup - CPU#2 stuck for 23s! [VM Thread:1503066]
CPU: 2 PID: 1503066 Comm: VM Thread Kdump: loaded Tainted: G
Hardware name: Huawei Cloud OpenStack Nova, BIOS
RIP: 0010:console_unlock+0x343/0x540
RSP: 0000:ffffb751447db9a0 EFLAGS: 00000247 ORIG_RAX: ffffffffffffff13
RAX: 0000000000000001 RBX: 0000000000000000 RCX: 00000000ffffffff
RDX: 0000000000000000 RSI: 0000000000000004 RDI: 0000000000000247
RBP: ffffffffafc71f90 R08: 0000000000000000 R09: 0000000000000040
R10: 0000000000000080 R11: 0000000000000000 R12: ffffffffafc74bd0
R13: ffffffffaf60a220 R14: 0000000000000247 R15: 0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f2fe6ad91f0 CR3: 00000004b2076003 CR4: 0000000000360ee0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
vprintk_emit+0x193/0x280
printk+0x52/0x6e
dump_task+0x114/0x130
mem_cgroup_scan_tasks+0x76/0x100
dump_header+0x1fe/0x210
oom_kill_process+0xd1/0x100
out_of_memory+0x125/0x570
mem_cgroup_out_of_memory+0xb5/0xd0
try_charge+0x720/0x770
mem_cgroup_try_charge+0x86/0x180
mem_cgroup_try_charge_delay+0x1c/0x40
do_anonymous_page+0xb5/0x390
handle_mm_fault+0xc4/0x1f0
This is because thousands of processes are in the OOM cgroup, it takes a
long time to traverse all of them. As a result, this lead to soft lockup
in the OOM process.
To fix this issue, call 'cond_resched' in the 'mem_cgroup_scan_tasks'
function per 1000 iterations. For global OOM, call
'touch_softlockup_watchdog' per 1000 iterations to avoid this issue.
Link: https://lkml.kernel.org/r/20241224025238.3768787-1-chenridong@huaweicloud.c…
Fixes: 9cbb78bb3143 ("mm, memcg: introduce own oom handler to iterate only over its own threads")
Signed-off-by: Chen Ridong <chenridong(a)huawei.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: Roman Gushchin <roman.gushchin(a)linux.dev>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Shakeel Butt <shakeelb(a)google.com>
Cc: Muchun Song <songmuchun(a)bytedance.com>
Cc: Michal Koutný <mkoutny(a)suse.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 65fb5eee1466..46f8b372d212 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1161,6 +1161,7 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
{
struct mem_cgroup *iter;
int ret = 0;
+ int i = 0;
BUG_ON(mem_cgroup_is_root(memcg));
@@ -1169,8 +1170,12 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
struct task_struct *task;
css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
- while (!ret && (task = css_task_iter_next(&it)))
+ while (!ret && (task = css_task_iter_next(&it))) {
+ /* Avoid potential softlockup warning */
+ if ((++i & 1023) == 0)
+ cond_resched();
ret = fn(task, arg);
+ }
css_task_iter_end(&it);
if (ret) {
mem_cgroup_iter_break(memcg, iter);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 1c485beb0b93..044ebab2c941 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -44,6 +44,7 @@
#include <linux/init.h>
#include <linux/mmu_notifier.h>
#include <linux/cred.h>
+#include <linux/nmi.h>
#include <asm/tlb.h>
#include "internal.h"
@@ -430,10 +431,15 @@ static void dump_tasks(struct oom_control *oc)
mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
else {
struct task_struct *p;
+ int i = 0;
rcu_read_lock();
- for_each_process(p)
+ for_each_process(p) {
+ /* Avoid potential softlockup warning */
+ if ((++i & 1023) == 0)
+ touch_softlockup_watchdog();
dump_task(p, oc);
+ }
rcu_read_unlock();
}
}
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x ade81479c7dda1ce3eedb215c78bc615bbd04f06
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020516-fable-crane-ab09@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ade81479c7dda1ce3eedb215c78bc615bbd04f06 Mon Sep 17 00:00:00 2001
From: Chen Ridong <chenridong(a)huawei.com>
Date: Tue, 24 Dec 2024 02:52:38 +0000
Subject: [PATCH] memcg: fix soft lockup in the OOM process
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
A soft lockup issue was found in the product with about 56,000 tasks were
in the OOM cgroup, it was traversing them when the soft lockup was
triggered.
watchdog: BUG: soft lockup - CPU#2 stuck for 23s! [VM Thread:1503066]
CPU: 2 PID: 1503066 Comm: VM Thread Kdump: loaded Tainted: G
Hardware name: Huawei Cloud OpenStack Nova, BIOS
RIP: 0010:console_unlock+0x343/0x540
RSP: 0000:ffffb751447db9a0 EFLAGS: 00000247 ORIG_RAX: ffffffffffffff13
RAX: 0000000000000001 RBX: 0000000000000000 RCX: 00000000ffffffff
RDX: 0000000000000000 RSI: 0000000000000004 RDI: 0000000000000247
RBP: ffffffffafc71f90 R08: 0000000000000000 R09: 0000000000000040
R10: 0000000000000080 R11: 0000000000000000 R12: ffffffffafc74bd0
R13: ffffffffaf60a220 R14: 0000000000000247 R15: 0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f2fe6ad91f0 CR3: 00000004b2076003 CR4: 0000000000360ee0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
vprintk_emit+0x193/0x280
printk+0x52/0x6e
dump_task+0x114/0x130
mem_cgroup_scan_tasks+0x76/0x100
dump_header+0x1fe/0x210
oom_kill_process+0xd1/0x100
out_of_memory+0x125/0x570
mem_cgroup_out_of_memory+0xb5/0xd0
try_charge+0x720/0x770
mem_cgroup_try_charge+0x86/0x180
mem_cgroup_try_charge_delay+0x1c/0x40
do_anonymous_page+0xb5/0x390
handle_mm_fault+0xc4/0x1f0
This is because thousands of processes are in the OOM cgroup, it takes a
long time to traverse all of them. As a result, this lead to soft lockup
in the OOM process.
To fix this issue, call 'cond_resched' in the 'mem_cgroup_scan_tasks'
function per 1000 iterations. For global OOM, call
'touch_softlockup_watchdog' per 1000 iterations to avoid this issue.
Link: https://lkml.kernel.org/r/20241224025238.3768787-1-chenridong@huaweicloud.c…
Fixes: 9cbb78bb3143 ("mm, memcg: introduce own oom handler to iterate only over its own threads")
Signed-off-by: Chen Ridong <chenridong(a)huawei.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: Roman Gushchin <roman.gushchin(a)linux.dev>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Shakeel Butt <shakeelb(a)google.com>
Cc: Muchun Song <songmuchun(a)bytedance.com>
Cc: Michal Koutný <mkoutny(a)suse.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 65fb5eee1466..46f8b372d212 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1161,6 +1161,7 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
{
struct mem_cgroup *iter;
int ret = 0;
+ int i = 0;
BUG_ON(mem_cgroup_is_root(memcg));
@@ -1169,8 +1170,12 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
struct task_struct *task;
css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
- while (!ret && (task = css_task_iter_next(&it)))
+ while (!ret && (task = css_task_iter_next(&it))) {
+ /* Avoid potential softlockup warning */
+ if ((++i & 1023) == 0)
+ cond_resched();
ret = fn(task, arg);
+ }
css_task_iter_end(&it);
if (ret) {
mem_cgroup_iter_break(memcg, iter);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 1c485beb0b93..044ebab2c941 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -44,6 +44,7 @@
#include <linux/init.h>
#include <linux/mmu_notifier.h>
#include <linux/cred.h>
+#include <linux/nmi.h>
#include <asm/tlb.h>
#include "internal.h"
@@ -430,10 +431,15 @@ static void dump_tasks(struct oom_control *oc)
mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
else {
struct task_struct *p;
+ int i = 0;
rcu_read_lock();
- for_each_process(p)
+ for_each_process(p) {
+ /* Avoid potential softlockup warning */
+ if ((++i & 1023) == 0)
+ touch_softlockup_watchdog();
dump_task(p, oc);
+ }
rcu_read_unlock();
}
}
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x ade81479c7dda1ce3eedb215c78bc615bbd04f06
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020515-nebulizer-gentleman-d288@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ade81479c7dda1ce3eedb215c78bc615bbd04f06 Mon Sep 17 00:00:00 2001
From: Chen Ridong <chenridong(a)huawei.com>
Date: Tue, 24 Dec 2024 02:52:38 +0000
Subject: [PATCH] memcg: fix soft lockup in the OOM process
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
A soft lockup issue was found in the product with about 56,000 tasks were
in the OOM cgroup, it was traversing them when the soft lockup was
triggered.
watchdog: BUG: soft lockup - CPU#2 stuck for 23s! [VM Thread:1503066]
CPU: 2 PID: 1503066 Comm: VM Thread Kdump: loaded Tainted: G
Hardware name: Huawei Cloud OpenStack Nova, BIOS
RIP: 0010:console_unlock+0x343/0x540
RSP: 0000:ffffb751447db9a0 EFLAGS: 00000247 ORIG_RAX: ffffffffffffff13
RAX: 0000000000000001 RBX: 0000000000000000 RCX: 00000000ffffffff
RDX: 0000000000000000 RSI: 0000000000000004 RDI: 0000000000000247
RBP: ffffffffafc71f90 R08: 0000000000000000 R09: 0000000000000040
R10: 0000000000000080 R11: 0000000000000000 R12: ffffffffafc74bd0
R13: ffffffffaf60a220 R14: 0000000000000247 R15: 0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f2fe6ad91f0 CR3: 00000004b2076003 CR4: 0000000000360ee0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
vprintk_emit+0x193/0x280
printk+0x52/0x6e
dump_task+0x114/0x130
mem_cgroup_scan_tasks+0x76/0x100
dump_header+0x1fe/0x210
oom_kill_process+0xd1/0x100
out_of_memory+0x125/0x570
mem_cgroup_out_of_memory+0xb5/0xd0
try_charge+0x720/0x770
mem_cgroup_try_charge+0x86/0x180
mem_cgroup_try_charge_delay+0x1c/0x40
do_anonymous_page+0xb5/0x390
handle_mm_fault+0xc4/0x1f0
This is because thousands of processes are in the OOM cgroup, it takes a
long time to traverse all of them. As a result, this lead to soft lockup
in the OOM process.
To fix this issue, call 'cond_resched' in the 'mem_cgroup_scan_tasks'
function per 1000 iterations. For global OOM, call
'touch_softlockup_watchdog' per 1000 iterations to avoid this issue.
Link: https://lkml.kernel.org/r/20241224025238.3768787-1-chenridong@huaweicloud.c…
Fixes: 9cbb78bb3143 ("mm, memcg: introduce own oom handler to iterate only over its own threads")
Signed-off-by: Chen Ridong <chenridong(a)huawei.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: Roman Gushchin <roman.gushchin(a)linux.dev>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Shakeel Butt <shakeelb(a)google.com>
Cc: Muchun Song <songmuchun(a)bytedance.com>
Cc: Michal Koutný <mkoutny(a)suse.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 65fb5eee1466..46f8b372d212 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1161,6 +1161,7 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
{
struct mem_cgroup *iter;
int ret = 0;
+ int i = 0;
BUG_ON(mem_cgroup_is_root(memcg));
@@ -1169,8 +1170,12 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
struct task_struct *task;
css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
- while (!ret && (task = css_task_iter_next(&it)))
+ while (!ret && (task = css_task_iter_next(&it))) {
+ /* Avoid potential softlockup warning */
+ if ((++i & 1023) == 0)
+ cond_resched();
ret = fn(task, arg);
+ }
css_task_iter_end(&it);
if (ret) {
mem_cgroup_iter_break(memcg, iter);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 1c485beb0b93..044ebab2c941 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -44,6 +44,7 @@
#include <linux/init.h>
#include <linux/mmu_notifier.h>
#include <linux/cred.h>
+#include <linux/nmi.h>
#include <asm/tlb.h>
#include "internal.h"
@@ -430,10 +431,15 @@ static void dump_tasks(struct oom_control *oc)
mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
else {
struct task_struct *p;
+ int i = 0;
rcu_read_lock();
- for_each_process(p)
+ for_each_process(p) {
+ /* Avoid potential softlockup warning */
+ if ((++i & 1023) == 0)
+ touch_softlockup_watchdog();
dump_task(p, oc);
+ }
rcu_read_unlock();
}
}
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x ade81479c7dda1ce3eedb215c78bc615bbd04f06
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020514-hacker-slouching-58c8@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ade81479c7dda1ce3eedb215c78bc615bbd04f06 Mon Sep 17 00:00:00 2001
From: Chen Ridong <chenridong(a)huawei.com>
Date: Tue, 24 Dec 2024 02:52:38 +0000
Subject: [PATCH] memcg: fix soft lockup in the OOM process
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
A soft lockup issue was found in the product with about 56,000 tasks were
in the OOM cgroup, it was traversing them when the soft lockup was
triggered.
watchdog: BUG: soft lockup - CPU#2 stuck for 23s! [VM Thread:1503066]
CPU: 2 PID: 1503066 Comm: VM Thread Kdump: loaded Tainted: G
Hardware name: Huawei Cloud OpenStack Nova, BIOS
RIP: 0010:console_unlock+0x343/0x540
RSP: 0000:ffffb751447db9a0 EFLAGS: 00000247 ORIG_RAX: ffffffffffffff13
RAX: 0000000000000001 RBX: 0000000000000000 RCX: 00000000ffffffff
RDX: 0000000000000000 RSI: 0000000000000004 RDI: 0000000000000247
RBP: ffffffffafc71f90 R08: 0000000000000000 R09: 0000000000000040
R10: 0000000000000080 R11: 0000000000000000 R12: ffffffffafc74bd0
R13: ffffffffaf60a220 R14: 0000000000000247 R15: 0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f2fe6ad91f0 CR3: 00000004b2076003 CR4: 0000000000360ee0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
vprintk_emit+0x193/0x280
printk+0x52/0x6e
dump_task+0x114/0x130
mem_cgroup_scan_tasks+0x76/0x100
dump_header+0x1fe/0x210
oom_kill_process+0xd1/0x100
out_of_memory+0x125/0x570
mem_cgroup_out_of_memory+0xb5/0xd0
try_charge+0x720/0x770
mem_cgroup_try_charge+0x86/0x180
mem_cgroup_try_charge_delay+0x1c/0x40
do_anonymous_page+0xb5/0x390
handle_mm_fault+0xc4/0x1f0
This is because thousands of processes are in the OOM cgroup, it takes a
long time to traverse all of them. As a result, this lead to soft lockup
in the OOM process.
To fix this issue, call 'cond_resched' in the 'mem_cgroup_scan_tasks'
function per 1000 iterations. For global OOM, call
'touch_softlockup_watchdog' per 1000 iterations to avoid this issue.
Link: https://lkml.kernel.org/r/20241224025238.3768787-1-chenridong@huaweicloud.c…
Fixes: 9cbb78bb3143 ("mm, memcg: introduce own oom handler to iterate only over its own threads")
Signed-off-by: Chen Ridong <chenridong(a)huawei.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: Roman Gushchin <roman.gushchin(a)linux.dev>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Shakeel Butt <shakeelb(a)google.com>
Cc: Muchun Song <songmuchun(a)bytedance.com>
Cc: Michal Koutný <mkoutny(a)suse.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 65fb5eee1466..46f8b372d212 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1161,6 +1161,7 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
{
struct mem_cgroup *iter;
int ret = 0;
+ int i = 0;
BUG_ON(mem_cgroup_is_root(memcg));
@@ -1169,8 +1170,12 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
struct task_struct *task;
css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
- while (!ret && (task = css_task_iter_next(&it)))
+ while (!ret && (task = css_task_iter_next(&it))) {
+ /* Avoid potential softlockup warning */
+ if ((++i & 1023) == 0)
+ cond_resched();
ret = fn(task, arg);
+ }
css_task_iter_end(&it);
if (ret) {
mem_cgroup_iter_break(memcg, iter);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 1c485beb0b93..044ebab2c941 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -44,6 +44,7 @@
#include <linux/init.h>
#include <linux/mmu_notifier.h>
#include <linux/cred.h>
+#include <linux/nmi.h>
#include <asm/tlb.h>
#include "internal.h"
@@ -430,10 +431,15 @@ static void dump_tasks(struct oom_control *oc)
mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
else {
struct task_struct *p;
+ int i = 0;
rcu_read_lock();
- for_each_process(p)
+ for_each_process(p) {
+ /* Avoid potential softlockup warning */
+ if ((++i & 1023) == 0)
+ touch_softlockup_watchdog();
dump_task(p, oc);
+ }
rcu_read_unlock();
}
}
From: Roberto Sassu <roberto.sassu(a)huawei.com>
Commit 0d73a55208e9 ("ima: re-introduce own integrity cache lock")
mistakenly reverted the performance improvement introduced in commit
42a4c603198f0 ("ima: fix ima_inode_post_setattr"). The unused bit mask was
subsequently removed by commit 11c60f23ed13 ("integrity: Remove unused
macro IMA_ACTION_RULE_FLAGS").
Restore the performance improvement by introducing the new mask
IMA_NONACTION_RULE_FLAGS, equal to IMA_NONACTION_FLAGS without
IMA_NEW_FILE, which is not a rule-specific flag.
Finally, reset IMA_NONACTION_RULE_FLAGS instead of IMA_NONACTION_FLAGS in
process_measurement(), if the IMA_CHANGE_ATTR atomic flag is set (after
file metadata modification).
With this patch, new files for which metadata were modified while they are
still open, can be reopened before the last file close (when security.ima
is written), since the IMA_NEW_FILE flag is not cleared anymore. Otherwise,
appraisal fails because security.ima is missing (files with IMA_NEW_FILE
set are an exception).
Cc: stable(a)vger.kernel.org # v4.16.x
Fixes: 0d73a55208e9 ("ima: re-introduce own integrity cache lock")
Signed-off-by: Roberto Sassu <roberto.sassu(a)huawei.com>
---
security/integrity/ima/ima.h | 3 +++
security/integrity/ima/ima_main.c | 7 +++++--
2 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
index 24d09ea91b87..a4f284bd846c 100644
--- a/security/integrity/ima/ima.h
+++ b/security/integrity/ima/ima.h
@@ -149,6 +149,9 @@ struct ima_kexec_hdr {
#define IMA_CHECK_BLACKLIST 0x40000000
#define IMA_VERITY_REQUIRED 0x80000000
+/* Exclude non-action flags which are not rule-specific. */
+#define IMA_NONACTION_RULE_FLAGS (IMA_NONACTION_FLAGS & ~IMA_NEW_FILE)
+
#define IMA_DO_MASK (IMA_MEASURE | IMA_APPRAISE | IMA_AUDIT | \
IMA_HASH | IMA_APPRAISE_SUBMASK)
#define IMA_DONE_MASK (IMA_MEASURED | IMA_APPRAISED | IMA_AUDITED | \
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index 9b87556b03a7..b028c501949c 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -269,10 +269,13 @@ static int process_measurement(struct file *file, const struct cred *cred,
mutex_lock(&iint->mutex);
if (test_and_clear_bit(IMA_CHANGE_ATTR, &iint->atomic_flags))
- /* reset appraisal flags if ima_inode_post_setattr was called */
+ /*
+ * Reset appraisal flags (action and non-action rule-specific)
+ * if ima_inode_post_setattr was called.
+ */
iint->flags &= ~(IMA_APPRAISE | IMA_APPRAISED |
IMA_APPRAISE_SUBMASK | IMA_APPRAISED_SUBMASK |
- IMA_NONACTION_FLAGS);
+ IMA_NONACTION_RULE_FLAGS);
/*
* Re-evaulate the file if either the xattr has changed or the
--
2.34.1
Hello,
we are seeing broken CPU PSI metrics across our infrastructure running
6.12, with messages like "psi: inconsistent task state!
task=1831:hackbench cpu=8 psi_flags=14 clear=0 set=4" in dmesg. I
believe commit 7d9da040575b343085287686fa902a5b2d43c7ca might fix this
issue.
psi: Fix race when task wakes up before psi_sched_switch() adjusts flags
Thanks
Paul
Direct HLT instruction execution causes #VEs for TDX VMs which is routed
to hypervisor via tdvmcall. This process renders HLT instruction
execution inatomic, so any preceding instructions like STI/MOV SS will
end up enabling interrupts before the HLT instruction is routed to the
hypervisor. This creates scenarios where interrupts could land during
HLT instruction emulation without aborting halt operation leading to
idefinite halt wait times.
Commit bfe6ed0c6727 ("x86/tdx: Add HLT support for TDX guests") already
upgraded x86_idle() to invoke tdvmcall to avoid such scenarios, but
it didn't cover pv_native_safe_halt() which can be invoked using
raw_safe_halt() from call sites like acpi_safe_halt().
raw_safe_halt() also returns with interrupts enabled so upgrade
tdx_safe_halt() to enable interrupts by default and ensure that paravirt
safe_halt() executions invoke tdx_safe_halt(). Earlier x86_idle() is now
handled via tdx_idle() which simply invokes tdvmcall while preserving
irq state.
To avoid future call sites which cause HLT instruction emulation with
irqs enabled, add a warn and fail the HLT instruction emulation.
Cc: stable(a)vger.kernel.org
Fixes: bfe6ed0c6727 ("x86/tdx: Add HLT support for TDX guests")
Signed-off-by: Vishal Annapurve <vannapurve(a)google.com>
---
Changes since V1:
1) Addressed comments from Dave H
- Comment regarding adding a check for TDX VMs in halt path is not
resolved in v2, would like feedback around better place to do so,
maybe in pv_native_safe_halt (?).
2) Added a new version of tdx_safe_halt() that will enable interrupts.
3) Previous tdx_safe_halt() implementation is moved to newly introduced
tdx_idle().
V1: https://lore.kernel.org/lkml/Z5l6L3Hen9_Y3SGC@google.com/T/
arch/x86/coco/tdx/tdx.c | 23 ++++++++++++++++++++++-
arch/x86/include/asm/tdx.h | 2 +-
arch/x86/kernel/process.c | 2 +-
3 files changed, 24 insertions(+), 3 deletions(-)
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 0d9b090b4880..cc2a637dca15 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -14,6 +14,7 @@
#include <asm/ia32.h>
#include <asm/insn.h>
#include <asm/insn-eval.h>
+#include <asm/paravirt_types.h>
#include <asm/pgtable.h>
#include <asm/set_memory.h>
#include <asm/traps.h>
@@ -380,13 +381,18 @@ static int handle_halt(struct ve_info *ve)
{
const bool irq_disabled = irqs_disabled();
+ if (!irq_disabled) {
+ WARN_ONCE(1, "HLT instruction emulation unsafe with irqs enabled\n");
+ return -EIO;
+ }
+
if (__halt(irq_disabled))
return -EIO;
return ve_instr_len(ve);
}
-void __cpuidle tdx_safe_halt(void)
+void __cpuidle tdx_idle(void)
{
const bool irq_disabled = false;
@@ -397,6 +403,12 @@ void __cpuidle tdx_safe_halt(void)
WARN_ONCE(1, "HLT instruction emulation failed\n");
}
+static void __cpuidle tdx_safe_halt(void)
+{
+ tdx_idle();
+ raw_local_irq_enable();
+}
+
static int read_msr(struct pt_regs *regs, struct ve_info *ve)
{
struct tdx_module_args args = {
@@ -1083,6 +1095,15 @@ void __init tdx_early_init(void)
x86_platform.guest.enc_kexec_begin = tdx_kexec_begin;
x86_platform.guest.enc_kexec_finish = tdx_kexec_finish;
+#ifdef CONFIG_PARAVIRT_XXL
+ /*
+ * halt instruction execution is not atomic for TDX VMs as it generates
+ * #VEs, so otherwise "safe" halt invocations which cause interrupts to
+ * get enabled right after halt instruction don't work for TDX VMs.
+ */
+ pv_ops.irq.safe_halt = tdx_safe_halt;
+#endif
+
/*
* TDX intercepts the RDMSR to read the X2APIC ID in the parallel
* bringup low level code. That raises #VE which cannot be handled
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index eba178996d84..dd386500ab1c 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -58,7 +58,7 @@ void tdx_get_ve_info(struct ve_info *ve);
bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve);
-void tdx_safe_halt(void);
+void tdx_idle(void);
bool tdx_early_handle_ve(struct pt_regs *regs);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index f63f8fd00a91..4083838fe4a0 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -933,7 +933,7 @@ void __init select_idle_routine(void)
static_call_update(x86_idle, mwait_idle);
} else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) {
pr_info("using TDX aware idle routine\n");
- static_call_update(x86_idle, tdx_safe_halt);
+ static_call_update(x86_idle, tdx_idle);
} else {
static_call_update(x86_idle, default_idle);
}
--
2.48.1.262.g85cc9f2d1e-goog
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 8d28d0ddb986f56920ac97ae704cc3340a699a30
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020421-exonerate-desecrate-e5ed@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8d28d0ddb986f56920ac97ae704cc3340a699a30 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3(a)huawei.com>
Date: Fri, 24 Jan 2025 17:20:55 +0800
Subject: [PATCH] md/md-bitmap: Synchronize bitmap_get_stats() with bitmap
lifetime
After commit ec6bb299c7c3 ("md/md-bitmap: add 'sync_size' into struct
md_bitmap_stats"), following panic is reported:
Oops: general protection fault, probably for non-canonical address
RIP: 0010:bitmap_get_stats+0x2b/0xa0
Call Trace:
<TASK>
md_seq_show+0x2d2/0x5b0
seq_read_iter+0x2b9/0x470
seq_read+0x12f/0x180
proc_reg_read+0x57/0xb0
vfs_read+0xf6/0x380
ksys_read+0x6c/0xf0
do_syscall_64+0x82/0x170
entry_SYSCALL_64_after_hwframe+0x76/0x7e
Root cause is that bitmap_get_stats() can be called at anytime if mddev
is still there, even if bitmap is destroyed, or not fully initialized.
Deferenceing bitmap in this case can crash the kernel. Meanwhile, the
above commit start to deferencing bitmap->storage, make the problem
easier to trigger.
Fix the problem by protecting bitmap_get_stats() with bitmap_info.mutex.
Cc: stable(a)vger.kernel.org # v6.12+
Fixes: 32a7627cf3a3 ("[PATCH] md: optimised resync using Bitmap based intent logging")
Reported-and-tested-by: Harshit Mogalapalli <harshit.m.mogalapalli(a)oracle.com>
Closes: https://lore.kernel.org/linux-raid/ca3a91a2-50ae-4f68-b317-abd9889f3907@ora…
Signed-off-by: Yu Kuai <yukuai3(a)huawei.com>
Link: https://lore.kernel.org/r/20250124092055.4050195-1-yukuai1@huaweicloud.com
Signed-off-by: Song Liu <song(a)kernel.org>
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index ec4ecd96e6b1..23c09d22fcdb 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -2355,7 +2355,10 @@ static int bitmap_get_stats(void *data, struct md_bitmap_stats *stats)
if (!bitmap)
return -ENOENT;
-
+ if (bitmap->mddev->bitmap_info.external)
+ return -ENOENT;
+ if (!bitmap->storage.sb_page) /* no superblock */
+ return -EINVAL;
sb = kmap_local_page(bitmap->storage.sb_page);
stats->sync_size = le64_to_cpu(sb->sync_size);
kunmap_local(sb);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 866015b681af..465ca2af1e6e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8376,6 +8376,10 @@ static int md_seq_show(struct seq_file *seq, void *v)
return 0;
spin_unlock(&all_mddevs_lock);
+
+ /* prevent bitmap to be freed after checking */
+ mutex_lock(&mddev->bitmap_info.mutex);
+
spin_lock(&mddev->lock);
if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
seq_printf(seq, "%s : ", mdname(mddev));
@@ -8451,6 +8455,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "\n");
}
spin_unlock(&mddev->lock);
+ mutex_unlock(&mddev->bitmap_info.mutex);
spin_lock(&all_mddevs_lock);
if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs))
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 8d28d0ddb986f56920ac97ae704cc3340a699a30
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020420-simile-joyride-42b9@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8d28d0ddb986f56920ac97ae704cc3340a699a30 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3(a)huawei.com>
Date: Fri, 24 Jan 2025 17:20:55 +0800
Subject: [PATCH] md/md-bitmap: Synchronize bitmap_get_stats() with bitmap
lifetime
After commit ec6bb299c7c3 ("md/md-bitmap: add 'sync_size' into struct
md_bitmap_stats"), following panic is reported:
Oops: general protection fault, probably for non-canonical address
RIP: 0010:bitmap_get_stats+0x2b/0xa0
Call Trace:
<TASK>
md_seq_show+0x2d2/0x5b0
seq_read_iter+0x2b9/0x470
seq_read+0x12f/0x180
proc_reg_read+0x57/0xb0
vfs_read+0xf6/0x380
ksys_read+0x6c/0xf0
do_syscall_64+0x82/0x170
entry_SYSCALL_64_after_hwframe+0x76/0x7e
Root cause is that bitmap_get_stats() can be called at anytime if mddev
is still there, even if bitmap is destroyed, or not fully initialized.
Deferenceing bitmap in this case can crash the kernel. Meanwhile, the
above commit start to deferencing bitmap->storage, make the problem
easier to trigger.
Fix the problem by protecting bitmap_get_stats() with bitmap_info.mutex.
Cc: stable(a)vger.kernel.org # v6.12+
Fixes: 32a7627cf3a3 ("[PATCH] md: optimised resync using Bitmap based intent logging")
Reported-and-tested-by: Harshit Mogalapalli <harshit.m.mogalapalli(a)oracle.com>
Closes: https://lore.kernel.org/linux-raid/ca3a91a2-50ae-4f68-b317-abd9889f3907@ora…
Signed-off-by: Yu Kuai <yukuai3(a)huawei.com>
Link: https://lore.kernel.org/r/20250124092055.4050195-1-yukuai1@huaweicloud.com
Signed-off-by: Song Liu <song(a)kernel.org>
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index ec4ecd96e6b1..23c09d22fcdb 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -2355,7 +2355,10 @@ static int bitmap_get_stats(void *data, struct md_bitmap_stats *stats)
if (!bitmap)
return -ENOENT;
-
+ if (bitmap->mddev->bitmap_info.external)
+ return -ENOENT;
+ if (!bitmap->storage.sb_page) /* no superblock */
+ return -EINVAL;
sb = kmap_local_page(bitmap->storage.sb_page);
stats->sync_size = le64_to_cpu(sb->sync_size);
kunmap_local(sb);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 866015b681af..465ca2af1e6e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8376,6 +8376,10 @@ static int md_seq_show(struct seq_file *seq, void *v)
return 0;
spin_unlock(&all_mddevs_lock);
+
+ /* prevent bitmap to be freed after checking */
+ mutex_lock(&mddev->bitmap_info.mutex);
+
spin_lock(&mddev->lock);
if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
seq_printf(seq, "%s : ", mdname(mddev));
@@ -8451,6 +8455,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "\n");
}
spin_unlock(&mddev->lock);
+ mutex_unlock(&mddev->bitmap_info.mutex);
spin_lock(&all_mddevs_lock);
if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs))
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 8d28d0ddb986f56920ac97ae704cc3340a699a30
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020420-dreamlike-desktop-698c@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8d28d0ddb986f56920ac97ae704cc3340a699a30 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3(a)huawei.com>
Date: Fri, 24 Jan 2025 17:20:55 +0800
Subject: [PATCH] md/md-bitmap: Synchronize bitmap_get_stats() with bitmap
lifetime
After commit ec6bb299c7c3 ("md/md-bitmap: add 'sync_size' into struct
md_bitmap_stats"), following panic is reported:
Oops: general protection fault, probably for non-canonical address
RIP: 0010:bitmap_get_stats+0x2b/0xa0
Call Trace:
<TASK>
md_seq_show+0x2d2/0x5b0
seq_read_iter+0x2b9/0x470
seq_read+0x12f/0x180
proc_reg_read+0x57/0xb0
vfs_read+0xf6/0x380
ksys_read+0x6c/0xf0
do_syscall_64+0x82/0x170
entry_SYSCALL_64_after_hwframe+0x76/0x7e
Root cause is that bitmap_get_stats() can be called at anytime if mddev
is still there, even if bitmap is destroyed, or not fully initialized.
Deferenceing bitmap in this case can crash the kernel. Meanwhile, the
above commit start to deferencing bitmap->storage, make the problem
easier to trigger.
Fix the problem by protecting bitmap_get_stats() with bitmap_info.mutex.
Cc: stable(a)vger.kernel.org # v6.12+
Fixes: 32a7627cf3a3 ("[PATCH] md: optimised resync using Bitmap based intent logging")
Reported-and-tested-by: Harshit Mogalapalli <harshit.m.mogalapalli(a)oracle.com>
Closes: https://lore.kernel.org/linux-raid/ca3a91a2-50ae-4f68-b317-abd9889f3907@ora…
Signed-off-by: Yu Kuai <yukuai3(a)huawei.com>
Link: https://lore.kernel.org/r/20250124092055.4050195-1-yukuai1@huaweicloud.com
Signed-off-by: Song Liu <song(a)kernel.org>
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index ec4ecd96e6b1..23c09d22fcdb 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -2355,7 +2355,10 @@ static int bitmap_get_stats(void *data, struct md_bitmap_stats *stats)
if (!bitmap)
return -ENOENT;
-
+ if (bitmap->mddev->bitmap_info.external)
+ return -ENOENT;
+ if (!bitmap->storage.sb_page) /* no superblock */
+ return -EINVAL;
sb = kmap_local_page(bitmap->storage.sb_page);
stats->sync_size = le64_to_cpu(sb->sync_size);
kunmap_local(sb);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 866015b681af..465ca2af1e6e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8376,6 +8376,10 @@ static int md_seq_show(struct seq_file *seq, void *v)
return 0;
spin_unlock(&all_mddevs_lock);
+
+ /* prevent bitmap to be freed after checking */
+ mutex_lock(&mddev->bitmap_info.mutex);
+
spin_lock(&mddev->lock);
if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
seq_printf(seq, "%s : ", mdname(mddev));
@@ -8451,6 +8455,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "\n");
}
spin_unlock(&mddev->lock);
+ mutex_unlock(&mddev->bitmap_info.mutex);
spin_lock(&all_mddevs_lock);
if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs))
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 8d28d0ddb986f56920ac97ae704cc3340a699a30
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020419-snippet-epileptic-4280@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8d28d0ddb986f56920ac97ae704cc3340a699a30 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3(a)huawei.com>
Date: Fri, 24 Jan 2025 17:20:55 +0800
Subject: [PATCH] md/md-bitmap: Synchronize bitmap_get_stats() with bitmap
lifetime
After commit ec6bb299c7c3 ("md/md-bitmap: add 'sync_size' into struct
md_bitmap_stats"), following panic is reported:
Oops: general protection fault, probably for non-canonical address
RIP: 0010:bitmap_get_stats+0x2b/0xa0
Call Trace:
<TASK>
md_seq_show+0x2d2/0x5b0
seq_read_iter+0x2b9/0x470
seq_read+0x12f/0x180
proc_reg_read+0x57/0xb0
vfs_read+0xf6/0x380
ksys_read+0x6c/0xf0
do_syscall_64+0x82/0x170
entry_SYSCALL_64_after_hwframe+0x76/0x7e
Root cause is that bitmap_get_stats() can be called at anytime if mddev
is still there, even if bitmap is destroyed, or not fully initialized.
Deferenceing bitmap in this case can crash the kernel. Meanwhile, the
above commit start to deferencing bitmap->storage, make the problem
easier to trigger.
Fix the problem by protecting bitmap_get_stats() with bitmap_info.mutex.
Cc: stable(a)vger.kernel.org # v6.12+
Fixes: 32a7627cf3a3 ("[PATCH] md: optimised resync using Bitmap based intent logging")
Reported-and-tested-by: Harshit Mogalapalli <harshit.m.mogalapalli(a)oracle.com>
Closes: https://lore.kernel.org/linux-raid/ca3a91a2-50ae-4f68-b317-abd9889f3907@ora…
Signed-off-by: Yu Kuai <yukuai3(a)huawei.com>
Link: https://lore.kernel.org/r/20250124092055.4050195-1-yukuai1@huaweicloud.com
Signed-off-by: Song Liu <song(a)kernel.org>
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index ec4ecd96e6b1..23c09d22fcdb 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -2355,7 +2355,10 @@ static int bitmap_get_stats(void *data, struct md_bitmap_stats *stats)
if (!bitmap)
return -ENOENT;
-
+ if (bitmap->mddev->bitmap_info.external)
+ return -ENOENT;
+ if (!bitmap->storage.sb_page) /* no superblock */
+ return -EINVAL;
sb = kmap_local_page(bitmap->storage.sb_page);
stats->sync_size = le64_to_cpu(sb->sync_size);
kunmap_local(sb);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 866015b681af..465ca2af1e6e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8376,6 +8376,10 @@ static int md_seq_show(struct seq_file *seq, void *v)
return 0;
spin_unlock(&all_mddevs_lock);
+
+ /* prevent bitmap to be freed after checking */
+ mutex_lock(&mddev->bitmap_info.mutex);
+
spin_lock(&mddev->lock);
if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
seq_printf(seq, "%s : ", mdname(mddev));
@@ -8451,6 +8455,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "\n");
}
spin_unlock(&mddev->lock);
+ mutex_unlock(&mddev->bitmap_info.mutex);
spin_lock(&all_mddevs_lock);
if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs))
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 8d28d0ddb986f56920ac97ae704cc3340a699a30
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020419-arrival-portion-4908@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 8d28d0ddb986f56920ac97ae704cc3340a699a30 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3(a)huawei.com>
Date: Fri, 24 Jan 2025 17:20:55 +0800
Subject: [PATCH] md/md-bitmap: Synchronize bitmap_get_stats() with bitmap
lifetime
After commit ec6bb299c7c3 ("md/md-bitmap: add 'sync_size' into struct
md_bitmap_stats"), following panic is reported:
Oops: general protection fault, probably for non-canonical address
RIP: 0010:bitmap_get_stats+0x2b/0xa0
Call Trace:
<TASK>
md_seq_show+0x2d2/0x5b0
seq_read_iter+0x2b9/0x470
seq_read+0x12f/0x180
proc_reg_read+0x57/0xb0
vfs_read+0xf6/0x380
ksys_read+0x6c/0xf0
do_syscall_64+0x82/0x170
entry_SYSCALL_64_after_hwframe+0x76/0x7e
Root cause is that bitmap_get_stats() can be called at anytime if mddev
is still there, even if bitmap is destroyed, or not fully initialized.
Deferenceing bitmap in this case can crash the kernel. Meanwhile, the
above commit start to deferencing bitmap->storage, make the problem
easier to trigger.
Fix the problem by protecting bitmap_get_stats() with bitmap_info.mutex.
Cc: stable(a)vger.kernel.org # v6.12+
Fixes: 32a7627cf3a3 ("[PATCH] md: optimised resync using Bitmap based intent logging")
Reported-and-tested-by: Harshit Mogalapalli <harshit.m.mogalapalli(a)oracle.com>
Closes: https://lore.kernel.org/linux-raid/ca3a91a2-50ae-4f68-b317-abd9889f3907@ora…
Signed-off-by: Yu Kuai <yukuai3(a)huawei.com>
Link: https://lore.kernel.org/r/20250124092055.4050195-1-yukuai1@huaweicloud.com
Signed-off-by: Song Liu <song(a)kernel.org>
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index ec4ecd96e6b1..23c09d22fcdb 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -2355,7 +2355,10 @@ static int bitmap_get_stats(void *data, struct md_bitmap_stats *stats)
if (!bitmap)
return -ENOENT;
-
+ if (bitmap->mddev->bitmap_info.external)
+ return -ENOENT;
+ if (!bitmap->storage.sb_page) /* no superblock */
+ return -EINVAL;
sb = kmap_local_page(bitmap->storage.sb_page);
stats->sync_size = le64_to_cpu(sb->sync_size);
kunmap_local(sb);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 866015b681af..465ca2af1e6e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8376,6 +8376,10 @@ static int md_seq_show(struct seq_file *seq, void *v)
return 0;
spin_unlock(&all_mddevs_lock);
+
+ /* prevent bitmap to be freed after checking */
+ mutex_lock(&mddev->bitmap_info.mutex);
+
spin_lock(&mddev->lock);
if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
seq_printf(seq, "%s : ", mdname(mddev));
@@ -8451,6 +8455,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "\n");
}
spin_unlock(&mddev->lock);
+ mutex_unlock(&mddev->bitmap_info.mutex);
spin_lock(&all_mddevs_lock);
if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs))
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 1378ffec30367233152b7dbf4fa6a25ee98585d1
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020458-egotism-espresso-bb20@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1378ffec30367233152b7dbf4fa6a25ee98585d1 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter(a)linaro.org>
Date: Thu, 17 Oct 2024 23:34:16 +0300
Subject: [PATCH] media: imx-jpeg: Fix potential error pointer dereference in
detach_pm()
The proble is on the first line:
if (jpeg->pd_dev[i] && !pm_runtime_suspended(jpeg->pd_dev[i]))
If jpeg->pd_dev[i] is an error pointer, then passing it to
pm_runtime_suspended() will lead to an Oops. The other conditions
check for both error pointers and NULL, but it would be more clear to
use the IS_ERR_OR_NULL() check for that.
Fixes: fd0af4cd35da ("media: imx-jpeg: Ensure power suppliers be suspended before detach them")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Dan Carpenter <dan.carpenter(a)linaro.org>
Reviewed-by: Ming Qian <ming.qian(a)nxp.com>
Signed-off-by: Hans Verkuil <hverkuil(a)xs4all.nl>
diff --git a/drivers/media/platform/nxp/imx-jpeg/mxc-jpeg.c b/drivers/media/platform/nxp/imx-jpeg/mxc-jpeg.c
index 7f5fe551179b..1221b309a916 100644
--- a/drivers/media/platform/nxp/imx-jpeg/mxc-jpeg.c
+++ b/drivers/media/platform/nxp/imx-jpeg/mxc-jpeg.c
@@ -2677,11 +2677,12 @@ static void mxc_jpeg_detach_pm_domains(struct mxc_jpeg_dev *jpeg)
int i;
for (i = 0; i < jpeg->num_domains; i++) {
- if (jpeg->pd_dev[i] && !pm_runtime_suspended(jpeg->pd_dev[i]))
+ if (!IS_ERR_OR_NULL(jpeg->pd_dev[i]) &&
+ !pm_runtime_suspended(jpeg->pd_dev[i]))
pm_runtime_force_suspend(jpeg->pd_dev[i]);
- if (jpeg->pd_link[i] && !IS_ERR(jpeg->pd_link[i]))
+ if (!IS_ERR_OR_NULL(jpeg->pd_link[i]))
device_link_del(jpeg->pd_link[i]);
- if (jpeg->pd_dev[i] && !IS_ERR(jpeg->pd_dev[i]))
+ if (!IS_ERR_OR_NULL(jpeg->pd_dev[i]))
dev_pm_domain_detach(jpeg->pd_dev[i], true);
jpeg->pd_dev[i] = NULL;
jpeg->pd_link[i] = NULL;
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x d3d930411ce390e532470194296658a960887773
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020412-afraid-unearth-34e3@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From d3d930411ce390e532470194296658a960887773 Mon Sep 17 00:00:00 2001
From: Patrisious Haddad <phaddad(a)nvidia.com>
Date: Sun, 19 Jan 2025 10:21:41 +0200
Subject: [PATCH] RDMA/mlx5: Fix implicit ODP use after free
Prevent double queueing of implicit ODP mr destroy work by using
__xa_cmpxchg() to make sure this is the only time we are destroying this
specific mr.
Without this change, we could try to invalidate this mr twice, which in
turn could result in queuing a MR work destroy twice, and eventually the
second work could execute after the MR was freed due to the first work,
causing a user after free and trace below.
refcount_t: underflow; use-after-free.
WARNING: CPU: 2 PID: 12178 at lib/refcount.c:28 refcount_warn_saturate+0x12b/0x130
Modules linked in: bonding ib_ipoib vfio_pci ip_gre geneve nf_tables ip6_gre gre ip6_tunnel tunnel6 ipip tunnel4 ib_umad rdma_ucm mlx5_vfio_pci vfio_pci_core vfio_iommu_type1 mlx5_ib vfio ib_uverbs mlx5_core iptable_raw openvswitch nsh rpcrdma ib_iser libiscsi scsi_transport_iscsi rdma_cm iw_cm ib_cm ib_core xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay zram zsmalloc fuse [last unloaded: ib_uverbs]
CPU: 2 PID: 12178 Comm: kworker/u20:5 Not tainted 6.5.0-rc1_net_next_mlx5_58c644e #1
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Workqueue: events_unbound free_implicit_child_mr_work [mlx5_ib]
RIP: 0010:refcount_warn_saturate+0x12b/0x130
Code: 48 c7 c7 38 95 2a 82 c6 05 bc c6 fe 00 01 e8 0c 66 aa ff 0f 0b 5b c3 48 c7 c7 e0 94 2a 82 c6 05 a7 c6 fe 00 01 e8 f5 65 aa ff <0f> 0b 5b c3 90 8b 07 3d 00 00 00 c0 74 12 83 f8 01 74 13 8d 50 ff
RSP: 0018:ffff8881008e3e40 EFLAGS: 00010286
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000027
RDX: ffff88852c91b5c8 RSI: 0000000000000001 RDI: ffff88852c91b5c0
RBP: ffff8881dacd4e00 R08: 00000000ffffffff R09: 0000000000000019
R10: 000000000000072e R11: 0000000063666572 R12: ffff88812bfd9e00
R13: ffff8881c792d200 R14: ffff88810011c005 R15: ffff8881002099c0
FS: 0000000000000000(0000) GS:ffff88852c900000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f5694b5e000 CR3: 00000001153f6003 CR4: 0000000000370ea0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
<TASK>
? refcount_warn_saturate+0x12b/0x130
free_implicit_child_mr_work+0x180/0x1b0 [mlx5_ib]
process_one_work+0x1cc/0x3c0
worker_thread+0x218/0x3c0
kthread+0xc6/0xf0
ret_from_fork+0x1f/0x30
</TASK>
Fixes: 5256edcb98a1 ("RDMA/mlx5: Rework implicit ODP destroy")
Cc: stable(a)vger.kernel.org
Link: https://patch.msgid.link/r/c96b8645a81085abff739e6b06e286a350d1283d.1737274…
Signed-off-by: Patrisious Haddad <phaddad(a)nvidia.com>
Signed-off-by: Leon Romanovsky <leonro(a)nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg(a)nvidia.com>
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index f655859eec00..f1e23583e6c0 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -228,13 +228,27 @@ static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr)
unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
struct mlx5_ib_mr *imr = mr->parent;
+ /*
+ * If userspace is racing freeing the parent implicit ODP MR then we can
+ * loose the race with parent destruction. In this case
+ * mlx5_ib_free_odp_mr() will free everything in the implicit_children
+ * xarray so NOP is fine. This child MR cannot be destroyed here because
+ * we are under its umem_mutex.
+ */
if (!refcount_inc_not_zero(&imr->mmkey.usecount))
return;
- xa_erase(&imr->implicit_children, idx);
+ xa_lock(&imr->implicit_children);
+ if (__xa_cmpxchg(&imr->implicit_children, idx, mr, NULL, GFP_KERNEL) !=
+ mr) {
+ xa_unlock(&imr->implicit_children);
+ return;
+ }
+
if (MLX5_CAP_ODP(mr_to_mdev(mr)->mdev, mem_page_fault))
- xa_erase(&mr_to_mdev(mr)->odp_mkeys,
- mlx5_base_mkey(mr->mmkey.key));
+ __xa_erase(&mr_to_mdev(mr)->odp_mkeys,
+ mlx5_base_mkey(mr->mmkey.key));
+ xa_unlock(&imr->implicit_children);
/* Freeing a MR is a sleeping operation, so bounce to a work queue */
INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work);
@@ -502,18 +516,18 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
refcount_inc(&ret->mmkey.usecount);
goto out_lock;
}
- xa_unlock(&imr->implicit_children);
if (MLX5_CAP_ODP(dev->mdev, mem_page_fault)) {
- ret = xa_store(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key),
- &mr->mmkey, GFP_KERNEL);
+ ret = __xa_store(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key),
+ &mr->mmkey, GFP_KERNEL);
if (xa_is_err(ret)) {
ret = ERR_PTR(xa_err(ret));
- xa_erase(&imr->implicit_children, idx);
- goto out_mr;
+ __xa_erase(&imr->implicit_children, idx);
+ goto out_lock;
}
mr->mmkey.type = MLX5_MKEY_IMPLICIT_CHILD;
}
+ xa_unlock(&imr->implicit_children);
mlx5_ib_dbg(mr_to_mdev(imr), "key %x mr %p\n", mr->mmkey.key, mr);
return mr;
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x d3d930411ce390e532470194296658a960887773
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020411-unwritten-anvil-1852@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From d3d930411ce390e532470194296658a960887773 Mon Sep 17 00:00:00 2001
From: Patrisious Haddad <phaddad(a)nvidia.com>
Date: Sun, 19 Jan 2025 10:21:41 +0200
Subject: [PATCH] RDMA/mlx5: Fix implicit ODP use after free
Prevent double queueing of implicit ODP mr destroy work by using
__xa_cmpxchg() to make sure this is the only time we are destroying this
specific mr.
Without this change, we could try to invalidate this mr twice, which in
turn could result in queuing a MR work destroy twice, and eventually the
second work could execute after the MR was freed due to the first work,
causing a user after free and trace below.
refcount_t: underflow; use-after-free.
WARNING: CPU: 2 PID: 12178 at lib/refcount.c:28 refcount_warn_saturate+0x12b/0x130
Modules linked in: bonding ib_ipoib vfio_pci ip_gre geneve nf_tables ip6_gre gre ip6_tunnel tunnel6 ipip tunnel4 ib_umad rdma_ucm mlx5_vfio_pci vfio_pci_core vfio_iommu_type1 mlx5_ib vfio ib_uverbs mlx5_core iptable_raw openvswitch nsh rpcrdma ib_iser libiscsi scsi_transport_iscsi rdma_cm iw_cm ib_cm ib_core xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay zram zsmalloc fuse [last unloaded: ib_uverbs]
CPU: 2 PID: 12178 Comm: kworker/u20:5 Not tainted 6.5.0-rc1_net_next_mlx5_58c644e #1
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Workqueue: events_unbound free_implicit_child_mr_work [mlx5_ib]
RIP: 0010:refcount_warn_saturate+0x12b/0x130
Code: 48 c7 c7 38 95 2a 82 c6 05 bc c6 fe 00 01 e8 0c 66 aa ff 0f 0b 5b c3 48 c7 c7 e0 94 2a 82 c6 05 a7 c6 fe 00 01 e8 f5 65 aa ff <0f> 0b 5b c3 90 8b 07 3d 00 00 00 c0 74 12 83 f8 01 74 13 8d 50 ff
RSP: 0018:ffff8881008e3e40 EFLAGS: 00010286
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000027
RDX: ffff88852c91b5c8 RSI: 0000000000000001 RDI: ffff88852c91b5c0
RBP: ffff8881dacd4e00 R08: 00000000ffffffff R09: 0000000000000019
R10: 000000000000072e R11: 0000000063666572 R12: ffff88812bfd9e00
R13: ffff8881c792d200 R14: ffff88810011c005 R15: ffff8881002099c0
FS: 0000000000000000(0000) GS:ffff88852c900000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f5694b5e000 CR3: 00000001153f6003 CR4: 0000000000370ea0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
<TASK>
? refcount_warn_saturate+0x12b/0x130
free_implicit_child_mr_work+0x180/0x1b0 [mlx5_ib]
process_one_work+0x1cc/0x3c0
worker_thread+0x218/0x3c0
kthread+0xc6/0xf0
ret_from_fork+0x1f/0x30
</TASK>
Fixes: 5256edcb98a1 ("RDMA/mlx5: Rework implicit ODP destroy")
Cc: stable(a)vger.kernel.org
Link: https://patch.msgid.link/r/c96b8645a81085abff739e6b06e286a350d1283d.1737274…
Signed-off-by: Patrisious Haddad <phaddad(a)nvidia.com>
Signed-off-by: Leon Romanovsky <leonro(a)nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg(a)nvidia.com>
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index f655859eec00..f1e23583e6c0 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -228,13 +228,27 @@ static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr)
unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
struct mlx5_ib_mr *imr = mr->parent;
+ /*
+ * If userspace is racing freeing the parent implicit ODP MR then we can
+ * loose the race with parent destruction. In this case
+ * mlx5_ib_free_odp_mr() will free everything in the implicit_children
+ * xarray so NOP is fine. This child MR cannot be destroyed here because
+ * we are under its umem_mutex.
+ */
if (!refcount_inc_not_zero(&imr->mmkey.usecount))
return;
- xa_erase(&imr->implicit_children, idx);
+ xa_lock(&imr->implicit_children);
+ if (__xa_cmpxchg(&imr->implicit_children, idx, mr, NULL, GFP_KERNEL) !=
+ mr) {
+ xa_unlock(&imr->implicit_children);
+ return;
+ }
+
if (MLX5_CAP_ODP(mr_to_mdev(mr)->mdev, mem_page_fault))
- xa_erase(&mr_to_mdev(mr)->odp_mkeys,
- mlx5_base_mkey(mr->mmkey.key));
+ __xa_erase(&mr_to_mdev(mr)->odp_mkeys,
+ mlx5_base_mkey(mr->mmkey.key));
+ xa_unlock(&imr->implicit_children);
/* Freeing a MR is a sleeping operation, so bounce to a work queue */
INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work);
@@ -502,18 +516,18 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
refcount_inc(&ret->mmkey.usecount);
goto out_lock;
}
- xa_unlock(&imr->implicit_children);
if (MLX5_CAP_ODP(dev->mdev, mem_page_fault)) {
- ret = xa_store(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key),
- &mr->mmkey, GFP_KERNEL);
+ ret = __xa_store(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key),
+ &mr->mmkey, GFP_KERNEL);
if (xa_is_err(ret)) {
ret = ERR_PTR(xa_err(ret));
- xa_erase(&imr->implicit_children, idx);
- goto out_mr;
+ __xa_erase(&imr->implicit_children, idx);
+ goto out_lock;
}
mr->mmkey.type = MLX5_MKEY_IMPLICIT_CHILD;
}
+ xa_unlock(&imr->implicit_children);
mlx5_ib_dbg(mr_to_mdev(imr), "key %x mr %p\n", mr->mmkey.key, mr);
return mr;
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x d3d930411ce390e532470194296658a960887773
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020410-evacuate-dotted-2c17@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From d3d930411ce390e532470194296658a960887773 Mon Sep 17 00:00:00 2001
From: Patrisious Haddad <phaddad(a)nvidia.com>
Date: Sun, 19 Jan 2025 10:21:41 +0200
Subject: [PATCH] RDMA/mlx5: Fix implicit ODP use after free
Prevent double queueing of implicit ODP mr destroy work by using
__xa_cmpxchg() to make sure this is the only time we are destroying this
specific mr.
Without this change, we could try to invalidate this mr twice, which in
turn could result in queuing a MR work destroy twice, and eventually the
second work could execute after the MR was freed due to the first work,
causing a user after free and trace below.
refcount_t: underflow; use-after-free.
WARNING: CPU: 2 PID: 12178 at lib/refcount.c:28 refcount_warn_saturate+0x12b/0x130
Modules linked in: bonding ib_ipoib vfio_pci ip_gre geneve nf_tables ip6_gre gre ip6_tunnel tunnel6 ipip tunnel4 ib_umad rdma_ucm mlx5_vfio_pci vfio_pci_core vfio_iommu_type1 mlx5_ib vfio ib_uverbs mlx5_core iptable_raw openvswitch nsh rpcrdma ib_iser libiscsi scsi_transport_iscsi rdma_cm iw_cm ib_cm ib_core xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay zram zsmalloc fuse [last unloaded: ib_uverbs]
CPU: 2 PID: 12178 Comm: kworker/u20:5 Not tainted 6.5.0-rc1_net_next_mlx5_58c644e #1
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Workqueue: events_unbound free_implicit_child_mr_work [mlx5_ib]
RIP: 0010:refcount_warn_saturate+0x12b/0x130
Code: 48 c7 c7 38 95 2a 82 c6 05 bc c6 fe 00 01 e8 0c 66 aa ff 0f 0b 5b c3 48 c7 c7 e0 94 2a 82 c6 05 a7 c6 fe 00 01 e8 f5 65 aa ff <0f> 0b 5b c3 90 8b 07 3d 00 00 00 c0 74 12 83 f8 01 74 13 8d 50 ff
RSP: 0018:ffff8881008e3e40 EFLAGS: 00010286
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000027
RDX: ffff88852c91b5c8 RSI: 0000000000000001 RDI: ffff88852c91b5c0
RBP: ffff8881dacd4e00 R08: 00000000ffffffff R09: 0000000000000019
R10: 000000000000072e R11: 0000000063666572 R12: ffff88812bfd9e00
R13: ffff8881c792d200 R14: ffff88810011c005 R15: ffff8881002099c0
FS: 0000000000000000(0000) GS:ffff88852c900000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f5694b5e000 CR3: 00000001153f6003 CR4: 0000000000370ea0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
<TASK>
? refcount_warn_saturate+0x12b/0x130
free_implicit_child_mr_work+0x180/0x1b0 [mlx5_ib]
process_one_work+0x1cc/0x3c0
worker_thread+0x218/0x3c0
kthread+0xc6/0xf0
ret_from_fork+0x1f/0x30
</TASK>
Fixes: 5256edcb98a1 ("RDMA/mlx5: Rework implicit ODP destroy")
Cc: stable(a)vger.kernel.org
Link: https://patch.msgid.link/r/c96b8645a81085abff739e6b06e286a350d1283d.1737274…
Signed-off-by: Patrisious Haddad <phaddad(a)nvidia.com>
Signed-off-by: Leon Romanovsky <leonro(a)nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg(a)nvidia.com>
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index f655859eec00..f1e23583e6c0 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -228,13 +228,27 @@ static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr)
unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
struct mlx5_ib_mr *imr = mr->parent;
+ /*
+ * If userspace is racing freeing the parent implicit ODP MR then we can
+ * loose the race with parent destruction. In this case
+ * mlx5_ib_free_odp_mr() will free everything in the implicit_children
+ * xarray so NOP is fine. This child MR cannot be destroyed here because
+ * we are under its umem_mutex.
+ */
if (!refcount_inc_not_zero(&imr->mmkey.usecount))
return;
- xa_erase(&imr->implicit_children, idx);
+ xa_lock(&imr->implicit_children);
+ if (__xa_cmpxchg(&imr->implicit_children, idx, mr, NULL, GFP_KERNEL) !=
+ mr) {
+ xa_unlock(&imr->implicit_children);
+ return;
+ }
+
if (MLX5_CAP_ODP(mr_to_mdev(mr)->mdev, mem_page_fault))
- xa_erase(&mr_to_mdev(mr)->odp_mkeys,
- mlx5_base_mkey(mr->mmkey.key));
+ __xa_erase(&mr_to_mdev(mr)->odp_mkeys,
+ mlx5_base_mkey(mr->mmkey.key));
+ xa_unlock(&imr->implicit_children);
/* Freeing a MR is a sleeping operation, so bounce to a work queue */
INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work);
@@ -502,18 +516,18 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
refcount_inc(&ret->mmkey.usecount);
goto out_lock;
}
- xa_unlock(&imr->implicit_children);
if (MLX5_CAP_ODP(dev->mdev, mem_page_fault)) {
- ret = xa_store(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key),
- &mr->mmkey, GFP_KERNEL);
+ ret = __xa_store(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key),
+ &mr->mmkey, GFP_KERNEL);
if (xa_is_err(ret)) {
ret = ERR_PTR(xa_err(ret));
- xa_erase(&imr->implicit_children, idx);
- goto out_mr;
+ __xa_erase(&imr->implicit_children, idx);
+ goto out_lock;
}
mr->mmkey.type = MLX5_MKEY_IMPLICIT_CHILD;
}
+ xa_unlock(&imr->implicit_children);
mlx5_ib_dbg(mr_to_mdev(imr), "key %x mr %p\n", mr->mmkey.key, mr);
return mr;
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x d3d930411ce390e532470194296658a960887773
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020409-scorpion-stark-a992@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From d3d930411ce390e532470194296658a960887773 Mon Sep 17 00:00:00 2001
From: Patrisious Haddad <phaddad(a)nvidia.com>
Date: Sun, 19 Jan 2025 10:21:41 +0200
Subject: [PATCH] RDMA/mlx5: Fix implicit ODP use after free
Prevent double queueing of implicit ODP mr destroy work by using
__xa_cmpxchg() to make sure this is the only time we are destroying this
specific mr.
Without this change, we could try to invalidate this mr twice, which in
turn could result in queuing a MR work destroy twice, and eventually the
second work could execute after the MR was freed due to the first work,
causing a user after free and trace below.
refcount_t: underflow; use-after-free.
WARNING: CPU: 2 PID: 12178 at lib/refcount.c:28 refcount_warn_saturate+0x12b/0x130
Modules linked in: bonding ib_ipoib vfio_pci ip_gre geneve nf_tables ip6_gre gre ip6_tunnel tunnel6 ipip tunnel4 ib_umad rdma_ucm mlx5_vfio_pci vfio_pci_core vfio_iommu_type1 mlx5_ib vfio ib_uverbs mlx5_core iptable_raw openvswitch nsh rpcrdma ib_iser libiscsi scsi_transport_iscsi rdma_cm iw_cm ib_cm ib_core xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay zram zsmalloc fuse [last unloaded: ib_uverbs]
CPU: 2 PID: 12178 Comm: kworker/u20:5 Not tainted 6.5.0-rc1_net_next_mlx5_58c644e #1
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
Workqueue: events_unbound free_implicit_child_mr_work [mlx5_ib]
RIP: 0010:refcount_warn_saturate+0x12b/0x130
Code: 48 c7 c7 38 95 2a 82 c6 05 bc c6 fe 00 01 e8 0c 66 aa ff 0f 0b 5b c3 48 c7 c7 e0 94 2a 82 c6 05 a7 c6 fe 00 01 e8 f5 65 aa ff <0f> 0b 5b c3 90 8b 07 3d 00 00 00 c0 74 12 83 f8 01 74 13 8d 50 ff
RSP: 0018:ffff8881008e3e40 EFLAGS: 00010286
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000027
RDX: ffff88852c91b5c8 RSI: 0000000000000001 RDI: ffff88852c91b5c0
RBP: ffff8881dacd4e00 R08: 00000000ffffffff R09: 0000000000000019
R10: 000000000000072e R11: 0000000063666572 R12: ffff88812bfd9e00
R13: ffff8881c792d200 R14: ffff88810011c005 R15: ffff8881002099c0
FS: 0000000000000000(0000) GS:ffff88852c900000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f5694b5e000 CR3: 00000001153f6003 CR4: 0000000000370ea0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
<TASK>
? refcount_warn_saturate+0x12b/0x130
free_implicit_child_mr_work+0x180/0x1b0 [mlx5_ib]
process_one_work+0x1cc/0x3c0
worker_thread+0x218/0x3c0
kthread+0xc6/0xf0
ret_from_fork+0x1f/0x30
</TASK>
Fixes: 5256edcb98a1 ("RDMA/mlx5: Rework implicit ODP destroy")
Cc: stable(a)vger.kernel.org
Link: https://patch.msgid.link/r/c96b8645a81085abff739e6b06e286a350d1283d.1737274…
Signed-off-by: Patrisious Haddad <phaddad(a)nvidia.com>
Signed-off-by: Leon Romanovsky <leonro(a)nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg(a)nvidia.com>
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index f655859eec00..f1e23583e6c0 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -228,13 +228,27 @@ static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr)
unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
struct mlx5_ib_mr *imr = mr->parent;
+ /*
+ * If userspace is racing freeing the parent implicit ODP MR then we can
+ * loose the race with parent destruction. In this case
+ * mlx5_ib_free_odp_mr() will free everything in the implicit_children
+ * xarray so NOP is fine. This child MR cannot be destroyed here because
+ * we are under its umem_mutex.
+ */
if (!refcount_inc_not_zero(&imr->mmkey.usecount))
return;
- xa_erase(&imr->implicit_children, idx);
+ xa_lock(&imr->implicit_children);
+ if (__xa_cmpxchg(&imr->implicit_children, idx, mr, NULL, GFP_KERNEL) !=
+ mr) {
+ xa_unlock(&imr->implicit_children);
+ return;
+ }
+
if (MLX5_CAP_ODP(mr_to_mdev(mr)->mdev, mem_page_fault))
- xa_erase(&mr_to_mdev(mr)->odp_mkeys,
- mlx5_base_mkey(mr->mmkey.key));
+ __xa_erase(&mr_to_mdev(mr)->odp_mkeys,
+ mlx5_base_mkey(mr->mmkey.key));
+ xa_unlock(&imr->implicit_children);
/* Freeing a MR is a sleeping operation, so bounce to a work queue */
INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work);
@@ -502,18 +516,18 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
refcount_inc(&ret->mmkey.usecount);
goto out_lock;
}
- xa_unlock(&imr->implicit_children);
if (MLX5_CAP_ODP(dev->mdev, mem_page_fault)) {
- ret = xa_store(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key),
- &mr->mmkey, GFP_KERNEL);
+ ret = __xa_store(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key),
+ &mr->mmkey, GFP_KERNEL);
if (xa_is_err(ret)) {
ret = ERR_PTR(xa_err(ret));
- xa_erase(&imr->implicit_children, idx);
- goto out_mr;
+ __xa_erase(&imr->implicit_children, idx);
+ goto out_lock;
}
mr->mmkey.type = MLX5_MKEY_IMPLICIT_CHILD;
}
+ xa_unlock(&imr->implicit_children);
mlx5_ib_dbg(mr_to_mdev(imr), "key %x mr %p\n", mr->mmkey.key, mr);
return mr;
Hi Sasha,
Am Samstag, dem 01.02.2025 um 23:33 -0500 schrieb Sasha Levin:
> This is a note to let you know that I've just added the patch titled
>
> drm/etnaviv: Drop the offset in page manipulation
>
> to the 6.12-stable tree which can be found at:
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
>
> The filename of the patch is:
> drm-etnaviv-drop-the-offset-in-page-manipulation.patch
> and it can be found in the queue-6.12 subdirectory.
>
> If you, or anyone else, feels it should not be added to the stable tree,
> please let <stable(a)vger.kernel.org> know about it.
>
please drop this patch and all its dependencies from all stable queues.
While the code makes certain assumptions that are corrected in this
patch, those assumptions are always true in all use-cases today. I
don't see a reason to introduce this kind of churn to the stable trees
to fix a theoretical issue.
Regards,
Lucas
>
>
> commit cc5b6c4868e20f34d46e359930f0ca45a1cab9e3
> Author: Sui Jingfeng <sui.jingfeng(a)linux.dev>
> Date: Fri Nov 15 20:32:44 2024 +0800
>
> drm/etnaviv: Drop the offset in page manipulation
>
> [ Upstream commit 9aad03e7f5db7944d5ee96cd5c595c54be2236e6 ]
>
> The etnaviv driver, both kernel space and user space, assumes that GPU page
> size is 4KiB. Its IOMMU map/unmap 4KiB physical address range once a time.
> If 'sg->offset != 0' is true, then the current implementation will map the
> IOVA to a wrong area, which may lead to coherency problem. Picture 0 and 1
> give the illustration, see below.
>
> PA start drifted
> |
> |<--- 'sg_dma_address(sg) - sg->offset'
> | .------ sg_dma_address(sg)
> | | .---- sg_dma_len(sg)
> |<-sg->offset->| |
> V |<-->| Another one cpu page
> +----+----+----+----+ +----+----+----+----+
> |xxxx| |||||| |||||||||||||||||||||
> +----+----+----+----+ +----+----+----+----+
> ^ ^ ^ ^
> |<--- da_len --->| | |
> | | | |
> | .--------------' | |
> | | .----------------' |
> | | | .----------------'
> | | | |
> | | +----+----+----+----+
> | | |||||||||||||||||||||
> | | +----+----+----+----+
> | |
> | '--------------. da_len = sg_dma_len(sg) + sg->offset, using
> | | 'sg_dma_len(sg) + sg->offset' will lead to GPUVA
> +----+ ~~~~~~~~~~~~~+ collision, but min_t(unsigned int, da_len, va_len)
> |xxxx| | will clamp it to correct size. But the IOVA will
> +----+ ~~~~~~~~~~~~~+ be redirect to wrong area.
> ^
> | Picture 0: Possibly wrong implementation.
> GPUVA (IOVA)
>
> --------------------------------------------------------------------------
>
> .------- sg_dma_address(sg)
> | .---- sg_dma_len(sg)
> |<-sg->offset->| |
> | |<-->| another one cpu page
> +----+----+----+----+ +----+----+----+----+
> | |||||| |||||||||||||||||||||
> +----+----+----+----+ +----+----+----+----+
> ^ ^ ^ ^
> | | | |
> .--------------' | | |
> | | | |
> | .--------------' | |
> | | .----------------' |
> | | | .----------------'
> | | | |
> +----+ +----+----+----+----+
> |||||| ||||||||||||||||||||| The first one is SZ_4K, the second is SZ_16K
> +----+ +----+----+----+----+
> ^
> | Picture 1: Perfectly correct implementation.
> GPUVA (IOVA)
>
> If sg->offset != 0 is true, IOVA will be mapped to wrong physical address.
> Either because there doesn't contain the data or there contains wrong data.
> Strictly speaking, the memory area that before sg_dma_address(sg) doesn't
> belong to us, and it's likely that the area is being used by other process.
>
> Because we don't want to introduce confusions about which part is visible
> to the GPU, we assumes that the size of GPUVA is always 4KiB aligned. This
> is very relaxed requirement, since we already made the decision that GPU
> page size is 4KiB (as a canonical decision). And softpin feature is landed,
> Mesa's util_vma_heap_alloc() will certainly report correct length of GPUVA
> to kernel with desired alignment ensured.
>
> With above statements agreed, drop the "offset in page" manipulation will
> return us a correct implementation at any case.
>
> Fixes: a8c21a5451d8 ("drm/etnaviv: add initial etnaviv DRM driver")
> Signed-off-by: Sui Jingfeng <sui.jingfeng(a)linux.dev>
> Signed-off-by: Lucas Stach <l.stach(a)pengutronix.de>
> Signed-off-by: Sasha Levin <sashal(a)kernel.org>
>
> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_mmu.c b/drivers/gpu/drm/etnaviv/etnaviv_mmu.c
> index a382920ae2be0..b7c09fc86a2cc 100644
> --- a/drivers/gpu/drm/etnaviv/etnaviv_mmu.c
> +++ b/drivers/gpu/drm/etnaviv/etnaviv_mmu.c
> @@ -82,8 +82,8 @@ static int etnaviv_iommu_map(struct etnaviv_iommu_context *context,
> return -EINVAL;
>
> for_each_sgtable_dma_sg(sgt, sg, i) {
> - phys_addr_t pa = sg_dma_address(sg) - sg->offset;
> - unsigned int da_len = sg_dma_len(sg) + sg->offset;
> + phys_addr_t pa = sg_dma_address(sg);
> + unsigned int da_len = sg_dma_len(sg);
> unsigned int bytes = min_t(unsigned int, da_len, va_len);
>
> VERB("map[%d]: %08x %pap(%x)", i, iova, &pa, bytes);
From: Kevin Brodsky <kevin.brodsky(a)arm.com>
[ Upstream commit 46036188ea1f5266df23a6149dea0df1c77cd1c7 ]
The mm kselftests are currently built with no optimisation (-O0). It's
unclear why, and besides being obviously suboptimal, this also prevents
the pkeys tests from working as intended. Let's build all the tests with
-O2.
[kevin.brodsky(a)arm.com: silence unused-result warnings]
Link: https://lkml.kernel.org/r/20250107170110.2819685-1-kevin.brodsky@arm.com
Link: https://lkml.kernel.org/r/20241209095019.1732120-6-kevin.brodsky@arm.com
Signed-off-by: Kevin Brodsky <kevin.brodsky(a)arm.com>
Cc: Aruna Ramakrishna <aruna.ramakrishna(a)oracle.com>
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: Joey Gouly <joey.gouly(a)arm.com>
Cc: Keith Lucas <keith.lucas(a)oracle.com>
Cc: Ryan Roberts <ryan.roberts(a)arm.com>
Cc: Shuah Khan <shuah(a)kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
(cherry picked from commit 46036188ea1f5266df23a6149dea0df1c77cd1c7)
[Yifei: This commit also fix the failure of pkey_sighandler_tests_64,
which is also in linux-6.12.y, thus backport this commit]
Signed-off-by: Yifei Liu <yifei.l.liu(a)oracle.com>
---
tools/testing/selftests/mm/Makefile | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile
index 02e1204971b0..c0138cb19705 100644
--- a/tools/testing/selftests/mm/Makefile
+++ b/tools/testing/selftests/mm/Makefile
@@ -33,9 +33,16 @@ endif
# LDLIBS.
MAKEFLAGS += --no-builtin-rules
-CFLAGS = -Wall -I $(top_srcdir) $(EXTRA_CFLAGS) $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
+CFLAGS = -Wall -O2 -I $(top_srcdir) $(EXTRA_CFLAGS) $(KHDR_INCLUDES) $(TOOLS_INCLUDES)
LDLIBS = -lrt -lpthread -lm
+# Some distributions (such as Ubuntu) configure GCC so that _FORTIFY_SOURCE is
+# automatically enabled at -O1 or above. This triggers various unused-result
+# warnings where functions such as read() or write() are called and their
+# return value is not checked. Disable _FORTIFY_SOURCE to silence those
+# warnings.
+CFLAGS += -U_FORTIFY_SOURCE
+
TEST_GEN_FILES = cow
TEST_GEN_FILES += compaction_test
TEST_GEN_FILES += gup_longterm
--
2.46.0
The patch below does not apply to the 6.12-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.12.y
git checkout FETCH_HEAD
git cherry-pick -x adb4998f4928a17d91be054218a902ba9f8c1f93
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025012032-phoenix-crushing-da7a@gregkh' --subject-prefix 'PATCH 6.12.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From adb4998f4928a17d91be054218a902ba9f8c1f93 Mon Sep 17 00:00:00 2001
From: Wayne Lin <Wayne.Lin(a)amd.com>
Date: Mon, 9 Dec 2024 15:25:35 +0800
Subject: [PATCH] drm/amd/display: Reduce accessing remote DPCD overhead
[Why]
Observed frame rate get dropped by tool like glxgear. Even though the
output to monitor is 60Hz, the rendered frame rate drops to 30Hz lower.
It's due to code path in some cases will trigger
dm_dp_mst_is_port_support_mode() to read out remote Link status to
assess the available bandwidth for dsc maniplation. Overhead of keep
reading remote DPCD is considerable.
[How]
Store the remote link BW in mst_local_bw and use end-to-end full_pbn
as an indicator to decide whether update the remote link bw or not.
Whenever we need the info to assess the BW, visit the stored one first.
Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3720
Fixes: fa57924c76d9 ("drm/amd/display: Refactor function dm_dp_mst_is_port_support_mode()")
Cc: Mario Limonciello <mario.limonciello(a)amd.com>
Cc: Alex Deucher <alexander.deucher(a)amd.com>
Reviewed-by: Jerry Zuo <jerry.zuo(a)amd.com>
Signed-off-by: Wayne Lin <Wayne.Lin(a)amd.com>
Signed-off-by: Tom Chung <chiahsuan.chung(a)amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler(a)amd.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
(cherry picked from commit 4a9a918545455a5979c6232fcf61ed3d8f0db3ae)
Cc: stable(a)vger.kernel.org
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
index 6464a8378387..2227cd8e4a89 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
@@ -697,6 +697,8 @@ struct amdgpu_dm_connector {
struct drm_dp_mst_port *mst_output_port;
struct amdgpu_dm_connector *mst_root;
struct drm_dp_aux *dsc_aux;
+ uint32_t mst_local_bw;
+ uint16_t vc_full_pbn;
struct mutex handle_mst_msg_ready;
/* TODO see if we can merge with ddc_bus or make a dm_connector */
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
index aadaa61ac5ac..1080075ccb17 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
@@ -155,6 +155,17 @@ amdgpu_dm_mst_connector_late_register(struct drm_connector *connector)
return 0;
}
+
+static inline void
+amdgpu_dm_mst_reset_mst_connector_setting(struct amdgpu_dm_connector *aconnector)
+{
+ aconnector->drm_edid = NULL;
+ aconnector->dsc_aux = NULL;
+ aconnector->mst_output_port->passthrough_aux = NULL;
+ aconnector->mst_local_bw = 0;
+ aconnector->vc_full_pbn = 0;
+}
+
static void
amdgpu_dm_mst_connector_early_unregister(struct drm_connector *connector)
{
@@ -182,9 +193,7 @@ amdgpu_dm_mst_connector_early_unregister(struct drm_connector *connector)
dc_sink_release(dc_sink);
aconnector->dc_sink = NULL;
- aconnector->drm_edid = NULL;
- aconnector->dsc_aux = NULL;
- port->passthrough_aux = NULL;
+ amdgpu_dm_mst_reset_mst_connector_setting(aconnector);
}
aconnector->mst_status = MST_STATUS_DEFAULT;
@@ -504,9 +513,7 @@ dm_dp_mst_detect(struct drm_connector *connector,
dc_sink_release(aconnector->dc_sink);
aconnector->dc_sink = NULL;
- aconnector->drm_edid = NULL;
- aconnector->dsc_aux = NULL;
- port->passthrough_aux = NULL;
+ amdgpu_dm_mst_reset_mst_connector_setting(aconnector);
amdgpu_dm_set_mst_status(&aconnector->mst_status,
MST_REMOTE_EDID | MST_ALLOCATE_NEW_PAYLOAD | MST_CLEAR_ALLOCATED_PAYLOAD,
@@ -1819,9 +1826,18 @@ enum dc_status dm_dp_mst_is_port_support_mode(
struct drm_dp_mst_port *immediate_upstream_port = NULL;
uint32_t end_link_bw = 0;
- /*Get last DP link BW capability*/
- if (dp_get_link_current_set_bw(&aconnector->mst_output_port->aux, &end_link_bw)) {
- if (stream_kbps > end_link_bw) {
+ /*Get last DP link BW capability. Mode shall be supported by Legacy peer*/
+ if (aconnector->mst_output_port->pdt != DP_PEER_DEVICE_DP_LEGACY_CONV &&
+ aconnector->mst_output_port->pdt != DP_PEER_DEVICE_NONE) {
+ if (aconnector->vc_full_pbn != aconnector->mst_output_port->full_pbn) {
+ dp_get_link_current_set_bw(&aconnector->mst_output_port->aux, &end_link_bw);
+ aconnector->vc_full_pbn = aconnector->mst_output_port->full_pbn;
+ aconnector->mst_local_bw = end_link_bw;
+ } else {
+ end_link_bw = aconnector->mst_local_bw;
+ }
+
+ if (end_link_bw > 0 && stream_kbps > end_link_bw) {
DRM_DEBUG_DRIVER("MST_DSC dsc decode at last link."
"Mode required bw can't fit into last link\n");
return DC_FAIL_BANDWIDTH_VALIDATE;
When updating the interrupt state for an emulated timer, we return
early and skip the setup of a soft timer that runs in parallel
with the guest.
While this is OK if we have set the interrupt pending, it is pretty
wrong if the guest moved CVAL into the future. In that case,
no timer is armed and the guest can wait for a very long time
(it will take a full put/load cycle for the situation to resolve).
This is specially visible with EDK2 running at EL2, but still
using the EL1 virtual timer, which in that case is fully emulated.
Any key-press takes ages to be captured, as there is no UART
interrupt and EDK2 relies on polling from a timer...
The fix is simply to drop the early return. If the timer interrupt
is pending, we will still return early, and otherwise arm the soft
timer.
Fixes: 4d74ecfa6458b ("KVM: arm64: Don't arm a hrtimer for an already pending timer")
Signed-off-by: Marc Zyngier <maz(a)kernel.org>
Cc: stable(a)vger.kernel.org
---
arch/arm64/kvm/arch_timer.c | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index d3d243366536c..035e43f5d4f9a 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -471,10 +471,8 @@ static void timer_emulate(struct arch_timer_context *ctx)
trace_kvm_timer_emulate(ctx, should_fire);
- if (should_fire != ctx->irq.level) {
+ if (should_fire != ctx->irq.level)
kvm_timer_update_irq(ctx->vcpu, should_fire, ctx);
- return;
- }
kvm_timer_update_status(ctx, should_fire);
--
2.39.2
When updating the interrupt state for an emulated timer, we return
early and skip the setup of a soft timer that runs in parallel
with the guest.
While this is OK if we have set the interrupt pending, it is pretty
wrong if the guest moved CVAL into the future. In that case,
no timer is armed and the guest can wait for a very long time
(it will take a full put/load cycle for the situation to resolve).
This is specially visible with EDK2 running at EL2, but still
using the EL1 virtual timer, which in that case is fully emulated.
Any key-press takes ages to be captured, as there is no UART
interrupt and EDK2 relies on polling from a timer...
The fix is simply to drop the early return. If the timer interrupt
is pending, we will still return early, and otherwise arm the soft
timer.
Fixes: 4d74ecfa6458b ("KVM: arm64: Don't arm a hrtimer for an already pending timer")
Signed-off-by: Marc Zyngier <maz(a)kernel.org>
Cc: stable(a)vger.kernel.org
---
arch/arm64/kvm/arch_timer.c | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index d3d243366536c..035e43f5d4f9a 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -471,10 +471,8 @@ static void timer_emulate(struct arch_timer_context *ctx)
trace_kvm_timer_emulate(ctx, should_fire);
- if (should_fire != ctx->irq.level) {
+ if (should_fire != ctx->irq.level)
kvm_timer_update_irq(ctx->vcpu, should_fire, ctx);
- return;
- }
kvm_timer_update_status(ctx, should_fire);
--
2.39.2
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 1e0a19912adb68a4b2b74fd77001c96cd83eb073
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020458-rental-overbill-09f4@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1e0a19912adb68a4b2b74fd77001c96cd83eb073 Mon Sep 17 00:00:00 2001
From: Michal Pecio <michal.pecio(a)gmail.com>
Date: Fri, 27 Dec 2024 14:01:40 +0200
Subject: [PATCH] usb: xhci: Fix NULL pointer dereference on certain command
aborts
If a command is queued to the final usable TRB of a ring segment, the
enqueue pointer is advanced to the subsequent link TRB and no further.
If the command is later aborted, when the abort completion is handled
the dequeue pointer is advanced to the first TRB of the next segment.
If no further commands are queued, xhci_handle_stopped_cmd_ring() sees
the ring pointers unequal and assumes that there is a pending command,
so it calls xhci_mod_cmd_timer() which crashes if cur_cmd was NULL.
Don't attempt timer setup if cur_cmd is NULL. The subsequent doorbell
ring likely is unnecessary too, but it's harmless. Leave it alone.
This is probably Bug 219532, but no confirmation has been received.
The issue has been independently reproduced and confirmed fixed using
a USB MCU programmed to NAK the Status stage of SET_ADDRESS forever.
Everything continued working normally after several prevented crashes.
Link: https://bugzilla.kernel.org/show_bug.cgi?id=219532
Fixes: c311e391a7ef ("xhci: rework command timeout and cancellation,")
CC: stable(a)vger.kernel.org
Signed-off-by: Michal Pecio <michal.pecio(a)gmail.com>
Signed-off-by: Mathias Nyman <mathias.nyman(a)linux.intel.com>
Link: https://lore.kernel.org/r/20241227120142.1035206-4-mathias.nyman@linux.inte…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
index 09b05a62375e..dfe1a676d487 100644
--- a/drivers/usb/host/xhci-ring.c
+++ b/drivers/usb/host/xhci-ring.c
@@ -422,7 +422,8 @@ static void xhci_handle_stopped_cmd_ring(struct xhci_hcd *xhci,
if ((xhci->cmd_ring->dequeue != xhci->cmd_ring->enqueue) &&
!(xhci->xhc_state & XHCI_STATE_DYING)) {
xhci->current_cmd = cur_cmd;
- xhci_mod_cmd_timer(xhci);
+ if (cur_cmd)
+ xhci_mod_cmd_timer(xhci);
xhci_ring_cmd_db(xhci);
}
}
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 1e0a19912adb68a4b2b74fd77001c96cd83eb073
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020457-imprecise-rectify-1264@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1e0a19912adb68a4b2b74fd77001c96cd83eb073 Mon Sep 17 00:00:00 2001
From: Michal Pecio <michal.pecio(a)gmail.com>
Date: Fri, 27 Dec 2024 14:01:40 +0200
Subject: [PATCH] usb: xhci: Fix NULL pointer dereference on certain command
aborts
If a command is queued to the final usable TRB of a ring segment, the
enqueue pointer is advanced to the subsequent link TRB and no further.
If the command is later aborted, when the abort completion is handled
the dequeue pointer is advanced to the first TRB of the next segment.
If no further commands are queued, xhci_handle_stopped_cmd_ring() sees
the ring pointers unequal and assumes that there is a pending command,
so it calls xhci_mod_cmd_timer() which crashes if cur_cmd was NULL.
Don't attempt timer setup if cur_cmd is NULL. The subsequent doorbell
ring likely is unnecessary too, but it's harmless. Leave it alone.
This is probably Bug 219532, but no confirmation has been received.
The issue has been independently reproduced and confirmed fixed using
a USB MCU programmed to NAK the Status stage of SET_ADDRESS forever.
Everything continued working normally after several prevented crashes.
Link: https://bugzilla.kernel.org/show_bug.cgi?id=219532
Fixes: c311e391a7ef ("xhci: rework command timeout and cancellation,")
CC: stable(a)vger.kernel.org
Signed-off-by: Michal Pecio <michal.pecio(a)gmail.com>
Signed-off-by: Mathias Nyman <mathias.nyman(a)linux.intel.com>
Link: https://lore.kernel.org/r/20241227120142.1035206-4-mathias.nyman@linux.inte…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
index 09b05a62375e..dfe1a676d487 100644
--- a/drivers/usb/host/xhci-ring.c
+++ b/drivers/usb/host/xhci-ring.c
@@ -422,7 +422,8 @@ static void xhci_handle_stopped_cmd_ring(struct xhci_hcd *xhci,
if ((xhci->cmd_ring->dequeue != xhci->cmd_ring->enqueue) &&
!(xhci->xhc_state & XHCI_STATE_DYING)) {
xhci->current_cmd = cur_cmd;
- xhci_mod_cmd_timer(xhci);
+ if (cur_cmd)
+ xhci_mod_cmd_timer(xhci);
xhci_ring_cmd_db(xhci);
}
}
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 1e0a19912adb68a4b2b74fd77001c96cd83eb073
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020457-afternoon-avert-fefb@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 1e0a19912adb68a4b2b74fd77001c96cd83eb073 Mon Sep 17 00:00:00 2001
From: Michal Pecio <michal.pecio(a)gmail.com>
Date: Fri, 27 Dec 2024 14:01:40 +0200
Subject: [PATCH] usb: xhci: Fix NULL pointer dereference on certain command
aborts
If a command is queued to the final usable TRB of a ring segment, the
enqueue pointer is advanced to the subsequent link TRB and no further.
If the command is later aborted, when the abort completion is handled
the dequeue pointer is advanced to the first TRB of the next segment.
If no further commands are queued, xhci_handle_stopped_cmd_ring() sees
the ring pointers unequal and assumes that there is a pending command,
so it calls xhci_mod_cmd_timer() which crashes if cur_cmd was NULL.
Don't attempt timer setup if cur_cmd is NULL. The subsequent doorbell
ring likely is unnecessary too, but it's harmless. Leave it alone.
This is probably Bug 219532, but no confirmation has been received.
The issue has been independently reproduced and confirmed fixed using
a USB MCU programmed to NAK the Status stage of SET_ADDRESS forever.
Everything continued working normally after several prevented crashes.
Link: https://bugzilla.kernel.org/show_bug.cgi?id=219532
Fixes: c311e391a7ef ("xhci: rework command timeout and cancellation,")
CC: stable(a)vger.kernel.org
Signed-off-by: Michal Pecio <michal.pecio(a)gmail.com>
Signed-off-by: Mathias Nyman <mathias.nyman(a)linux.intel.com>
Link: https://lore.kernel.org/r/20241227120142.1035206-4-mathias.nyman@linux.inte…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
index 09b05a62375e..dfe1a676d487 100644
--- a/drivers/usb/host/xhci-ring.c
+++ b/drivers/usb/host/xhci-ring.c
@@ -422,7 +422,8 @@ static void xhci_handle_stopped_cmd_ring(struct xhci_hcd *xhci,
if ((xhci->cmd_ring->dequeue != xhci->cmd_ring->enqueue) &&
!(xhci->xhc_state & XHCI_STATE_DYING)) {
xhci->current_cmd = cur_cmd;
- xhci_mod_cmd_timer(xhci);
+ if (cur_cmd)
+ xhci_mod_cmd_timer(xhci);
xhci_ring_cmd_db(xhci);
}
}
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x f4ed93037966aea07ae6b10ab208976783d24e2e
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020426-favoring-stoke-6cfa@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f4ed93037966aea07ae6b10ab208976783d24e2e Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong(a)kernel.org>
Date: Tue, 17 Dec 2024 13:43:06 -0800
Subject: [PATCH] xfs: don't shut down the filesystem for media failures beyond
end of log
If the filesystem has an external log device on pmem and the pmem
reports a media error beyond the end of the log area, don't shut down
the filesystem because we don't use that space.
Cc: <stable(a)vger.kernel.org> # v6.0
Fixes: 6f643c57d57c56 ("xfs: implement ->notify_failure() for XFS")
Signed-off-by: "Darrick J. Wong" <djwong(a)kernel.org>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
index fa50e5308292..0b0b0f31aca2 100644
--- a/fs/xfs/xfs_notify_failure.c
+++ b/fs/xfs/xfs_notify_failure.c
@@ -153,6 +153,79 @@ xfs_dax_notify_failure_thaw(
thaw_super(sb, FREEZE_HOLDER_USERSPACE);
}
+static int
+xfs_dax_translate_range(
+ struct xfs_buftarg *btp,
+ u64 offset,
+ u64 len,
+ xfs_daddr_t *daddr,
+ uint64_t *bblen)
+{
+ u64 dev_start = btp->bt_dax_part_off;
+ u64 dev_len = bdev_nr_bytes(btp->bt_bdev);
+ u64 dev_end = dev_start + dev_len - 1;
+
+ /* Notify failure on the whole device. */
+ if (offset == 0 && len == U64_MAX) {
+ offset = dev_start;
+ len = dev_len;
+ }
+
+ /* Ignore the range out of filesystem area */
+ if (offset + len - 1 < dev_start)
+ return -ENXIO;
+ if (offset > dev_end)
+ return -ENXIO;
+
+ /* Calculate the real range when it touches the boundary */
+ if (offset > dev_start)
+ offset -= dev_start;
+ else {
+ len -= dev_start - offset;
+ offset = 0;
+ }
+ if (offset + len - 1 > dev_end)
+ len = dev_end - offset + 1;
+
+ *daddr = BTOBB(offset);
+ *bblen = BTOBB(len);
+ return 0;
+}
+
+static int
+xfs_dax_notify_logdev_failure(
+ struct xfs_mount *mp,
+ u64 offset,
+ u64 len,
+ int mf_flags)
+{
+ xfs_daddr_t daddr;
+ uint64_t bblen;
+ int error;
+
+ /*
+ * Return ENXIO instead of shutting down the filesystem if the failed
+ * region is beyond the end of the log.
+ */
+ error = xfs_dax_translate_range(mp->m_logdev_targp,
+ offset, len, &daddr, &bblen);
+ if (error)
+ return error;
+
+ /*
+ * In the pre-remove case the failure notification is attempting to
+ * trigger a force unmount. The expectation is that the device is
+ * still present, but its removal is in progress and can not be
+ * cancelled, proceed with accessing the log device.
+ */
+ if (mf_flags & MF_MEM_PRE_REMOVE)
+ return 0;
+
+ xfs_err(mp, "ondisk log corrupt, shutting down fs!");
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
+ return -EFSCORRUPTED;
+}
+
static int
xfs_dax_notify_ddev_failure(
struct xfs_mount *mp,
@@ -263,8 +336,9 @@ xfs_dax_notify_failure(
int mf_flags)
{
struct xfs_mount *mp = dax_holder(dax_dev);
- u64 ddev_start;
- u64 ddev_end;
+ xfs_daddr_t daddr;
+ uint64_t bblen;
+ int error;
if (!(mp->m_super->s_flags & SB_BORN)) {
xfs_warn(mp, "filesystem is not ready for notify_failure()!");
@@ -279,17 +353,7 @@ xfs_dax_notify_failure(
if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
mp->m_logdev_targp != mp->m_ddev_targp) {
- /*
- * In the pre-remove case the failure notification is attempting
- * to trigger a force unmount. The expectation is that the
- * device is still present, but its removal is in progress and
- * can not be cancelled, proceed with accessing the log device.
- */
- if (mf_flags & MF_MEM_PRE_REMOVE)
- return 0;
- xfs_err(mp, "ondisk log corrupt, shutting down fs!");
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
- return -EFSCORRUPTED;
+ return xfs_dax_notify_logdev_failure(mp, offset, len, mf_flags);
}
if (!xfs_has_rmapbt(mp)) {
@@ -297,33 +361,12 @@ xfs_dax_notify_failure(
return -EOPNOTSUPP;
}
- ddev_start = mp->m_ddev_targp->bt_dax_part_off;
- ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1;
+ error = xfs_dax_translate_range(mp->m_ddev_targp, offset, len, &daddr,
+ &bblen);
+ if (error)
+ return error;
- /* Notify failure on the whole device. */
- if (offset == 0 && len == U64_MAX) {
- offset = ddev_start;
- len = bdev_nr_bytes(mp->m_ddev_targp->bt_bdev);
- }
-
- /* Ignore the range out of filesystem area */
- if (offset + len - 1 < ddev_start)
- return -ENXIO;
- if (offset > ddev_end)
- return -ENXIO;
-
- /* Calculate the real range when it touches the boundary */
- if (offset > ddev_start)
- offset -= ddev_start;
- else {
- len -= ddev_start - offset;
- offset = 0;
- }
- if (offset + len - 1 > ddev_end)
- len = ddev_end - offset + 1;
-
- return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len),
- mf_flags);
+ return xfs_dax_notify_ddev_failure(mp, daddr, bblen, mf_flags);
}
const struct dax_holder_operations xfs_dax_holder_operations = {
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x f4ed93037966aea07ae6b10ab208976783d24e2e
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020426-bobtail-police-e100@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f4ed93037966aea07ae6b10ab208976783d24e2e Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong(a)kernel.org>
Date: Tue, 17 Dec 2024 13:43:06 -0800
Subject: [PATCH] xfs: don't shut down the filesystem for media failures beyond
end of log
If the filesystem has an external log device on pmem and the pmem
reports a media error beyond the end of the log area, don't shut down
the filesystem because we don't use that space.
Cc: <stable(a)vger.kernel.org> # v6.0
Fixes: 6f643c57d57c56 ("xfs: implement ->notify_failure() for XFS")
Signed-off-by: "Darrick J. Wong" <djwong(a)kernel.org>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
index fa50e5308292..0b0b0f31aca2 100644
--- a/fs/xfs/xfs_notify_failure.c
+++ b/fs/xfs/xfs_notify_failure.c
@@ -153,6 +153,79 @@ xfs_dax_notify_failure_thaw(
thaw_super(sb, FREEZE_HOLDER_USERSPACE);
}
+static int
+xfs_dax_translate_range(
+ struct xfs_buftarg *btp,
+ u64 offset,
+ u64 len,
+ xfs_daddr_t *daddr,
+ uint64_t *bblen)
+{
+ u64 dev_start = btp->bt_dax_part_off;
+ u64 dev_len = bdev_nr_bytes(btp->bt_bdev);
+ u64 dev_end = dev_start + dev_len - 1;
+
+ /* Notify failure on the whole device. */
+ if (offset == 0 && len == U64_MAX) {
+ offset = dev_start;
+ len = dev_len;
+ }
+
+ /* Ignore the range out of filesystem area */
+ if (offset + len - 1 < dev_start)
+ return -ENXIO;
+ if (offset > dev_end)
+ return -ENXIO;
+
+ /* Calculate the real range when it touches the boundary */
+ if (offset > dev_start)
+ offset -= dev_start;
+ else {
+ len -= dev_start - offset;
+ offset = 0;
+ }
+ if (offset + len - 1 > dev_end)
+ len = dev_end - offset + 1;
+
+ *daddr = BTOBB(offset);
+ *bblen = BTOBB(len);
+ return 0;
+}
+
+static int
+xfs_dax_notify_logdev_failure(
+ struct xfs_mount *mp,
+ u64 offset,
+ u64 len,
+ int mf_flags)
+{
+ xfs_daddr_t daddr;
+ uint64_t bblen;
+ int error;
+
+ /*
+ * Return ENXIO instead of shutting down the filesystem if the failed
+ * region is beyond the end of the log.
+ */
+ error = xfs_dax_translate_range(mp->m_logdev_targp,
+ offset, len, &daddr, &bblen);
+ if (error)
+ return error;
+
+ /*
+ * In the pre-remove case the failure notification is attempting to
+ * trigger a force unmount. The expectation is that the device is
+ * still present, but its removal is in progress and can not be
+ * cancelled, proceed with accessing the log device.
+ */
+ if (mf_flags & MF_MEM_PRE_REMOVE)
+ return 0;
+
+ xfs_err(mp, "ondisk log corrupt, shutting down fs!");
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
+ return -EFSCORRUPTED;
+}
+
static int
xfs_dax_notify_ddev_failure(
struct xfs_mount *mp,
@@ -263,8 +336,9 @@ xfs_dax_notify_failure(
int mf_flags)
{
struct xfs_mount *mp = dax_holder(dax_dev);
- u64 ddev_start;
- u64 ddev_end;
+ xfs_daddr_t daddr;
+ uint64_t bblen;
+ int error;
if (!(mp->m_super->s_flags & SB_BORN)) {
xfs_warn(mp, "filesystem is not ready for notify_failure()!");
@@ -279,17 +353,7 @@ xfs_dax_notify_failure(
if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
mp->m_logdev_targp != mp->m_ddev_targp) {
- /*
- * In the pre-remove case the failure notification is attempting
- * to trigger a force unmount. The expectation is that the
- * device is still present, but its removal is in progress and
- * can not be cancelled, proceed with accessing the log device.
- */
- if (mf_flags & MF_MEM_PRE_REMOVE)
- return 0;
- xfs_err(mp, "ondisk log corrupt, shutting down fs!");
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
- return -EFSCORRUPTED;
+ return xfs_dax_notify_logdev_failure(mp, offset, len, mf_flags);
}
if (!xfs_has_rmapbt(mp)) {
@@ -297,33 +361,12 @@ xfs_dax_notify_failure(
return -EOPNOTSUPP;
}
- ddev_start = mp->m_ddev_targp->bt_dax_part_off;
- ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1;
+ error = xfs_dax_translate_range(mp->m_ddev_targp, offset, len, &daddr,
+ &bblen);
+ if (error)
+ return error;
- /* Notify failure on the whole device. */
- if (offset == 0 && len == U64_MAX) {
- offset = ddev_start;
- len = bdev_nr_bytes(mp->m_ddev_targp->bt_bdev);
- }
-
- /* Ignore the range out of filesystem area */
- if (offset + len - 1 < ddev_start)
- return -ENXIO;
- if (offset > ddev_end)
- return -ENXIO;
-
- /* Calculate the real range when it touches the boundary */
- if (offset > ddev_start)
- offset -= ddev_start;
- else {
- len -= ddev_start - offset;
- offset = 0;
- }
- if (offset + len - 1 > ddev_end)
- len = ddev_end - offset + 1;
-
- return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len),
- mf_flags);
+ return xfs_dax_notify_ddev_failure(mp, daddr, bblen, mf_flags);
}
const struct dax_holder_operations xfs_dax_holder_operations = {
The patch below does not apply to the 6.12-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.12.y
git checkout FETCH_HEAD
git cherry-pick -x efebe42d95fbba91dca6e3e32cb9e0612eb56de5
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020458-showoff-enclosure-b449@gregkh' --subject-prefix 'PATCH 6.12.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From efebe42d95fbba91dca6e3e32cb9e0612eb56de5 Mon Sep 17 00:00:00 2001
From: Long Li <leo.lilong(a)huawei.com>
Date: Sat, 11 Jan 2025 15:05:44 +0800
Subject: [PATCH] xfs: fix mount hang during primary superblock recovery
failure
When mounting an image containing a log with sb modifications that require
log replay, the mount process hang all the time and stack as follows:
[root@localhost ~]# cat /proc/557/stack
[<0>] xfs_buftarg_wait+0x31/0x70
[<0>] xfs_buftarg_drain+0x54/0x350
[<0>] xfs_mountfs+0x66e/0xe80
[<0>] xfs_fs_fill_super+0x7f1/0xec0
[<0>] get_tree_bdev_flags+0x186/0x280
[<0>] get_tree_bdev+0x18/0x30
[<0>] xfs_fs_get_tree+0x1d/0x30
[<0>] vfs_get_tree+0x2d/0x110
[<0>] path_mount+0xb59/0xfc0
[<0>] do_mount+0x92/0xc0
[<0>] __x64_sys_mount+0xc2/0x160
[<0>] x64_sys_call+0x2de4/0x45c0
[<0>] do_syscall_64+0xa7/0x240
[<0>] entry_SYSCALL_64_after_hwframe+0x76/0x7e
During log recovery, while updating the in-memory superblock from the
primary SB buffer, if an error is encountered, such as superblock
corruption occurs or some other reasons, we will proceed to out_release
and release the xfs_buf. However, this is insufficient because the
xfs_buf's log item has already been initialized and the xfs_buf is held
by the buffer log item as follows, the xfs_buf will not be released,
causing the mount thread to hang.
xlog_recover_do_primary_sb_buffer
xlog_recover_do_reg_buffer
xlog_recover_validate_buf_type
xfs_buf_item_init(bp, mp)
The solution is straightforward, we simply need to allow it to be
handled by the normal buffer write process. The filesystem will be
shutdown before the submission of buffer_list in xlog_do_recovery_pass(),
ensuring the correct release of the xfs_buf as follows:
xlog_do_recovery_pass
error = xlog_recover_process
xlog_recover_process_data
xlog_recover_process_ophdr
xlog_recovery_process_trans
...
xlog_recover_buf_commit_pass2
error = xlog_recover_do_primary_sb_buffer
//Encounter error and return
if (error)
goto out_writebuf
...
out_writebuf:
xfs_buf_delwri_queue(bp, buffer_list) //add bp to list
return error
...
if (!list_empty(&buffer_list))
if (error)
xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); //shutdown first
xfs_buf_delwri_submit(&buffer_list); //submit buffers in list
__xfs_buf_submit
if (bp->b_mount->m_log && xlog_is_shutdown(bp->b_mount->m_log))
xfs_buf_ioend_fail(bp) //release bp correctly
Fixes: 6a18765b54e2 ("xfs: update the file system geometry after recoverying superblock buffers")
Cc: stable(a)vger.kernel.org # v6.12
Signed-off-by: Long Li <leo.lilong(a)huawei.com>
Reviewed-by: Darrick J. Wong <djwong(a)kernel.org>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
Signed-off-by: Carlos Maiolino <cem(a)kernel.org>
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index b05d5b81f642..05a2f6927c12 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -1087,7 +1087,7 @@ xlog_recover_buf_commit_pass2(
error = xlog_recover_do_primary_sb_buffer(mp, item, bp, buf_f,
current_lsn);
if (error)
- goto out_release;
+ goto out_writebuf;
/* Update the rt superblock if we have one. */
if (xfs_has_rtsb(mp) && mp->m_rtsb_bp) {
@@ -1104,6 +1104,15 @@ xlog_recover_buf_commit_pass2(
xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn);
}
+ /*
+ * Buffer held by buf log item during 'normal' buffer recovery must
+ * be committed through buffer I/O submission path to ensure proper
+ * release. When error occurs during sb buffer recovery, log shutdown
+ * will be done before submitting buffer list so that buffers can be
+ * released correctly through ioend failure path.
+ */
+out_writebuf:
+
/*
* Perform delayed write on the buffer. Asynchronous writes will be
* slower when taking into account all the buffers to be flushed.
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 07eae0fa67ca4bbb199ad85645e0f9dfaef931cd
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020447-mastiff-cauterize-c488@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 07eae0fa67ca4bbb199ad85645e0f9dfaef931cd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch(a)lst.de>
Date: Thu, 16 Jan 2025 07:01:41 +0100
Subject: [PATCH] xfs: check for dead buffers in xfs_buf_find_insert
Commit 32dd4f9c506b ("xfs: remove a superflous hash lookup when inserting
new buffers") converted xfs_buf_find_insert to use
rhashtable_lookup_get_insert_fast and thus an operation that returns the
existing buffer when an insert would duplicate the hash key. But this
code path misses the check for a buffer with a reference count of zero,
which could lead to reusing an about to be freed buffer. Fix this by
using the same atomic_inc_not_zero pattern as xfs_buf_insert.
Fixes: 32dd4f9c506b ("xfs: remove a superflous hash lookup when inserting new buffers")
Signed-off-by: Christoph Hellwig <hch(a)lst.de>
Reviewed-by: Dave Chinner <dchinner(a)redhat.com>
Reviewed-by: Darrick J. Wong <djwong(a)kernel.org>
Cc: stable(a)vger.kernel.org # v6.0
Signed-off-by: Carlos Maiolino <cem(a)kernel.org>
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index d9636bff16ce..183428fbc607 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -655,9 +655,8 @@ xfs_buf_find_insert(
spin_unlock(&bch->bc_lock);
goto out_free_buf;
}
- if (bp) {
+ if (bp && atomic_inc_not_zero(&bp->b_hold)) {
/* found an existing buffer */
- atomic_inc(&bp->b_hold);
spin_unlock(&bch->bc_lock);
error = xfs_buf_find_lock(bp, flags);
if (error)
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 07eae0fa67ca4bbb199ad85645e0f9dfaef931cd
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025020447-stinking-untying-4604@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 07eae0fa67ca4bbb199ad85645e0f9dfaef931cd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch(a)lst.de>
Date: Thu, 16 Jan 2025 07:01:41 +0100
Subject: [PATCH] xfs: check for dead buffers in xfs_buf_find_insert
Commit 32dd4f9c506b ("xfs: remove a superflous hash lookup when inserting
new buffers") converted xfs_buf_find_insert to use
rhashtable_lookup_get_insert_fast and thus an operation that returns the
existing buffer when an insert would duplicate the hash key. But this
code path misses the check for a buffer with a reference count of zero,
which could lead to reusing an about to be freed buffer. Fix this by
using the same atomic_inc_not_zero pattern as xfs_buf_insert.
Fixes: 32dd4f9c506b ("xfs: remove a superflous hash lookup when inserting new buffers")
Signed-off-by: Christoph Hellwig <hch(a)lst.de>
Reviewed-by: Dave Chinner <dchinner(a)redhat.com>
Reviewed-by: Darrick J. Wong <djwong(a)kernel.org>
Cc: stable(a)vger.kernel.org # v6.0
Signed-off-by: Carlos Maiolino <cem(a)kernel.org>
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index d9636bff16ce..183428fbc607 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -655,9 +655,8 @@ xfs_buf_find_insert(
spin_unlock(&bch->bc_lock);
goto out_free_buf;
}
- if (bp) {
+ if (bp && atomic_inc_not_zero(&bp->b_hold)) {
/* found an existing buffer */
- atomic_inc(&bp->b_hold);
spin_unlock(&bch->bc_lock);
error = xfs_buf_find_lock(bp, flags);
if (error)
The current implementation sets the wMaxPacketSize of bulk in/out
endpoints to 1024 bytes at the end of the f_midi_bind function. However,
in cases where there is a failure in the first midi bind attempt,
consider rebinding. This scenario may encounter an f_midi_bind issue due
to the previous bind setting the bulk endpoint's wMaxPacketSize to 1024
bytes, which exceeds the ep->maxpacket_limit where configured dwc3 TX/RX
FIFO's maxpacket size of 512 bytes for IN/OUT endpoints in support HS
speed only.
Here the term "rebind" in this context refers to attempting to bind the
MIDI function a second time in certain scenarios. The situations where
rebinding is considered include:
* When there is a failure in the first UDC write attempt, which may be
caused by other functions bind along with MIDI.
* Runtime composition change : Example : MIDI,ADB to MIDI. Or MIDI to
MIDI,ADB.
This commit addresses this issue by resetting the wMaxPacketSize before
endpoint claim. And here there is no need to reset all values in the usb
endpoint descriptor structure, as all members except wMaxPacketSize and
bEndpointAddress have predefined values.
This ensures that restores the endpoint to its expected configuration,
and preventing conflicts with value of ep->maxpacket_limit. It also
aligns with the approach used in other function drivers, which treat
endpoint descriptors as if they were full speed before endpoint claim.
Fixes: 46decc82ffd5 ("usb: gadget: unconditionally allocate hs/ss descriptor in bind operation")
Cc: stable(a)vger.kernel.org
Signed-off-by: Selvarasu Ganesan <selvarasu.g(a)samsung.com>
---
Changes in v2:
- Expanded changelog as per the comment from Greg KH.
- Link to v1: https://lore.kernel.org/all/20241208152322.1653-1-selvarasu.g@samsung.com/
---
drivers/usb/gadget/function/f_midi.c | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/drivers/usb/gadget/function/f_midi.c b/drivers/usb/gadget/function/f_midi.c
index 837fcdfa3840..9b991cf5b0f8 100644
--- a/drivers/usb/gadget/function/f_midi.c
+++ b/drivers/usb/gadget/function/f_midi.c
@@ -907,6 +907,15 @@ static int f_midi_bind(struct usb_configuration *c, struct usb_function *f)
status = -ENODEV;
+ /*
+ * Reset wMaxPacketSize with maximum packet size of FS bulk transfer before
+ * endpoint claim. This ensures that the wMaxPacketSize does not exceed the
+ * limit during bind retries where configured dwc3 TX/RX FIFO's maxpacket
+ * size of 512 bytes for IN/OUT endpoints in support HS speed only.
+ */
+ bulk_in_desc.wMaxPacketSize = cpu_to_le16(64);
+ bulk_out_desc.wMaxPacketSize = cpu_to_le16(64);
+
/* allocate instance-specific endpoints */
midi->in_ep = usb_ep_autoconfig(cdev->gadget, &bulk_in_desc);
if (!midi->in_ep)
--
2.17.1
From: Puranjay Mohan <pjy(a)amazon.com>
[ Upstream commit 7c2fd76048e95dd267055b5f5e0a48e6e7c81fd9 ]
On an NVMe namespace that does not support metadata, it is possible to
send an IO command with metadata through io-passthru. This allows issues
like [1] to trigger in the completion code path.
nvme_map_user_request() doesn't check if the namespace supports metadata
before sending it forward. It also allows admin commands with metadata to
be processed as it ignores metadata when bdev == NULL and may report
success.
Reject an IO command with metadata when the NVMe namespace doesn't
support it and reject an admin command if it has metadata.
[1] https://lore.kernel.org/all/mb61pcylvnym8.fsf@amazon.com/
Suggested-by: Christoph Hellwig <hch(a)lst.de>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
Reviewed-by: Sagi Grimberg <sagi(a)grimberg.me>
Reviewed-by: Anuj Gupta <anuj20.g(a)samsung.com>
Signed-off-by: Keith Busch <kbusch(a)kernel.org>
[ Minor changes to make it work on 6.1 ]
Signed-off-by: Puranjay Mohan <pjy(a)amazon.com>
Signed-off-by: Hagar Hemdan <hagarhem(a)amazon.com>
---
Resend as all stables contain the fix except 6.1.
---
drivers/nvme/host/ioctl.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 875dee6ecd40..19a7f0160618 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -3,6 +3,7 @@
* Copyright (c) 2011-2014, Intel Corporation.
* Copyright (c) 2017-2021 Christoph Hellwig.
*/
+#include <linux/blk-integrity.h>
#include <linux/ptrace.h> /* for force_successful_syscall_return */
#include <linux/nvme_ioctl.h>
#include <linux/io_uring.h>
@@ -171,10 +172,15 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer,
struct request_queue *q = req->q;
struct nvme_ns *ns = q->queuedata;
struct block_device *bdev = ns ? ns->disk->part0 : NULL;
+ bool supports_metadata = bdev && blk_get_integrity(bdev->bd_disk);
+ bool has_metadata = meta_buffer && meta_len;
struct bio *bio = NULL;
void *meta = NULL;
int ret;
+ if (has_metadata && !supports_metadata)
+ return -EINVAL;
+
if (ioucmd && (ioucmd->flags & IORING_URING_CMD_FIXED)) {
struct iov_iter iter;
@@ -198,7 +204,7 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer,
if (bdev)
bio_set_dev(bio, bdev);
- if (bdev && meta_buffer && meta_len) {
+ if (has_metadata) {
meta = nvme_add_user_metadata(req, meta_buffer, meta_len,
meta_seed);
if (IS_ERR(meta)) {
--
2.40.1
From: Parth Pancholi <parth.pancholi(a)toradex.com>
Replace lz4c with lz4 for kernel image compression.
Although lz4 and lz4c are functionally similar, lz4c has been deprecated
upstream since 2018. Since as early as Ubuntu 16.04 and Fedora 25, lz4
and lz4c have been packaged together, making it safe to update the
requirement from lz4c to lz4.
Consequently, some distributions and build systems, such as OpenEmbedded,
have fully transitioned to using lz4. OpenEmbedded core adopted this
change in commit fe167e082cbd ("bitbake.conf: require lz4 instead of
lz4c"), causing compatibility issues when building the mainline kernel
in the latest OpenEmbedded environment, as seen in the errors below.
This change also updates the LZ4 compression commands to make it backward
compatible by replacing stdin and stdout with the '-' option, due to some
unclear reason, the stdout keyword does not work for lz4 and '-' works for
both. In addition, this modifies the legacy '-c1' with '-9' which is also
compatible with both. This fixes the mainline kernel build failures with
the latest master OpenEmbedded builds associated with the mentioned
compatibility issues.
LZ4 arch/arm/boot/compressed/piggy_data
/bin/sh: 1: lz4c: not found
...
...
ERROR: oe_runmake failed
Cc: stable(a)vger.kernel.org
Link: https://github.com/lz4/lz4/pull/553
Suggested-by: Francesco Dolcini <francesco.dolcini(a)toradex.com>
Signed-off-by: Parth Pancholi <parth.pancholi(a)toradex.com>
---
v2: correct the compression command line to make it compatible with lz4
v1: https://lore.kernel.org/all/20241112150006.265900-1-parth105105@gmail.com/
---
Makefile | 2 +-
scripts/Makefile.lib | 4 ++--
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/Makefile b/Makefile
index 79192a3024bf..7630f763f5b2 100644
--- a/Makefile
+++ b/Makefile
@@ -508,7 +508,7 @@ KGZIP = gzip
KBZIP2 = bzip2
KLZOP = lzop
LZMA = lzma
-LZ4 = lz4c
+LZ4 = lz4
XZ = xz
ZSTD = zstd
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 01a9f567d5af..fe5e132fcea8 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -371,10 +371,10 @@ quiet_cmd_lzo_with_size = LZO $@
cmd_lzo_with_size = { cat $(real-prereqs) | $(KLZOP) -9; $(size_append); } > $@
quiet_cmd_lz4 = LZ4 $@
- cmd_lz4 = cat $(real-prereqs) | $(LZ4) -l -c1 stdin stdout > $@
+ cmd_lz4 = cat $(real-prereqs) | $(LZ4) -l -9 - - > $@
quiet_cmd_lz4_with_size = LZ4 $@
- cmd_lz4_with_size = { cat $(real-prereqs) | $(LZ4) -l -c1 stdin stdout; \
+ cmd_lz4_with_size = { cat $(real-prereqs) | $(LZ4) -l -9 - -; \
$(size_append); } > $@
# U-Boot mkimage
--
2.34.1
This series fixes oopses on Alpha/SMP observed since kernel v6.9. [1]
Thanks to Magnus Lindholm for identifying that remarkably longstanding
bug.
The problem is that GCC expects 16-byte alignment of the incoming stack
since early 2004, as Maciej found out [2]:
Having actually dug speculatively I can see that the psABI was changed in
GCC 3.5 with commit e5e10fb4a350 ("re PR target/14539 (128-bit long double
improperly aligned)") back in Mar 2004, when the stack pointer alignment
was increased from 8 bytes to 16 bytes, and arch/alpha/kernel/entry.S has
various suspicious stack pointer adjustments, starting with SP_OFF which
is not a whole multiple of 16.
Also, as Magnus noted, "ALPHA Calling Standard" [3] required the same:
D.3.1 Stack Alignment
This standard requires that stacks be octaword aligned at the time a
new procedure is invoked.
However:
- the "normal" kernel stack is always misaligned by 8 bytes, thanks to
the odd number of 64-bit words in 'struct pt_regs', which is the very
first thing pushed onto the kernel thread stack;
- syscall, fault, interrupt etc. handlers may, or may not, receive aligned
stack depending on numerous factors.
Somehow we got away with it until recently, when we ended up with
a stack corruption in kernel/smp.c:smp_call_function_single() due to
its use of 32-byte aligned local data and the compiler doing clever
things allocating it on the stack.
Patches 1-2 are preparatory; 3 - the main fix; 4 - fixes remaining
special cases.
Ivan.
[1] https://lore.kernel.org/rcu/CA+=Fv5R9NG+1SHU9QV9hjmavycHKpnNyerQ=Ei90G98ukR…
[2] https://lore.kernel.org/rcu/alpine.DEB.2.21.2501130248010.18889@angie.orcam…
[3] https://bitsavers.org/pdf/dec/alpha/Alpha_Calling_Standard_Rev_2.0_19900427…
---
Changes in v2:
- patch #1: provide empty 'struct pt_regs' to fix compile failure in libbpf,
reported by John Paul Adrian Glaubitz <glaubitz(a)physik.fu-berlin.de>;
update comment and commit message accordingly;
- cc'ed <stable(a)vger.kernel.org> as older kernels ought to be fixed as well.
---
Ivan Kokshaysky (4):
alpha/uapi: do not expose kernel-only stack frame structures
alpha: replace hardcoded stack offsets with autogenerated ones
alpha: make stack 16-byte aligned (most cases)
alpha: align stack for page fault and user unaligned trap handlers
arch/alpha/include/asm/ptrace.h | 64 ++++++++++++++++++++++++++-
arch/alpha/include/uapi/asm/ptrace.h | 65 ++--------------------------
arch/alpha/kernel/asm-offsets.c | 4 ++
arch/alpha/kernel/entry.S | 24 +++++-----
arch/alpha/kernel/traps.c | 2 +-
arch/alpha/mm/fault.c | 4 +-
6 files changed, 83 insertions(+), 80 deletions(-)
--
2.39.5
From: Aradhya Bhatia <a-bhatia1(a)ti.com>
The driver code doesn't have a Phy de-initialization path as yet, and so
it does not clear the phy_initialized flag while suspending. This is a
problem because after resume the driver looks at this flag to determine
if a Phy re-initialization is required or not. It is in fact required
because the hardware is resuming from a suspend, but the driver does not
carry out any re-initialization causing the D-Phy to not work at all.
Call the counterparts of phy_init() and phy_power_on(), that are
phy_exit() and phy_power_off(), from _bridge_post_disable(), and clear
the flags so that the Phy can be initialized again when required.
Fixes: fced5a364dee ("drm/bridge: cdns: Convert to phy framework")
Cc: Stable List <stable(a)vger.kernel.org>
Signed-off-by: Aradhya Bhatia <a-bhatia1(a)ti.com>
Signed-off-by: Aradhya Bhatia <aradhya.bhatia(a)linux.dev>
---
drivers/gpu/drm/bridge/cadence/cdns-dsi-core.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/bridge/cadence/cdns-dsi-core.c b/drivers/gpu/drm/bridge/cadence/cdns-dsi-core.c
index 2f897ea5e80a..b0a1a6774ea6 100644
--- a/drivers/gpu/drm/bridge/cadence/cdns-dsi-core.c
+++ b/drivers/gpu/drm/bridge/cadence/cdns-dsi-core.c
@@ -680,6 +680,11 @@ static void cdns_dsi_bridge_post_disable(struct drm_bridge *bridge)
struct cdns_dsi_input *input = bridge_to_cdns_dsi_input(bridge);
struct cdns_dsi *dsi = input_to_dsi(input);
+ dsi->phy_initialized = false;
+ dsi->link_initialized = false;
+ phy_power_off(dsi->dphy);
+ phy_exit(dsi->dphy);
+
pm_runtime_put(dsi->base.dev);
}
@@ -1152,7 +1157,6 @@ static int __maybe_unused cdns_dsi_suspend(struct device *dev)
clk_disable_unprepare(dsi->dsi_sys_clk);
clk_disable_unprepare(dsi->dsi_p_clk);
reset_control_assert(dsi->dsi_p_rst);
- dsi->link_initialized = false;
return 0;
}
--
2.34.1
Dear Prof./Dr. Table,
e-Informatica Software Engineering Journal (EISEJ) is an established,
peer-reviewed, ISI JCR-indexed journal with an Impact Factor (IF=1.2,
5 Year IF=1.3) calculated by Clarivate and a free-of-charge, open-access
publishing model that accepts papers in Software Engineering, including
the intersection of data science (AI/ML) and Software Engineering.
Since you have published your research results at premier software
engineering conferences, such as MSR24, we believe that
EISEJ can serve as an excellent platform to share your impactful work further,
connecting it with a broad, engaged audience and enhancing your visibility
even more.
Why choose EISEJ for your next publication?
- Reputation and Trust: Indexed in ISI Web of Science (IF=1.2, 5 Year
IF=1.3), Scopus (CiteScore'23=2.6, CiteScoreTracker'24=3.3 and growing),
DBLP, DOAJ, and Google Scholar, demonstrating our global reach and
credibility.
- No Author Fees: Publish your work under a CC-BY license without any costs,
making your research accessible worldwide.
- Global Reach and Recognition: Internationally renowned Editorial Board:
https://www.e-informatyka.pl/index.php/einformatica/editorial-board/ .
- Efficient Publishing Process: Benefit from continuous, rapid publishing
with immediate online availability post-acceptance.
- No Restrictions on Paper Length: We welcome comprehensive work without
word or page limits.
What We Publish:
In addition to regular research papers, EISEJ is an ideal venue for:
- Systematic reviews, mapping studies, and survey research studies,
and supporting tools, see, e.g.,
[1] Norsaremah Salleh, Emilia Mendes, Fabiana Mendes, Charitha Dissanayake
Lekamlage and Kai Petersen, "Value-based Software Engineering: A Systematic
Mapping Study", In e-Informatica Software Engineering Journal, vol. 17, no.
1, pp. 230106, 2023. DOI: 10.37190/e-Inf230106.
[2] Chris Marshall, Barbara Kitchenham and Pearl Brereton, "Tool Features to
Support Systematic Reviews in Software Engineering – A Cross Domain Study",
In e-Informatica Software Engineering Journal, vol. 12, no. 1, pp. 79–115,
2018. DOI: 10.5277/e-Inf180104.
- Guidelines, see, e.g.,
[3] Miroslaw Staron, "Guidelines for Conducting Action Research Studies in
Software Engineering", In e-Informatica Software Engineering Journal, vol.
19, no. 1, pp. 250105, 2025. DOI: 10.37190/e-Inf250105.
[4] Muhammad Usman, Nauman Bin Ali and Claes Wohlin, "A Quality Assessment
Instrument for Systematic Literature Reviews in Software Engineering", In
e-Informatica Software Engineering Journal, vol. 17, no. 1, pp. 230105,
2023. DOI: 10.37190/e-Inf230105.
- Research agendas and vision papers, see, e.g.,
[5] Michael Unterkalmsteiner, Pekka Abrahamsson, XiaoFeng Wang, Anh
Nguyen-Duc, Syed Shah, Sohaib Shahid Bajwa, Guido H. Baltes, Kieran Conboy,
Eoin Cullina, Denis Dennehy, Henry Edison, Carlos Fernandez-Sanchez, Juan
Garbajosa, Tony Gorschek, Eriks Klotins, Laura Hokkanen, Fabio Kon, Ilaria
Lunesu, Michele Marchesi, Lorraine Morgan, Markku Oivo, Christoph Selig,
Pertti Seppänen, Roger Sweetman, Pasi Tyrväinen, Christina Ungerer and
Agustin Yagüe, "Software Startups – A Research Agenda", In e-Informatica
Software Engineering Journal, vol. 10, no. 1, pp. 89–124, 2016. DOI:
10.5277/e-Inf160105.
[6] Einav Peretz-Andersson and Richard Torkar, "Empirical AI Transformation
Research: A Systematic Mapping Study and Future Agenda", In e-Informatica
Software Engineering Journal, vol. 16, no. 1, pp. 220108, 2022. DOI:
10.37190/e-Inf220108.
- Interdisciplinary work at the intersection of Software Engineering and
AI/ML, see, e.g.,
[7] Mirosław Ochodek and Miroslaw Staron, "ACoRA – A Platform for Automating
Code Review Tasks", In e-Informatica Software Engineering Journal, vol. 19,
no. 1, pp. 250102, 2025. DOI: 10.37190/e-Inf250102.
We are also open to proposals for special sections that reflect the latest
trends and challenges in the field. If you have ideas for collaboration or
suggestions on how we can better meet researchers’ needs, we welcome
your feedback at e-informatica(a)pwr.edu.pl .
Explore our journal at: https://www.e-informatyka.pl/ , and start your
submission process at https://mc.manuscriptcentral.com/e-InformaticaSEJ .
We recommend using our latest LaTeX template available at
https://www.e-informatyka.pl/index.php/einformatica/authors-guide/paper-sub…
With best regards,
Lech Madeyski & Miroslaw Ochodek
Editors-in-Chief, e-Informatica Software Engineering Journal (EISEJ)
---
EISEJ is indexed by:
- ISI WoS with Impact Factor (IF=1.2, 5 Year IF=1.3) calculated by Clarivate:
https://jcr.clarivate.com/jcr-jp/journal-profile?journal=E-INFORMATICA&year…
- Scopus (with CiteScore=2.6): https://www.scopus.com/sourceid/21100259509
- DBLP: https://dblp.uni-trier.de/db/journals/eInformatica/index.html
- Directory of Open Access Journals (DOAJ): https://www.doaj.org/toc/2084-4840
- Google Scholar: https://scholar.google.pl/citations?user=8-uDLDoAAAAJ&hl
---
Opt out: If you do not want to receive further e-mails from us, please use this link:
https://listmonk.e-informatyka.pl/subscription/c3d88966-bba6-42ee-af4c-3eab…
A recent LLVM commit [1] started generating an .ARM.attributes section
similar to the one that exists for 32-bit, which results in orphan
section warnings (or errors if CONFIG_WERROR is enabled) from the linker
because it is not handled in the arm64 linker scripts.
ld.lld: error: arch/arm64/kernel/vdso/vgettimeofday.o:(.ARM.attributes) is being placed in '.ARM.attributes'
ld.lld: error: arch/arm64/kernel/vdso/vgetrandom.o:(.ARM.attributes) is being placed in '.ARM.attributes'
ld.lld: error: vmlinux.a(lib/vsprintf.o):(.ARM.attributes) is being placed in '.ARM.attributes'
ld.lld: error: vmlinux.a(lib/win_minmax.o):(.ARM.attributes) is being placed in '.ARM.attributes'
ld.lld: error: vmlinux.a(lib/xarray.o):(.ARM.attributes) is being placed in '.ARM.attributes'
Add this new section to the necessary linker scripts to resolve the warnings.
Cc: stable(a)vger.kernel.org
Fixes: b3e5d80d0c48 ("arm64/build: Warn on orphan section placement")
Link: https://github.com/llvm/llvm-project/commit/ee99c4d4845db66c4daa2373352133f… [1]
Signed-off-by: Nathan Chancellor <nathan(a)kernel.org>
---
arch/arm64/kernel/vdso/vdso.lds.S | 1 +
arch/arm64/kernel/vmlinux.lds.S | 1 +
2 files changed, 2 insertions(+)
diff --git a/arch/arm64/kernel/vdso/vdso.lds.S b/arch/arm64/kernel/vdso/vdso.lds.S
index 4ec32e86a8da..f8418a3a2758 100644
--- a/arch/arm64/kernel/vdso/vdso.lds.S
+++ b/arch/arm64/kernel/vdso/vdso.lds.S
@@ -75,6 +75,7 @@ SECTIONS
DWARF_DEBUG
ELF_DETAILS
+ .ARM.attributes 0 : { *(.ARM.attributes) }
/DISCARD/ : {
*(.data .data.* .gnu.linkonce.d.* .sdata*)
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index f84c71f04d9e..c94942e9eb46 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -335,6 +335,7 @@ SECTIONS
STABS_DEBUG
DWARF_DEBUG
ELF_DETAILS
+ .ARM.attributes 0 : { *(.ARM.attributes) }
HEAD_SYMBOLS
---
base-commit: 1dd3393696efba1598aa7692939bba99d0cffae3
change-id: 20250123-arm64-handle-arm-attributes-in-linker-script-82aee25313ac
Best regards,
--
Nathan Chancellor <nathan(a)kernel.org>
Hi -
May I request that you apply
961b4b5e86bf ("NFSD: Reset cb_seq_status after NFS4ERR_DELAY")
to all LTS kernels that don't have it?
I've checked that it applies cleanly to at least v6.1 and run
it through NFSD CI. It should not be a problem to apply it
elsewhere.
--
Chuck Lever
From: "Yiren Xie" <1534428646(a)qq.com>
It is obvious a conflict between the code and the comment.
The function aarch64_insn_is_steppable is used to check if a mrs
instruction can be safe in single-stepping environment, in the
comment it says only reading DAIF bits by mrs is safe in
single-stepping environment, and other mrs instructions are not. So
aarch64_insn_is_steppable should returen "TRUE" if the mrs instruction
being single stepped is reading DAIF bits.
And have verified using a kprobe kernel module which reads the DAIF bits by
function arch_local_irq_save with offset setting to 0x4, confirmed that
without this modification, it encounters
"kprobe_init: register_kprobe failed, returned -22" error while inserting
the kprobe kernel module. and with this modification, it can read the DAIF
bits in single-stepping environment.
Fixes: 2dd0e8d2d2a1 ("arm64: Kprobes with single stepping support")
Cc: stable(a)vger.kernel.org
Signed-off-by: Yiren Xie <1534428646(a)qq.com>
---
arch/arm64/kernel/probes/decode-insn.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c
index 6438bf62e753..22383eb1c22c 100644
--- a/arch/arm64/kernel/probes/decode-insn.c
+++ b/arch/arm64/kernel/probes/decode-insn.c
@@ -40,7 +40,7 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn)
*/
if (aarch64_insn_is_mrs(insn))
return aarch64_insn_extract_system_reg(insn)
- != AARCH64_INSN_SPCLREG_DAIF;
+ == AARCH64_INSN_SPCLREG_DAIF;
/*
* The HINT instruction is steppable only if it is in whitelist
--
2.34.1
Hi All,
+stable
There seems to be some formatting issues in my log output. I have
attached it as a file.
Thanks & Regards,
Harshvardhan
On 29/01/25 1:57 PM, Harshvardhan Jha wrote:
> Hello there,
>
> The stable tag v5.4.289 seems to fail to boot with following trace:
>
> [ OK ] Created slice system-serial\x2dgetty.slice. [ OK ] Listening on
> udev Control Socket. [ OK ] Reached target Local Encrypted Volumes. [ OK
> ] Listening on /dev/initctl Compatibility Named Pipe. [ OK ] Listening
> on Delayed Shutdown Socket. [ OK ] Created slice
> system-selinux\x2dpol...grate\x2dlocal\x2dchanges.slice. [ OK ] Stopped
> target Initrd File Systems. [ OK ] Listening on LVM2 metadata daemon
> socket. Mounting Debug File System... [ OK ] Listening on networkd
> rtnetlink socket. [ OK ] Listening on Device-mapper event daemon FIFOs.
> Starting Monitoring of LVM2 mirrors... dmeventd or progress polling... [
> OK ] Created slice User and Session Slice. Starting Read and set NIS
> domainname from /etc/sysconfig/network... Starting Load legacy module
> configuration... [ OK ] Set up automount Arbitrary Executab...ats File
> System Automount Point. [ OK ] Created slice
> system-rdma\x2dload\x2dmodules.slice. [ OK ] Started Forward Password
> Requests to Wall Directory Watch. [ OK ] Reached target Paths. Starting
> Remount Root and Kernel File Systems... [ OK ] Created slice
> system-getty.slice. Starting Create list of required st... nodes for the
> current kernel... Starting Set Up Additional Binary Formats... [ OK ]
> Stopped target Initrd Root File System. [ OK ] Reached target Slices. [
> OK ] Created slice system-systemd\x2dfsck.slice. [ OK ] Mounted POSIX
> Message Queue File System. [ OK ] Mounted Debug File System. [ OK ]
> Started Read and set NIS domainname from /etc/sysconfig/network.
> Mounting Arbitrary Executable File Formats File System... [ OK ] Started
> Journal Service. [ OK ] Started Create list of required sta...ce nodes
> for the current kernel. Starting Create Static Device Nodes in /dev... [
> OK ] Started LVM2 metadata daemon. [ OK ] Started Remount Root and
> Kernel File Systems. Starting Load/Save Random Seed... Starting udev
> Coldplug all Devices... Starting Flush Journal to Persistent Storage...
> [FAILED] Failed to start Load Kernel Modules. See 'systemctl status
> systemd-modules-load.service' for details. Starting Apply Kernel
> Variables... [ OK ] Started Load/Save Random Seed. [ OK ] Mounted
> Arbitrary Executable File Formats File System. [ OK ] Started Set Up
> Additional Binary Formats. [ OK ] Started Create Static Device Nodes in
> /dev. Starting udev Kernel Device Manager... [ OK ] Started Apply Kernel
> Variables. [ OK ] Started Load legacy module configuration. [ OK ]
> Started udev Coldplug all Devices. Starting udev Wait for Complete
> Device Initialization... [ 24.427217] megaraid_sas 0000:65:00.0:
> megasas_build_io_fusion 3273 sge_count (-12) is out of range. Range is:
> 0-256
>
> [ 24.427217] megaraid_sas 0000:65:00.0: megasas_build_io_fusion 3273
> sge_count (-12) is out of range. Range is: 0-256 kept showing
> infinitely. It kept popping until I reset the system.
>
> Reverting the following patch seems to fix the issue:
>
> commit 07c9cccc4c3fecba175a7e5aafba6370758f5ce2
> Author: Juergen Gross <jgross(a)suse.com>
> Date: Fri Sep 13 12:05:02 2024 +0200
>
> xen/swiotlb: add alignment check for dma buffers
>
> [ Upstream commit 9f40ec84a7976d95c34e7cc070939deb103652b0 ]
>
> When checking a memory buffer to be consecutive in machine memory,
> the alignment needs to be checked, too. Failing to do so might result
> in DMA memory not being aligned according to its requested size,
> leading to error messages like:
>
> 4xxx 0000:2b:00.0: enabling device (0140 -> 0142)
> 4xxx 0000:2b:00.0: Ring address not aligned
> 4xxx 0000:2b:00.0: Failed to initialise service qat_crypto
> 4xxx 0000:2b:00.0: Resetting device qat_dev0
> 4xxx: probe of 0000:2b:00.0 failed with error -14
>
> Fixes: 9435cce87950 ("xen/swiotlb: Add support for 64KB page
> granularity")
> Signed-off-by: Juergen Gross <jgross(a)suse.com>
> Reviewed-by: Stefano Stabellini <sstabellini(a)kernel.org>
> Signed-off-by: Juergen Gross <jgross(a)suse.com>
> Signed-off-by: Sasha Levin <sashal(a)kernel.org>
>
> I tried changing swiotlb grub command line arguments but that didn't
> seem to help much unfortunately and the error was seen again.
>
> Thanks & Regards,
> Harshvardhan
>
Hello,
New build issue found on stable-rc/linux-5.4.y:
expected ‘=’, ‘,’, ‘;’, ‘asm’ or ‘__attribute__’ before ‘__free’ in
drivers/soc/atmel/soc.o (drivers/soc/atmel/soc.c)
[logspec:kbuild,kbuild.compiler.error]
- Dashboard: https://staging.dashboard.kernelci.org:9000/issue/maestro:040dcc33328a47e52…
- Grafana: https://grafana.kernelci.org/d/issue/issue?var-id=maestro:040dcc33328a47e52…
Log excerpt:
drivers/soc/atmel/soc.c:277:32: error: expected ‘=’, ‘,’, ‘;’, ‘asm’
or ‘__attribute__’ before ‘__free’
277 | struct device_node *np __free(device_node) =
of_find_node_by_path("/");
| ^~~~~~
drivers/soc/atmel/soc.c:277:32: error: implicit declaration of
function ‘__free’; did you mean ‘kzfree’?
[-Werror=implicit-function-declaration]
277 | struct device_node *np __free(device_node) =
of_find_node_by_path("/");
| ^~~~~~
| kzfree
drivers/soc/atmel/soc.c:277:39: error: ‘device_node’ undeclared (first
use in this function)
277 | struct device_node *np __free(device_node) =
of_find_node_by_path("/");
| ^~~~~~~~~~~
drivers/soc/atmel/soc.c:277:39: note: each undeclared identifier is
reported only once for each function it appears in
drivers/soc/atmel/soc.c:279:51: error: ‘np’ undeclared (first use in
this function); did you mean ‘nop’?
279 | if (!of_match_node(at91_soc_allowed_list, np))
| ^~
| nop
CC drivers/spi/spidev.o
cc1: some warnings being treated as errors
# Builds where the incident occurred:
## multi_v7_defconfig(gcc-12):
- Dashboard: https://staging.dashboard.kernelci.org:9000/build/maestro:67a119e4661a7bc87…
## multi_v5_defconfig(gcc-12):
- Dashboard: https://staging.dashboard.kernelci.org:9000/build/maestro:67a119e0661a7bc87…
## multi_v7_defconfig(gcc-12):
- Dashboard: https://staging.dashboard.kernelci.org:9000/build/maestro:67a119d8661a7bc87…
#kernelci issue maestro:040dcc33328a47e528c393a656a52b340b4ccc8b
--
This is an experimental report format. Please send feedback in!
Talk to us at kerneci(a)lists.linux.dev
Made with love by the KernelCI team - https://kernelci.org
OP-TEE supplicant is a user-space daemon and it's possible for it
be hung or crashed or killed in the middle of processing an OP-TEE
RPC call. It becomes more complicated when there is incorrect shutdown
ordering of the supplicant process vs the OP-TEE client application which
can eventually lead to system hang-up waiting for the closure of the
client application.
Allow the client process waiting in kernel for supplicant response to
be killed rather than indefinitely waiting in an unkillable state. Also,
a normal uninterruptible wait should not have resulted in the hung-task
watchdog getting triggered, but the endless loop would.
This fixes issues observed during system reboot/shutdown when supplicant
got hung for some reason or gets crashed/killed which lead to client
getting hung in an unkillable state. It in turn lead to system being in
hung up state requiring hard power off/on to recover.
Fixes: 4fb0a5eb364d ("tee: add OP-TEE driver")
Suggested-by: Arnd Bergmann <arnd(a)arndb.de>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Sumit Garg <sumit.garg(a)linaro.org>
---
Changes in v3:
- Use mutex_lock() instead of mutex_lock_killable().
- Update commit message to incorporate Arnd's feedback.
Changes in v2:
- Switch to killable wait instead as suggested by Arnd instead
of supplicant timeout. It atleast allow the client to wait for
supplicant in killable state which in turn allows system to reboot
or shutdown gracefully.
drivers/tee/optee/supp.c | 35 ++++++++---------------------------
1 file changed, 8 insertions(+), 27 deletions(-)
diff --git a/drivers/tee/optee/supp.c b/drivers/tee/optee/supp.c
index 322a543b8c27..d0f397c90242 100644
--- a/drivers/tee/optee/supp.c
+++ b/drivers/tee/optee/supp.c
@@ -80,7 +80,6 @@ u32 optee_supp_thrd_req(struct tee_context *ctx, u32 func, size_t num_params,
struct optee *optee = tee_get_drvdata(ctx->teedev);
struct optee_supp *supp = &optee->supp;
struct optee_supp_req *req;
- bool interruptable;
u32 ret;
/*
@@ -111,36 +110,18 @@ u32 optee_supp_thrd_req(struct tee_context *ctx, u32 func, size_t num_params,
/*
* Wait for supplicant to process and return result, once we've
* returned from wait_for_completion(&req->c) successfully we have
- * exclusive access again.
+ * exclusive access again. Allow the wait to be killable such that
+ * the wait doesn't turn into an indefinite state if the supplicant
+ * gets hung for some reason.
*/
- while (wait_for_completion_interruptible(&req->c)) {
+ if (wait_for_completion_killable(&req->c)) {
mutex_lock(&supp->mutex);
- interruptable = !supp->ctx;
- if (interruptable) {
- /*
- * There's no supplicant available and since the
- * supp->mutex currently is held none can
- * become available until the mutex released
- * again.
- *
- * Interrupting an RPC to supplicant is only
- * allowed as a way of slightly improving the user
- * experience in case the supplicant hasn't been
- * started yet. During normal operation the supplicant
- * will serve all requests in a timely manner and
- * interrupting then wouldn't make sense.
- */
- if (req->in_queue) {
- list_del(&req->link);
- req->in_queue = false;
- }
+ if (req->in_queue) {
+ list_del(&req->link);
+ req->in_queue = false;
}
mutex_unlock(&supp->mutex);
-
- if (interruptable) {
- req->ret = TEEC_ERROR_COMMUNICATION;
- break;
- }
+ req->ret = TEEC_ERROR_COMMUNICATION;
}
ret = req->ret;
--
2.43.0
gsacore registers are not accessible from normal world.
Disable this node, so that the suspend/resume callbacks
in the pinctrl driver don't cause a Serror attempting to
access the registers.
Fixes: ea89fdf24fd9 ("arm64: dts: exynos: google: Add initial Google gs101 SoC support")
Signed-off-by: Peter Griffin <peter.griffin(a)linaro.org>
To: Rob Herring <robh(a)kernel.org>
To: Krzysztof Kozlowski <krzk+dt(a)kernel.org>
To: Conor Dooley <conor+dt(a)kernel.org>
To: Alim Akhtar <alim.akhtar(a)samsung.com>
Cc: linux-arm-kernel(a)lists.infradead.org
Cc: linux-samsung-soc(a)vger.kernel.org
Cc: devicetree(a)vger.kernel.org
Cc: linux-kernel(a)vger.kernel.org
Cc: tudor.ambarus(a)linaro.org
Cc: andre.draszik(a)linaro.org
Cc: kernel-team(a)android.com
Cc: willmcvicker(a)google.com
Cc: stable(a)vger.kernel.org
---
arch/arm64/boot/dts/exynos/google/gs101.dtsi | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/arm64/boot/dts/exynos/google/gs101.dtsi b/arch/arm64/boot/dts/exynos/google/gs101.dtsi
index 302c5beb224a..b8f8255f840b 100644
--- a/arch/arm64/boot/dts/exynos/google/gs101.dtsi
+++ b/arch/arm64/boot/dts/exynos/google/gs101.dtsi
@@ -1451,6 +1451,7 @@ pinctrl_gsacore: pinctrl@17a80000 {
/* TODO: update once support for this CMU exists */
clocks = <0>;
clock-names = "pclk";
+ status = "disabled";
};
cmu_top: clock-controller@1e080000 {
---
base-commit: ed9a4ad6e5bd3a443e81446476718abebee47e82
change-id: 20241213-contrib-pg-pinctrl_gsacore_disable-3457c942b0fe
Best regards,
--
Peter Griffin <peter.griffin(a)linaro.org>
devm_blk_crypto_profile_init() registers a cleanup handler to run when
the associated (platform-) device is being released. For UFS, the
crypto private data and pointers are stored as part of the ufs_hba's
data structure 'struct ufs_hba::crypto_profile'. This structure is
allocated as part of the underlying ufshcd and therefore Scsi_host
allocation.
During driver release or during error handling in ufshcd_pltfrm_init(),
this structure is released as part of ufshcd_dealloc_host() before the
(platform-) device associated with the crypto call above is released.
Once this device is released, the crypto cleanup code will run, using
the just-released 'struct ufs_hba::crypto_profile'. This causes a
use-after-free situation:
Call trace:
kfree+0x60/0x2d8 (P)
kvfree+0x44/0x60
blk_crypto_profile_destroy_callback+0x28/0x70
devm_action_release+0x1c/0x30
release_nodes+0x6c/0x108
devres_release_all+0x98/0x100
device_unbind_cleanup+0x20/0x70
really_probe+0x218/0x2d0
In other words, the initialisation code flow is:
platform-device probe
ufshcd_pltfrm_init()
ufshcd_alloc_host()
scsi_host_alloc()
allocation of struct ufs_hba
creation of scsi-host devices
devm_blk_crypto_profile_init()
devm registration of cleanup handler using platform-device
and during error handling of ufshcd_pltfrm_init() or during driver
removal:
ufshcd_dealloc_host()
scsi_host_put()
put_device(scsi-host)
release of struct ufs_hba
put_device(platform-device)
crypto cleanup handler
To fix this use-after free, change ufshcd_alloc_host() to register a
devres action to automatically cleanup the underlying SCSI device on
ufshcd destruction, without requiring explicit calls to
ufshcd_dealloc_host(). This way:
* the crypto profile and all other ufs_hba-owned resources are
destroyed before SCSI (as they've been registered after)
* a memleak is plugged in tc-dwc-g210-pci.c remove() as a
side-effect
* EXPORT_SYMBOL_GPL(ufshcd_dealloc_host) can be removed fully as
it's not needed anymore
* no future drivers using ufshcd_alloc_host() could ever forget
adding the cleanup
Fixes: cb77cb5abe1f ("blk-crypto: rename blk_keyslot_manager to blk_crypto_profile")
Fixes: d76d9d7d1009 ("scsi: ufs: use devm_blk_ksm_init()")
Cc: stable(a)vger.kernel.org
Signed-off-by: André Draszik <andre.draszik(a)linaro.org>
---
Changes in v4:
- add a kdoc note to ufshcd_alloc_host() to state why there is no
ufshcd_dealloc_host() (Mani)
- use return err, without goto (Mani)
- drop register dump and abort info from commit message (Mani)
- Link to v3: https://lore.kernel.org/r/20250116-ufshcd-fix-v3-1-6a83004ea85c@linaro.org
Changes in v3:
- rename devres action handler to ufshcd_devres_release() (Bart)
- Link to v2: https://lore.kernel.org/r/20250114-ufshcd-fix-v2-1-2dc627590a4a@linaro.org
Changes in v2:
- completely new approach using devres action for Scsi_host cleanup, to
ensure ordering
- add Fixes: and CC: stable tags (Eric)
- Link to v1: https://lore.kernel.org/r/20250113-ufshcd-fix-v1-1-ca63d1d4bd55@linaro.org
---
In my case, as per above trace I initially encountered an error in
ufshcd_verify_dev_init(), which made me notice this problem both during
error handling and release. For reproducing, it'd be possible to change
that function to just return an error, or rmmod the platform glue
driver.
Other approaches for solving this issue I see are the following, but I
believe this one here is the cleanest:
* turn 'struct ufs_hba::crypto_profile' into a dynamically allocated
pointer, in which case it doesn't matter if cleanup runs after
scsi_host_put()
* add an explicit devm_blk_crypto_profile_deinit() to be called by API
users when necessary, e.g. before ufshcd_dealloc_host() in this case
* register the crypto cleanup handler against the scsi-host device
instead, like in v1 of this patch
---
drivers/ufs/core/ufshcd.c | 31 +++++++++++++++++++++----------
drivers/ufs/host/ufshcd-pci.c | 2 --
drivers/ufs/host/ufshcd-pltfrm.c | 28 +++++++++-------------------
include/ufs/ufshcd.h | 1 -
4 files changed, 30 insertions(+), 32 deletions(-)
diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index 43ddae7318cb..4328f769a7c8 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -10279,16 +10279,6 @@ int ufshcd_system_thaw(struct device *dev)
EXPORT_SYMBOL_GPL(ufshcd_system_thaw);
#endif /* CONFIG_PM_SLEEP */
-/**
- * ufshcd_dealloc_host - deallocate Host Bus Adapter (HBA)
- * @hba: pointer to Host Bus Adapter (HBA)
- */
-void ufshcd_dealloc_host(struct ufs_hba *hba)
-{
- scsi_host_put(hba->host);
-}
-EXPORT_SYMBOL_GPL(ufshcd_dealloc_host);
-
/**
* ufshcd_set_dma_mask - Set dma mask based on the controller
* addressing capability
@@ -10307,12 +10297,26 @@ static int ufshcd_set_dma_mask(struct ufs_hba *hba)
return dma_set_mask_and_coherent(hba->dev, DMA_BIT_MASK(32));
}
+/**
+ * ufshcd_devres_release - devres cleanup handler, invoked during release of
+ * hba->dev
+ * @host: pointer to SCSI host
+ */
+static void ufshcd_devres_release(void *host)
+{
+ scsi_host_put(host);
+}
+
/**
* ufshcd_alloc_host - allocate Host Bus Adapter (HBA)
* @dev: pointer to device handle
* @hba_handle: driver private handle
*
* Return: 0 on success, non-zero value on failure.
+ *
+ * NOTE: There is no corresponding ufshcd_dealloc_host() because this function
+ * keeps track of its allocations using devres and deallocates everything on
+ * device removal automatically.
*/
int ufshcd_alloc_host(struct device *dev, struct ufs_hba **hba_handle)
{
@@ -10334,6 +10338,13 @@ int ufshcd_alloc_host(struct device *dev, struct ufs_hba **hba_handle)
err = -ENOMEM;
goto out_error;
}
+
+ err = devm_add_action_or_reset(dev, ufshcd_devres_release,
+ host);
+ if (err)
+ return dev_err_probe(dev, err,
+ "failed to add ufshcd dealloc action\n");
+
host->nr_maps = HCTX_TYPE_POLL + 1;
hba = shost_priv(host);
hba->host = host;
diff --git a/drivers/ufs/host/ufshcd-pci.c b/drivers/ufs/host/ufshcd-pci.c
index ea39c5d5b8cf..9cfcaad23cf9 100644
--- a/drivers/ufs/host/ufshcd-pci.c
+++ b/drivers/ufs/host/ufshcd-pci.c
@@ -562,7 +562,6 @@ static void ufshcd_pci_remove(struct pci_dev *pdev)
pm_runtime_forbid(&pdev->dev);
pm_runtime_get_noresume(&pdev->dev);
ufshcd_remove(hba);
- ufshcd_dealloc_host(hba);
}
/**
@@ -605,7 +604,6 @@ ufshcd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
err = ufshcd_init(hba, mmio_base, pdev->irq);
if (err) {
dev_err(&pdev->dev, "Initialization failed\n");
- ufshcd_dealloc_host(hba);
return err;
}
diff --git a/drivers/ufs/host/ufshcd-pltfrm.c b/drivers/ufs/host/ufshcd-pltfrm.c
index 505572d4fa87..ffe5d1d2b215 100644
--- a/drivers/ufs/host/ufshcd-pltfrm.c
+++ b/drivers/ufs/host/ufshcd-pltfrm.c
@@ -465,21 +465,17 @@ int ufshcd_pltfrm_init(struct platform_device *pdev,
struct device *dev = &pdev->dev;
mmio_base = devm_platform_ioremap_resource(pdev, 0);
- if (IS_ERR(mmio_base)) {
- err = PTR_ERR(mmio_base);
- goto out;
- }
+ if (IS_ERR(mmio_base))
+ return PTR_ERR(mmio_base);
irq = platform_get_irq(pdev, 0);
- if (irq < 0) {
- err = irq;
- goto out;
- }
+ if (irq < 0)
+ return irq;
err = ufshcd_alloc_host(dev, &hba);
if (err) {
dev_err(dev, "Allocation failed\n");
- goto out;
+ return err;
}
hba->vops = vops;
@@ -488,13 +484,13 @@ int ufshcd_pltfrm_init(struct platform_device *pdev,
if (err) {
dev_err(dev, "%s: clock parse failed %d\n",
__func__, err);
- goto dealloc_host;
+ return err;
}
err = ufshcd_parse_regulator_info(hba);
if (err) {
dev_err(dev, "%s: regulator init failed %d\n",
__func__, err);
- goto dealloc_host;
+ return err;
}
ufshcd_init_lanes_per_dir(hba);
@@ -502,25 +498,20 @@ int ufshcd_pltfrm_init(struct platform_device *pdev,
err = ufshcd_parse_operating_points(hba);
if (err) {
dev_err(dev, "%s: OPP parse failed %d\n", __func__, err);
- goto dealloc_host;
+ return err;
}
err = ufshcd_init(hba, mmio_base, irq);
if (err) {
dev_err_probe(dev, err, "Initialization failed with error %d\n",
err);
- goto dealloc_host;
+ return err;
}
pm_runtime_set_active(dev);
pm_runtime_enable(dev);
return 0;
-
-dealloc_host:
- ufshcd_dealloc_host(hba);
-out:
- return err;
}
EXPORT_SYMBOL_GPL(ufshcd_pltfrm_init);
@@ -534,7 +525,6 @@ void ufshcd_pltfrm_remove(struct platform_device *pdev)
pm_runtime_get_sync(&pdev->dev);
ufshcd_remove(hba);
- ufshcd_dealloc_host(hba);
pm_runtime_disable(&pdev->dev);
pm_runtime_put_noidle(&pdev->dev);
}
diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h
index da0fa5c65081..58eb6e897827 100644
--- a/include/ufs/ufshcd.h
+++ b/include/ufs/ufshcd.h
@@ -1311,7 +1311,6 @@ static inline void ufshcd_rmwl(struct ufs_hba *hba, u32 mask, u32 val, u32 reg)
void ufshcd_enable_irq(struct ufs_hba *hba);
void ufshcd_disable_irq(struct ufs_hba *hba);
int ufshcd_alloc_host(struct device *, struct ufs_hba **);
-void ufshcd_dealloc_host(struct ufs_hba *);
int ufshcd_hba_enable(struct ufs_hba *hba);
int ufshcd_init(struct ufs_hba *, void __iomem *, unsigned int);
int ufshcd_link_recovery(struct ufs_hba *hba);
---
base-commit: 4e16367cfe0ce395f29d0482b78970cce8e1db73
change-id: 20250113-ufshcd-fix-52409f2d32ff
Best regards,
--
André Draszik <andre.draszik(a)linaro.org>
From: Rik van Riel <riel(a)fb.com>
[ Upstream commit 6db2526c1d694c91c6e05e2f186c085e9460f202 ]
Setting and clearing CPU bits in the mm_cpumask is only ever done
by the CPU itself, from the context switch code or the TLB flush
code.
Synchronization is handled by switch_mm_irqs_off() blocking interrupts.
Sending TLB flush IPIs to CPUs that are in the mm_cpumask, but no
longer running the program causes a regression in the will-it-scale
tlbflush2 test. This test is contrived, but a large regression here
might cause a small regression in some real world workload.
Instead of always sending IPIs to CPUs that are in the mm_cpumask,
but no longer running the program, send these IPIs only once a second.
The rest of the time we can skip over CPUs where the loaded_mm is
different from the target mm.
Reported-by: kernel test roboto <oliver.sang(a)intel.com>
Signed-off-by: Rik van Riel <riel(a)surriel.com>
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: Andy Lutomirski <luto(a)kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers(a)efficios.com>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Link: https://lore.kernel.org/r/20241204210316.612ee573@fangorn
Closes: https://lore.kernel.org/oe-lkp/202411282207.6bd28eae-lkp@intel.com/
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
arch/x86/include/asm/mmu.h | 2 ++
arch/x86/include/asm/mmu_context.h | 1 +
arch/x86/include/asm/tlbflush.h | 1 +
arch/x86/mm/tlb.c | 35 +++++++++++++++++++++++++++---
4 files changed, 36 insertions(+), 3 deletions(-)
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 5d7494631ea95..c07c018a1c139 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -33,6 +33,8 @@ typedef struct {
*/
atomic64_t tlb_gen;
+ unsigned long next_trim_cpumask;
+
#ifdef CONFIG_MODIFY_LDT_SYSCALL
struct rw_semaphore ldt_usr_sem;
struct ldt_struct *ldt;
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 27516046117a3..1d11aeb597542 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -106,6 +106,7 @@ static inline int init_new_context(struct task_struct *tsk,
mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
atomic64_set(&mm->context.tlb_gen, 0);
+ mm->context.next_trim_cpumask = jiffies + HZ;
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index b587a9ee9cb25..22b93e35fa886 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -207,6 +207,7 @@ struct flush_tlb_info {
unsigned int initiating_cpu;
u8 stride_shift;
u8 freed_tables;
+ u8 trim_cpumask;
};
void flush_tlb_local(void);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 511172d70825c..19d083ad2de79 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -854,9 +854,36 @@ static void flush_tlb_func(void *info)
nr_invalidate);
}
-static bool tlb_is_not_lazy(int cpu, void *data)
+static bool should_flush_tlb(int cpu, void *data)
{
- return !per_cpu(cpu_tlbstate_shared.is_lazy, cpu);
+ struct flush_tlb_info *info = data;
+
+ /* Lazy TLB will get flushed at the next context switch. */
+ if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
+ return false;
+
+ /* No mm means kernel memory flush. */
+ if (!info->mm)
+ return true;
+
+ /* The target mm is loaded, and the CPU is not lazy. */
+ if (per_cpu(cpu_tlbstate.loaded_mm, cpu) == info->mm)
+ return true;
+
+ /* In cpumask, but not the loaded mm? Periodically remove by flushing. */
+ if (info->trim_cpumask)
+ return true;
+
+ return false;
+}
+
+static bool should_trim_cpumask(struct mm_struct *mm)
+{
+ if (time_after(jiffies, READ_ONCE(mm->context.next_trim_cpumask))) {
+ WRITE_ONCE(mm->context.next_trim_cpumask, jiffies + HZ);
+ return true;
+ }
+ return false;
}
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared);
@@ -890,7 +917,7 @@ STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask,
if (info->freed_tables)
on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true);
else
- on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func,
+ on_each_cpu_cond_mask(should_flush_tlb, flush_tlb_func,
(void *)info, 1, cpumask);
}
@@ -941,6 +968,7 @@ static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
info->freed_tables = freed_tables;
info->new_tlb_gen = new_tlb_gen;
info->initiating_cpu = smp_processor_id();
+ info->trim_cpumask = 0;
return info;
}
@@ -983,6 +1011,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
* flush_tlb_func_local() directly in this case.
*/
if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
+ info->trim_cpumask = should_trim_cpumask(mm);
flush_tlb_multi(mm_cpumask(mm), info);
} else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
lockdep_assert_irqs_enabled();
--
2.39.5
From: Rik van Riel <riel(a)fb.com>
[ Upstream commit 6db2526c1d694c91c6e05e2f186c085e9460f202 ]
Setting and clearing CPU bits in the mm_cpumask is only ever done
by the CPU itself, from the context switch code or the TLB flush
code.
Synchronization is handled by switch_mm_irqs_off() blocking interrupts.
Sending TLB flush IPIs to CPUs that are in the mm_cpumask, but no
longer running the program causes a regression in the will-it-scale
tlbflush2 test. This test is contrived, but a large regression here
might cause a small regression in some real world workload.
Instead of always sending IPIs to CPUs that are in the mm_cpumask,
but no longer running the program, send these IPIs only once a second.
The rest of the time we can skip over CPUs where the loaded_mm is
different from the target mm.
Reported-by: kernel test roboto <oliver.sang(a)intel.com>
Signed-off-by: Rik van Riel <riel(a)surriel.com>
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: Andy Lutomirski <luto(a)kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers(a)efficios.com>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Link: https://lore.kernel.org/r/20241204210316.612ee573@fangorn
Closes: https://lore.kernel.org/oe-lkp/202411282207.6bd28eae-lkp@intel.com/
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
arch/x86/include/asm/mmu.h | 2 ++
arch/x86/include/asm/mmu_context.h | 1 +
arch/x86/include/asm/tlbflush.h | 1 +
arch/x86/mm/tlb.c | 35 +++++++++++++++++++++++++++---
4 files changed, 36 insertions(+), 3 deletions(-)
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 5d7494631ea95..c07c018a1c139 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -33,6 +33,8 @@ typedef struct {
*/
atomic64_t tlb_gen;
+ unsigned long next_trim_cpumask;
+
#ifdef CONFIG_MODIFY_LDT_SYSCALL
struct rw_semaphore ldt_usr_sem;
struct ldt_struct *ldt;
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index b8d40ddeab00f..6f5c7584fe1e3 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -106,6 +106,7 @@ static inline int init_new_context(struct task_struct *tsk,
mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
atomic64_set(&mm->context.tlb_gen, 0);
+ mm->context.next_trim_cpumask = jiffies + HZ;
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index cda3118f3b27d..d1eb6bbfd39e3 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -208,6 +208,7 @@ struct flush_tlb_info {
unsigned int initiating_cpu;
u8 stride_shift;
u8 freed_tables;
+ u8 trim_cpumask;
};
void flush_tlb_local(void);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index c1e31e9a85d76..b07e2167fcebf 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -878,9 +878,36 @@ static void flush_tlb_func(void *info)
nr_invalidate);
}
-static bool tlb_is_not_lazy(int cpu, void *data)
+static bool should_flush_tlb(int cpu, void *data)
{
- return !per_cpu(cpu_tlbstate_shared.is_lazy, cpu);
+ struct flush_tlb_info *info = data;
+
+ /* Lazy TLB will get flushed at the next context switch. */
+ if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
+ return false;
+
+ /* No mm means kernel memory flush. */
+ if (!info->mm)
+ return true;
+
+ /* The target mm is loaded, and the CPU is not lazy. */
+ if (per_cpu(cpu_tlbstate.loaded_mm, cpu) == info->mm)
+ return true;
+
+ /* In cpumask, but not the loaded mm? Periodically remove by flushing. */
+ if (info->trim_cpumask)
+ return true;
+
+ return false;
+}
+
+static bool should_trim_cpumask(struct mm_struct *mm)
+{
+ if (time_after(jiffies, READ_ONCE(mm->context.next_trim_cpumask))) {
+ WRITE_ONCE(mm->context.next_trim_cpumask, jiffies + HZ);
+ return true;
+ }
+ return false;
}
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared);
@@ -914,7 +941,7 @@ STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask,
if (info->freed_tables)
on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true);
else
- on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func,
+ on_each_cpu_cond_mask(should_flush_tlb, flush_tlb_func,
(void *)info, 1, cpumask);
}
@@ -965,6 +992,7 @@ static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
info->freed_tables = freed_tables;
info->new_tlb_gen = new_tlb_gen;
info->initiating_cpu = smp_processor_id();
+ info->trim_cpumask = 0;
return info;
}
@@ -1007,6 +1035,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
* flush_tlb_func_local() directly in this case.
*/
if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
+ info->trim_cpumask = should_trim_cpumask(mm);
flush_tlb_multi(mm_cpumask(mm), info);
} else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
lockdep_assert_irqs_enabled();
--
2.39.5
From: Rik van Riel <riel(a)fb.com>
[ Upstream commit 6db2526c1d694c91c6e05e2f186c085e9460f202 ]
Setting and clearing CPU bits in the mm_cpumask is only ever done
by the CPU itself, from the context switch code or the TLB flush
code.
Synchronization is handled by switch_mm_irqs_off() blocking interrupts.
Sending TLB flush IPIs to CPUs that are in the mm_cpumask, but no
longer running the program causes a regression in the will-it-scale
tlbflush2 test. This test is contrived, but a large regression here
might cause a small regression in some real world workload.
Instead of always sending IPIs to CPUs that are in the mm_cpumask,
but no longer running the program, send these IPIs only once a second.
The rest of the time we can skip over CPUs where the loaded_mm is
different from the target mm.
Reported-by: kernel test roboto <oliver.sang(a)intel.com>
Signed-off-by: Rik van Riel <riel(a)surriel.com>
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: Andy Lutomirski <luto(a)kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers(a)efficios.com>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Link: https://lore.kernel.org/r/20241204210316.612ee573@fangorn
Closes: https://lore.kernel.org/oe-lkp/202411282207.6bd28eae-lkp@intel.com/
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
arch/x86/include/asm/mmu.h | 2 ++
arch/x86/include/asm/mmu_context.h | 1 +
arch/x86/include/asm/tlbflush.h | 1 +
arch/x86/mm/tlb.c | 35 +++++++++++++++++++++++++++---
4 files changed, 36 insertions(+), 3 deletions(-)
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 0da5c227f490c..53763cf192777 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -37,6 +37,8 @@ typedef struct {
*/
atomic64_t tlb_gen;
+ unsigned long next_trim_cpumask;
+
#ifdef CONFIG_MODIFY_LDT_SYSCALL
struct rw_semaphore ldt_usr_sem;
struct ldt_struct *ldt;
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 8dac45a2c7fcf..f5afd956d5e50 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -145,6 +145,7 @@ static inline int init_new_context(struct task_struct *tsk,
mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
atomic64_set(&mm->context.tlb_gen, 0);
+ mm->context.next_trim_cpumask = jiffies + HZ;
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 25726893c6f4d..5d61adc6e892e 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -222,6 +222,7 @@ struct flush_tlb_info {
unsigned int initiating_cpu;
u8 stride_shift;
u8 freed_tables;
+ u8 trim_cpumask;
};
void flush_tlb_local(void);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 64f594826a282..df1794a5e38a5 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -898,9 +898,36 @@ static void flush_tlb_func(void *info)
nr_invalidate);
}
-static bool tlb_is_not_lazy(int cpu, void *data)
+static bool should_flush_tlb(int cpu, void *data)
{
- return !per_cpu(cpu_tlbstate_shared.is_lazy, cpu);
+ struct flush_tlb_info *info = data;
+
+ /* Lazy TLB will get flushed at the next context switch. */
+ if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
+ return false;
+
+ /* No mm means kernel memory flush. */
+ if (!info->mm)
+ return true;
+
+ /* The target mm is loaded, and the CPU is not lazy. */
+ if (per_cpu(cpu_tlbstate.loaded_mm, cpu) == info->mm)
+ return true;
+
+ /* In cpumask, but not the loaded mm? Periodically remove by flushing. */
+ if (info->trim_cpumask)
+ return true;
+
+ return false;
+}
+
+static bool should_trim_cpumask(struct mm_struct *mm)
+{
+ if (time_after(jiffies, READ_ONCE(mm->context.next_trim_cpumask))) {
+ WRITE_ONCE(mm->context.next_trim_cpumask, jiffies + HZ);
+ return true;
+ }
+ return false;
}
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared);
@@ -934,7 +961,7 @@ STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask,
if (info->freed_tables)
on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true);
else
- on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func,
+ on_each_cpu_cond_mask(should_flush_tlb, flush_tlb_func,
(void *)info, 1, cpumask);
}
@@ -985,6 +1012,7 @@ static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
info->freed_tables = freed_tables;
info->new_tlb_gen = new_tlb_gen;
info->initiating_cpu = smp_processor_id();
+ info->trim_cpumask = 0;
return info;
}
@@ -1027,6 +1055,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
* flush_tlb_func_local() directly in this case.
*/
if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
+ info->trim_cpumask = should_trim_cpumask(mm);
flush_tlb_multi(mm_cpumask(mm), info);
} else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
lockdep_assert_irqs_enabled();
--
2.39.5