The patch below does not apply to the v5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 3cea8af2d1a9ae5869b47c3dabe3b20f331f3bbd Mon Sep 17 00:00:00 2001
From: Gil Fine <gil.fine(a)linux.intel.com>
Date: Thu, 10 Oct 2024 17:29:42 +0300
Subject: [PATCH] thunderbolt: Honor TMU requirements in the domain when
setting TMU mode
Currently, when configuring TMU (Time Management Unit) mode of a given
router, we take into account only its own TMU requirements ignoring
other routers in the domain. This is problematic if the router we are
configuring has lower TMU requirements than what is already configured
in the domain.
In the scenario below, we have a host router with two USB4 ports: A and
B. Port A connected to device router #1 (which supports CL states) and
existing DisplayPort tunnel, thus, the TMU mode is HiFi uni-directional.
1. Initial topology
[Host]
A/
/
[Device #1]
/
Monitor
2. Plug in device #2 (that supports CL states) to downstream port B of
the host router
[Host]
A/ B\
/ \
[Device #1] [Device #2]
/
Monitor
The TMU mode on port B and port A will be configured to LowRes which is
not what we want and will cause monitor to start flickering.
To address this we first scan the domain and search for any router
configured to HiFi uni-directional mode, and if found, configure TMU
mode of the given router to HiFi uni-directional as well.
Cc: stable(a)vger.kernel.org
Signed-off-by: Gil Fine <gil.fine(a)linux.intel.com>
Signed-off-by: Mika Westerberg <mika.westerberg(a)linux.intel.com>
---
drivers/thunderbolt/tb.c | 48 +++++++++++++++++++++++++++++++++++-----
1 file changed, 42 insertions(+), 6 deletions(-)
diff --git a/drivers/thunderbolt/tb.c b/drivers/thunderbolt/tb.c
index 10e719dd837ce..4f777788e9179 100644
--- a/drivers/thunderbolt/tb.c
+++ b/drivers/thunderbolt/tb.c
@@ -288,6 +288,24 @@ static void tb_increase_tmu_accuracy(struct tb_tunnel *tunnel)
device_for_each_child(&sw->dev, NULL, tb_increase_switch_tmu_accuracy);
}
+static int tb_switch_tmu_hifi_uni_required(struct device *dev, void *not_used)
+{
+ struct tb_switch *sw = tb_to_switch(dev);
+
+ if (sw && tb_switch_tmu_is_enabled(sw) &&
+ tb_switch_tmu_is_configured(sw, TB_SWITCH_TMU_MODE_HIFI_UNI))
+ return 1;
+
+ return device_for_each_child(dev, NULL,
+ tb_switch_tmu_hifi_uni_required);
+}
+
+static bool tb_tmu_hifi_uni_required(struct tb *tb)
+{
+ return device_for_each_child(&tb->dev, NULL,
+ tb_switch_tmu_hifi_uni_required) == 1;
+}
+
static int tb_enable_tmu(struct tb_switch *sw)
{
int ret;
@@ -302,12 +320,30 @@ static int tb_enable_tmu(struct tb_switch *sw)
ret = tb_switch_tmu_configure(sw,
TB_SWITCH_TMU_MODE_MEDRES_ENHANCED_UNI);
if (ret == -EOPNOTSUPP) {
- if (tb_switch_clx_is_enabled(sw, TB_CL1))
- ret = tb_switch_tmu_configure(sw,
- TB_SWITCH_TMU_MODE_LOWRES);
- else
- ret = tb_switch_tmu_configure(sw,
- TB_SWITCH_TMU_MODE_HIFI_BI);
+ if (tb_switch_clx_is_enabled(sw, TB_CL1)) {
+ /*
+ * Figure out uni-directional HiFi TMU requirements
+ * currently in the domain. If there are no
+ * uni-directional HiFi requirements we can put the TMU
+ * into LowRes mode.
+ *
+ * Deliberately skip bi-directional HiFi links
+ * as these work independently of other links
+ * (and they do not allow any CL states anyway).
+ */
+ if (tb_tmu_hifi_uni_required(sw->tb))
+ ret = tb_switch_tmu_configure(sw,
+ TB_SWITCH_TMU_MODE_HIFI_UNI);
+ else
+ ret = tb_switch_tmu_configure(sw,
+ TB_SWITCH_TMU_MODE_LOWRES);
+ } else {
+ ret = tb_switch_tmu_configure(sw, TB_SWITCH_TMU_MODE_HIFI_BI);
+ }
+
+ /* If not supported, fallback to bi-directional HiFi */
+ if (ret == -EOPNOTSUPP)
+ ret = tb_switch_tmu_configure(sw, TB_SWITCH_TMU_MODE_HIFI_BI);
}
if (ret)
return ret;
--
2.43.0
The patch below does not apply to the v5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 3e8b7238b427e05498034c240451af5f5495afda Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan+linaro(a)kernel.org>
Date: Mon, 28 Oct 2024 13:49:58 +0100
Subject: [PATCH] gpiolib: fix debugfs newline separators
The gpiolib debugfs interface exports a list of all gpio chips in a
system and the state of their pins.
The gpio chip sections are supposed to be separated by a newline
character, but a long-standing bug prevents the separator from
being included when output is generated in multiple sessions, making the
output inconsistent and hard to read.
Make sure to only suppress the newline separator at the beginning of the
file as intended.
Fixes: f9c4a31f6150 ("gpiolib: Use seq_file's iterator interface")
Cc: stable(a)vger.kernel.org # 3.7
Cc: Thierry Reding <treding(a)nvidia.com>
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
Link: https://lore.kernel.org/r/20241028125000.24051-2-johan+linaro@kernel.org
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski(a)linaro.org>
---
drivers/gpio/gpiolib.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index d5952ab7752c2..e27488a90bc97 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -4926,6 +4926,8 @@ static void *gpiolib_seq_start(struct seq_file *s, loff_t *pos)
return NULL;
s->private = priv;
+ if (*pos > 0)
+ priv->newline = true;
priv->idx = srcu_read_lock(&gpio_devices_srcu);
list_for_each_entry_srcu(gdev, &gpio_devices, list,
--
2.43.0
The patch below does not apply to the v5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 4adf613e01bf99e1739f6ff3e162ad5b7d578d1a Mon Sep 17 00:00:00 2001
From: Alexander Usyskin <alexander.usyskin(a)intel.com>
Date: Tue, 15 Oct 2024 15:31:57 +0300
Subject: [PATCH] mei: use kvmalloc for read buffer
Read buffer is allocated according to max message size, reported by
the firmware and may reach 64K in systems with pxp client.
Contiguous 64k allocation may fail under memory pressure.
Read buffer is used as in-driver message storage and not required
to be contiguous.
Use kvmalloc to allow kernel to allocate non-contiguous memory.
Fixes: 3030dc056459 ("mei: add wrapper for queuing control commands.")
Cc: stable <stable(a)kernel.org>
Reported-by: Rohit Agarwal <rohiagar(a)chromium.org>
Closes: https://lore.kernel.org/all/20240813084542.2921300-1-rohiagar@chromium.org/
Tested-by: Brian Geffon <bgeffon(a)google.com>
Signed-off-by: Alexander Usyskin <alexander.usyskin(a)intel.com>
Acked-by: Tomas Winkler <tomasw(a)gmail.com>
Link: https://lore.kernel.org/r/20241015123157.2337026-1-alexander.usyskin@intel.…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/misc/mei/client.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/misc/mei/client.c b/drivers/misc/mei/client.c
index 9d090fa07516f..be011cef12e5d 100644
--- a/drivers/misc/mei/client.c
+++ b/drivers/misc/mei/client.c
@@ -321,7 +321,7 @@ void mei_io_cb_free(struct mei_cl_cb *cb)
return;
list_del(&cb->list);
- kfree(cb->buf.data);
+ kvfree(cb->buf.data);
kfree(cb->ext_hdr);
kfree(cb);
}
@@ -497,7 +497,7 @@ struct mei_cl_cb *mei_cl_alloc_cb(struct mei_cl *cl, size_t length,
if (length == 0)
return cb;
- cb->buf.data = kmalloc(roundup(length, MEI_SLOT_SIZE), GFP_KERNEL);
+ cb->buf.data = kvmalloc(roundup(length, MEI_SLOT_SIZE), GFP_KERNEL);
if (!cb->buf.data) {
mei_io_cb_free(cb);
return NULL;
--
2.43.0
The patch below does not apply to the v5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 41e192ad2779cae0102879612dfe46726e4396aa Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Date: Fri, 18 Oct 2024 04:33:10 +0900
Subject: [PATCH] nilfs2: fix kernel bug due to missing clearing of checked
flag
Syzbot reported that in directory operations after nilfs2 detects
filesystem corruption and degrades to read-only,
__block_write_begin_int(), which is called to prepare block writes, may
fail the BUG_ON check for accesses exceeding the folio/page size,
triggering a kernel bug.
This was found to be because the "checked" flag of a page/folio was not
cleared when it was discarded by nilfs2's own routine, which causes the
sanity check of directory entries to be skipped when the directory
page/folio is reloaded. So, fix that.
This was necessary when the use of nilfs2's own page discard routine was
applied to more than just metadata files.
Link: https://lkml.kernel.org/r/20241017193359.5051-1-konishi.ryusuke@gmail.com
Fixes: 8c26c4e2694a ("nilfs2: fix issue with flush kernel thread after remount in RO mode because of driver's internal error or metadata corruption")
Signed-off-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Reported-by: syzbot+d6ca2daf692c7a82f959(a)syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=d6ca2daf692c7a82f959
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/nilfs2/page.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 5436eb0424bd1..10def4b559956 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -401,6 +401,7 @@ void nilfs_clear_folio_dirty(struct folio *folio)
folio_clear_uptodate(folio);
folio_clear_mappedtodisk(folio);
+ folio_clear_checked(folio);
head = folio_buffers(folio);
if (head) {
--
2.43.0
The patch below does not apply to the v5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 281dd25c1a018261a04d1b8bf41a0674000bfe38 Mon Sep 17 00:00:00 2001
From: Matt Fleming <mfleming(a)cloudflare.com>
Date: Fri, 11 Oct 2024 13:07:37 +0100
Subject: [PATCH] mm/page_alloc: let GFP_ATOMIC order-0 allocs access
highatomic reserves
Under memory pressure it's possible for GFP_ATOMIC order-0 allocations to
fail even though free pages are available in the highatomic reserves.
GFP_ATOMIC allocations cannot trigger unreserve_highatomic_pageblock()
since it's only run from reclaim.
Given that such allocations will pass the watermarks in
__zone_watermark_unusable_free(), it makes sense to fallback to highatomic
reserves the same way that ALLOC_OOM can.
This fixes order-0 page allocation failures observed on Cloudflare's fleet
when handling network packets:
kswapd1: page allocation failure: order:0, mode:0x820(GFP_ATOMIC),
nodemask=(null),cpuset=/,mems_allowed=0-7
CPU: 10 PID: 696 Comm: kswapd1 Kdump: loaded Tainted: G O 6.6.43-CUSTOM #1
Hardware name: MACHINE
Call Trace:
<IRQ>
dump_stack_lvl+0x3c/0x50
warn_alloc+0x13a/0x1c0
__alloc_pages_slowpath.constprop.0+0xc9d/0xd10
__alloc_pages+0x327/0x340
__napi_alloc_skb+0x16d/0x1f0
bnxt_rx_page_skb+0x96/0x1b0 [bnxt_en]
bnxt_rx_pkt+0x201/0x15e0 [bnxt_en]
__bnxt_poll_work+0x156/0x2b0 [bnxt_en]
bnxt_poll+0xd9/0x1c0 [bnxt_en]
__napi_poll+0x2b/0x1b0
bpf_trampoline_6442524138+0x7d/0x1000
__napi_poll+0x5/0x1b0
net_rx_action+0x342/0x740
handle_softirqs+0xcf/0x2b0
irq_exit_rcu+0x6c/0x90
sysvec_apic_timer_interrupt+0x72/0x90
</IRQ>
[mfleming(a)cloudflare.com: update comment]
Link: https://lkml.kernel.org/r/20241015125158.3597702-1-matt@readmodwrite.com
Link: https://lkml.kernel.org/r/20241011120737.3300370-1-matt@readmodwrite.com
Link: https://lore.kernel.org/all/CAGis_TWzSu=P7QJmjD58WWiu3zjMTVKSzdOwWE8ORaGytz…
Fixes: 1d91df85f399 ("mm/page_alloc: handle a missing case for memalloc_nocma_{save/restore} APIs")
Signed-off-by: Matt Fleming <mfleming(a)cloudflare.com>
Suggested-by: Vlastimil Babka <vbabka(a)suse.cz>
Reviewed-by: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Mel Gorman <mgorman(a)techsingularity.net>
Cc: Michal Hocko <mhocko(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/page_alloc.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8afab64814dc4..94a2ffe280089 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2893,12 +2893,12 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
page = __rmqueue(zone, order, migratetype, alloc_flags);
/*
- * If the allocation fails, allow OOM handling access
- * to HIGHATOMIC reserves as failing now is worse than
- * failing a high-order atomic allocation in the
- * future.
+ * If the allocation fails, allow OOM handling and
+ * order-0 (atomic) allocs access to HIGHATOMIC
+ * reserves as failing now is worse than failing a
+ * high-order atomic allocation in the future.
*/
- if (!page && (alloc_flags & ALLOC_OOM))
+ if (!page && (alloc_flags & (ALLOC_OOM|ALLOC_NON_BLOCK)))
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (!page) {
--
2.43.0
The patch below does not apply to the v5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 117932eea99b729ee5d12783601a4f7f5fd58a23 Mon Sep 17 00:00:00 2001
From: Chen Ridong <chenridong(a)huawei.com>
Date: Tue, 8 Oct 2024 11:24:56 +0000
Subject: [PATCH] cgroup/bpf: use a dedicated workqueue for cgroup bpf
destruction
A hung_task problem shown below was found:
INFO: task kworker/0:0:8 blocked for more than 327 seconds.
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
Workqueue: events cgroup_bpf_release
Call Trace:
<TASK>
__schedule+0x5a2/0x2050
? find_held_lock+0x33/0x100
? wq_worker_sleeping+0x9e/0xe0
schedule+0x9f/0x180
schedule_preempt_disabled+0x25/0x50
__mutex_lock+0x512/0x740
? cgroup_bpf_release+0x1e/0x4d0
? cgroup_bpf_release+0xcf/0x4d0
? process_scheduled_works+0x161/0x8a0
? cgroup_bpf_release+0x1e/0x4d0
? mutex_lock_nested+0x2b/0x40
? __pfx_delay_tsc+0x10/0x10
mutex_lock_nested+0x2b/0x40
cgroup_bpf_release+0xcf/0x4d0
? process_scheduled_works+0x161/0x8a0
? trace_event_raw_event_workqueue_execute_start+0x64/0xd0
? process_scheduled_works+0x161/0x8a0
process_scheduled_works+0x23a/0x8a0
worker_thread+0x231/0x5b0
? __pfx_worker_thread+0x10/0x10
kthread+0x14d/0x1c0
? __pfx_kthread+0x10/0x10
ret_from_fork+0x59/0x70
? __pfx_kthread+0x10/0x10
ret_from_fork_asm+0x1b/0x30
</TASK>
This issue can be reproduced by the following pressuse test:
1. A large number of cpuset cgroups are deleted.
2. Set cpu on and off repeatly.
3. Set watchdog_thresh repeatly.
The scripts can be obtained at LINK mentioned above the signature.
The reason for this issue is cgroup_mutex and cpu_hotplug_lock are
acquired in different tasks, which may lead to deadlock.
It can lead to a deadlock through the following steps:
1. A large number of cpusets are deleted asynchronously, which puts a
large number of cgroup_bpf_release works into system_wq. The max_active
of system_wq is WQ_DFL_ACTIVE(256). Consequently, all active works are
cgroup_bpf_release works, and many cgroup_bpf_release works will be put
into inactive queue. As illustrated in the diagram, there are 256 (in
the acvtive queue) + n (in the inactive queue) works.
2. Setting watchdog_thresh will hold cpu_hotplug_lock.read and put
smp_call_on_cpu work into system_wq. However step 1 has already filled
system_wq, 'sscs.work' is put into inactive queue. 'sscs.work' has
to wait until the works that were put into the inacvtive queue earlier
have executed (n cgroup_bpf_release), so it will be blocked for a while.
3. Cpu offline requires cpu_hotplug_lock.write, which is blocked by step 2.
4. Cpusets that were deleted at step 1 put cgroup_release works into
cgroup_destroy_wq. They are competing to get cgroup_mutex all the time.
When cgroup_metux is acqured by work at css_killed_work_fn, it will
call cpuset_css_offline, which needs to acqure cpu_hotplug_lock.read.
However, cpuset_css_offline will be blocked for step 3.
5. At this moment, there are 256 works in active queue that are
cgroup_bpf_release, they are attempting to acquire cgroup_mutex, and as
a result, all of them are blocked. Consequently, sscs.work can not be
executed. Ultimately, this situation leads to four processes being
blocked, forming a deadlock.
system_wq(step1) WatchDog(step2) cpu offline(step3) cgroup_destroy_wq(step4)
...
2000+ cgroups deleted asyn
256 actives + n inactives
__lockup_detector_reconfigure
P(cpu_hotplug_lock.read)
put sscs.work into system_wq
256 + n + 1(sscs.work)
sscs.work wait to be executed
warting sscs.work finish
percpu_down_write
P(cpu_hotplug_lock.write)
...blocking...
css_killed_work_fn
P(cgroup_mutex)
cpuset_css_offline
P(cpu_hotplug_lock.read)
...blocking...
256 cgroup_bpf_release
mutex_lock(&cgroup_mutex);
..blocking...
To fix the problem, place cgroup_bpf_release works on a dedicated
workqueue which can break the loop and solve the problem. System wqs are
for misc things which shouldn't create a large number of concurrent work
items. If something is going to generate >WQ_DFL_ACTIVE(256) concurrent
work items, it should use its own dedicated workqueue.
Fixes: 4bfc0bb2c60e ("bpf: decouple the lifetime of cgroup_bpf from cgroup itself")
Cc: stable(a)vger.kernel.org # v5.3+
Link: https://lore.kernel.org/cgroups/e90c32d2-2a85-4f28-9154-09c7d320cb60@huawei…
Tested-by: Vishal Chourasia <vishalc(a)linux.ibm.com>
Signed-off-by: Chen Ridong <chenridong(a)huawei.com>
Signed-off-by: Tejun Heo <tj(a)kernel.org>
---
kernel/bpf/cgroup.c | 19 ++++++++++++++++++-
1 file changed, 18 insertions(+), 1 deletion(-)
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index e7113d700b878..025d7e2214aeb 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -24,6 +24,23 @@
DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
EXPORT_SYMBOL(cgroup_bpf_enabled_key);
+/*
+ * cgroup bpf destruction makes heavy use of work items and there can be a lot
+ * of concurrent destructions. Use a separate workqueue so that cgroup bpf
+ * destruction work items don't end up filling up max_active of system_wq
+ * which may lead to deadlock.
+ */
+static struct workqueue_struct *cgroup_bpf_destroy_wq;
+
+static int __init cgroup_bpf_wq_init(void)
+{
+ cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy", 0, 1);
+ if (!cgroup_bpf_destroy_wq)
+ panic("Failed to alloc workqueue for cgroup bpf destroy.\n");
+ return 0;
+}
+core_initcall(cgroup_bpf_wq_init);
+
/* __always_inline is necessary to prevent indirect call through run_prog
* function pointer.
*/
@@ -334,7 +351,7 @@ static void cgroup_bpf_release_fn(struct percpu_ref *ref)
struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
- queue_work(system_wq, &cgrp->bpf.release_work);
+ queue_work(cgroup_bpf_destroy_wq, &cgrp->bpf.release_work);
}
/* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through
--
2.43.0
The patch below does not apply to the v5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 4413665dd6c528b31284119e3571c25f371e1c36 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20Sch=C3=A4r?= <jan(a)jschaer.ch>
Date: Tue, 29 Oct 2024 23:12:49 +0100
Subject: [PATCH] ALSA: usb-audio: Add quirks for Dell WD19 dock
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The WD19 family of docks has the same audio chipset as the WD15. This
change enables jack detection on the WD19.
We don't need the dell_dock_mixer_init quirk for the WD19. It is only
needed because of the dell_alc4020_map quirk for the WD15 in
mixer_maps.c, which disables the volume controls. Even for the WD15,
this quirk was apparently only needed when the dock firmware was not
updated.
Signed-off-by: Jan Schär <jan(a)jschaer.ch>
Cc: <stable(a)vger.kernel.org>
Link: https://patch.msgid.link/20241029221249.15661-1-jan@jschaer.ch
Signed-off-by: Takashi Iwai <tiwai(a)suse.de>
---
sound/usb/mixer_quirks.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/sound/usb/mixer_quirks.c b/sound/usb/mixer_quirks.c
index 2a9594f34dac6..6456e87e2f397 100644
--- a/sound/usb/mixer_quirks.c
+++ b/sound/usb/mixer_quirks.c
@@ -4042,6 +4042,9 @@ int snd_usb_mixer_apply_create_quirk(struct usb_mixer_interface *mixer)
break;
err = dell_dock_mixer_init(mixer);
break;
+ case USB_ID(0x0bda, 0x402e): /* Dell WD19 dock */
+ err = dell_dock_mixer_create(mixer);
+ break;
case USB_ID(0x2a39, 0x3fd2): /* RME ADI-2 Pro */
case USB_ID(0x2a39, 0x3fd3): /* RME ADI-2 DAC */
--
2.43.0
The patch below does not apply to the v5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From aec8e6bf839101784f3ef037dcdb9432c3f32343 Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1(a)huawei.com>
Date: Mon, 21 Oct 2024 22:02:15 +0800
Subject: [PATCH] btrfs: fix use-after-free of block device file in
__btrfs_free_extra_devids()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Mounting btrfs from two images (which have the same one fsid and two
different dev_uuids) in certain executing order may trigger an UAF for
variable 'device->bdev_file' in __btrfs_free_extra_devids(). And
following are the details:
1. Attach image_1 to loop0, attach image_2 to loop1, and scan btrfs
devices by ioctl(BTRFS_IOC_SCAN_DEV):
/ btrfs_device_1 → loop0
fs_device
\ btrfs_device_2 → loop1
2. mount /dev/loop0 /mnt
btrfs_open_devices
btrfs_device_1->bdev_file = btrfs_get_bdev_and_sb(loop0)
btrfs_device_2->bdev_file = btrfs_get_bdev_and_sb(loop1)
btrfs_fill_super
open_ctree
fail: btrfs_close_devices // -ENOMEM
btrfs_close_bdev(btrfs_device_1)
fput(btrfs_device_1->bdev_file)
// btrfs_device_1->bdev_file is freed
btrfs_close_bdev(btrfs_device_2)
fput(btrfs_device_2->bdev_file)
3. mount /dev/loop1 /mnt
btrfs_open_devices
btrfs_get_bdev_and_sb(&bdev_file)
// EIO, btrfs_device_1->bdev_file is not assigned,
// which points to a freed memory area
btrfs_device_2->bdev_file = btrfs_get_bdev_and_sb(loop1)
btrfs_fill_super
open_ctree
btrfs_free_extra_devids
if (btrfs_device_1->bdev_file)
fput(btrfs_device_1->bdev_file) // UAF !
Fix it by setting 'device->bdev_file' as 'NULL' after closing the
btrfs_device in btrfs_close_one_device().
Fixes: 142388194191 ("btrfs: do not background blkdev_put()")
CC: stable(a)vger.kernel.org # 4.19+
Link: https://bugzilla.kernel.org/show_bug.cgi?id=219408
Signed-off-by: Zhihao Cheng <chengzhihao1(a)huawei.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
---
fs/btrfs/volumes.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8f340ad1d9384..eb51b609190fb 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1105,6 +1105,7 @@ static void btrfs_close_one_device(struct btrfs_device *device)
if (device->bdev) {
fs_devices->open_devices--;
device->bdev = NULL;
+ device->bdev_file = NULL;
}
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
btrfs_destroy_dev_zone_info(device);
--
2.43.0
The patch below does not apply to the v5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From d949d1d14fa281ace388b1de978e8f2cd52875cf Mon Sep 17 00:00:00 2001
From: Jeongjun Park <aha310510(a)gmail.com>
Date: Mon, 9 Sep 2024 21:35:58 +0900
Subject: [PATCH] mm: shmem: fix data-race in shmem_getattr()
I got the following KCSAN report during syzbot testing:
==================================================================
BUG: KCSAN: data-race in generic_fillattr / inode_set_ctime_current
write to 0xffff888102eb3260 of 4 bytes by task 6565 on cpu 1:
inode_set_ctime_to_ts include/linux/fs.h:1638 [inline]
inode_set_ctime_current+0x169/0x1d0 fs/inode.c:2626
shmem_mknod+0x117/0x180 mm/shmem.c:3443
shmem_create+0x34/0x40 mm/shmem.c:3497
lookup_open fs/namei.c:3578 [inline]
open_last_lookups fs/namei.c:3647 [inline]
path_openat+0xdbc/0x1f00 fs/namei.c:3883
do_filp_open+0xf7/0x200 fs/namei.c:3913
do_sys_openat2+0xab/0x120 fs/open.c:1416
do_sys_open fs/open.c:1431 [inline]
__do_sys_openat fs/open.c:1447 [inline]
__se_sys_openat fs/open.c:1442 [inline]
__x64_sys_openat+0xf3/0x120 fs/open.c:1442
x64_sys_call+0x1025/0x2d60 arch/x86/include/generated/asm/syscalls_64.h:258
do_syscall_x64 arch/x86/entry/common.c:52 [inline]
do_syscall_64+0x54/0x120 arch/x86/entry/common.c:83
entry_SYSCALL_64_after_hwframe+0x76/0x7e
read to 0xffff888102eb3260 of 4 bytes by task 3498 on cpu 0:
inode_get_ctime_nsec include/linux/fs.h:1623 [inline]
inode_get_ctime include/linux/fs.h:1629 [inline]
generic_fillattr+0x1dd/0x2f0 fs/stat.c:62
shmem_getattr+0x17b/0x200 mm/shmem.c:1157
vfs_getattr_nosec fs/stat.c:166 [inline]
vfs_getattr+0x19b/0x1e0 fs/stat.c:207
vfs_statx_path fs/stat.c:251 [inline]
vfs_statx+0x134/0x2f0 fs/stat.c:315
vfs_fstatat+0xec/0x110 fs/stat.c:341
__do_sys_newfstatat fs/stat.c:505 [inline]
__se_sys_newfstatat+0x58/0x260 fs/stat.c:499
__x64_sys_newfstatat+0x55/0x70 fs/stat.c:499
x64_sys_call+0x141f/0x2d60 arch/x86/include/generated/asm/syscalls_64.h:263
do_syscall_x64 arch/x86/entry/common.c:52 [inline]
do_syscall_64+0x54/0x120 arch/x86/entry/common.c:83
entry_SYSCALL_64_after_hwframe+0x76/0x7e
value changed: 0x2755ae53 -> 0x27ee44d3
Reported by Kernel Concurrency Sanitizer on:
CPU: 0 UID: 0 PID: 3498 Comm: udevd Not tainted 6.11.0-rc6-syzkaller-00326-gd1f2d51b711a-dirty #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/06/2024
==================================================================
When calling generic_fillattr(), if you don't hold read lock, data-race
will occur in inode member variables, which can cause unexpected
behavior.
Since there is no special protection when shmem_getattr() calls
generic_fillattr(), data-race occurs by functions such as shmem_unlink()
or shmem_mknod(). This can cause unexpected results, so commenting it out
is not enough.
Therefore, when calling generic_fillattr() from shmem_getattr(), it is
appropriate to protect the inode using inode_lock_shared() and
inode_unlock_shared() to prevent data-race.
Link: https://lkml.kernel.org/r/20240909123558.70229-1-aha310510@gmail.com
Fixes: 44a30220bc0a ("shmem: recalculate file inode when fstat")
Signed-off-by: Jeongjun Park <aha310510(a)gmail.com>
Reported-by: syzbot <syzkaller(a)googlegroup.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Yu Zhao <yuzhao(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/shmem.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/mm/shmem.c b/mm/shmem.c
index c5adb987b23cf..4ba1d00fabdaa 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1166,7 +1166,9 @@ static int shmem_getattr(struct mnt_idmap *idmap,
stat->attributes_mask |= (STATX_ATTR_APPEND |
STATX_ATTR_IMMUTABLE |
STATX_ATTR_NODUMP);
+ inode_lock_shared(inode);
generic_fillattr(idmap, request_mask, inode, stat);
+ inode_unlock_shared(inode);
if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0))
stat->blksize = HPAGE_PMD_SIZE;
--
2.43.0
The patch below does not apply to the v5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 1b6063a57754eae5705753c01e78dc268b989038 Mon Sep 17 00:00:00 2001
From: Ovidiu Bunea <Ovidiu.Bunea(a)amd.com>
Date: Fri, 11 Oct 2024 11:12:19 -0400
Subject: [PATCH] Revert "drm/amd/display: update DML2 policy
EnhancedPrefetchScheduleAccelerationFinal DCN35"
This reverts
commit 9dad21f910fc ("drm/amd/display: update DML2 policy EnhancedPrefetchScheduleAccelerationFinal DCN35")
[why & how]
The offending commit exposes a hang with lid close/open behavior.
Both issues seem to be related to ODM 2:1 mode switching, so there
is another issue generic to that sequence that needs to be
investigated.
Cc: Mario Limonciello <mario.limonciello(a)amd.com>
Cc: Alex Deucher <alexander.deucher(a)amd.com>
Reviewed-by: Nicholas Kazlauskas <nicholas.kazlauskas(a)amd.com>
Signed-off-by: Ovidiu Bunea <Ovidiu.Bunea(a)amd.com>
Signed-off-by: Tom Chung <chiahsuan.chung(a)amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler(a)amd.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
(cherry picked from commit 68bf95317ebf2cfa7105251e4279e951daceefb7)
Cc: stable(a)vger.kernel.org
---
drivers/gpu/drm/amd/display/dc/dml2/dml2_policy.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/gpu/drm/amd/display/dc/dml2/dml2_policy.c b/drivers/gpu/drm/amd/display/dc/dml2/dml2_policy.c
index 11c904ae29586..c4c52173ef224 100644
--- a/drivers/gpu/drm/amd/display/dc/dml2/dml2_policy.c
+++ b/drivers/gpu/drm/amd/display/dc/dml2/dml2_policy.c
@@ -303,6 +303,7 @@ void build_unoptimized_policy_settings(enum dml_project_id project, struct dml_m
if (project == dml_project_dcn35 ||
project == dml_project_dcn351) {
policy->DCCProgrammingAssumesScanDirectionUnknownFinal = false;
+ policy->EnhancedPrefetchScheduleAccelerationFinal = 0;
policy->AllowForPStateChangeOrStutterInVBlankFinal = dml_prefetch_support_uclk_fclk_and_stutter_if_possible; /*new*/
policy->UseOnlyMaxPrefetchModes = 1;
}
--
2.43.0
The patch below does not apply to the v5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 3cea8af2d1a9ae5869b47c3dabe3b20f331f3bbd Mon Sep 17 00:00:00 2001
From: Gil Fine <gil.fine(a)linux.intel.com>
Date: Thu, 10 Oct 2024 17:29:42 +0300
Subject: [PATCH] thunderbolt: Honor TMU requirements in the domain when
setting TMU mode
Currently, when configuring TMU (Time Management Unit) mode of a given
router, we take into account only its own TMU requirements ignoring
other routers in the domain. This is problematic if the router we are
configuring has lower TMU requirements than what is already configured
in the domain.
In the scenario below, we have a host router with two USB4 ports: A and
B. Port A connected to device router #1 (which supports CL states) and
existing DisplayPort tunnel, thus, the TMU mode is HiFi uni-directional.
1. Initial topology
[Host]
A/
/
[Device #1]
/
Monitor
2. Plug in device #2 (that supports CL states) to downstream port B of
the host router
[Host]
A/ B\
/ \
[Device #1] [Device #2]
/
Monitor
The TMU mode on port B and port A will be configured to LowRes which is
not what we want and will cause monitor to start flickering.
To address this we first scan the domain and search for any router
configured to HiFi uni-directional mode, and if found, configure TMU
mode of the given router to HiFi uni-directional as well.
Cc: stable(a)vger.kernel.org
Signed-off-by: Gil Fine <gil.fine(a)linux.intel.com>
Signed-off-by: Mika Westerberg <mika.westerberg(a)linux.intel.com>
---
drivers/thunderbolt/tb.c | 48 +++++++++++++++++++++++++++++++++++-----
1 file changed, 42 insertions(+), 6 deletions(-)
diff --git a/drivers/thunderbolt/tb.c b/drivers/thunderbolt/tb.c
index 10e719dd837ce..4f777788e9179 100644
--- a/drivers/thunderbolt/tb.c
+++ b/drivers/thunderbolt/tb.c
@@ -288,6 +288,24 @@ static void tb_increase_tmu_accuracy(struct tb_tunnel *tunnel)
device_for_each_child(&sw->dev, NULL, tb_increase_switch_tmu_accuracy);
}
+static int tb_switch_tmu_hifi_uni_required(struct device *dev, void *not_used)
+{
+ struct tb_switch *sw = tb_to_switch(dev);
+
+ if (sw && tb_switch_tmu_is_enabled(sw) &&
+ tb_switch_tmu_is_configured(sw, TB_SWITCH_TMU_MODE_HIFI_UNI))
+ return 1;
+
+ return device_for_each_child(dev, NULL,
+ tb_switch_tmu_hifi_uni_required);
+}
+
+static bool tb_tmu_hifi_uni_required(struct tb *tb)
+{
+ return device_for_each_child(&tb->dev, NULL,
+ tb_switch_tmu_hifi_uni_required) == 1;
+}
+
static int tb_enable_tmu(struct tb_switch *sw)
{
int ret;
@@ -302,12 +320,30 @@ static int tb_enable_tmu(struct tb_switch *sw)
ret = tb_switch_tmu_configure(sw,
TB_SWITCH_TMU_MODE_MEDRES_ENHANCED_UNI);
if (ret == -EOPNOTSUPP) {
- if (tb_switch_clx_is_enabled(sw, TB_CL1))
- ret = tb_switch_tmu_configure(sw,
- TB_SWITCH_TMU_MODE_LOWRES);
- else
- ret = tb_switch_tmu_configure(sw,
- TB_SWITCH_TMU_MODE_HIFI_BI);
+ if (tb_switch_clx_is_enabled(sw, TB_CL1)) {
+ /*
+ * Figure out uni-directional HiFi TMU requirements
+ * currently in the domain. If there are no
+ * uni-directional HiFi requirements we can put the TMU
+ * into LowRes mode.
+ *
+ * Deliberately skip bi-directional HiFi links
+ * as these work independently of other links
+ * (and they do not allow any CL states anyway).
+ */
+ if (tb_tmu_hifi_uni_required(sw->tb))
+ ret = tb_switch_tmu_configure(sw,
+ TB_SWITCH_TMU_MODE_HIFI_UNI);
+ else
+ ret = tb_switch_tmu_configure(sw,
+ TB_SWITCH_TMU_MODE_LOWRES);
+ } else {
+ ret = tb_switch_tmu_configure(sw, TB_SWITCH_TMU_MODE_HIFI_BI);
+ }
+
+ /* If not supported, fallback to bi-directional HiFi */
+ if (ret == -EOPNOTSUPP)
+ ret = tb_switch_tmu_configure(sw, TB_SWITCH_TMU_MODE_HIFI_BI);
}
if (ret)
return ret;
--
2.43.0
The patch below does not apply to the v5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 3e8b7238b427e05498034c240451af5f5495afda Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan+linaro(a)kernel.org>
Date: Mon, 28 Oct 2024 13:49:58 +0100
Subject: [PATCH] gpiolib: fix debugfs newline separators
The gpiolib debugfs interface exports a list of all gpio chips in a
system and the state of their pins.
The gpio chip sections are supposed to be separated by a newline
character, but a long-standing bug prevents the separator from
being included when output is generated in multiple sessions, making the
output inconsistent and hard to read.
Make sure to only suppress the newline separator at the beginning of the
file as intended.
Fixes: f9c4a31f6150 ("gpiolib: Use seq_file's iterator interface")
Cc: stable(a)vger.kernel.org # 3.7
Cc: Thierry Reding <treding(a)nvidia.com>
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
Link: https://lore.kernel.org/r/20241028125000.24051-2-johan+linaro@kernel.org
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski(a)linaro.org>
---
drivers/gpio/gpiolib.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index d5952ab7752c2..e27488a90bc97 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -4926,6 +4926,8 @@ static void *gpiolib_seq_start(struct seq_file *s, loff_t *pos)
return NULL;
s->private = priv;
+ if (*pos > 0)
+ priv->newline = true;
priv->idx = srcu_read_lock(&gpio_devices_srcu);
list_for_each_entry_srcu(gdev, &gpio_devices, list,
--
2.43.0
The patch below does not apply to the v5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 4adf613e01bf99e1739f6ff3e162ad5b7d578d1a Mon Sep 17 00:00:00 2001
From: Alexander Usyskin <alexander.usyskin(a)intel.com>
Date: Tue, 15 Oct 2024 15:31:57 +0300
Subject: [PATCH] mei: use kvmalloc for read buffer
Read buffer is allocated according to max message size, reported by
the firmware and may reach 64K in systems with pxp client.
Contiguous 64k allocation may fail under memory pressure.
Read buffer is used as in-driver message storage and not required
to be contiguous.
Use kvmalloc to allow kernel to allocate non-contiguous memory.
Fixes: 3030dc056459 ("mei: add wrapper for queuing control commands.")
Cc: stable <stable(a)kernel.org>
Reported-by: Rohit Agarwal <rohiagar(a)chromium.org>
Closes: https://lore.kernel.org/all/20240813084542.2921300-1-rohiagar@chromium.org/
Tested-by: Brian Geffon <bgeffon(a)google.com>
Signed-off-by: Alexander Usyskin <alexander.usyskin(a)intel.com>
Acked-by: Tomas Winkler <tomasw(a)gmail.com>
Link: https://lore.kernel.org/r/20241015123157.2337026-1-alexander.usyskin@intel.…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/misc/mei/client.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/misc/mei/client.c b/drivers/misc/mei/client.c
index 9d090fa07516f..be011cef12e5d 100644
--- a/drivers/misc/mei/client.c
+++ b/drivers/misc/mei/client.c
@@ -321,7 +321,7 @@ void mei_io_cb_free(struct mei_cl_cb *cb)
return;
list_del(&cb->list);
- kfree(cb->buf.data);
+ kvfree(cb->buf.data);
kfree(cb->ext_hdr);
kfree(cb);
}
@@ -497,7 +497,7 @@ struct mei_cl_cb *mei_cl_alloc_cb(struct mei_cl *cl, size_t length,
if (length == 0)
return cb;
- cb->buf.data = kmalloc(roundup(length, MEI_SLOT_SIZE), GFP_KERNEL);
+ cb->buf.data = kvmalloc(roundup(length, MEI_SLOT_SIZE), GFP_KERNEL);
if (!cb->buf.data) {
mei_io_cb_free(cb);
return NULL;
--
2.43.0
The patch below does not apply to the v5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 35e41024c4c2b02ef8207f61b9004f6956cf037b Mon Sep 17 00:00:00 2001
From: Gregory Price <gourry(a)gourry.net>
Date: Fri, 25 Oct 2024 10:17:24 -0400
Subject: [PATCH] vmscan,migrate: fix page count imbalance on node stats when
demoting pages
When numa balancing is enabled with demotion, vmscan will call
migrate_pages when shrinking LRUs. migrate_pages will decrement the
the node's isolated page count, leading to an imbalanced count when
invoked from (MG)LRU code.
The result is dmesg output like such:
$ cat /proc/sys/vm/stat_refresh
[77383.088417] vmstat_refresh: nr_isolated_anon -103212
[77383.088417] vmstat_refresh: nr_isolated_file -899642
This negative value may impact compaction and reclaim throttling.
The following path produces the decrement:
shrink_folio_list
demote_folio_list
migrate_pages
migrate_pages_batch
migrate_folio_move
migrate_folio_done
mod_node_page_state(-ve) <- decrement
This path happens for SUCCESSFUL migrations, not failures. Typically
callers to migrate_pages are required to handle putback/accounting for
failures, but this is already handled in the shrink code.
When accounting for migrations, instead do not decrement the count when
the migration reason is MR_DEMOTION. As of v6.11, this demotion logic
is the only source of MR_DEMOTION.
Link: https://lkml.kernel.org/r/20241025141724.17927-1-gourry@gourry.net
Fixes: 26aa2d199d6f ("mm/migrate: demote pages during reclaim")
Signed-off-by: Gregory Price <gourry(a)gourry.net>
Reviewed-by: Yang Shi <shy828301(a)gmail.com>
Reviewed-by: Davidlohr Bueso <dave(a)stgolabs.net>
Reviewed-by: Shakeel Butt <shakeel.butt(a)linux.dev>
Reviewed-by: "Huang, Ying" <ying.huang(a)intel.com>
Reviewed-by: Oscar Salvador <osalvador(a)suse.de>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: Wei Xu <weixugc(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/migrate.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/migrate.c b/mm/migrate.c
index 7e520562d421a..fab84a7760889 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1178,7 +1178,7 @@ static void migrate_folio_done(struct folio *src,
* not accounted to NR_ISOLATED_*. They can be recognized
* as __folio_test_movable
*/
- if (likely(!__folio_test_movable(src)))
+ if (likely(!__folio_test_movable(src)) && reason != MR_DEMOTION)
mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON +
folio_is_file_lru(src), -folio_nr_pages(src));
--
2.43.0
The patch below does not apply to the v5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 41e192ad2779cae0102879612dfe46726e4396aa Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Date: Fri, 18 Oct 2024 04:33:10 +0900
Subject: [PATCH] nilfs2: fix kernel bug due to missing clearing of checked
flag
Syzbot reported that in directory operations after nilfs2 detects
filesystem corruption and degrades to read-only,
__block_write_begin_int(), which is called to prepare block writes, may
fail the BUG_ON check for accesses exceeding the folio/page size,
triggering a kernel bug.
This was found to be because the "checked" flag of a page/folio was not
cleared when it was discarded by nilfs2's own routine, which causes the
sanity check of directory entries to be skipped when the directory
page/folio is reloaded. So, fix that.
This was necessary when the use of nilfs2's own page discard routine was
applied to more than just metadata files.
Link: https://lkml.kernel.org/r/20241017193359.5051-1-konishi.ryusuke@gmail.com
Fixes: 8c26c4e2694a ("nilfs2: fix issue with flush kernel thread after remount in RO mode because of driver's internal error or metadata corruption")
Signed-off-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Reported-by: syzbot+d6ca2daf692c7a82f959(a)syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=d6ca2daf692c7a82f959
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/nilfs2/page.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 5436eb0424bd1..10def4b559956 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -401,6 +401,7 @@ void nilfs_clear_folio_dirty(struct folio *folio)
folio_clear_uptodate(folio);
folio_clear_mappedtodisk(folio);
+ folio_clear_checked(folio);
head = folio_buffers(folio);
if (head) {
--
2.43.0
The patch below does not apply to the v5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 117932eea99b729ee5d12783601a4f7f5fd58a23 Mon Sep 17 00:00:00 2001
From: Chen Ridong <chenridong(a)huawei.com>
Date: Tue, 8 Oct 2024 11:24:56 +0000
Subject: [PATCH] cgroup/bpf: use a dedicated workqueue for cgroup bpf
destruction
A hung_task problem shown below was found:
INFO: task kworker/0:0:8 blocked for more than 327 seconds.
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
Workqueue: events cgroup_bpf_release
Call Trace:
<TASK>
__schedule+0x5a2/0x2050
? find_held_lock+0x33/0x100
? wq_worker_sleeping+0x9e/0xe0
schedule+0x9f/0x180
schedule_preempt_disabled+0x25/0x50
__mutex_lock+0x512/0x740
? cgroup_bpf_release+0x1e/0x4d0
? cgroup_bpf_release+0xcf/0x4d0
? process_scheduled_works+0x161/0x8a0
? cgroup_bpf_release+0x1e/0x4d0
? mutex_lock_nested+0x2b/0x40
? __pfx_delay_tsc+0x10/0x10
mutex_lock_nested+0x2b/0x40
cgroup_bpf_release+0xcf/0x4d0
? process_scheduled_works+0x161/0x8a0
? trace_event_raw_event_workqueue_execute_start+0x64/0xd0
? process_scheduled_works+0x161/0x8a0
process_scheduled_works+0x23a/0x8a0
worker_thread+0x231/0x5b0
? __pfx_worker_thread+0x10/0x10
kthread+0x14d/0x1c0
? __pfx_kthread+0x10/0x10
ret_from_fork+0x59/0x70
? __pfx_kthread+0x10/0x10
ret_from_fork_asm+0x1b/0x30
</TASK>
This issue can be reproduced by the following pressuse test:
1. A large number of cpuset cgroups are deleted.
2. Set cpu on and off repeatly.
3. Set watchdog_thresh repeatly.
The scripts can be obtained at LINK mentioned above the signature.
The reason for this issue is cgroup_mutex and cpu_hotplug_lock are
acquired in different tasks, which may lead to deadlock.
It can lead to a deadlock through the following steps:
1. A large number of cpusets are deleted asynchronously, which puts a
large number of cgroup_bpf_release works into system_wq. The max_active
of system_wq is WQ_DFL_ACTIVE(256). Consequently, all active works are
cgroup_bpf_release works, and many cgroup_bpf_release works will be put
into inactive queue. As illustrated in the diagram, there are 256 (in
the acvtive queue) + n (in the inactive queue) works.
2. Setting watchdog_thresh will hold cpu_hotplug_lock.read and put
smp_call_on_cpu work into system_wq. However step 1 has already filled
system_wq, 'sscs.work' is put into inactive queue. 'sscs.work' has
to wait until the works that were put into the inacvtive queue earlier
have executed (n cgroup_bpf_release), so it will be blocked for a while.
3. Cpu offline requires cpu_hotplug_lock.write, which is blocked by step 2.
4. Cpusets that were deleted at step 1 put cgroup_release works into
cgroup_destroy_wq. They are competing to get cgroup_mutex all the time.
When cgroup_metux is acqured by work at css_killed_work_fn, it will
call cpuset_css_offline, which needs to acqure cpu_hotplug_lock.read.
However, cpuset_css_offline will be blocked for step 3.
5. At this moment, there are 256 works in active queue that are
cgroup_bpf_release, they are attempting to acquire cgroup_mutex, and as
a result, all of them are blocked. Consequently, sscs.work can not be
executed. Ultimately, this situation leads to four processes being
blocked, forming a deadlock.
system_wq(step1) WatchDog(step2) cpu offline(step3) cgroup_destroy_wq(step4)
...
2000+ cgroups deleted asyn
256 actives + n inactives
__lockup_detector_reconfigure
P(cpu_hotplug_lock.read)
put sscs.work into system_wq
256 + n + 1(sscs.work)
sscs.work wait to be executed
warting sscs.work finish
percpu_down_write
P(cpu_hotplug_lock.write)
...blocking...
css_killed_work_fn
P(cgroup_mutex)
cpuset_css_offline
P(cpu_hotplug_lock.read)
...blocking...
256 cgroup_bpf_release
mutex_lock(&cgroup_mutex);
..blocking...
To fix the problem, place cgroup_bpf_release works on a dedicated
workqueue which can break the loop and solve the problem. System wqs are
for misc things which shouldn't create a large number of concurrent work
items. If something is going to generate >WQ_DFL_ACTIVE(256) concurrent
work items, it should use its own dedicated workqueue.
Fixes: 4bfc0bb2c60e ("bpf: decouple the lifetime of cgroup_bpf from cgroup itself")
Cc: stable(a)vger.kernel.org # v5.3+
Link: https://lore.kernel.org/cgroups/e90c32d2-2a85-4f28-9154-09c7d320cb60@huawei…
Tested-by: Vishal Chourasia <vishalc(a)linux.ibm.com>
Signed-off-by: Chen Ridong <chenridong(a)huawei.com>
Signed-off-by: Tejun Heo <tj(a)kernel.org>
---
kernel/bpf/cgroup.c | 19 ++++++++++++++++++-
1 file changed, 18 insertions(+), 1 deletion(-)
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index e7113d700b878..025d7e2214aeb 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -24,6 +24,23 @@
DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
EXPORT_SYMBOL(cgroup_bpf_enabled_key);
+/*
+ * cgroup bpf destruction makes heavy use of work items and there can be a lot
+ * of concurrent destructions. Use a separate workqueue so that cgroup bpf
+ * destruction work items don't end up filling up max_active of system_wq
+ * which may lead to deadlock.
+ */
+static struct workqueue_struct *cgroup_bpf_destroy_wq;
+
+static int __init cgroup_bpf_wq_init(void)
+{
+ cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy", 0, 1);
+ if (!cgroup_bpf_destroy_wq)
+ panic("Failed to alloc workqueue for cgroup bpf destroy.\n");
+ return 0;
+}
+core_initcall(cgroup_bpf_wq_init);
+
/* __always_inline is necessary to prevent indirect call through run_prog
* function pointer.
*/
@@ -334,7 +351,7 @@ static void cgroup_bpf_release_fn(struct percpu_ref *ref)
struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
- queue_work(system_wq, &cgrp->bpf.release_work);
+ queue_work(cgroup_bpf_destroy_wq, &cgrp->bpf.release_work);
}
/* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through
--
2.43.0
The patch below does not apply to the v5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 4413665dd6c528b31284119e3571c25f371e1c36 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20Sch=C3=A4r?= <jan(a)jschaer.ch>
Date: Tue, 29 Oct 2024 23:12:49 +0100
Subject: [PATCH] ALSA: usb-audio: Add quirks for Dell WD19 dock
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The WD19 family of docks has the same audio chipset as the WD15. This
change enables jack detection on the WD19.
We don't need the dell_dock_mixer_init quirk for the WD19. It is only
needed because of the dell_alc4020_map quirk for the WD15 in
mixer_maps.c, which disables the volume controls. Even for the WD15,
this quirk was apparently only needed when the dock firmware was not
updated.
Signed-off-by: Jan Schär <jan(a)jschaer.ch>
Cc: <stable(a)vger.kernel.org>
Link: https://patch.msgid.link/20241029221249.15661-1-jan@jschaer.ch
Signed-off-by: Takashi Iwai <tiwai(a)suse.de>
---
sound/usb/mixer_quirks.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/sound/usb/mixer_quirks.c b/sound/usb/mixer_quirks.c
index 2a9594f34dac6..6456e87e2f397 100644
--- a/sound/usb/mixer_quirks.c
+++ b/sound/usb/mixer_quirks.c
@@ -4042,6 +4042,9 @@ int snd_usb_mixer_apply_create_quirk(struct usb_mixer_interface *mixer)
break;
err = dell_dock_mixer_init(mixer);
break;
+ case USB_ID(0x0bda, 0x402e): /* Dell WD19 dock */
+ err = dell_dock_mixer_create(mixer);
+ break;
case USB_ID(0x2a39, 0x3fd2): /* RME ADI-2 Pro */
case USB_ID(0x2a39, 0x3fd3): /* RME ADI-2 DAC */
--
2.43.0
The patch below does not apply to the v5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 7245012f0f496162dd95d888ed2ceb5a35170f1a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg(a)intel.com>
Date: Wed, 23 Oct 2024 09:17:44 +0200
Subject: [PATCH] wifi: iwlwifi: mvm: fix 6 GHz scan construction
If more than 255 colocated APs exist for the set of all
APs found during 2.4/5 GHz scanning, then the 6 GHz scan
construction will loop forever since the loop variable
has type u8, which can never reach the number found when
that's bigger than 255, and is stored in a u32 variable.
Also move it into the loops to have a smaller scope.
Using a u32 there is fine, we limit the number of APs in
the scan list and each has a limit on the number of RNR
entries due to the frame size. With a limit of 1000 scan
results, a frame size upper bound of 4096 (really it's
more like ~2300) and a TBTT entry size of at least 11,
we get an upper bound for the number of ~372k, well in
the bounds of a u32.
Cc: stable(a)vger.kernel.org
Fixes: eae94cf82d74 ("iwlwifi: mvm: add support for 6GHz")
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219375
Link: https://patch.msgid.link/20241023091744.f4baed5c08a1.I8b417148bbc8c5d11c101…
Signed-off-by: Johannes Berg <johannes.berg(a)intel.com>
---
drivers/net/wireless/intel/iwlwifi/mvm/scan.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c
index 3ce9150213a74..ddcbd80a49fb2 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c
@@ -1774,7 +1774,7 @@ iwl_mvm_umac_scan_cfg_channels_v7_6g(struct iwl_mvm *mvm,
&cp->channel_config[ch_cnt];
u32 s_ssid_bitmap = 0, bssid_bitmap = 0, flags = 0;
- u8 j, k, n_s_ssids = 0, n_bssids = 0;
+ u8 k, n_s_ssids = 0, n_bssids = 0;
u8 max_s_ssids, max_bssids;
bool force_passive = false, found = false, allow_passive = true,
unsolicited_probe_on_chan = false, psc_no_listen = false;
@@ -1799,7 +1799,7 @@ iwl_mvm_umac_scan_cfg_channels_v7_6g(struct iwl_mvm *mvm,
cfg->v5.iter_count = 1;
cfg->v5.iter_interval = 0;
- for (j = 0; j < params->n_6ghz_params; j++) {
+ for (u32 j = 0; j < params->n_6ghz_params; j++) {
s8 tmp_psd_20;
if (!(scan_6ghz_params[j].channel_idx == i))
@@ -1873,7 +1873,7 @@ iwl_mvm_umac_scan_cfg_channels_v7_6g(struct iwl_mvm *mvm,
* SSID.
* TODO: improve this logic
*/
- for (j = 0; j < params->n_6ghz_params; j++) {
+ for (u32 j = 0; j < params->n_6ghz_params; j++) {
if (!(scan_6ghz_params[j].channel_idx == i))
continue;
--
2.43.0
The patch below does not apply to the v5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From aec8e6bf839101784f3ef037dcdb9432c3f32343 Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1(a)huawei.com>
Date: Mon, 21 Oct 2024 22:02:15 +0800
Subject: [PATCH] btrfs: fix use-after-free of block device file in
__btrfs_free_extra_devids()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Mounting btrfs from two images (which have the same one fsid and two
different dev_uuids) in certain executing order may trigger an UAF for
variable 'device->bdev_file' in __btrfs_free_extra_devids(). And
following are the details:
1. Attach image_1 to loop0, attach image_2 to loop1, and scan btrfs
devices by ioctl(BTRFS_IOC_SCAN_DEV):
/ btrfs_device_1 → loop0
fs_device
\ btrfs_device_2 → loop1
2. mount /dev/loop0 /mnt
btrfs_open_devices
btrfs_device_1->bdev_file = btrfs_get_bdev_and_sb(loop0)
btrfs_device_2->bdev_file = btrfs_get_bdev_and_sb(loop1)
btrfs_fill_super
open_ctree
fail: btrfs_close_devices // -ENOMEM
btrfs_close_bdev(btrfs_device_1)
fput(btrfs_device_1->bdev_file)
// btrfs_device_1->bdev_file is freed
btrfs_close_bdev(btrfs_device_2)
fput(btrfs_device_2->bdev_file)
3. mount /dev/loop1 /mnt
btrfs_open_devices
btrfs_get_bdev_and_sb(&bdev_file)
// EIO, btrfs_device_1->bdev_file is not assigned,
// which points to a freed memory area
btrfs_device_2->bdev_file = btrfs_get_bdev_and_sb(loop1)
btrfs_fill_super
open_ctree
btrfs_free_extra_devids
if (btrfs_device_1->bdev_file)
fput(btrfs_device_1->bdev_file) // UAF !
Fix it by setting 'device->bdev_file' as 'NULL' after closing the
btrfs_device in btrfs_close_one_device().
Fixes: 142388194191 ("btrfs: do not background blkdev_put()")
CC: stable(a)vger.kernel.org # 4.19+
Link: https://bugzilla.kernel.org/show_bug.cgi?id=219408
Signed-off-by: Zhihao Cheng <chengzhihao1(a)huawei.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
---
fs/btrfs/volumes.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8f340ad1d9384..eb51b609190fb 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1105,6 +1105,7 @@ static void btrfs_close_one_device(struct btrfs_device *device)
if (device->bdev) {
fs_devices->open_devices--;
device->bdev = NULL;
+ device->bdev_file = NULL;
}
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
btrfs_destroy_dev_zone_info(device);
--
2.43.0
The patch below does not apply to the v5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From d949d1d14fa281ace388b1de978e8f2cd52875cf Mon Sep 17 00:00:00 2001
From: Jeongjun Park <aha310510(a)gmail.com>
Date: Mon, 9 Sep 2024 21:35:58 +0900
Subject: [PATCH] mm: shmem: fix data-race in shmem_getattr()
I got the following KCSAN report during syzbot testing:
==================================================================
BUG: KCSAN: data-race in generic_fillattr / inode_set_ctime_current
write to 0xffff888102eb3260 of 4 bytes by task 6565 on cpu 1:
inode_set_ctime_to_ts include/linux/fs.h:1638 [inline]
inode_set_ctime_current+0x169/0x1d0 fs/inode.c:2626
shmem_mknod+0x117/0x180 mm/shmem.c:3443
shmem_create+0x34/0x40 mm/shmem.c:3497
lookup_open fs/namei.c:3578 [inline]
open_last_lookups fs/namei.c:3647 [inline]
path_openat+0xdbc/0x1f00 fs/namei.c:3883
do_filp_open+0xf7/0x200 fs/namei.c:3913
do_sys_openat2+0xab/0x120 fs/open.c:1416
do_sys_open fs/open.c:1431 [inline]
__do_sys_openat fs/open.c:1447 [inline]
__se_sys_openat fs/open.c:1442 [inline]
__x64_sys_openat+0xf3/0x120 fs/open.c:1442
x64_sys_call+0x1025/0x2d60 arch/x86/include/generated/asm/syscalls_64.h:258
do_syscall_x64 arch/x86/entry/common.c:52 [inline]
do_syscall_64+0x54/0x120 arch/x86/entry/common.c:83
entry_SYSCALL_64_after_hwframe+0x76/0x7e
read to 0xffff888102eb3260 of 4 bytes by task 3498 on cpu 0:
inode_get_ctime_nsec include/linux/fs.h:1623 [inline]
inode_get_ctime include/linux/fs.h:1629 [inline]
generic_fillattr+0x1dd/0x2f0 fs/stat.c:62
shmem_getattr+0x17b/0x200 mm/shmem.c:1157
vfs_getattr_nosec fs/stat.c:166 [inline]
vfs_getattr+0x19b/0x1e0 fs/stat.c:207
vfs_statx_path fs/stat.c:251 [inline]
vfs_statx+0x134/0x2f0 fs/stat.c:315
vfs_fstatat+0xec/0x110 fs/stat.c:341
__do_sys_newfstatat fs/stat.c:505 [inline]
__se_sys_newfstatat+0x58/0x260 fs/stat.c:499
__x64_sys_newfstatat+0x55/0x70 fs/stat.c:499
x64_sys_call+0x141f/0x2d60 arch/x86/include/generated/asm/syscalls_64.h:263
do_syscall_x64 arch/x86/entry/common.c:52 [inline]
do_syscall_64+0x54/0x120 arch/x86/entry/common.c:83
entry_SYSCALL_64_after_hwframe+0x76/0x7e
value changed: 0x2755ae53 -> 0x27ee44d3
Reported by Kernel Concurrency Sanitizer on:
CPU: 0 UID: 0 PID: 3498 Comm: udevd Not tainted 6.11.0-rc6-syzkaller-00326-gd1f2d51b711a-dirty #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/06/2024
==================================================================
When calling generic_fillattr(), if you don't hold read lock, data-race
will occur in inode member variables, which can cause unexpected
behavior.
Since there is no special protection when shmem_getattr() calls
generic_fillattr(), data-race occurs by functions such as shmem_unlink()
or shmem_mknod(). This can cause unexpected results, so commenting it out
is not enough.
Therefore, when calling generic_fillattr() from shmem_getattr(), it is
appropriate to protect the inode using inode_lock_shared() and
inode_unlock_shared() to prevent data-race.
Link: https://lkml.kernel.org/r/20240909123558.70229-1-aha310510@gmail.com
Fixes: 44a30220bc0a ("shmem: recalculate file inode when fstat")
Signed-off-by: Jeongjun Park <aha310510(a)gmail.com>
Reported-by: syzbot <syzkaller(a)googlegroup.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Yu Zhao <yuzhao(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/shmem.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/mm/shmem.c b/mm/shmem.c
index c5adb987b23cf..4ba1d00fabdaa 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1166,7 +1166,9 @@ static int shmem_getattr(struct mnt_idmap *idmap,
stat->attributes_mask |= (STATX_ATTR_APPEND |
STATX_ATTR_IMMUTABLE |
STATX_ATTR_NODUMP);
+ inode_lock_shared(inode);
generic_fillattr(idmap, request_mask, inode, stat);
+ inode_unlock_shared(inode);
if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0))
stat->blksize = HPAGE_PMD_SIZE;
--
2.43.0
The patch below does not apply to the v6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 1b6063a57754eae5705753c01e78dc268b989038 Mon Sep 17 00:00:00 2001
From: Ovidiu Bunea <Ovidiu.Bunea(a)amd.com>
Date: Fri, 11 Oct 2024 11:12:19 -0400
Subject: [PATCH] Revert "drm/amd/display: update DML2 policy
EnhancedPrefetchScheduleAccelerationFinal DCN35"
This reverts
commit 9dad21f910fc ("drm/amd/display: update DML2 policy EnhancedPrefetchScheduleAccelerationFinal DCN35")
[why & how]
The offending commit exposes a hang with lid close/open behavior.
Both issues seem to be related to ODM 2:1 mode switching, so there
is another issue generic to that sequence that needs to be
investigated.
Cc: Mario Limonciello <mario.limonciello(a)amd.com>
Cc: Alex Deucher <alexander.deucher(a)amd.com>
Reviewed-by: Nicholas Kazlauskas <nicholas.kazlauskas(a)amd.com>
Signed-off-by: Ovidiu Bunea <Ovidiu.Bunea(a)amd.com>
Signed-off-by: Tom Chung <chiahsuan.chung(a)amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler(a)amd.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
(cherry picked from commit 68bf95317ebf2cfa7105251e4279e951daceefb7)
Cc: stable(a)vger.kernel.org
---
drivers/gpu/drm/amd/display/dc/dml2/dml2_policy.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/gpu/drm/amd/display/dc/dml2/dml2_policy.c b/drivers/gpu/drm/amd/display/dc/dml2/dml2_policy.c
index 11c904ae29586..c4c52173ef224 100644
--- a/drivers/gpu/drm/amd/display/dc/dml2/dml2_policy.c
+++ b/drivers/gpu/drm/amd/display/dc/dml2/dml2_policy.c
@@ -303,6 +303,7 @@ void build_unoptimized_policy_settings(enum dml_project_id project, struct dml_m
if (project == dml_project_dcn35 ||
project == dml_project_dcn351) {
policy->DCCProgrammingAssumesScanDirectionUnknownFinal = false;
+ policy->EnhancedPrefetchScheduleAccelerationFinal = 0;
policy->AllowForPStateChangeOrStutterInVBlankFinal = dml_prefetch_support_uclk_fclk_and_stutter_if_possible; /*new*/
policy->UseOnlyMaxPrefetchModes = 1;
}
--
2.43.0
The patch below does not apply to the v6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 1db272864ff250b5e607283eaec819e1186c8e26 Mon Sep 17 00:00:00 2001
From: Sabyrzhan Tasbolatov <snovitoll(a)gmail.com>
Date: Wed, 16 Oct 2024 20:24:07 +0500
Subject: [PATCH] x86/traps: move kmsan check after instrumentation_begin
During x86_64 kernel build with CONFIG_KMSAN, the objtool warns following:
AR built-in.a
AR vmlinux.a
LD vmlinux.o
vmlinux.o: warning: objtool: handle_bug+0x4: call to
kmsan_unpoison_entry_regs() leaves .noinstr.text section
OBJCOPY modules.builtin.modinfo
GEN modules.builtin
MODPOST Module.symvers
CC .vmlinux.export.o
Moving kmsan_unpoison_entry_regs() _after_ instrumentation_begin() fixes
the warning.
There is decode_bug(regs->ip, &imm) is left before KMSAN unpoisoining, but
it has the return condition and if we include it after
instrumentation_begin() it results the warning "return with
instrumentation enabled", hence, I'm concerned that regs will not be KMSAN
unpoisoned if `ud_type == BUG_NONE` is true.
Link: https://lkml.kernel.org/r/20241016152407.3149001-1-snovitoll@gmail.com
Fixes: ba54d194f8da ("x86/traps: avoid KMSAN bugs originating from handle_bug()")
Signed-off-by: Sabyrzhan Tasbolatov <snovitoll(a)gmail.com>
Reviewed-by: Alexander Potapenko <glider(a)google.com>
Cc: Borislav Petkov (AMD) <bp(a)alien8.de>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
arch/x86/kernel/traps.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index d05392db5d0fe..2dbadf347b5f4 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -261,12 +261,6 @@ static noinstr bool handle_bug(struct pt_regs *regs)
int ud_type;
u32 imm;
- /*
- * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug()
- * is a rare case that uses @regs without passing them to
- * irqentry_enter().
- */
- kmsan_unpoison_entry_regs(regs);
ud_type = decode_bug(regs->ip, &imm);
if (ud_type == BUG_NONE)
return handled;
@@ -275,6 +269,12 @@ static noinstr bool handle_bug(struct pt_regs *regs)
* All lies, just get the WARN/BUG out.
*/
instrumentation_begin();
+ /*
+ * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug()
+ * is a rare case that uses @regs without passing them to
+ * irqentry_enter().
+ */
+ kmsan_unpoison_entry_regs(regs);
/*
* Since we're emulating a CALL with exceptions, restore the interrupt
* state to what it was at the exception site.
--
2.43.0
The patch below does not apply to the v6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 3cea8af2d1a9ae5869b47c3dabe3b20f331f3bbd Mon Sep 17 00:00:00 2001
From: Gil Fine <gil.fine(a)linux.intel.com>
Date: Thu, 10 Oct 2024 17:29:42 +0300
Subject: [PATCH] thunderbolt: Honor TMU requirements in the domain when
setting TMU mode
Currently, when configuring TMU (Time Management Unit) mode of a given
router, we take into account only its own TMU requirements ignoring
other routers in the domain. This is problematic if the router we are
configuring has lower TMU requirements than what is already configured
in the domain.
In the scenario below, we have a host router with two USB4 ports: A and
B. Port A connected to device router #1 (which supports CL states) and
existing DisplayPort tunnel, thus, the TMU mode is HiFi uni-directional.
1. Initial topology
[Host]
A/
/
[Device #1]
/
Monitor
2. Plug in device #2 (that supports CL states) to downstream port B of
the host router
[Host]
A/ B\
/ \
[Device #1] [Device #2]
/
Monitor
The TMU mode on port B and port A will be configured to LowRes which is
not what we want and will cause monitor to start flickering.
To address this we first scan the domain and search for any router
configured to HiFi uni-directional mode, and if found, configure TMU
mode of the given router to HiFi uni-directional as well.
Cc: stable(a)vger.kernel.org
Signed-off-by: Gil Fine <gil.fine(a)linux.intel.com>
Signed-off-by: Mika Westerberg <mika.westerberg(a)linux.intel.com>
---
drivers/thunderbolt/tb.c | 48 +++++++++++++++++++++++++++++++++++-----
1 file changed, 42 insertions(+), 6 deletions(-)
diff --git a/drivers/thunderbolt/tb.c b/drivers/thunderbolt/tb.c
index 10e719dd837ce..4f777788e9179 100644
--- a/drivers/thunderbolt/tb.c
+++ b/drivers/thunderbolt/tb.c
@@ -288,6 +288,24 @@ static void tb_increase_tmu_accuracy(struct tb_tunnel *tunnel)
device_for_each_child(&sw->dev, NULL, tb_increase_switch_tmu_accuracy);
}
+static int tb_switch_tmu_hifi_uni_required(struct device *dev, void *not_used)
+{
+ struct tb_switch *sw = tb_to_switch(dev);
+
+ if (sw && tb_switch_tmu_is_enabled(sw) &&
+ tb_switch_tmu_is_configured(sw, TB_SWITCH_TMU_MODE_HIFI_UNI))
+ return 1;
+
+ return device_for_each_child(dev, NULL,
+ tb_switch_tmu_hifi_uni_required);
+}
+
+static bool tb_tmu_hifi_uni_required(struct tb *tb)
+{
+ return device_for_each_child(&tb->dev, NULL,
+ tb_switch_tmu_hifi_uni_required) == 1;
+}
+
static int tb_enable_tmu(struct tb_switch *sw)
{
int ret;
@@ -302,12 +320,30 @@ static int tb_enable_tmu(struct tb_switch *sw)
ret = tb_switch_tmu_configure(sw,
TB_SWITCH_TMU_MODE_MEDRES_ENHANCED_UNI);
if (ret == -EOPNOTSUPP) {
- if (tb_switch_clx_is_enabled(sw, TB_CL1))
- ret = tb_switch_tmu_configure(sw,
- TB_SWITCH_TMU_MODE_LOWRES);
- else
- ret = tb_switch_tmu_configure(sw,
- TB_SWITCH_TMU_MODE_HIFI_BI);
+ if (tb_switch_clx_is_enabled(sw, TB_CL1)) {
+ /*
+ * Figure out uni-directional HiFi TMU requirements
+ * currently in the domain. If there are no
+ * uni-directional HiFi requirements we can put the TMU
+ * into LowRes mode.
+ *
+ * Deliberately skip bi-directional HiFi links
+ * as these work independently of other links
+ * (and they do not allow any CL states anyway).
+ */
+ if (tb_tmu_hifi_uni_required(sw->tb))
+ ret = tb_switch_tmu_configure(sw,
+ TB_SWITCH_TMU_MODE_HIFI_UNI);
+ else
+ ret = tb_switch_tmu_configure(sw,
+ TB_SWITCH_TMU_MODE_LOWRES);
+ } else {
+ ret = tb_switch_tmu_configure(sw, TB_SWITCH_TMU_MODE_HIFI_BI);
+ }
+
+ /* If not supported, fallback to bi-directional HiFi */
+ if (ret == -EOPNOTSUPP)
+ ret = tb_switch_tmu_configure(sw, TB_SWITCH_TMU_MODE_HIFI_BI);
}
if (ret)
return ret;
--
2.43.0
The patch below does not apply to the v6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 3e8b7238b427e05498034c240451af5f5495afda Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan+linaro(a)kernel.org>
Date: Mon, 28 Oct 2024 13:49:58 +0100
Subject: [PATCH] gpiolib: fix debugfs newline separators
The gpiolib debugfs interface exports a list of all gpio chips in a
system and the state of their pins.
The gpio chip sections are supposed to be separated by a newline
character, but a long-standing bug prevents the separator from
being included when output is generated in multiple sessions, making the
output inconsistent and hard to read.
Make sure to only suppress the newline separator at the beginning of the
file as intended.
Fixes: f9c4a31f6150 ("gpiolib: Use seq_file's iterator interface")
Cc: stable(a)vger.kernel.org # 3.7
Cc: Thierry Reding <treding(a)nvidia.com>
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
Link: https://lore.kernel.org/r/20241028125000.24051-2-johan+linaro@kernel.org
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski(a)linaro.org>
---
drivers/gpio/gpiolib.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index d5952ab7752c2..e27488a90bc97 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -4926,6 +4926,8 @@ static void *gpiolib_seq_start(struct seq_file *s, loff_t *pos)
return NULL;
s->private = priv;
+ if (*pos > 0)
+ priv->newline = true;
priv->idx = srcu_read_lock(&gpio_devices_srcu);
list_for_each_entry_srcu(gdev, &gpio_devices, list,
--
2.43.0
The patch below does not apply to the v6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 4adf613e01bf99e1739f6ff3e162ad5b7d578d1a Mon Sep 17 00:00:00 2001
From: Alexander Usyskin <alexander.usyskin(a)intel.com>
Date: Tue, 15 Oct 2024 15:31:57 +0300
Subject: [PATCH] mei: use kvmalloc for read buffer
Read buffer is allocated according to max message size, reported by
the firmware and may reach 64K in systems with pxp client.
Contiguous 64k allocation may fail under memory pressure.
Read buffer is used as in-driver message storage and not required
to be contiguous.
Use kvmalloc to allow kernel to allocate non-contiguous memory.
Fixes: 3030dc056459 ("mei: add wrapper for queuing control commands.")
Cc: stable <stable(a)kernel.org>
Reported-by: Rohit Agarwal <rohiagar(a)chromium.org>
Closes: https://lore.kernel.org/all/20240813084542.2921300-1-rohiagar@chromium.org/
Tested-by: Brian Geffon <bgeffon(a)google.com>
Signed-off-by: Alexander Usyskin <alexander.usyskin(a)intel.com>
Acked-by: Tomas Winkler <tomasw(a)gmail.com>
Link: https://lore.kernel.org/r/20241015123157.2337026-1-alexander.usyskin@intel.…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
drivers/misc/mei/client.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/misc/mei/client.c b/drivers/misc/mei/client.c
index 9d090fa07516f..be011cef12e5d 100644
--- a/drivers/misc/mei/client.c
+++ b/drivers/misc/mei/client.c
@@ -321,7 +321,7 @@ void mei_io_cb_free(struct mei_cl_cb *cb)
return;
list_del(&cb->list);
- kfree(cb->buf.data);
+ kvfree(cb->buf.data);
kfree(cb->ext_hdr);
kfree(cb);
}
@@ -497,7 +497,7 @@ struct mei_cl_cb *mei_cl_alloc_cb(struct mei_cl *cl, size_t length,
if (length == 0)
return cb;
- cb->buf.data = kmalloc(roundup(length, MEI_SLOT_SIZE), GFP_KERNEL);
+ cb->buf.data = kvmalloc(roundup(length, MEI_SLOT_SIZE), GFP_KERNEL);
if (!cb->buf.data) {
mei_io_cb_free(cb);
return NULL;
--
2.43.0
The patch below does not apply to the v6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 41e192ad2779cae0102879612dfe46726e4396aa Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Date: Fri, 18 Oct 2024 04:33:10 +0900
Subject: [PATCH] nilfs2: fix kernel bug due to missing clearing of checked
flag
Syzbot reported that in directory operations after nilfs2 detects
filesystem corruption and degrades to read-only,
__block_write_begin_int(), which is called to prepare block writes, may
fail the BUG_ON check for accesses exceeding the folio/page size,
triggering a kernel bug.
This was found to be because the "checked" flag of a page/folio was not
cleared when it was discarded by nilfs2's own routine, which causes the
sanity check of directory entries to be skipped when the directory
page/folio is reloaded. So, fix that.
This was necessary when the use of nilfs2's own page discard routine was
applied to more than just metadata files.
Link: https://lkml.kernel.org/r/20241017193359.5051-1-konishi.ryusuke@gmail.com
Fixes: 8c26c4e2694a ("nilfs2: fix issue with flush kernel thread after remount in RO mode because of driver's internal error or metadata corruption")
Signed-off-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Reported-by: syzbot+d6ca2daf692c7a82f959(a)syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=d6ca2daf692c7a82f959
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/nilfs2/page.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 5436eb0424bd1..10def4b559956 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -401,6 +401,7 @@ void nilfs_clear_folio_dirty(struct folio *folio)
folio_clear_uptodate(folio);
folio_clear_mappedtodisk(folio);
+ folio_clear_checked(folio);
head = folio_buffers(folio);
if (head) {
--
2.43.0
The patch below does not apply to the v6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 48f62d38a07d464a499fa834638afcfd2b68f852 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams(a)intel.com>
Date: Tue, 22 Oct 2024 18:43:40 -0700
Subject: [PATCH] cxl/acpi: Ensure ports ready at cxl_acpi_probe() return
In order to ensure root CXL ports are enabled upon cxl_acpi_probe()
when the 'cxl_port' driver is built as a module, arrange for the
module to be pre-loaded or built-in.
The "Fixes:" but no "Cc: stable" on this patch reflects that the issue
is merely by inspection since the bug that triggered the discovery of
this potential problem [1] is fixed by other means. However, a stable
backport should do no harm.
Fixes: 8dd2bc0f8e02 ("cxl/mem: Add the cxl_mem driver")
Link: http://lore.kernel.org/20241004212504.1246-1-gourry@gourry.net [1]
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
Tested-by: Gregory Price <gourry(a)gourry.net>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
Reviewed-by: Ira Weiny <ira.weiny(a)intel.com>
Link: https://patch.msgid.link/172964781969.81806.17276352414854540808.stgit@dwil…
Signed-off-by: Ira Weiny <ira.weiny(a)intel.com>
---
drivers/cxl/acpi.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c
index 82b78e331d8ed..432b7cfd12a8e 100644
--- a/drivers/cxl/acpi.c
+++ b/drivers/cxl/acpi.c
@@ -924,6 +924,13 @@ static void __exit cxl_acpi_exit(void)
/* load before dax_hmem sees 'Soft Reserved' CXL ranges */
subsys_initcall(cxl_acpi_init);
+
+/*
+ * Arrange for host-bridge ports to be active synchronous with
+ * cxl_acpi_probe() exit.
+ */
+MODULE_SOFTDEP("pre: cxl_port");
+
module_exit(cxl_acpi_exit);
MODULE_DESCRIPTION("CXL ACPI: Platform Support");
MODULE_LICENSE("GPL v2");
--
2.43.0
The patch below does not apply to the v6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 77b0d113eec49a7390ff1a08ca1923e89f5f86c6 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana(a)suse.com>
Date: Tue, 29 Oct 2024 15:18:45 +0000
Subject: [PATCH] btrfs: fix defrag not merging contiguous extents due to
merged extent maps
When running defrag (manual defrag) against a file that has extents that
are contiguous and we already have the respective extent maps loaded and
merged, we end up not defragging the range covered by those contiguous
extents. This happens when we have an extent map that was the result of
merging multiple extent maps for contiguous extents and the length of the
merged extent map is greater than or equals to the defrag threshold
length.
The script below reproduces this scenario:
$ cat test.sh
#!/bin/bash
DEV=/dev/sdi
MNT=/mnt/sdi
mkfs.btrfs -f $DEV
mount $DEV $MNT
# Create a 256K file with 4 extents of 64K each.
xfs_io -f -c "falloc 0 64K" \
-c "pwrite 0 64K" \
-c "falloc 64K 64K" \
-c "pwrite 64K 64K" \
-c "falloc 128K 64K" \
-c "pwrite 128K 64K" \
-c "falloc 192K 64K" \
-c "pwrite 192K 64K" \
$MNT/foo
umount $MNT
echo -n "Initial number of file extent items: "
btrfs inspect-internal dump-tree -t 5 $DEV | grep EXTENT_DATA | wc -l
mount $DEV $MNT
# Read the whole file in order to load and merge extent maps.
cat $MNT/foo > /dev/null
btrfs filesystem defragment -t 128K $MNT/foo
umount $MNT
echo -n "Number of file extent items after defrag with 128K threshold: "
btrfs inspect-internal dump-tree -t 5 $DEV | grep EXTENT_DATA | wc -l
mount $DEV $MNT
# Read the whole file in order to load and merge extent maps.
cat $MNT/foo > /dev/null
btrfs filesystem defragment -t 256K $MNT/foo
umount $MNT
echo -n "Number of file extent items after defrag with 256K threshold: "
btrfs inspect-internal dump-tree -t 5 $DEV | grep EXTENT_DATA | wc -l
Running it:
$ ./test.sh
Initial number of file extent items: 4
Number of file extent items after defrag with 128K threshold: 4
Number of file extent items after defrag with 256K threshold: 4
The 4 extents don't get merged because we have an extent map with a size
of 256K that is the result of merging the individual extent maps for each
of the four 64K extents and at defrag_lookup_extent() we have a value of
zero for the generation threshold ('newer_than' argument) since this is a
manual defrag. As a consequence we don't call defrag_get_extent() to get
an extent map representing a single file extent item in the inode's
subvolume tree, so we end up using the merged extent map at
defrag_collect_targets() and decide not to defrag.
Fix this by updating defrag_lookup_extent() to always discard extent maps
that were merged and call defrag_get_extent() regardless of the minimum
generation threshold ('newer_than' argument).
A test case for fstests will be sent along soon.
CC: stable(a)vger.kernel.org # 6.1+
Fixes: 199257a78bb0 ("btrfs: defrag: don't use merged extent map for their generation check")
Reviewed-by: Qu Wenruo <wqu(a)suse.com>
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
---
fs/btrfs/defrag.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index b95ef44c326bd..968dae9539482 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -763,12 +763,12 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
* We can get a merged extent, in that case, we need to re-search
* tree to get the original em for defrag.
*
- * If @newer_than is 0 or em::generation < newer_than, we can trust
- * this em, as either we don't care about the generation, or the
- * merged extent map will be rejected anyway.
+ * This is because even if we have adjacent extents that are contiguous
+ * and compatible (same type and flags), we still want to defrag them
+ * so that we use less metadata (extent items in the extent tree and
+ * file extent items in the inode's subvolume tree).
*/
- if (em && (em->flags & EXTENT_FLAG_MERGED) &&
- newer_than && em->generation >= newer_than) {
+ if (em && (em->flags & EXTENT_FLAG_MERGED)) {
free_extent_map(em);
em = NULL;
}
--
2.43.0
The patch below does not apply to the v6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 101c268bd2f37e965a5468353e62d154db38838e Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams(a)intel.com>
Date: Tue, 22 Oct 2024 18:43:49 -0700
Subject: [PATCH] cxl/port: Fix use-after-free, permit out-of-order decoder
shutdown
In support of investigating an initialization failure report [1],
cxl_test was updated to register mock memory-devices after the mock
root-port/bus device had been registered. That led to cxl_test crashing
with a use-after-free bug with the following signature:
cxl_port_attach_region: cxl region3: cxl_host_bridge.0:port3 decoder3.0 add: mem0:decoder7.0 @ 0 next: cxl_switch_uport.0 nr_eps: 1 nr_targets: 1
cxl_port_attach_region: cxl region3: cxl_host_bridge.0:port3 decoder3.0 add: mem4:decoder14.0 @ 1 next: cxl_switch_uport.0 nr_eps: 2 nr_targets: 1
cxl_port_setup_targets: cxl region3: cxl_switch_uport.0:port6 target[0] = cxl_switch_dport.0 for mem0:decoder7.0 @ 0
1) cxl_port_setup_targets: cxl region3: cxl_switch_uport.0:port6 target[1] = cxl_switch_dport.4 for mem4:decoder14.0 @ 1
[..]
cxld_unregister: cxl decoder14.0:
cxl_region_decode_reset: cxl_region region3:
mock_decoder_reset: cxl_port port3: decoder3.0 reset
2) mock_decoder_reset: cxl_port port3: decoder3.0: out of order reset, expected decoder3.1
cxl_endpoint_decoder_release: cxl decoder14.0:
[..]
cxld_unregister: cxl decoder7.0:
3) cxl_region_decode_reset: cxl_region region3:
Oops: general protection fault, probably for non-canonical address 0x6b6b6b6b6b6b6bc3: 0000 [#1] PREEMPT SMP PTI
[..]
RIP: 0010:to_cxl_port+0x8/0x60 [cxl_core]
[..]
Call Trace:
<TASK>
cxl_region_decode_reset+0x69/0x190 [cxl_core]
cxl_region_detach+0xe8/0x210 [cxl_core]
cxl_decoder_kill_region+0x27/0x40 [cxl_core]
cxld_unregister+0x5d/0x60 [cxl_core]
At 1) a region has been established with 2 endpoint decoders (7.0 and
14.0). Those endpoints share a common switch-decoder in the topology
(3.0). At teardown, 2), decoder14.0 is the first to be removed and hits
the "out of order reset case" in the switch decoder. The effect though
is that region3 cleanup is aborted leaving it in-tact and
referencing decoder14.0. At 3) the second attempt to teardown region3
trips over the stale decoder14.0 object which has long since been
deleted.
The fix here is to recognize that the CXL specification places no
mandate on in-order shutdown of switch-decoders, the driver enforces
in-order allocation, and hardware enforces in-order commit. So, rather
than fail and leave objects dangling, always remove them.
In support of making cxl_region_decode_reset() always succeed,
cxl_region_invalidate_memregion() failures are turned into warnings.
Crashing the kernel is ok there since system integrity is at risk if
caches cannot be managed around physical address mutation events like
CXL region destruction.
A new device_for_each_child_reverse_from() is added to cleanup
port->commit_end after all dependent decoders have been disabled. In
other words if decoders are allocated 0->1->2 and disabled 1->2->0 then
port->commit_end only decrements from 2 after 2 has been disabled, and
it decrements all the way to zero since 1 was disabled previously.
Link: http://lore.kernel.org/20241004212504.1246-1-gourry@gourry.net [1]
Cc: stable(a)vger.kernel.org
Fixes: 176baefb2eb5 ("cxl/hdm: Commit decoder state to hardware")
Reviewed-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: Davidlohr Bueso <dave(a)stgolabs.net>
Cc: Dave Jiang <dave.jiang(a)intel.com>
Cc: Alison Schofield <alison.schofield(a)intel.com>
Cc: Ira Weiny <ira.weiny(a)intel.com>
Cc: Zijun Hu <quic_zijuhu(a)quicinc.com>
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
Reviewed-by: Ira Weiny <ira.weiny(a)intel.com>
Link: https://patch.msgid.link/172964782781.81806.17902885593105284330.stgit@dwil…
Signed-off-by: Ira Weiny <ira.weiny(a)intel.com>
---
drivers/base/core.c | 35 +++++++++++++++++++++++++
drivers/cxl/core/hdm.c | 50 ++++++++++++++++++++++++++++++------
drivers/cxl/core/region.c | 48 ++++++++++------------------------
drivers/cxl/cxl.h | 3 ++-
include/linux/device.h | 3 +++
tools/testing/cxl/test/cxl.c | 14 ++++------
6 files changed, 100 insertions(+), 53 deletions(-)
diff --git a/drivers/base/core.c b/drivers/base/core.c
index a4c853411a6b2..e42f1ad730780 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -4037,6 +4037,41 @@ int device_for_each_child_reverse(struct device *parent, void *data,
}
EXPORT_SYMBOL_GPL(device_for_each_child_reverse);
+/**
+ * device_for_each_child_reverse_from - device child iterator in reversed order.
+ * @parent: parent struct device.
+ * @from: optional starting point in child list
+ * @fn: function to be called for each device.
+ * @data: data for the callback.
+ *
+ * Iterate over @parent's child devices, starting at @from, and call @fn
+ * for each, passing it @data. This helper is identical to
+ * device_for_each_child_reverse() when @from is NULL.
+ *
+ * @fn is checked each iteration. If it returns anything other than 0,
+ * iteration stop and that value is returned to the caller of
+ * device_for_each_child_reverse_from();
+ */
+int device_for_each_child_reverse_from(struct device *parent,
+ struct device *from, const void *data,
+ int (*fn)(struct device *, const void *))
+{
+ struct klist_iter i;
+ struct device *child;
+ int error = 0;
+
+ if (!parent->p)
+ return 0;
+
+ klist_iter_init_node(&parent->p->klist_children, &i,
+ (from ? &from->p->knode_parent : NULL));
+ while ((child = prev_device(&i)) && !error)
+ error = fn(child, data);
+ klist_iter_exit(&i);
+ return error;
+}
+EXPORT_SYMBOL_GPL(device_for_each_child_reverse_from);
+
/**
* device_find_child - device iterator for locating a particular device.
* @parent: parent struct device
diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
index 3df10517a3278..223c273c0cd17 100644
--- a/drivers/cxl/core/hdm.c
+++ b/drivers/cxl/core/hdm.c
@@ -712,7 +712,44 @@ static int cxl_decoder_commit(struct cxl_decoder *cxld)
return 0;
}
-static int cxl_decoder_reset(struct cxl_decoder *cxld)
+static int commit_reap(struct device *dev, const void *data)
+{
+ struct cxl_port *port = to_cxl_port(dev->parent);
+ struct cxl_decoder *cxld;
+
+ if (!is_switch_decoder(dev) && !is_endpoint_decoder(dev))
+ return 0;
+
+ cxld = to_cxl_decoder(dev);
+ if (port->commit_end == cxld->id &&
+ ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)) {
+ port->commit_end--;
+ dev_dbg(&port->dev, "reap: %s commit_end: %d\n",
+ dev_name(&cxld->dev), port->commit_end);
+ }
+
+ return 0;
+}
+
+void cxl_port_commit_reap(struct cxl_decoder *cxld)
+{
+ struct cxl_port *port = to_cxl_port(cxld->dev.parent);
+
+ lockdep_assert_held_write(&cxl_region_rwsem);
+
+ /*
+ * Once the highest committed decoder is disabled, free any other
+ * decoders that were pinned allocated by out-of-order release.
+ */
+ port->commit_end--;
+ dev_dbg(&port->dev, "reap: %s commit_end: %d\n", dev_name(&cxld->dev),
+ port->commit_end);
+ device_for_each_child_reverse_from(&port->dev, &cxld->dev, NULL,
+ commit_reap);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_port_commit_reap, CXL);
+
+static void cxl_decoder_reset(struct cxl_decoder *cxld)
{
struct cxl_port *port = to_cxl_port(cxld->dev.parent);
struct cxl_hdm *cxlhdm = dev_get_drvdata(&port->dev);
@@ -721,14 +758,14 @@ static int cxl_decoder_reset(struct cxl_decoder *cxld)
u32 ctrl;
if ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)
- return 0;
+ return;
- if (port->commit_end != id) {
+ if (port->commit_end == id)
+ cxl_port_commit_reap(cxld);
+ else
dev_dbg(&port->dev,
"%s: out of order reset, expected decoder%d.%d\n",
dev_name(&cxld->dev), port->id, port->commit_end);
- return -EBUSY;
- }
down_read(&cxl_dpa_rwsem);
ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id));
@@ -741,7 +778,6 @@ static int cxl_decoder_reset(struct cxl_decoder *cxld)
writel(0, hdm + CXL_HDM_DECODER0_BASE_LOW_OFFSET(id));
up_read(&cxl_dpa_rwsem);
- port->commit_end--;
cxld->flags &= ~CXL_DECODER_F_ENABLE;
/* Userspace is now responsible for reconfiguring this decoder */
@@ -751,8 +787,6 @@ static int cxl_decoder_reset(struct cxl_decoder *cxld)
cxled = to_cxl_endpoint_decoder(&cxld->dev);
cxled->state = CXL_DECODER_STATE_MANUAL;
}
-
- return 0;
}
static int cxl_setup_hdm_decoder_from_dvsec(
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index e701e4b040328..3478d20583035 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -232,8 +232,8 @@ static int cxl_region_invalidate_memregion(struct cxl_region *cxlr)
"Bypassing cpu_cache_invalidate_memregion() for testing!\n");
return 0;
} else {
- dev_err(&cxlr->dev,
- "Failed to synchronize CPU cache state\n");
+ dev_WARN(&cxlr->dev,
+ "Failed to synchronize CPU cache state\n");
return -ENXIO;
}
}
@@ -242,19 +242,17 @@ static int cxl_region_invalidate_memregion(struct cxl_region *cxlr)
return 0;
}
-static int cxl_region_decode_reset(struct cxl_region *cxlr, int count)
+static void cxl_region_decode_reset(struct cxl_region *cxlr, int count)
{
struct cxl_region_params *p = &cxlr->params;
- int i, rc = 0;
+ int i;
/*
- * Before region teardown attempt to flush, and if the flush
- * fails cancel the region teardown for data consistency
- * concerns
+ * Before region teardown attempt to flush, evict any data cached for
+ * this region, or scream loudly about missing arch / platform support
+ * for CXL teardown.
*/
- rc = cxl_region_invalidate_memregion(cxlr);
- if (rc)
- return rc;
+ cxl_region_invalidate_memregion(cxlr);
for (i = count - 1; i >= 0; i--) {
struct cxl_endpoint_decoder *cxled = p->targets[i];
@@ -277,23 +275,17 @@ static int cxl_region_decode_reset(struct cxl_region *cxlr, int count)
cxl_rr = cxl_rr_load(iter, cxlr);
cxld = cxl_rr->decoder;
if (cxld->reset)
- rc = cxld->reset(cxld);
- if (rc)
- return rc;
+ cxld->reset(cxld);
set_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags);
}
endpoint_reset:
- rc = cxled->cxld.reset(&cxled->cxld);
- if (rc)
- return rc;
+ cxled->cxld.reset(&cxled->cxld);
set_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags);
}
/* all decoders associated with this region have been torn down */
clear_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags);
-
- return 0;
}
static int commit_decoder(struct cxl_decoder *cxld)
@@ -409,16 +401,8 @@ static ssize_t commit_store(struct device *dev, struct device_attribute *attr,
* still pending.
*/
if (p->state == CXL_CONFIG_RESET_PENDING) {
- rc = cxl_region_decode_reset(cxlr, p->interleave_ways);
- /*
- * Revert to committed since there may still be active
- * decoders associated with this region, or move forward
- * to active to mark the reset successful
- */
- if (rc)
- p->state = CXL_CONFIG_COMMIT;
- else
- p->state = CXL_CONFIG_ACTIVE;
+ cxl_region_decode_reset(cxlr, p->interleave_ways);
+ p->state = CXL_CONFIG_ACTIVE;
}
}
@@ -2054,13 +2038,7 @@ static int cxl_region_detach(struct cxl_endpoint_decoder *cxled)
get_device(&cxlr->dev);
if (p->state > CXL_CONFIG_ACTIVE) {
- /*
- * TODO: tear down all impacted regions if a device is
- * removed out of order
- */
- rc = cxl_region_decode_reset(cxlr, p->interleave_ways);
- if (rc)
- goto out;
+ cxl_region_decode_reset(cxlr, p->interleave_ways);
p->state = CXL_CONFIG_ACTIVE;
}
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 0d8b810a51f04..5406e3ab3d4a4 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -359,7 +359,7 @@ struct cxl_decoder {
struct cxl_region *region;
unsigned long flags;
int (*commit)(struct cxl_decoder *cxld);
- int (*reset)(struct cxl_decoder *cxld);
+ void (*reset)(struct cxl_decoder *cxld);
};
/*
@@ -730,6 +730,7 @@ static inline bool is_cxl_root(struct cxl_port *port)
int cxl_num_decoders_committed(struct cxl_port *port);
bool is_cxl_port(const struct device *dev);
struct cxl_port *to_cxl_port(const struct device *dev);
+void cxl_port_commit_reap(struct cxl_decoder *cxld);
struct pci_bus;
int devm_cxl_register_pci_bus(struct device *host, struct device *uport_dev,
struct pci_bus *bus);
diff --git a/include/linux/device.h b/include/linux/device.h
index b4bde8d226979..667cb6db90193 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1078,6 +1078,9 @@ int device_for_each_child(struct device *dev, void *data,
int (*fn)(struct device *dev, void *data));
int device_for_each_child_reverse(struct device *dev, void *data,
int (*fn)(struct device *dev, void *data));
+int device_for_each_child_reverse_from(struct device *parent,
+ struct device *from, const void *data,
+ int (*fn)(struct device *, const void *));
struct device *device_find_child(struct device *dev, void *data,
int (*match)(struct device *dev, void *data));
struct device *device_find_child_by_name(struct device *parent,
diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c
index 90d5afd52dd06..c5bbd89b31920 100644
--- a/tools/testing/cxl/test/cxl.c
+++ b/tools/testing/cxl/test/cxl.c
@@ -693,26 +693,22 @@ static int mock_decoder_commit(struct cxl_decoder *cxld)
return 0;
}
-static int mock_decoder_reset(struct cxl_decoder *cxld)
+static void mock_decoder_reset(struct cxl_decoder *cxld)
{
struct cxl_port *port = to_cxl_port(cxld->dev.parent);
int id = cxld->id;
if ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)
- return 0;
+ return;
dev_dbg(&port->dev, "%s reset\n", dev_name(&cxld->dev));
- if (port->commit_end != id) {
+ if (port->commit_end == id)
+ cxl_port_commit_reap(cxld);
+ else
dev_dbg(&port->dev,
"%s: out of order reset, expected decoder%d.%d\n",
dev_name(&cxld->dev), port->id, port->commit_end);
- return -EBUSY;
- }
-
- port->commit_end--;
cxld->flags &= ~CXL_DECODER_F_ENABLE;
-
- return 0;
}
static void default_mock_decoder(struct cxl_decoder *cxld)
--
2.43.0
The patch below does not apply to the v6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 01626a18230246efdcea322aa8f067e60ffe5ccd Mon Sep 17 00:00:00 2001
From: Barry Song <v-songbaohua(a)oppo.com>
Date: Fri, 27 Sep 2024 09:19:36 +1200
Subject: [PATCH] mm: avoid unconditional one-tick sleep when swapcache_prepare
fails
Commit 13ddaf26be32 ("mm/swap: fix race when skipping swapcache")
introduced an unconditional one-tick sleep when `swapcache_prepare()`
fails, which has led to reports of UI stuttering on latency-sensitive
Android devices. To address this, we can use a waitqueue to wake up tasks
that fail `swapcache_prepare()` sooner, instead of always sleeping for a
full tick. While tasks may occasionally be woken by an unrelated
`do_swap_page()`, this method is preferable to two scenarios: rapid
re-entry into page faults, which can cause livelocks, and multiple
millisecond sleeps, which visibly degrade user experience.
Oven's testing shows that a single waitqueue resolves the UI stuttering
issue. If a 'thundering herd' problem becomes apparent later, a waitqueue
hash similar to `folio_wait_table[PAGE_WAIT_TABLE_SIZE]` for page bit
locks can be introduced.
[v-songbaohua(a)oppo.com: wake_up only when swapcache_wq waitqueue is active]
Link: https://lkml.kernel.org/r/20241008130807.40833-1-21cnbao@gmail.com
Link: https://lkml.kernel.org/r/20240926211936.75373-1-21cnbao@gmail.com
Fixes: 13ddaf26be32 ("mm/swap: fix race when skipping swapcache")
Signed-off-by: Barry Song <v-songbaohua(a)oppo.com>
Reported-by: Oven Liyang <liyangouwen1(a)oppo.com>
Tested-by: Oven Liyang <liyangouwen1(a)oppo.com>
Cc: Kairui Song <kasong(a)tencent.com>
Cc: "Huang, Ying" <ying.huang(a)intel.com>
Cc: Yu Zhao <yuzhao(a)google.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Chris Li <chrisl(a)kernel.org>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: Yosry Ahmed <yosryahmed(a)google.com>
Cc: SeongJae Park <sj(a)kernel.org>
Cc: Kalesh Singh <kaleshsingh(a)google.com>
Cc: Suren Baghdasaryan <surenb(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/memory.c | 15 +++++++++++++--
1 file changed, 13 insertions(+), 2 deletions(-)
diff --git a/mm/memory.c b/mm/memory.c
index 3ccee51adfbbd..bdf77a3ec47bc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4187,6 +4187,8 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq);
+
/*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
@@ -4199,6 +4201,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct folio *swapcache, *folio = NULL;
+ DECLARE_WAITQUEUE(wait, current);
struct page *page;
struct swap_info_struct *si = NULL;
rmap_t rmap_flags = RMAP_NONE;
@@ -4297,7 +4300,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* Relax a bit to prevent rapid
* repeated page faults.
*/
+ add_wait_queue(&swapcache_wq, &wait);
schedule_timeout_uninterruptible(1);
+ remove_wait_queue(&swapcache_wq, &wait);
goto out_page;
}
need_clear_cache = true;
@@ -4604,8 +4609,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
/* Clear the swap cache pin for direct swapin after PTL unlock */
- if (need_clear_cache)
+ if (need_clear_cache) {
swapcache_clear(si, entry, nr_pages);
+ if (waitqueue_active(&swapcache_wq))
+ wake_up(&swapcache_wq);
+ }
if (si)
put_swap_device(si);
return ret;
@@ -4620,8 +4628,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
folio_unlock(swapcache);
folio_put(swapcache);
}
- if (need_clear_cache)
+ if (need_clear_cache) {
swapcache_clear(si, entry, nr_pages);
+ if (waitqueue_active(&swapcache_wq))
+ wake_up(&swapcache_wq);
+ }
if (si)
put_swap_device(si);
return ret;
--
2.43.0
The patch below does not apply to the v6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 6575b268157f37929948a8d1f3bafb3d7c055bc1 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams(a)intel.com>
Date: Fri, 25 Oct 2024 12:32:55 -0700
Subject: [PATCH] cxl/port: Fix CXL port initialization order when the
subsystem is built-in
When the CXL subsystem is built-in the module init order is determined
by Makefile order. That order violates expectations. The expectation is
that cxl_acpi and cxl_mem can race to attach. If cxl_acpi wins the race,
cxl_mem will find the enabled CXL root ports it needs. If cxl_acpi loses
the race it will retrigger cxl_mem to attach via cxl_bus_rescan(). That
flow only works if cxl_acpi can assume ports are enabled immediately
upon cxl_acpi_probe() return. That in turn can only happen in the
CONFIG_CXL_ACPI=y case if the cxl_port driver is registered before
cxl_acpi_probe() runs.
Fix up the order to prevent initialization failures. Ensure that
cxl_port is built-in when cxl_acpi is also built-in, arrange for
Makefile order to resolve the subsys_initcall() order of cxl_port and
cxl_acpi, and arrange for Makefile order to resolve the
device_initcall() (module_init()) order of the remaining objects.
As for what contributed to this not being found earlier, the CXL
regression environment, cxl_test, builds all CXL functionality as a
module to allow to symbol mocking and other dynamic reload tests. As a
result there is no regression coverage for the built-in case.
Reported-by: Gregory Price <gourry(a)gourry.net>
Closes: http://lore.kernel.org/20241004212504.1246-1-gourry@gourry.net
Tested-by: Gregory Price <gourry(a)gourry.net>
Fixes: 8dd2bc0f8e02 ("cxl/mem: Add the cxl_mem driver")
Cc: stable(a)vger.kernel.org
Cc: Davidlohr Bueso <dave(a)stgolabs.net>
Cc: Jonathan Cameron <jonathan.cameron(a)huawei.com>
Cc: Dave Jiang <dave.jiang(a)intel.com>
Cc: Alison Schofield <alison.schofield(a)intel.com>
Cc: Vishal Verma <vishal.l.verma(a)intel.com>
Cc: Ira Weiny <ira.weiny(a)intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
Reviewed-by: Ira Weiny <ira.weiny(a)intel.com>
Tested-by: Alejandro Lucero <alucerop(a)amd.com>
Reviewed-by: Alejandro Lucero <alucerop(a)amd.com>
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
Link: https://patch.msgid.link/172988474904.476062.7961350937442459266.stgit@dwil…
Signed-off-by: Ira Weiny <ira.weiny(a)intel.com>
---
drivers/cxl/Kconfig | 1 +
drivers/cxl/Makefile | 20 ++++++++++++++------
drivers/cxl/port.c | 17 ++++++++++++++++-
3 files changed, 31 insertions(+), 7 deletions(-)
diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
index 29c192f20082c..876469e23f7a7 100644
--- a/drivers/cxl/Kconfig
+++ b/drivers/cxl/Kconfig
@@ -60,6 +60,7 @@ config CXL_ACPI
default CXL_BUS
select ACPI_TABLE_LIB
select ACPI_HMAT
+ select CXL_PORT
help
Enable support for host managed device memory (HDM) resources
published by a platform's ACPI CXL memory layout description. See
diff --git a/drivers/cxl/Makefile b/drivers/cxl/Makefile
index db321f48ba52e..2caa90fa4bf25 100644
--- a/drivers/cxl/Makefile
+++ b/drivers/cxl/Makefile
@@ -1,13 +1,21 @@
# SPDX-License-Identifier: GPL-2.0
+
+# Order is important here for the built-in case:
+# - 'core' first for fundamental init
+# - 'port' before platform root drivers like 'acpi' so that CXL-root ports
+# are immediately enabled
+# - 'mem' and 'pmem' before endpoint drivers so that memdevs are
+# immediately enabled
+# - 'pci' last, also mirrors the hardware enumeration hierarchy
obj-y += core/
-obj-$(CONFIG_CXL_PCI) += cxl_pci.o
-obj-$(CONFIG_CXL_MEM) += cxl_mem.o
+obj-$(CONFIG_CXL_PORT) += cxl_port.o
obj-$(CONFIG_CXL_ACPI) += cxl_acpi.o
obj-$(CONFIG_CXL_PMEM) += cxl_pmem.o
-obj-$(CONFIG_CXL_PORT) += cxl_port.o
+obj-$(CONFIG_CXL_MEM) += cxl_mem.o
+obj-$(CONFIG_CXL_PCI) += cxl_pci.o
-cxl_mem-y := mem.o
-cxl_pci-y := pci.o
+cxl_port-y := port.o
cxl_acpi-y := acpi.o
cxl_pmem-y := pmem.o security.o
-cxl_port-y := port.o
+cxl_mem-y := mem.o
+cxl_pci-y := pci.o
diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c
index 861dde65768fe..9dc394295e1fc 100644
--- a/drivers/cxl/port.c
+++ b/drivers/cxl/port.c
@@ -208,7 +208,22 @@ static struct cxl_driver cxl_port_driver = {
},
};
-module_cxl_driver(cxl_port_driver);
+static int __init cxl_port_init(void)
+{
+ return cxl_driver_register(&cxl_port_driver);
+}
+/*
+ * Be ready to immediately enable ports emitted by the platform CXL root
+ * (e.g. cxl_acpi) when CONFIG_CXL_PORT=y.
+ */
+subsys_initcall(cxl_port_init);
+
+static void __exit cxl_port_exit(void)
+{
+ cxl_driver_unregister(&cxl_port_driver);
+}
+module_exit(cxl_port_exit);
+
MODULE_DESCRIPTION("CXL: Port enumeration and services");
MODULE_LICENSE("GPL v2");
MODULE_IMPORT_NS(CXL);
--
2.43.0
The patch below does not apply to the v6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 7245012f0f496162dd95d888ed2ceb5a35170f1a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg(a)intel.com>
Date: Wed, 23 Oct 2024 09:17:44 +0200
Subject: [PATCH] wifi: iwlwifi: mvm: fix 6 GHz scan construction
If more than 255 colocated APs exist for the set of all
APs found during 2.4/5 GHz scanning, then the 6 GHz scan
construction will loop forever since the loop variable
has type u8, which can never reach the number found when
that's bigger than 255, and is stored in a u32 variable.
Also move it into the loops to have a smaller scope.
Using a u32 there is fine, we limit the number of APs in
the scan list and each has a limit on the number of RNR
entries due to the frame size. With a limit of 1000 scan
results, a frame size upper bound of 4096 (really it's
more like ~2300) and a TBTT entry size of at least 11,
we get an upper bound for the number of ~372k, well in
the bounds of a u32.
Cc: stable(a)vger.kernel.org
Fixes: eae94cf82d74 ("iwlwifi: mvm: add support for 6GHz")
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219375
Link: https://patch.msgid.link/20241023091744.f4baed5c08a1.I8b417148bbc8c5d11c101…
Signed-off-by: Johannes Berg <johannes.berg(a)intel.com>
---
drivers/net/wireless/intel/iwlwifi/mvm/scan.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c
index 3ce9150213a74..ddcbd80a49fb2 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c
@@ -1774,7 +1774,7 @@ iwl_mvm_umac_scan_cfg_channels_v7_6g(struct iwl_mvm *mvm,
&cp->channel_config[ch_cnt];
u32 s_ssid_bitmap = 0, bssid_bitmap = 0, flags = 0;
- u8 j, k, n_s_ssids = 0, n_bssids = 0;
+ u8 k, n_s_ssids = 0, n_bssids = 0;
u8 max_s_ssids, max_bssids;
bool force_passive = false, found = false, allow_passive = true,
unsolicited_probe_on_chan = false, psc_no_listen = false;
@@ -1799,7 +1799,7 @@ iwl_mvm_umac_scan_cfg_channels_v7_6g(struct iwl_mvm *mvm,
cfg->v5.iter_count = 1;
cfg->v5.iter_interval = 0;
- for (j = 0; j < params->n_6ghz_params; j++) {
+ for (u32 j = 0; j < params->n_6ghz_params; j++) {
s8 tmp_psd_20;
if (!(scan_6ghz_params[j].channel_idx == i))
@@ -1873,7 +1873,7 @@ iwl_mvm_umac_scan_cfg_channels_v7_6g(struct iwl_mvm *mvm,
* SSID.
* TODO: improve this logic
*/
- for (j = 0; j < params->n_6ghz_params; j++) {
+ for (u32 j = 0; j < params->n_6ghz_params; j++) {
if (!(scan_6ghz_params[j].channel_idx == i))
continue;
--
2.43.0
The patch below does not apply to the v6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From a0f0625390858321525c2a8d04e174a546bd19b3 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana(a)suse.com>
Date: Mon, 28 Oct 2024 16:23:00 +0000
Subject: [PATCH] btrfs: fix extent map merging not happening for adjacent
extents
If we have 3 or more adjacent extents in a file, that is, consecutive file
extent items pointing to adjacent extents, within a contiguous file range
and compatible flags, we end up not merging all the extents into a single
extent map.
For example:
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt/sdc
$ xfs_io -f -d -c "pwrite -b 64K 0 64K" \
-c "pwrite -b 64K 64K 64K" \
-c "pwrite -b 64K 128K 64K" \
-c "pwrite -b 64K 192K 64K" \
/mnt/sdc/foo
After all the ordered extents complete we unpin the extent maps and try
to merge them, but instead of getting a single extent map we get two
because:
1) When the first ordered extent completes (file range [0, 64K)) we
unpin its extent map and attempt to merge it with the extent map for
the range [64K, 128K), but we can't because that extent map is still
pinned;
2) When the second ordered extent completes (file range [64K, 128K)), we
unpin its extent map and merge it with the previous extent map, for
file range [0, 64K), but we can't merge with the next extent map, for
the file range [128K, 192K), because this one is still pinned.
The merged extent map for the file range [0, 128K) gets the flag
EXTENT_MAP_MERGED set;
3) When the third ordered extent completes (file range [128K, 192K)), we
unpin its extent map and attempt to merge it with the previous extent
map, for file range [0, 128K), but we can't because that extent map
has the flag EXTENT_MAP_MERGED set (mergeable_maps() returns false
due to different flags) while the extent map for the range [128K, 192K)
doesn't have that flag set.
We also can't merge it with the next extent map, for file range
[192K, 256K), because that one is still pinned.
At this moment we have 3 extent maps:
One for file range [0, 128K), with the flag EXTENT_MAP_MERGED set.
One for file range [128K, 192K).
One for file range [192K, 256K) which is still pinned;
4) When the fourth and final extent completes (file range [192K, 256K)),
we unpin its extent map and attempt to merge it with the previous
extent map, for file range [128K, 192K), which succeeds since none
of these extent maps have the EXTENT_MAP_MERGED flag set.
So we end up with 2 extent maps:
One for file range [0, 128K), with the flag EXTENT_MAP_MERGED set.
One for file range [128K, 256K), with the flag EXTENT_MAP_MERGED set.
Since after merging extent maps we don't attempt to merge again, that
is, merge the resulting extent map with the one that is now preceding
it (and the one following it), we end up with those two extent maps,
when we could have had a single extent map to represent the whole file.
Fix this by making mergeable_maps() ignore the EXTENT_MAP_MERGED flag.
While this doesn't present any functional issue, it prevents the merging
of extent maps which allows to save memory, and can make defrag not
merging extents too (that will be addressed in the next patch).
Fixes: 199257a78bb0 ("btrfs: defrag: don't use merged extent map for their generation check")
CC: stable(a)vger.kernel.org # 6.1+
Reviewed-by: Qu Wenruo <wqu(a)suse.com>
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
---
fs/btrfs/extent_map.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 668c617444a50..1d93e1202c339 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -230,7 +230,12 @@ static bool mergeable_maps(const struct extent_map *prev, const struct extent_ma
if (extent_map_end(prev) != next->start)
return false;
- if (prev->flags != next->flags)
+ /*
+ * The merged flag is not an on-disk flag, it just indicates we had the
+ * extent maps of 2 (or more) adjacent extents merged, so factor it out.
+ */
+ if ((prev->flags & ~EXTENT_FLAG_MERGED) !=
+ (next->flags & ~EXTENT_FLAG_MERGED))
return false;
if (next->disk_bytenr < EXTENT_MAP_LAST_BYTE - 1)
--
2.43.0
The patch below does not apply to the v6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From aec8e6bf839101784f3ef037dcdb9432c3f32343 Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1(a)huawei.com>
Date: Mon, 21 Oct 2024 22:02:15 +0800
Subject: [PATCH] btrfs: fix use-after-free of block device file in
__btrfs_free_extra_devids()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Mounting btrfs from two images (which have the same one fsid and two
different dev_uuids) in certain executing order may trigger an UAF for
variable 'device->bdev_file' in __btrfs_free_extra_devids(). And
following are the details:
1. Attach image_1 to loop0, attach image_2 to loop1, and scan btrfs
devices by ioctl(BTRFS_IOC_SCAN_DEV):
/ btrfs_device_1 → loop0
fs_device
\ btrfs_device_2 → loop1
2. mount /dev/loop0 /mnt
btrfs_open_devices
btrfs_device_1->bdev_file = btrfs_get_bdev_and_sb(loop0)
btrfs_device_2->bdev_file = btrfs_get_bdev_and_sb(loop1)
btrfs_fill_super
open_ctree
fail: btrfs_close_devices // -ENOMEM
btrfs_close_bdev(btrfs_device_1)
fput(btrfs_device_1->bdev_file)
// btrfs_device_1->bdev_file is freed
btrfs_close_bdev(btrfs_device_2)
fput(btrfs_device_2->bdev_file)
3. mount /dev/loop1 /mnt
btrfs_open_devices
btrfs_get_bdev_and_sb(&bdev_file)
// EIO, btrfs_device_1->bdev_file is not assigned,
// which points to a freed memory area
btrfs_device_2->bdev_file = btrfs_get_bdev_and_sb(loop1)
btrfs_fill_super
open_ctree
btrfs_free_extra_devids
if (btrfs_device_1->bdev_file)
fput(btrfs_device_1->bdev_file) // UAF !
Fix it by setting 'device->bdev_file' as 'NULL' after closing the
btrfs_device in btrfs_close_one_device().
Fixes: 142388194191 ("btrfs: do not background blkdev_put()")
CC: stable(a)vger.kernel.org # 4.19+
Link: https://bugzilla.kernel.org/show_bug.cgi?id=219408
Signed-off-by: Zhihao Cheng <chengzhihao1(a)huawei.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
---
fs/btrfs/volumes.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8f340ad1d9384..eb51b609190fb 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1105,6 +1105,7 @@ static void btrfs_close_one_device(struct btrfs_device *device)
if (device->bdev) {
fs_devices->open_devices--;
device->bdev = NULL;
+ device->bdev_file = NULL;
}
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
btrfs_destroy_dev_zone_info(device);
--
2.43.0
The patch below does not apply to the v6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From d949d1d14fa281ace388b1de978e8f2cd52875cf Mon Sep 17 00:00:00 2001
From: Jeongjun Park <aha310510(a)gmail.com>
Date: Mon, 9 Sep 2024 21:35:58 +0900
Subject: [PATCH] mm: shmem: fix data-race in shmem_getattr()
I got the following KCSAN report during syzbot testing:
==================================================================
BUG: KCSAN: data-race in generic_fillattr / inode_set_ctime_current
write to 0xffff888102eb3260 of 4 bytes by task 6565 on cpu 1:
inode_set_ctime_to_ts include/linux/fs.h:1638 [inline]
inode_set_ctime_current+0x169/0x1d0 fs/inode.c:2626
shmem_mknod+0x117/0x180 mm/shmem.c:3443
shmem_create+0x34/0x40 mm/shmem.c:3497
lookup_open fs/namei.c:3578 [inline]
open_last_lookups fs/namei.c:3647 [inline]
path_openat+0xdbc/0x1f00 fs/namei.c:3883
do_filp_open+0xf7/0x200 fs/namei.c:3913
do_sys_openat2+0xab/0x120 fs/open.c:1416
do_sys_open fs/open.c:1431 [inline]
__do_sys_openat fs/open.c:1447 [inline]
__se_sys_openat fs/open.c:1442 [inline]
__x64_sys_openat+0xf3/0x120 fs/open.c:1442
x64_sys_call+0x1025/0x2d60 arch/x86/include/generated/asm/syscalls_64.h:258
do_syscall_x64 arch/x86/entry/common.c:52 [inline]
do_syscall_64+0x54/0x120 arch/x86/entry/common.c:83
entry_SYSCALL_64_after_hwframe+0x76/0x7e
read to 0xffff888102eb3260 of 4 bytes by task 3498 on cpu 0:
inode_get_ctime_nsec include/linux/fs.h:1623 [inline]
inode_get_ctime include/linux/fs.h:1629 [inline]
generic_fillattr+0x1dd/0x2f0 fs/stat.c:62
shmem_getattr+0x17b/0x200 mm/shmem.c:1157
vfs_getattr_nosec fs/stat.c:166 [inline]
vfs_getattr+0x19b/0x1e0 fs/stat.c:207
vfs_statx_path fs/stat.c:251 [inline]
vfs_statx+0x134/0x2f0 fs/stat.c:315
vfs_fstatat+0xec/0x110 fs/stat.c:341
__do_sys_newfstatat fs/stat.c:505 [inline]
__se_sys_newfstatat+0x58/0x260 fs/stat.c:499
__x64_sys_newfstatat+0x55/0x70 fs/stat.c:499
x64_sys_call+0x141f/0x2d60 arch/x86/include/generated/asm/syscalls_64.h:263
do_syscall_x64 arch/x86/entry/common.c:52 [inline]
do_syscall_64+0x54/0x120 arch/x86/entry/common.c:83
entry_SYSCALL_64_after_hwframe+0x76/0x7e
value changed: 0x2755ae53 -> 0x27ee44d3
Reported by Kernel Concurrency Sanitizer on:
CPU: 0 UID: 0 PID: 3498 Comm: udevd Not tainted 6.11.0-rc6-syzkaller-00326-gd1f2d51b711a-dirty #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/06/2024
==================================================================
When calling generic_fillattr(), if you don't hold read lock, data-race
will occur in inode member variables, which can cause unexpected
behavior.
Since there is no special protection when shmem_getattr() calls
generic_fillattr(), data-race occurs by functions such as shmem_unlink()
or shmem_mknod(). This can cause unexpected results, so commenting it out
is not enough.
Therefore, when calling generic_fillattr() from shmem_getattr(), it is
appropriate to protect the inode using inode_lock_shared() and
inode_unlock_shared() to prevent data-race.
Link: https://lkml.kernel.org/r/20240909123558.70229-1-aha310510@gmail.com
Fixes: 44a30220bc0a ("shmem: recalculate file inode when fstat")
Signed-off-by: Jeongjun Park <aha310510(a)gmail.com>
Reported-by: syzbot <syzkaller(a)googlegroup.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Yu Zhao <yuzhao(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/shmem.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/mm/shmem.c b/mm/shmem.c
index c5adb987b23cf..4ba1d00fabdaa 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1166,7 +1166,9 @@ static int shmem_getattr(struct mnt_idmap *idmap,
stat->attributes_mask |= (STATX_ATTR_APPEND |
STATX_ATTR_IMMUTABLE |
STATX_ATTR_NODUMP);
+ inode_lock_shared(inode);
generic_fillattr(idmap, request_mask, inode, stat);
+ inode_unlock_shared(inode);
if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0))
stat->blksize = HPAGE_PMD_SIZE;
--
2.43.0
The patch below does not apply to the v6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From ddd6d8e975b171ea3f63a011a75820883ff0d479 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao(a)google.com>
Date: Sat, 19 Oct 2024 01:29:38 +0000
Subject: [PATCH] mm: multi-gen LRU: remove MM_LEAF_OLD and MM_NONLEAF_TOTAL
stats
Patch series "mm: multi-gen LRU: Have secondary MMUs participate in
MM_WALK".
Today, the MM_WALK capability causes MGLRU to clear the young bit from
PMDs and PTEs during the page table walk before eviction, but MGLRU does
not call the clear_young() MMU notifier in this case. By not calling this
notifier, the MM walk takes less time/CPU, but it causes pages that are
accessed mostly through KVM / secondary MMUs to appear younger than they
should be.
We do call the clear_young() notifier today, but only when attempting to
evict the page, so we end up clearing young/accessed information less
frequently for secondary MMUs than for mm PTEs, and therefore they appear
younger and are less likely to be evicted. Therefore, memory that is
*not* being accessed mostly by KVM will be evicted *more* frequently,
worsening performance.
ChromeOS observed a tab-open latency regression when enabling MGLRU with a
setup that involved running a VM:
Tab-open latency histogram (ms)
Version p50 mean p95 p99 max
base 1315 1198 2347 3454 10319
mglru 2559 1311 7399 12060 43758
fix 1119 926 2470 4211 6947
This series replaces the final non-selftest patchs from this series[1],
which introduced a similar change (and a new MMU notifier) with KVM
optimizations. I'll send a separate series (to Sean and Paolo) for the
KVM optimizations.
This series also makes proactive reclaim with MGLRU possible for KVM
memory. I have verified that this functions correctly with the selftest
from [1], but given that that test is a KVM selftest, I'll send it with
the rest of the KVM optimizations later. Andrew, let me know if you'd
like to take the test now anyway.
[1]: https://lore.kernel.org/linux-mm/20240926013506.860253-18-jthoughton@google…
This patch (of 2):
The removed stats, MM_LEAF_OLD and MM_NONLEAF_TOTAL, are not very helpful
and become more complicated to properly compute when adding
test/clear_young() notifiers in MGLRU's mm walk.
Link: https://lkml.kernel.org/r/20241019012940.3656292-1-jthoughton@google.com
Link: https://lkml.kernel.org/r/20241019012940.3656292-2-jthoughton@google.com
Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks")
Signed-off-by: Yu Zhao <yuzhao(a)google.com>
Signed-off-by: James Houghton <jthoughton(a)google.com>
Cc: Axel Rasmussen <axelrasmussen(a)google.com>
Cc: David Matlack <dmatlack(a)google.com>
Cc: David Rientjes <rientjes(a)google.com>
Cc: David Stevens <stevensd(a)google.com>
Cc: Oliver Upton <oliver.upton(a)linux.dev>
Cc: Paolo Bonzini <pbonzini(a)redhat.com>
Cc: Sean Christopherson <seanjc(a)google.com>
Cc: Wei Xu <weixugc(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/mmzone.h | 2 --
mm/vmscan.c | 14 +++++---------
2 files changed, 5 insertions(+), 11 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 17506e4a28355..9342e5692dab6 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -458,9 +458,7 @@ struct lru_gen_folio {
enum {
MM_LEAF_TOTAL, /* total leaf entries */
- MM_LEAF_OLD, /* old leaf entries */
MM_LEAF_YOUNG, /* young leaf entries */
- MM_NONLEAF_TOTAL, /* total non-leaf entries */
MM_NONLEAF_FOUND, /* non-leaf entries found in Bloom filters */
MM_NONLEAF_ADDED, /* non-leaf entries added to Bloom filters */
NR_MM_STATS
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eb4e8440c5071..4f1d33e4b3601 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3399,7 +3399,6 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
continue;
if (!pte_young(ptent)) {
- walk->mm_stats[MM_LEAF_OLD]++;
continue;
}
@@ -3552,7 +3551,6 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
walk->mm_stats[MM_LEAF_TOTAL]++;
if (!pmd_young(val)) {
- walk->mm_stats[MM_LEAF_OLD]++;
continue;
}
@@ -3564,8 +3562,6 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
continue;
}
- walk->mm_stats[MM_NONLEAF_TOTAL]++;
-
if (!walk->force_scan && should_clear_pmd_young()) {
if (!pmd_young(val))
continue;
@@ -5254,11 +5250,11 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
for (tier = 0; tier < MAX_NR_TIERS; tier++) {
seq_printf(m, " %10d", tier);
for (type = 0; type < ANON_AND_FILE; type++) {
- const char *s = " ";
+ const char *s = "xxx";
unsigned long n[3] = {};
if (seq == max_seq) {
- s = "RT ";
+ s = "RTx";
n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
} else if (seq == min_seq[type] || NR_HIST_GENS > 1) {
@@ -5280,14 +5276,14 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
seq_puts(m, " ");
for (i = 0; i < NR_MM_STATS; i++) {
- const char *s = " ";
+ const char *s = "xxxx";
unsigned long n = 0;
if (seq == max_seq && NR_HIST_GENS == 1) {
- s = "LOYNFA";
+ s = "TYFA";
n = READ_ONCE(mm_state->stats[hist][i]);
} else if (seq != max_seq && NR_HIST_GENS > 1) {
- s = "loynfa";
+ s = "tyfa";
n = READ_ONCE(mm_state->stats[hist][i]);
}
--
2.43.0
The patch below does not apply to the v6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 1b6063a57754eae5705753c01e78dc268b989038 Mon Sep 17 00:00:00 2001
From: Ovidiu Bunea <Ovidiu.Bunea(a)amd.com>
Date: Fri, 11 Oct 2024 11:12:19 -0400
Subject: [PATCH] Revert "drm/amd/display: update DML2 policy
EnhancedPrefetchScheduleAccelerationFinal DCN35"
This reverts
commit 9dad21f910fc ("drm/amd/display: update DML2 policy EnhancedPrefetchScheduleAccelerationFinal DCN35")
[why & how]
The offending commit exposes a hang with lid close/open behavior.
Both issues seem to be related to ODM 2:1 mode switching, so there
is another issue generic to that sequence that needs to be
investigated.
Cc: Mario Limonciello <mario.limonciello(a)amd.com>
Cc: Alex Deucher <alexander.deucher(a)amd.com>
Reviewed-by: Nicholas Kazlauskas <nicholas.kazlauskas(a)amd.com>
Signed-off-by: Ovidiu Bunea <Ovidiu.Bunea(a)amd.com>
Signed-off-by: Tom Chung <chiahsuan.chung(a)amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler(a)amd.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
(cherry picked from commit 68bf95317ebf2cfa7105251e4279e951daceefb7)
Cc: stable(a)vger.kernel.org
---
drivers/gpu/drm/amd/display/dc/dml2/dml2_policy.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/gpu/drm/amd/display/dc/dml2/dml2_policy.c b/drivers/gpu/drm/amd/display/dc/dml2/dml2_policy.c
index 11c904ae29586..c4c52173ef224 100644
--- a/drivers/gpu/drm/amd/display/dc/dml2/dml2_policy.c
+++ b/drivers/gpu/drm/amd/display/dc/dml2/dml2_policy.c
@@ -303,6 +303,7 @@ void build_unoptimized_policy_settings(enum dml_project_id project, struct dml_m
if (project == dml_project_dcn35 ||
project == dml_project_dcn351) {
policy->DCCProgrammingAssumesScanDirectionUnknownFinal = false;
+ policy->EnhancedPrefetchScheduleAccelerationFinal = 0;
policy->AllowForPStateChangeOrStutterInVBlankFinal = dml_prefetch_support_uclk_fclk_and_stutter_if_possible; /*new*/
policy->UseOnlyMaxPrefetchModes = 1;
}
--
2.43.0
The patch below does not apply to the v6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 3e8b7238b427e05498034c240451af5f5495afda Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan+linaro(a)kernel.org>
Date: Mon, 28 Oct 2024 13:49:58 +0100
Subject: [PATCH] gpiolib: fix debugfs newline separators
The gpiolib debugfs interface exports a list of all gpio chips in a
system and the state of their pins.
The gpio chip sections are supposed to be separated by a newline
character, but a long-standing bug prevents the separator from
being included when output is generated in multiple sessions, making the
output inconsistent and hard to read.
Make sure to only suppress the newline separator at the beginning of the
file as intended.
Fixes: f9c4a31f6150 ("gpiolib: Use seq_file's iterator interface")
Cc: stable(a)vger.kernel.org # 3.7
Cc: Thierry Reding <treding(a)nvidia.com>
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
Link: https://lore.kernel.org/r/20241028125000.24051-2-johan+linaro@kernel.org
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski(a)linaro.org>
---
drivers/gpio/gpiolib.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index d5952ab7752c2..e27488a90bc97 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -4926,6 +4926,8 @@ static void *gpiolib_seq_start(struct seq_file *s, loff_t *pos)
return NULL;
s->private = priv;
+ if (*pos > 0)
+ priv->newline = true;
priv->idx = srcu_read_lock(&gpio_devices_srcu);
list_for_each_entry_srcu(gdev, &gpio_devices, list,
--
2.43.0
The patch below does not apply to the v6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 4aa923a6e6406b43566ef6ac35a3d9a3197fa3e8 Mon Sep 17 00:00:00 2001
From: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com>
Date: Fri, 25 Oct 2024 15:56:39 +0100
Subject: [PATCH] drm/amd/pm: Vangogh: Fix kernel memory out of bounds write
KASAN reports that the GPU metrics table allocated in
vangogh_tables_init() is not large enough for the memset done in
smu_cmn_init_soft_gpu_metrics(). Condensed report follows:
[ 33.861314] BUG: KASAN: slab-out-of-bounds in smu_cmn_init_soft_gpu_metrics+0x73/0x200 [amdgpu]
[ 33.861799] Write of size 168 at addr ffff888129f59500 by task mangoapp/1067
...
[ 33.861808] CPU: 6 UID: 1000 PID: 1067 Comm: mangoapp Tainted: G W 6.12.0-rc4 #356 1a56f59a8b5182eeaf67eb7cb8b13594dd23b544
[ 33.861816] Tainted: [W]=WARN
[ 33.861818] Hardware name: Valve Galileo/Galileo, BIOS F7G0107 12/01/2023
[ 33.861822] Call Trace:
[ 33.861826] <TASK>
[ 33.861829] dump_stack_lvl+0x66/0x90
[ 33.861838] print_report+0xce/0x620
[ 33.861853] kasan_report+0xda/0x110
[ 33.862794] kasan_check_range+0xfd/0x1a0
[ 33.862799] __asan_memset+0x23/0x40
[ 33.862803] smu_cmn_init_soft_gpu_metrics+0x73/0x200 [amdgpu 13b1bc364ec578808f676eba412c20eaab792779]
[ 33.863306] vangogh_get_gpu_metrics_v2_4+0x123/0xad0 [amdgpu 13b1bc364ec578808f676eba412c20eaab792779]
[ 33.864257] vangogh_common_get_gpu_metrics+0xb0c/0xbc0 [amdgpu 13b1bc364ec578808f676eba412c20eaab792779]
[ 33.865682] amdgpu_dpm_get_gpu_metrics+0xcc/0x110 [amdgpu 13b1bc364ec578808f676eba412c20eaab792779]
[ 33.866160] amdgpu_get_gpu_metrics+0x154/0x2d0 [amdgpu 13b1bc364ec578808f676eba412c20eaab792779]
[ 33.867135] dev_attr_show+0x43/0xc0
[ 33.867147] sysfs_kf_seq_show+0x1f1/0x3b0
[ 33.867155] seq_read_iter+0x3f8/0x1140
[ 33.867173] vfs_read+0x76c/0xc50
[ 33.867198] ksys_read+0xfb/0x1d0
[ 33.867214] do_syscall_64+0x90/0x160
...
[ 33.867353] Allocated by task 378 on cpu 7 at 22.794876s:
[ 33.867358] kasan_save_stack+0x33/0x50
[ 33.867364] kasan_save_track+0x17/0x60
[ 33.867367] __kasan_kmalloc+0x87/0x90
[ 33.867371] vangogh_init_smc_tables+0x3f9/0x840 [amdgpu]
[ 33.867835] smu_sw_init+0xa32/0x1850 [amdgpu]
[ 33.868299] amdgpu_device_init+0x467b/0x8d90 [amdgpu]
[ 33.868733] amdgpu_driver_load_kms+0x19/0xf0 [amdgpu]
[ 33.869167] amdgpu_pci_probe+0x2d6/0xcd0 [amdgpu]
[ 33.869608] local_pci_probe+0xda/0x180
[ 33.869614] pci_device_probe+0x43f/0x6b0
Empirically we can confirm that the former allocates 152 bytes for the
table, while the latter memsets the 168 large block.
Root cause appears that when GPU metrics tables for v2_4 parts were added
it was not considered to enlarge the table to fit.
The fix in this patch is rather "brute force" and perhaps later should be
done in a smarter way, by extracting and consolidating the part version to
size logic to a common helper, instead of brute forcing the largest
possible allocation. Nevertheless, for now this works and fixes the out of
bounds write.
v2:
* Drop impossible v3_0 case. (Mario)
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin(a)igalia.com>
Fixes: 41cec40bc9ba ("drm/amd/pm: Vangogh: Add new gpu_metrics_v2_4 to acquire gpu_metrics")
Cc: Mario Limonciello <mario.limonciello(a)amd.com>
Cc: Evan Quan <evan.quan(a)amd.com>
Cc: Wenyou Yang <WenYou.Yang(a)amd.com>
Cc: Alex Deucher <alexander.deucher(a)amd.com>
Reviewed-by: Mario Limonciello <mario.limonciello(a)amd.com>
Link: https://lore.kernel.org/r/20241025145639.19124-1-tursulin@igalia.com
Signed-off-by: Mario Limonciello <mario.limonciello(a)amd.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
(cherry picked from commit 0880f58f9609f0200483a49429af0f050d281703)
Cc: stable(a)vger.kernel.org # v6.6+
---
drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c
index 22737b11b1bfb..1fe020f1f4dbe 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c
@@ -242,7 +242,9 @@ static int vangogh_tables_init(struct smu_context *smu)
goto err0_out;
smu_table->metrics_time = 0;
- smu_table->gpu_metrics_table_size = max(sizeof(struct gpu_metrics_v2_3), sizeof(struct gpu_metrics_v2_2));
+ smu_table->gpu_metrics_table_size = sizeof(struct gpu_metrics_v2_2);
+ smu_table->gpu_metrics_table_size = max(smu_table->gpu_metrics_table_size, sizeof(struct gpu_metrics_v2_3));
+ smu_table->gpu_metrics_table_size = max(smu_table->gpu_metrics_table_size, sizeof(struct gpu_metrics_v2_4));
smu_table->gpu_metrics_table = kzalloc(smu_table->gpu_metrics_table_size, GFP_KERNEL);
if (!smu_table->gpu_metrics_table)
goto err1_out;
--
2.43.0
The patch below does not apply to the v6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 9d08ec41a0645283d79a2e642205d488feaceacf Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao(a)google.com>
Date: Sat, 19 Oct 2024 22:22:12 -0600
Subject: [PATCH] mm: allow set/clear page_type again
Some page flags (page->flags) were converted to page types
(page->page_types). A recent example is PG_hugetlb.
From the exclusive writer's perspective, e.g., a thread doing
__folio_set_hugetlb(), there is a difference between the page flag and
type APIs: the former allows the same non-atomic operation to be repeated
whereas the latter does not. For example, calling __folio_set_hugetlb()
twice triggers VM_BUG_ON_FOLIO(), since the second call expects the type
(PG_hugetlb) not to be set previously.
Using add_hugetlb_folio() as an example, it calls __folio_set_hugetlb() in
the following error-handling path. And when that happens, it triggers the
aforementioned VM_BUG_ON_FOLIO().
if (folio_test_hugetlb(folio)) {
rc = hugetlb_vmemmap_restore_folio(h, folio);
if (rc) {
spin_lock_irq(&hugetlb_lock);
add_hugetlb_folio(h, folio, false);
...
It is possible to make hugeTLB comply with the new requirements from the
page type API. However, a straightforward fix would be to just allow the
same page type to be set or cleared again inside the API, to avoid any
changes to its callers.
Link: https://lkml.kernel.org/r/20241020042212.296781-1-yuzhao@google.com
Fixes: d99e3140a4d3 ("mm: turn folio_test_hugetlb into a PageType")
Signed-off-by: Yu Zhao <yuzhao(a)google.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Muchun Song <muchun.song(a)linux.dev>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/page-flags.h | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 1b3a767104878..cc839e4365c18 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -975,12 +975,16 @@ static __always_inline bool folio_test_##fname(const struct folio *folio) \
} \
static __always_inline void __folio_set_##fname(struct folio *folio) \
{ \
+ if (folio_test_##fname(folio)) \
+ return; \
VM_BUG_ON_FOLIO(data_race(folio->page.page_type) != UINT_MAX, \
folio); \
folio->page.page_type = (unsigned int)PGTY_##lname << 24; \
} \
static __always_inline void __folio_clear_##fname(struct folio *folio) \
{ \
+ if (folio->page.page_type == UINT_MAX) \
+ return; \
VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio); \
folio->page.page_type = UINT_MAX; \
}
@@ -993,11 +997,15 @@ static __always_inline int Page##uname(const struct page *page) \
} \
static __always_inline void __SetPage##uname(struct page *page) \
{ \
+ if (Page##uname(page)) \
+ return; \
VM_BUG_ON_PAGE(data_race(page->page_type) != UINT_MAX, page); \
page->page_type = (unsigned int)PGTY_##lname << 24; \
} \
static __always_inline void __ClearPage##uname(struct page *page) \
{ \
+ if (page->page_type == UINT_MAX) \
+ return; \
VM_BUG_ON_PAGE(!Page##uname(page), page); \
page->page_type = UINT_MAX; \
}
--
2.43.0
The patch below does not apply to the v6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 77b0d113eec49a7390ff1a08ca1923e89f5f86c6 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana(a)suse.com>
Date: Tue, 29 Oct 2024 15:18:45 +0000
Subject: [PATCH] btrfs: fix defrag not merging contiguous extents due to
merged extent maps
When running defrag (manual defrag) against a file that has extents that
are contiguous and we already have the respective extent maps loaded and
merged, we end up not defragging the range covered by those contiguous
extents. This happens when we have an extent map that was the result of
merging multiple extent maps for contiguous extents and the length of the
merged extent map is greater than or equals to the defrag threshold
length.
The script below reproduces this scenario:
$ cat test.sh
#!/bin/bash
DEV=/dev/sdi
MNT=/mnt/sdi
mkfs.btrfs -f $DEV
mount $DEV $MNT
# Create a 256K file with 4 extents of 64K each.
xfs_io -f -c "falloc 0 64K" \
-c "pwrite 0 64K" \
-c "falloc 64K 64K" \
-c "pwrite 64K 64K" \
-c "falloc 128K 64K" \
-c "pwrite 128K 64K" \
-c "falloc 192K 64K" \
-c "pwrite 192K 64K" \
$MNT/foo
umount $MNT
echo -n "Initial number of file extent items: "
btrfs inspect-internal dump-tree -t 5 $DEV | grep EXTENT_DATA | wc -l
mount $DEV $MNT
# Read the whole file in order to load and merge extent maps.
cat $MNT/foo > /dev/null
btrfs filesystem defragment -t 128K $MNT/foo
umount $MNT
echo -n "Number of file extent items after defrag with 128K threshold: "
btrfs inspect-internal dump-tree -t 5 $DEV | grep EXTENT_DATA | wc -l
mount $DEV $MNT
# Read the whole file in order to load and merge extent maps.
cat $MNT/foo > /dev/null
btrfs filesystem defragment -t 256K $MNT/foo
umount $MNT
echo -n "Number of file extent items after defrag with 256K threshold: "
btrfs inspect-internal dump-tree -t 5 $DEV | grep EXTENT_DATA | wc -l
Running it:
$ ./test.sh
Initial number of file extent items: 4
Number of file extent items after defrag with 128K threshold: 4
Number of file extent items after defrag with 256K threshold: 4
The 4 extents don't get merged because we have an extent map with a size
of 256K that is the result of merging the individual extent maps for each
of the four 64K extents and at defrag_lookup_extent() we have a value of
zero for the generation threshold ('newer_than' argument) since this is a
manual defrag. As a consequence we don't call defrag_get_extent() to get
an extent map representing a single file extent item in the inode's
subvolume tree, so we end up using the merged extent map at
defrag_collect_targets() and decide not to defrag.
Fix this by updating defrag_lookup_extent() to always discard extent maps
that were merged and call defrag_get_extent() regardless of the minimum
generation threshold ('newer_than' argument).
A test case for fstests will be sent along soon.
CC: stable(a)vger.kernel.org # 6.1+
Fixes: 199257a78bb0 ("btrfs: defrag: don't use merged extent map for their generation check")
Reviewed-by: Qu Wenruo <wqu(a)suse.com>
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
---
fs/btrfs/defrag.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index b95ef44c326bd..968dae9539482 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -763,12 +763,12 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
* We can get a merged extent, in that case, we need to re-search
* tree to get the original em for defrag.
*
- * If @newer_than is 0 or em::generation < newer_than, we can trust
- * this em, as either we don't care about the generation, or the
- * merged extent map will be rejected anyway.
+ * This is because even if we have adjacent extents that are contiguous
+ * and compatible (same type and flags), we still want to defrag them
+ * so that we use less metadata (extent items in the extent tree and
+ * file extent items in the inode's subvolume tree).
*/
- if (em && (em->flags & EXTENT_FLAG_MERGED) &&
- newer_than && em->generation >= newer_than) {
+ if (em && (em->flags & EXTENT_FLAG_MERGED)) {
free_extent_map(em);
em = NULL;
}
--
2.43.0
The patch below does not apply to the v6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 01626a18230246efdcea322aa8f067e60ffe5ccd Mon Sep 17 00:00:00 2001
From: Barry Song <v-songbaohua(a)oppo.com>
Date: Fri, 27 Sep 2024 09:19:36 +1200
Subject: [PATCH] mm: avoid unconditional one-tick sleep when swapcache_prepare
fails
Commit 13ddaf26be32 ("mm/swap: fix race when skipping swapcache")
introduced an unconditional one-tick sleep when `swapcache_prepare()`
fails, which has led to reports of UI stuttering on latency-sensitive
Android devices. To address this, we can use a waitqueue to wake up tasks
that fail `swapcache_prepare()` sooner, instead of always sleeping for a
full tick. While tasks may occasionally be woken by an unrelated
`do_swap_page()`, this method is preferable to two scenarios: rapid
re-entry into page faults, which can cause livelocks, and multiple
millisecond sleeps, which visibly degrade user experience.
Oven's testing shows that a single waitqueue resolves the UI stuttering
issue. If a 'thundering herd' problem becomes apparent later, a waitqueue
hash similar to `folio_wait_table[PAGE_WAIT_TABLE_SIZE]` for page bit
locks can be introduced.
[v-songbaohua(a)oppo.com: wake_up only when swapcache_wq waitqueue is active]
Link: https://lkml.kernel.org/r/20241008130807.40833-1-21cnbao@gmail.com
Link: https://lkml.kernel.org/r/20240926211936.75373-1-21cnbao@gmail.com
Fixes: 13ddaf26be32 ("mm/swap: fix race when skipping swapcache")
Signed-off-by: Barry Song <v-songbaohua(a)oppo.com>
Reported-by: Oven Liyang <liyangouwen1(a)oppo.com>
Tested-by: Oven Liyang <liyangouwen1(a)oppo.com>
Cc: Kairui Song <kasong(a)tencent.com>
Cc: "Huang, Ying" <ying.huang(a)intel.com>
Cc: Yu Zhao <yuzhao(a)google.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Chris Li <chrisl(a)kernel.org>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: Yosry Ahmed <yosryahmed(a)google.com>
Cc: SeongJae Park <sj(a)kernel.org>
Cc: Kalesh Singh <kaleshsingh(a)google.com>
Cc: Suren Baghdasaryan <surenb(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/memory.c | 15 +++++++++++++--
1 file changed, 13 insertions(+), 2 deletions(-)
diff --git a/mm/memory.c b/mm/memory.c
index 3ccee51adfbbd..bdf77a3ec47bc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4187,6 +4187,8 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq);
+
/*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
@@ -4199,6 +4201,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct folio *swapcache, *folio = NULL;
+ DECLARE_WAITQUEUE(wait, current);
struct page *page;
struct swap_info_struct *si = NULL;
rmap_t rmap_flags = RMAP_NONE;
@@ -4297,7 +4300,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* Relax a bit to prevent rapid
* repeated page faults.
*/
+ add_wait_queue(&swapcache_wq, &wait);
schedule_timeout_uninterruptible(1);
+ remove_wait_queue(&swapcache_wq, &wait);
goto out_page;
}
need_clear_cache = true;
@@ -4604,8 +4609,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
/* Clear the swap cache pin for direct swapin after PTL unlock */
- if (need_clear_cache)
+ if (need_clear_cache) {
swapcache_clear(si, entry, nr_pages);
+ if (waitqueue_active(&swapcache_wq))
+ wake_up(&swapcache_wq);
+ }
if (si)
put_swap_device(si);
return ret;
@@ -4620,8 +4628,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
folio_unlock(swapcache);
folio_put(swapcache);
}
- if (need_clear_cache)
+ if (need_clear_cache) {
swapcache_clear(si, entry, nr_pages);
+ if (waitqueue_active(&swapcache_wq))
+ wake_up(&swapcache_wq);
+ }
if (si)
put_swap_device(si);
return ret;
--
2.43.0
The patch below does not apply to the v6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 6575b268157f37929948a8d1f3bafb3d7c055bc1 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams(a)intel.com>
Date: Fri, 25 Oct 2024 12:32:55 -0700
Subject: [PATCH] cxl/port: Fix CXL port initialization order when the
subsystem is built-in
When the CXL subsystem is built-in the module init order is determined
by Makefile order. That order violates expectations. The expectation is
that cxl_acpi and cxl_mem can race to attach. If cxl_acpi wins the race,
cxl_mem will find the enabled CXL root ports it needs. If cxl_acpi loses
the race it will retrigger cxl_mem to attach via cxl_bus_rescan(). That
flow only works if cxl_acpi can assume ports are enabled immediately
upon cxl_acpi_probe() return. That in turn can only happen in the
CONFIG_CXL_ACPI=y case if the cxl_port driver is registered before
cxl_acpi_probe() runs.
Fix up the order to prevent initialization failures. Ensure that
cxl_port is built-in when cxl_acpi is also built-in, arrange for
Makefile order to resolve the subsys_initcall() order of cxl_port and
cxl_acpi, and arrange for Makefile order to resolve the
device_initcall() (module_init()) order of the remaining objects.
As for what contributed to this not being found earlier, the CXL
regression environment, cxl_test, builds all CXL functionality as a
module to allow to symbol mocking and other dynamic reload tests. As a
result there is no regression coverage for the built-in case.
Reported-by: Gregory Price <gourry(a)gourry.net>
Closes: http://lore.kernel.org/20241004212504.1246-1-gourry@gourry.net
Tested-by: Gregory Price <gourry(a)gourry.net>
Fixes: 8dd2bc0f8e02 ("cxl/mem: Add the cxl_mem driver")
Cc: stable(a)vger.kernel.org
Cc: Davidlohr Bueso <dave(a)stgolabs.net>
Cc: Jonathan Cameron <jonathan.cameron(a)huawei.com>
Cc: Dave Jiang <dave.jiang(a)intel.com>
Cc: Alison Schofield <alison.schofield(a)intel.com>
Cc: Vishal Verma <vishal.l.verma(a)intel.com>
Cc: Ira Weiny <ira.weiny(a)intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
Reviewed-by: Ira Weiny <ira.weiny(a)intel.com>
Tested-by: Alejandro Lucero <alucerop(a)amd.com>
Reviewed-by: Alejandro Lucero <alucerop(a)amd.com>
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
Link: https://patch.msgid.link/172988474904.476062.7961350937442459266.stgit@dwil…
Signed-off-by: Ira Weiny <ira.weiny(a)intel.com>
---
drivers/cxl/Kconfig | 1 +
drivers/cxl/Makefile | 20 ++++++++++++++------
drivers/cxl/port.c | 17 ++++++++++++++++-
3 files changed, 31 insertions(+), 7 deletions(-)
diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
index 29c192f20082c..876469e23f7a7 100644
--- a/drivers/cxl/Kconfig
+++ b/drivers/cxl/Kconfig
@@ -60,6 +60,7 @@ config CXL_ACPI
default CXL_BUS
select ACPI_TABLE_LIB
select ACPI_HMAT
+ select CXL_PORT
help
Enable support for host managed device memory (HDM) resources
published by a platform's ACPI CXL memory layout description. See
diff --git a/drivers/cxl/Makefile b/drivers/cxl/Makefile
index db321f48ba52e..2caa90fa4bf25 100644
--- a/drivers/cxl/Makefile
+++ b/drivers/cxl/Makefile
@@ -1,13 +1,21 @@
# SPDX-License-Identifier: GPL-2.0
+
+# Order is important here for the built-in case:
+# - 'core' first for fundamental init
+# - 'port' before platform root drivers like 'acpi' so that CXL-root ports
+# are immediately enabled
+# - 'mem' and 'pmem' before endpoint drivers so that memdevs are
+# immediately enabled
+# - 'pci' last, also mirrors the hardware enumeration hierarchy
obj-y += core/
-obj-$(CONFIG_CXL_PCI) += cxl_pci.o
-obj-$(CONFIG_CXL_MEM) += cxl_mem.o
+obj-$(CONFIG_CXL_PORT) += cxl_port.o
obj-$(CONFIG_CXL_ACPI) += cxl_acpi.o
obj-$(CONFIG_CXL_PMEM) += cxl_pmem.o
-obj-$(CONFIG_CXL_PORT) += cxl_port.o
+obj-$(CONFIG_CXL_MEM) += cxl_mem.o
+obj-$(CONFIG_CXL_PCI) += cxl_pci.o
-cxl_mem-y := mem.o
-cxl_pci-y := pci.o
+cxl_port-y := port.o
cxl_acpi-y := acpi.o
cxl_pmem-y := pmem.o security.o
-cxl_port-y := port.o
+cxl_mem-y := mem.o
+cxl_pci-y := pci.o
diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c
index 861dde65768fe..9dc394295e1fc 100644
--- a/drivers/cxl/port.c
+++ b/drivers/cxl/port.c
@@ -208,7 +208,22 @@ static struct cxl_driver cxl_port_driver = {
},
};
-module_cxl_driver(cxl_port_driver);
+static int __init cxl_port_init(void)
+{
+ return cxl_driver_register(&cxl_port_driver);
+}
+/*
+ * Be ready to immediately enable ports emitted by the platform CXL root
+ * (e.g. cxl_acpi) when CONFIG_CXL_PORT=y.
+ */
+subsys_initcall(cxl_port_init);
+
+static void __exit cxl_port_exit(void)
+{
+ cxl_driver_unregister(&cxl_port_driver);
+}
+module_exit(cxl_port_exit);
+
MODULE_DESCRIPTION("CXL: Port enumeration and services");
MODULE_LICENSE("GPL v2");
MODULE_IMPORT_NS(CXL);
--
2.43.0
The patch below does not apply to the v6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 7245012f0f496162dd95d888ed2ceb5a35170f1a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg(a)intel.com>
Date: Wed, 23 Oct 2024 09:17:44 +0200
Subject: [PATCH] wifi: iwlwifi: mvm: fix 6 GHz scan construction
If more than 255 colocated APs exist for the set of all
APs found during 2.4/5 GHz scanning, then the 6 GHz scan
construction will loop forever since the loop variable
has type u8, which can never reach the number found when
that's bigger than 255, and is stored in a u32 variable.
Also move it into the loops to have a smaller scope.
Using a u32 there is fine, we limit the number of APs in
the scan list and each has a limit on the number of RNR
entries due to the frame size. With a limit of 1000 scan
results, a frame size upper bound of 4096 (really it's
more like ~2300) and a TBTT entry size of at least 11,
we get an upper bound for the number of ~372k, well in
the bounds of a u32.
Cc: stable(a)vger.kernel.org
Fixes: eae94cf82d74 ("iwlwifi: mvm: add support for 6GHz")
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219375
Link: https://patch.msgid.link/20241023091744.f4baed5c08a1.I8b417148bbc8c5d11c101…
Signed-off-by: Johannes Berg <johannes.berg(a)intel.com>
---
drivers/net/wireless/intel/iwlwifi/mvm/scan.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c
index 3ce9150213a74..ddcbd80a49fb2 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c
@@ -1774,7 +1774,7 @@ iwl_mvm_umac_scan_cfg_channels_v7_6g(struct iwl_mvm *mvm,
&cp->channel_config[ch_cnt];
u32 s_ssid_bitmap = 0, bssid_bitmap = 0, flags = 0;
- u8 j, k, n_s_ssids = 0, n_bssids = 0;
+ u8 k, n_s_ssids = 0, n_bssids = 0;
u8 max_s_ssids, max_bssids;
bool force_passive = false, found = false, allow_passive = true,
unsolicited_probe_on_chan = false, psc_no_listen = false;
@@ -1799,7 +1799,7 @@ iwl_mvm_umac_scan_cfg_channels_v7_6g(struct iwl_mvm *mvm,
cfg->v5.iter_count = 1;
cfg->v5.iter_interval = 0;
- for (j = 0; j < params->n_6ghz_params; j++) {
+ for (u32 j = 0; j < params->n_6ghz_params; j++) {
s8 tmp_psd_20;
if (!(scan_6ghz_params[j].channel_idx == i))
@@ -1873,7 +1873,7 @@ iwl_mvm_umac_scan_cfg_channels_v7_6g(struct iwl_mvm *mvm,
* SSID.
* TODO: improve this logic
*/
- for (j = 0; j < params->n_6ghz_params; j++) {
+ for (u32 j = 0; j < params->n_6ghz_params; j++) {
if (!(scan_6ghz_params[j].channel_idx == i))
continue;
--
2.43.0
The patch below does not apply to the v6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From a0f0625390858321525c2a8d04e174a546bd19b3 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana(a)suse.com>
Date: Mon, 28 Oct 2024 16:23:00 +0000
Subject: [PATCH] btrfs: fix extent map merging not happening for adjacent
extents
If we have 3 or more adjacent extents in a file, that is, consecutive file
extent items pointing to adjacent extents, within a contiguous file range
and compatible flags, we end up not merging all the extents into a single
extent map.
For example:
$ mkfs.btrfs -f /dev/sdc
$ mount /dev/sdc /mnt/sdc
$ xfs_io -f -d -c "pwrite -b 64K 0 64K" \
-c "pwrite -b 64K 64K 64K" \
-c "pwrite -b 64K 128K 64K" \
-c "pwrite -b 64K 192K 64K" \
/mnt/sdc/foo
After all the ordered extents complete we unpin the extent maps and try
to merge them, but instead of getting a single extent map we get two
because:
1) When the first ordered extent completes (file range [0, 64K)) we
unpin its extent map and attempt to merge it with the extent map for
the range [64K, 128K), but we can't because that extent map is still
pinned;
2) When the second ordered extent completes (file range [64K, 128K)), we
unpin its extent map and merge it with the previous extent map, for
file range [0, 64K), but we can't merge with the next extent map, for
the file range [128K, 192K), because this one is still pinned.
The merged extent map for the file range [0, 128K) gets the flag
EXTENT_MAP_MERGED set;
3) When the third ordered extent completes (file range [128K, 192K)), we
unpin its extent map and attempt to merge it with the previous extent
map, for file range [0, 128K), but we can't because that extent map
has the flag EXTENT_MAP_MERGED set (mergeable_maps() returns false
due to different flags) while the extent map for the range [128K, 192K)
doesn't have that flag set.
We also can't merge it with the next extent map, for file range
[192K, 256K), because that one is still pinned.
At this moment we have 3 extent maps:
One for file range [0, 128K), with the flag EXTENT_MAP_MERGED set.
One for file range [128K, 192K).
One for file range [192K, 256K) which is still pinned;
4) When the fourth and final extent completes (file range [192K, 256K)),
we unpin its extent map and attempt to merge it with the previous
extent map, for file range [128K, 192K), which succeeds since none
of these extent maps have the EXTENT_MAP_MERGED flag set.
So we end up with 2 extent maps:
One for file range [0, 128K), with the flag EXTENT_MAP_MERGED set.
One for file range [128K, 256K), with the flag EXTENT_MAP_MERGED set.
Since after merging extent maps we don't attempt to merge again, that
is, merge the resulting extent map with the one that is now preceding
it (and the one following it), we end up with those two extent maps,
when we could have had a single extent map to represent the whole file.
Fix this by making mergeable_maps() ignore the EXTENT_MAP_MERGED flag.
While this doesn't present any functional issue, it prevents the merging
of extent maps which allows to save memory, and can make defrag not
merging extents too (that will be addressed in the next patch).
Fixes: 199257a78bb0 ("btrfs: defrag: don't use merged extent map for their generation check")
CC: stable(a)vger.kernel.org # 6.1+
Reviewed-by: Qu Wenruo <wqu(a)suse.com>
Signed-off-by: Filipe Manana <fdmanana(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
---
fs/btrfs/extent_map.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 668c617444a50..1d93e1202c339 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -230,7 +230,12 @@ static bool mergeable_maps(const struct extent_map *prev, const struct extent_ma
if (extent_map_end(prev) != next->start)
return false;
- if (prev->flags != next->flags)
+ /*
+ * The merged flag is not an on-disk flag, it just indicates we had the
+ * extent maps of 2 (or more) adjacent extents merged, so factor it out.
+ */
+ if ((prev->flags & ~EXTENT_FLAG_MERGED) !=
+ (next->flags & ~EXTENT_FLAG_MERGED))
return false;
if (next->disk_bytenr < EXTENT_MAP_LAST_BYTE - 1)
--
2.43.0
The patch below does not apply to the v6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From aec8e6bf839101784f3ef037dcdb9432c3f32343 Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1(a)huawei.com>
Date: Mon, 21 Oct 2024 22:02:15 +0800
Subject: [PATCH] btrfs: fix use-after-free of block device file in
__btrfs_free_extra_devids()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Mounting btrfs from two images (which have the same one fsid and two
different dev_uuids) in certain executing order may trigger an UAF for
variable 'device->bdev_file' in __btrfs_free_extra_devids(). And
following are the details:
1. Attach image_1 to loop0, attach image_2 to loop1, and scan btrfs
devices by ioctl(BTRFS_IOC_SCAN_DEV):
/ btrfs_device_1 → loop0
fs_device
\ btrfs_device_2 → loop1
2. mount /dev/loop0 /mnt
btrfs_open_devices
btrfs_device_1->bdev_file = btrfs_get_bdev_and_sb(loop0)
btrfs_device_2->bdev_file = btrfs_get_bdev_and_sb(loop1)
btrfs_fill_super
open_ctree
fail: btrfs_close_devices // -ENOMEM
btrfs_close_bdev(btrfs_device_1)
fput(btrfs_device_1->bdev_file)
// btrfs_device_1->bdev_file is freed
btrfs_close_bdev(btrfs_device_2)
fput(btrfs_device_2->bdev_file)
3. mount /dev/loop1 /mnt
btrfs_open_devices
btrfs_get_bdev_and_sb(&bdev_file)
// EIO, btrfs_device_1->bdev_file is not assigned,
// which points to a freed memory area
btrfs_device_2->bdev_file = btrfs_get_bdev_and_sb(loop1)
btrfs_fill_super
open_ctree
btrfs_free_extra_devids
if (btrfs_device_1->bdev_file)
fput(btrfs_device_1->bdev_file) // UAF !
Fix it by setting 'device->bdev_file' as 'NULL' after closing the
btrfs_device in btrfs_close_one_device().
Fixes: 142388194191 ("btrfs: do not background blkdev_put()")
CC: stable(a)vger.kernel.org # 4.19+
Link: https://bugzilla.kernel.org/show_bug.cgi?id=219408
Signed-off-by: Zhihao Cheng <chengzhihao1(a)huawei.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
---
fs/btrfs/volumes.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8f340ad1d9384..eb51b609190fb 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1105,6 +1105,7 @@ static void btrfs_close_one_device(struct btrfs_device *device)
if (device->bdev) {
fs_devices->open_devices--;
device->bdev = NULL;
+ device->bdev_file = NULL;
}
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
btrfs_destroy_dev_zone_info(device);
--
2.43.0
The patch below does not apply to the v6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From d48e1dea3931de64c26717adc2b89743c7ab6594 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota(a)wdc.com>
Date: Wed, 9 Oct 2024 22:52:06 +0900
Subject: [PATCH] btrfs: fix error propagation of split bios
The purpose of btrfs_bbio_propagate_error() shall be propagating an error
of split bio to its original btrfs_bio, and tell the error to the upper
layer. However, it's not working well on some cases.
* Case 1. Immediate (or quick) end_bio with an error
When btrfs sends btrfs_bio to mirrored devices, btrfs calls
btrfs_bio_end_io() when all the mirroring bios are completed. If that
btrfs_bio was split, it is from btrfs_clone_bioset and its end_io function
is btrfs_orig_write_end_io. For this case, btrfs_bbio_propagate_error()
accesses the orig_bbio's bio context to increase the error count.
That works well in most cases. However, if the end_io is called enough
fast, orig_bbio's (remaining part after split) bio context may not be
properly set at that time. Since the bio context is set when the orig_bbio
(the last btrfs_bio) is sent to devices, that might be too late for earlier
split btrfs_bio's completion. That will result in NULL pointer
dereference.
That bug is easily reproducible by running btrfs/146 on zoned devices [1]
and it shows the following trace.
[1] You need raid-stripe-tree feature as it create "-d raid0 -m raid1" FS.
BUG: kernel NULL pointer dereference, address: 0000000000000020
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
PGD 0 P4D 0
Oops: Oops: 0000 [#1] PREEMPT SMP PTI
CPU: 1 UID: 0 PID: 13 Comm: kworker/u32:1 Not tainted 6.11.0-rc7-BTRFS-ZNS+ #474
Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
Workqueue: writeback wb_workfn (flush-btrfs-5)
RIP: 0010:btrfs_bio_end_io+0xae/0xc0 [btrfs]
BTRFS error (device dm-0): bdev /dev/mapper/error-test errs: wr 2, rd 0, flush 0, corrupt 0, gen 0
RSP: 0018:ffffc9000006f248 EFLAGS: 00010246
RAX: 0000000000000000 RBX: ffff888005a7f080 RCX: ffffc9000006f1dc
RDX: 0000000000000000 RSI: 000000000000000a RDI: ffff888005a7f080
RBP: ffff888011dfc540 R08: 0000000000000000 R09: 0000000000000001
R10: ffffffff82e508e0 R11: 0000000000000005 R12: ffff88800ddfbe58
R13: ffff888005a7f080 R14: ffff888005a7f158 R15: ffff888005a7f158
FS: 0000000000000000(0000) GS:ffff88803ea80000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000020 CR3: 0000000002e22006 CR4: 0000000000370ef0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
<TASK>
? __die_body.cold+0x19/0x26
? page_fault_oops+0x13e/0x2b0
? _printk+0x58/0x73
? do_user_addr_fault+0x5f/0x750
? exc_page_fault+0x76/0x240
? asm_exc_page_fault+0x22/0x30
? btrfs_bio_end_io+0xae/0xc0 [btrfs]
? btrfs_log_dev_io_error+0x7f/0x90 [btrfs]
btrfs_orig_write_end_io+0x51/0x90 [btrfs]
dm_submit_bio+0x5c2/0xa50 [dm_mod]
? find_held_lock+0x2b/0x80
? blk_try_enter_queue+0x90/0x1e0
__submit_bio+0xe0/0x130
? ktime_get+0x10a/0x160
? lockdep_hardirqs_on+0x74/0x100
submit_bio_noacct_nocheck+0x199/0x410
btrfs_submit_bio+0x7d/0x150 [btrfs]
btrfs_submit_chunk+0x1a1/0x6d0 [btrfs]
? lockdep_hardirqs_on+0x74/0x100
? __folio_start_writeback+0x10/0x2c0
btrfs_submit_bbio+0x1c/0x40 [btrfs]
submit_one_bio+0x44/0x60 [btrfs]
submit_extent_folio+0x13f/0x330 [btrfs]
? btrfs_set_range_writeback+0xa3/0xd0 [btrfs]
extent_writepage_io+0x18b/0x360 [btrfs]
extent_write_locked_range+0x17c/0x340 [btrfs]
? __pfx_end_bbio_data_write+0x10/0x10 [btrfs]
run_delalloc_cow+0x71/0xd0 [btrfs]
btrfs_run_delalloc_range+0x176/0x500 [btrfs]
? find_lock_delalloc_range+0x119/0x260 [btrfs]
writepage_delalloc+0x2ab/0x480 [btrfs]
extent_write_cache_pages+0x236/0x7d0 [btrfs]
btrfs_writepages+0x72/0x130 [btrfs]
do_writepages+0xd4/0x240
? find_held_lock+0x2b/0x80
? wbc_attach_and_unlock_inode+0x12c/0x290
? wbc_attach_and_unlock_inode+0x12c/0x290
__writeback_single_inode+0x5c/0x4c0
? do_raw_spin_unlock+0x49/0xb0
writeback_sb_inodes+0x22c/0x560
__writeback_inodes_wb+0x4c/0xe0
wb_writeback+0x1d6/0x3f0
wb_workfn+0x334/0x520
process_one_work+0x1ee/0x570
? lock_is_held_type+0xc6/0x130
worker_thread+0x1d1/0x3b0
? __pfx_worker_thread+0x10/0x10
kthread+0xee/0x120
? __pfx_kthread+0x10/0x10
ret_from_fork+0x30/0x50
? __pfx_kthread+0x10/0x10
ret_from_fork_asm+0x1a/0x30
</TASK>
Modules linked in: dm_mod btrfs blake2b_generic xor raid6_pq rapl
CR2: 0000000000000020
* Case 2. Earlier completion of orig_bbio for mirrored btrfs_bios
btrfs_bbio_propagate_error() assumes the end_io function for orig_bbio is
called last among split bios. In that case, btrfs_orig_write_end_io() sets
the bio->bi_status to BLK_STS_IOERR by seeing the bioc->error [2].
Otherwise, the increased orig_bio's bioc->error is not checked by anyone
and return BLK_STS_OK to the upper layer.
[2] Actually, this is not true. Because we only increases orig_bioc->errors
by max_errors, the condition "atomic_read(&bioc->error) > bioc->max_errors"
is still not met if only one split btrfs_bio fails.
* Case 3. Later completion of orig_bbio for un-mirrored btrfs_bios
In contrast to the above case, btrfs_bbio_propagate_error() is not working
well if un-mirrored orig_bbio is completed last. It sets
orig_bbio->bio.bi_status to the btrfs_bio's error. But, that is easily
over-written by orig_bbio's completion status. If the status is BLK_STS_OK,
the upper layer would not know the failure.
* Solution
Considering the above cases, we can only save the error status in the
orig_bbio (remaining part after split) itself as it is always
available. Also, the saved error status should be propagated when all the
split btrfs_bios are finished (i.e, bbio->pending_ios == 0).
This commit introduces "status" to btrfs_bbio and saves the first error of
split bios to original btrfs_bio's "status" variable. When all the split
bios are finished, the saved status is loaded into original btrfs_bio's
status.
With this commit, btrfs/146 on zoned devices does not hit the NULL pointer
dereference anymore.
Fixes: 852eee62d31a ("btrfs: allow btrfs_submit_bio to split bios")
CC: stable(a)vger.kernel.org # 6.6+
Reviewed-by: Qu Wenruo <wqu(a)suse.com>
Reviewed-by: Christoph Hellwig <hch(a)lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn(a)wdc.com>
Signed-off-by: Naohiro Aota <naohiro.aota(a)wdc.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
---
fs/btrfs/bio.c | 37 +++++++++++++------------------------
fs/btrfs/bio.h | 3 +++
2 files changed, 16 insertions(+), 24 deletions(-)
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index ce13416bc10f0..f83ec5a1baa60 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -49,6 +49,7 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
bbio->end_io = end_io;
bbio->private = private;
atomic_set(&bbio->pending_ios, 1);
+ WRITE_ONCE(bbio->status, BLK_STS_OK);
}
/*
@@ -120,41 +121,29 @@ static void __btrfs_bio_end_io(struct btrfs_bio *bbio)
}
}
-static void btrfs_orig_write_end_io(struct bio *bio);
-
-static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio,
- struct btrfs_bio *orig_bbio)
-{
- /*
- * For writes we tolerate nr_mirrors - 1 write failures, so we can't
- * just blindly propagate a write failure here. Instead increment the
- * error count in the original I/O context so that it is guaranteed to
- * be larger than the error tolerance.
- */
- if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) {
- struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private;
- struct btrfs_io_context *orig_bioc = orig_stripe->bioc;
-
- atomic_add(orig_bioc->max_errors, &orig_bioc->error);
- } else {
- orig_bbio->bio.bi_status = bbio->bio.bi_status;
- }
-}
-
void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
{
bbio->bio.bi_status = status;
if (bbio->bio.bi_pool == &btrfs_clone_bioset) {
struct btrfs_bio *orig_bbio = bbio->private;
- if (bbio->bio.bi_status)
- btrfs_bbio_propagate_error(bbio, orig_bbio);
btrfs_cleanup_bio(bbio);
bbio = orig_bbio;
}
- if (atomic_dec_and_test(&bbio->pending_ios))
+ /*
+ * At this point, bbio always points to the original btrfs_bio. Save
+ * the first error in it.
+ */
+ if (status != BLK_STS_OK)
+ cmpxchg(&bbio->status, BLK_STS_OK, status);
+
+ if (atomic_dec_and_test(&bbio->pending_ios)) {
+ /* Load split bio's error which might be set above. */
+ if (status == BLK_STS_OK)
+ bbio->bio.bi_status = READ_ONCE(bbio->status);
__btrfs_bio_end_io(bbio);
+ }
}
static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
index e486123407458..e2fe16074ad65 100644
--- a/fs/btrfs/bio.h
+++ b/fs/btrfs/bio.h
@@ -79,6 +79,9 @@ struct btrfs_bio {
/* File system that this I/O operates on. */
struct btrfs_fs_info *fs_info;
+ /* Save the first error status of split bio. */
+ blk_status_t status;
+
/*
* This member must come last, bio_alloc_bioset will allocate enough
* bytes for entire btrfs_bio but relies on bio being last.
--
2.43.0
The patch below does not apply to the v6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From ddd6d8e975b171ea3f63a011a75820883ff0d479 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao(a)google.com>
Date: Sat, 19 Oct 2024 01:29:38 +0000
Subject: [PATCH] mm: multi-gen LRU: remove MM_LEAF_OLD and MM_NONLEAF_TOTAL
stats
Patch series "mm: multi-gen LRU: Have secondary MMUs participate in
MM_WALK".
Today, the MM_WALK capability causes MGLRU to clear the young bit from
PMDs and PTEs during the page table walk before eviction, but MGLRU does
not call the clear_young() MMU notifier in this case. By not calling this
notifier, the MM walk takes less time/CPU, but it causes pages that are
accessed mostly through KVM / secondary MMUs to appear younger than they
should be.
We do call the clear_young() notifier today, but only when attempting to
evict the page, so we end up clearing young/accessed information less
frequently for secondary MMUs than for mm PTEs, and therefore they appear
younger and are less likely to be evicted. Therefore, memory that is
*not* being accessed mostly by KVM will be evicted *more* frequently,
worsening performance.
ChromeOS observed a tab-open latency regression when enabling MGLRU with a
setup that involved running a VM:
Tab-open latency histogram (ms)
Version p50 mean p95 p99 max
base 1315 1198 2347 3454 10319
mglru 2559 1311 7399 12060 43758
fix 1119 926 2470 4211 6947
This series replaces the final non-selftest patchs from this series[1],
which introduced a similar change (and a new MMU notifier) with KVM
optimizations. I'll send a separate series (to Sean and Paolo) for the
KVM optimizations.
This series also makes proactive reclaim with MGLRU possible for KVM
memory. I have verified that this functions correctly with the selftest
from [1], but given that that test is a KVM selftest, I'll send it with
the rest of the KVM optimizations later. Andrew, let me know if you'd
like to take the test now anyway.
[1]: https://lore.kernel.org/linux-mm/20240926013506.860253-18-jthoughton@google…
This patch (of 2):
The removed stats, MM_LEAF_OLD and MM_NONLEAF_TOTAL, are not very helpful
and become more complicated to properly compute when adding
test/clear_young() notifiers in MGLRU's mm walk.
Link: https://lkml.kernel.org/r/20241019012940.3656292-1-jthoughton@google.com
Link: https://lkml.kernel.org/r/20241019012940.3656292-2-jthoughton@google.com
Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks")
Signed-off-by: Yu Zhao <yuzhao(a)google.com>
Signed-off-by: James Houghton <jthoughton(a)google.com>
Cc: Axel Rasmussen <axelrasmussen(a)google.com>
Cc: David Matlack <dmatlack(a)google.com>
Cc: David Rientjes <rientjes(a)google.com>
Cc: David Stevens <stevensd(a)google.com>
Cc: Oliver Upton <oliver.upton(a)linux.dev>
Cc: Paolo Bonzini <pbonzini(a)redhat.com>
Cc: Sean Christopherson <seanjc(a)google.com>
Cc: Wei Xu <weixugc(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/mmzone.h | 2 --
mm/vmscan.c | 14 +++++---------
2 files changed, 5 insertions(+), 11 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 17506e4a28355..9342e5692dab6 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -458,9 +458,7 @@ struct lru_gen_folio {
enum {
MM_LEAF_TOTAL, /* total leaf entries */
- MM_LEAF_OLD, /* old leaf entries */
MM_LEAF_YOUNG, /* young leaf entries */
- MM_NONLEAF_TOTAL, /* total non-leaf entries */
MM_NONLEAF_FOUND, /* non-leaf entries found in Bloom filters */
MM_NONLEAF_ADDED, /* non-leaf entries added to Bloom filters */
NR_MM_STATS
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eb4e8440c5071..4f1d33e4b3601 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3399,7 +3399,6 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
continue;
if (!pte_young(ptent)) {
- walk->mm_stats[MM_LEAF_OLD]++;
continue;
}
@@ -3552,7 +3551,6 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
walk->mm_stats[MM_LEAF_TOTAL]++;
if (!pmd_young(val)) {
- walk->mm_stats[MM_LEAF_OLD]++;
continue;
}
@@ -3564,8 +3562,6 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
continue;
}
- walk->mm_stats[MM_NONLEAF_TOTAL]++;
-
if (!walk->force_scan && should_clear_pmd_young()) {
if (!pmd_young(val))
continue;
@@ -5254,11 +5250,11 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
for (tier = 0; tier < MAX_NR_TIERS; tier++) {
seq_printf(m, " %10d", tier);
for (type = 0; type < ANON_AND_FILE; type++) {
- const char *s = " ";
+ const char *s = "xxx";
unsigned long n[3] = {};
if (seq == max_seq) {
- s = "RT ";
+ s = "RTx";
n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
} else if (seq == min_seq[type] || NR_HIST_GENS > 1) {
@@ -5280,14 +5276,14 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
seq_puts(m, " ");
for (i = 0; i < NR_MM_STATS; i++) {
- const char *s = " ";
+ const char *s = "xxxx";
unsigned long n = 0;
if (seq == max_seq && NR_HIST_GENS == 1) {
- s = "LOYNFA";
+ s = "TYFA";
n = READ_ONCE(mm_state->stats[hist][i]);
} else if (seq != max_seq && NR_HIST_GENS > 1) {
- s = "loynfa";
+ s = "tyfa";
n = READ_ONCE(mm_state->stats[hist][i]);
}
--
2.43.0
The patch below does not apply to the v6.11-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 9d08ec41a0645283d79a2e642205d488feaceacf Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao(a)google.com>
Date: Sat, 19 Oct 2024 22:22:12 -0600
Subject: [PATCH] mm: allow set/clear page_type again
Some page flags (page->flags) were converted to page types
(page->page_types). A recent example is PG_hugetlb.
From the exclusive writer's perspective, e.g., a thread doing
__folio_set_hugetlb(), there is a difference between the page flag and
type APIs: the former allows the same non-atomic operation to be repeated
whereas the latter does not. For example, calling __folio_set_hugetlb()
twice triggers VM_BUG_ON_FOLIO(), since the second call expects the type
(PG_hugetlb) not to be set previously.
Using add_hugetlb_folio() as an example, it calls __folio_set_hugetlb() in
the following error-handling path. And when that happens, it triggers the
aforementioned VM_BUG_ON_FOLIO().
if (folio_test_hugetlb(folio)) {
rc = hugetlb_vmemmap_restore_folio(h, folio);
if (rc) {
spin_lock_irq(&hugetlb_lock);
add_hugetlb_folio(h, folio, false);
...
It is possible to make hugeTLB comply with the new requirements from the
page type API. However, a straightforward fix would be to just allow the
same page type to be set or cleared again inside the API, to avoid any
changes to its callers.
Link: https://lkml.kernel.org/r/20241020042212.296781-1-yuzhao@google.com
Fixes: d99e3140a4d3 ("mm: turn folio_test_hugetlb into a PageType")
Signed-off-by: Yu Zhao <yuzhao(a)google.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Muchun Song <muchun.song(a)linux.dev>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/page-flags.h | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 1b3a767104878..cc839e4365c18 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -975,12 +975,16 @@ static __always_inline bool folio_test_##fname(const struct folio *folio) \
} \
static __always_inline void __folio_set_##fname(struct folio *folio) \
{ \
+ if (folio_test_##fname(folio)) \
+ return; \
VM_BUG_ON_FOLIO(data_race(folio->page.page_type) != UINT_MAX, \
folio); \
folio->page.page_type = (unsigned int)PGTY_##lname << 24; \
} \
static __always_inline void __folio_clear_##fname(struct folio *folio) \
{ \
+ if (folio->page.page_type == UINT_MAX) \
+ return; \
VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio); \
folio->page.page_type = UINT_MAX; \
}
@@ -993,11 +997,15 @@ static __always_inline int Page##uname(const struct page *page) \
} \
static __always_inline void __SetPage##uname(struct page *page) \
{ \
+ if (Page##uname(page)) \
+ return; \
VM_BUG_ON_PAGE(data_race(page->page_type) != UINT_MAX, page); \
page->page_type = (unsigned int)PGTY_##lname << 24; \
} \
static __always_inline void __ClearPage##uname(struct page *page) \
{ \
+ if (page->page_type == UINT_MAX) \
+ return; \
VM_BUG_ON_PAGE(!Page##uname(page), page); \
page->page_type = UINT_MAX; \
}
--
2.43.0
The patch below does not apply to the v6.11-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
Thanks,
Sasha
------------------ original commit in Linus's tree ------------------
From 01626a18230246efdcea322aa8f067e60ffe5ccd Mon Sep 17 00:00:00 2001
From: Barry Song <v-songbaohua(a)oppo.com>
Date: Fri, 27 Sep 2024 09:19:36 +1200
Subject: [PATCH] mm: avoid unconditional one-tick sleep when swapcache_prepare
fails
Commit 13ddaf26be32 ("mm/swap: fix race when skipping swapcache")
introduced an unconditional one-tick sleep when `swapcache_prepare()`
fails, which has led to reports of UI stuttering on latency-sensitive
Android devices. To address this, we can use a waitqueue to wake up tasks
that fail `swapcache_prepare()` sooner, instead of always sleeping for a
full tick. While tasks may occasionally be woken by an unrelated
`do_swap_page()`, this method is preferable to two scenarios: rapid
re-entry into page faults, which can cause livelocks, and multiple
millisecond sleeps, which visibly degrade user experience.
Oven's testing shows that a single waitqueue resolves the UI stuttering
issue. If a 'thundering herd' problem becomes apparent later, a waitqueue
hash similar to `folio_wait_table[PAGE_WAIT_TABLE_SIZE]` for page bit
locks can be introduced.
[v-songbaohua(a)oppo.com: wake_up only when swapcache_wq waitqueue is active]
Link: https://lkml.kernel.org/r/20241008130807.40833-1-21cnbao@gmail.com
Link: https://lkml.kernel.org/r/20240926211936.75373-1-21cnbao@gmail.com
Fixes: 13ddaf26be32 ("mm/swap: fix race when skipping swapcache")
Signed-off-by: Barry Song <v-songbaohua(a)oppo.com>
Reported-by: Oven Liyang <liyangouwen1(a)oppo.com>
Tested-by: Oven Liyang <liyangouwen1(a)oppo.com>
Cc: Kairui Song <kasong(a)tencent.com>
Cc: "Huang, Ying" <ying.huang(a)intel.com>
Cc: Yu Zhao <yuzhao(a)google.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Chris Li <chrisl(a)kernel.org>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: Yosry Ahmed <yosryahmed(a)google.com>
Cc: SeongJae Park <sj(a)kernel.org>
Cc: Kalesh Singh <kaleshsingh(a)google.com>
Cc: Suren Baghdasaryan <surenb(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/memory.c | 15 +++++++++++++--
1 file changed, 13 insertions(+), 2 deletions(-)
diff --git a/mm/memory.c b/mm/memory.c
index 3ccee51adfbbd..bdf77a3ec47bc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4187,6 +4187,8 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq);
+
/*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
@@ -4199,6 +4201,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct folio *swapcache, *folio = NULL;
+ DECLARE_WAITQUEUE(wait, current);
struct page *page;
struct swap_info_struct *si = NULL;
rmap_t rmap_flags = RMAP_NONE;
@@ -4297,7 +4300,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* Relax a bit to prevent rapid
* repeated page faults.
*/
+ add_wait_queue(&swapcache_wq, &wait);
schedule_timeout_uninterruptible(1);
+ remove_wait_queue(&swapcache_wq, &wait);
goto out_page;
}
need_clear_cache = true;
@@ -4604,8 +4609,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
/* Clear the swap cache pin for direct swapin after PTL unlock */
- if (need_clear_cache)
+ if (need_clear_cache) {
swapcache_clear(si, entry, nr_pages);
+ if (waitqueue_active(&swapcache_wq))
+ wake_up(&swapcache_wq);
+ }
if (si)
put_swap_device(si);
return ret;
@@ -4620,8 +4628,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
folio_unlock(swapcache);
folio_put(swapcache);
}
- if (need_clear_cache)
+ if (need_clear_cache) {
swapcache_clear(si, entry, nr_pages);
+ if (waitqueue_active(&swapcache_wq))
+ wake_up(&swapcache_wq);
+ }
if (si)
put_swap_device(si);
return ret;
--
2.43.0
The quilt patch titled
Subject: ipc: fix memleak if msg_init_ns failed in create_ipc_ns
has been removed from the -mm tree. Its filename was
ipc-fix-memleak-if-msg_init_ns-failed-in-create_ipc_ns.patch
This patch was dropped because it was merged into the mm-nonmm-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Wupeng Ma <mawupeng1(a)huawei.com>
Subject: ipc: fix memleak if msg_init_ns failed in create_ipc_ns
Date: Wed, 23 Oct 2024 17:31:29 +0800
From: Ma Wupeng <mawupeng1(a)huawei.com>
Percpu memory allocation may failed during create_ipc_ns however this
fail is not handled properly since ipc sysctls and mq sysctls is not
released properly. Fix this by release these two resource when failure.
Here is the kmemleak stack when percpu failed:
unreferenced object 0xffff88819de2a600 (size 512):
comm "shmem_2nstest", pid 120711, jiffies 4300542254
hex dump (first 32 bytes):
60 aa 9d 84 ff ff ff ff fc 18 48 b2 84 88 ff ff `.........H.....
04 00 00 00 a4 01 00 00 20 e4 56 81 ff ff ff ff ........ .V.....
backtrace (crc be7cba35):
[<ffffffff81b43f83>] __kmalloc_node_track_caller_noprof+0x333/0x420
[<ffffffff81a52e56>] kmemdup_noprof+0x26/0x50
[<ffffffff821b2f37>] setup_mq_sysctls+0x57/0x1d0
[<ffffffff821b29cc>] copy_ipcs+0x29c/0x3b0
[<ffffffff815d6a10>] create_new_namespaces+0x1d0/0x920
[<ffffffff815d7449>] copy_namespaces+0x2e9/0x3e0
[<ffffffff815458f3>] copy_process+0x29f3/0x7ff0
[<ffffffff8154b080>] kernel_clone+0xc0/0x650
[<ffffffff8154b6b1>] __do_sys_clone+0xa1/0xe0
[<ffffffff843df8ff>] do_syscall_64+0xbf/0x1c0
[<ffffffff846000b0>] entry_SYSCALL_64_after_hwframe+0x4b/0x53
Link: https://lkml.kernel.org/r/20241023093129.3074301-1-mawupeng1@huawei.com
Fixes: 72d1e611082e ("ipc/msg: mitigate the lock contention with percpu counter")
Signed-off-by: Ma Wupeng <mawupeng1(a)huawei.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
ipc/namespace.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
--- a/ipc/namespace.c~ipc-fix-memleak-if-msg_init_ns-failed-in-create_ipc_ns
+++ a/ipc/namespace.c
@@ -83,13 +83,15 @@ static struct ipc_namespace *create_ipc_
err = msg_init_ns(ns);
if (err)
- goto fail_put;
+ goto fail_ipc;
sem_init_ns(ns);
shm_init_ns(ns);
return ns;
+fail_ipc:
+ retire_ipc_sysctls(ns);
fail_mq:
retire_mq_sysctls(ns);
_
Patches currently in -mm which might be from mawupeng1(a)huawei.com are
The quilt patch titled
Subject: mm/damon/vaddr: fix issue in damon_va_evenly_split_region()
has been removed from the -mm tree. Its filename was
mm-damon-vaddr-fix-issue-in-damon_va_evenly_split_region.patch
This patch was dropped because it was merged into the mm-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Zheng Yejian <zhengyejian(a)huaweicloud.com>
Subject: mm/damon/vaddr: fix issue in damon_va_evenly_split_region()
Date: Tue, 22 Oct 2024 16:39:26 +0800
Patch series "mm/damon/vaddr: Fix issue in
damon_va_evenly_split_region()". v2.
According to the logic of damon_va_evenly_split_region(), currently
following split case would not meet the expectation:
Suppose DAMON_MIN_REGION=0x1000,
Case: Split [0x0, 0x3000) into 2 pieces, then the result would be
acutually 3 regions:
[0x0, 0x1000), [0x1000, 0x2000), [0x2000, 0x3000)
but NOT the expected 2 regions:
[0x0, 0x1000), [0x1000, 0x3000) !!!
The root cause is that when calculating size of each split piece in
damon_va_evenly_split_region():
`sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION);`
both the dividing and the ALIGN_DOWN may cause loss of precision, then
each time split one piece of size 'sz_piece' from origin 'start' to 'end'
would cause more pieces are split out than expected!!!
To fix it, count for each piece split and make sure no more than
'nr_pieces'. In addition, add above case into damon_test_split_evenly().
And add 'nr_piece == 1' check in damon_va_evenly_split_region() for better
code readability and add a corresponding kunit testcase.
This patch (of 2):
According to the logic of damon_va_evenly_split_region(), currently
following split case would not meet the expectation:
Suppose DAMON_MIN_REGION=0x1000,
Case: Split [0x0, 0x3000) into 2 pieces, then the result would be
acutually 3 regions:
[0x0, 0x1000), [0x1000, 0x2000), [0x2000, 0x3000)
but NOT the expected 2 regions:
[0x0, 0x1000), [0x1000, 0x3000) !!!
The root cause is that when calculating size of each split piece in
damon_va_evenly_split_region():
`sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION);`
both the dividing and the ALIGN_DOWN may cause loss of precision,
then each time split one piece of size 'sz_piece' from origin 'start' to
'end' would cause more pieces are split out than expected!!!
To fix it, count for each piece split and make sure no more than
'nr_pieces'. In addition, add above case into damon_test_split_evenly().
After this patch, damon-operations test passed:
# ./tools/testing/kunit/kunit.py run damon-operations
[...]
============== damon-operations (6 subtests) ===============
[PASSED] damon_test_three_regions_in_vmas
[PASSED] damon_test_apply_three_regions1
[PASSED] damon_test_apply_three_regions2
[PASSED] damon_test_apply_three_regions3
[PASSED] damon_test_apply_three_regions4
[PASSED] damon_test_split_evenly
================ [PASSED] damon-operations =================
Link: https://lkml.kernel.org/r/20241022083927.3592237-1-zhengyejian@huaweicloud.…
Link: https://lkml.kernel.org/r/20241022083927.3592237-2-zhengyejian@huaweicloud.…
Fixes: 3f49584b262c ("mm/damon: implement primitives for the virtual memory address spaces")
Signed-off-by: Zheng Yejian <zhengyejian(a)huaweicloud.com>
Reviewed-by: SeongJae Park <sj(a)kernel.org>
Cc: Fernand Sieber <sieberf(a)amazon.com>
Cc: Leonard Foerster <foersleo(a)amazon.de>
Cc: Shakeel Butt <shakeel.butt(a)linux.dev>
Cc: Ye Weihua <yeweihua4(a)huawei.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/damon/tests/vaddr-kunit.h | 1 +
mm/damon/vaddr.c | 4 ++--
2 files changed, 3 insertions(+), 2 deletions(-)
--- a/mm/damon/tests/vaddr-kunit.h~mm-damon-vaddr-fix-issue-in-damon_va_evenly_split_region
+++ a/mm/damon/tests/vaddr-kunit.h
@@ -300,6 +300,7 @@ static void damon_test_split_evenly(stru
damon_test_split_evenly_fail(test, 0, 100, 0);
damon_test_split_evenly_succ(test, 0, 100, 10);
damon_test_split_evenly_succ(test, 5, 59, 5);
+ damon_test_split_evenly_succ(test, 0, 3, 2);
damon_test_split_evenly_fail(test, 5, 6, 2);
}
--- a/mm/damon/vaddr.c~mm-damon-vaddr-fix-issue-in-damon_va_evenly_split_region
+++ a/mm/damon/vaddr.c
@@ -67,6 +67,7 @@ static int damon_va_evenly_split_region(
unsigned long sz_orig, sz_piece, orig_end;
struct damon_region *n = NULL, *next;
unsigned long start;
+ unsigned int i;
if (!r || !nr_pieces)
return -EINVAL;
@@ -80,8 +81,7 @@ static int damon_va_evenly_split_region(
r->ar.end = r->ar.start + sz_piece;
next = damon_next_region(r);
- for (start = r->ar.end; start + sz_piece <= orig_end;
- start += sz_piece) {
+ for (start = r->ar.end, i = 1; i < nr_pieces; start += sz_piece, i++) {
n = damon_new_region(start, start + sz_piece);
if (!n)
return -ENOMEM;
_
Patches currently in -mm which might be from zhengyejian(a)huaweicloud.com are
The quilt patch titled
Subject: mm: resolve faulty mmap_region() error path behaviour
has been removed from the -mm tree. Its filename was
mm-resolve-faulty-mmap_region-error-path-behaviour.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Subject: mm: resolve faulty mmap_region() error path behaviour
Date: Tue, 29 Oct 2024 18:11:48 +0000
The mmap_region() function is somewhat terrifying, with spaghetti-like
control flow and numerous means by which issues can arise and incomplete
state, memory leaks and other unpleasantness can occur.
A large amount of the complexity arises from trying to handle errors late
in the process of mapping a VMA, which forms the basis of recently
observed issues with resource leaks and observable inconsistent state.
Taking advantage of previous patches in this series we move a number of
checks earlier in the code, simplifying things by moving the core of the
logic into a static internal function __mmap_region().
Doing this allows us to perform a number of checks up front before we do
any real work, and allows us to unwind the writable unmap check
unconditionally as required and to perform a CONFIG_DEBUG_VM_MAPLE_TREE
validation unconditionally also.
We move a number of things here:
1. We preallocate memory for the iterator before we call the file-backed
memory hook, allowing us to exit early and avoid having to perform
complicated and error-prone close/free logic. We carefully free
iterator state on both success and error paths.
2. The enclosing mmap_region() function handles the mapping_map_writable()
logic early. Previously the logic had the mapping_map_writable() at the
point of mapping a newly allocated file-backed VMA, and a matching
mapping_unmap_writable() on success and error paths.
We now do this unconditionally if this is a file-backed, shared writable
mapping. If a driver changes the flags to eliminate VM_MAYWRITE, however
doing so does not invalidate the seal check we just performed, and we in
any case always decrement the counter in the wrapper.
We perform a debug assert to ensure a driver does not attempt to do the
opposite.
3. We also move arch_validate_flags() up into the mmap_region()
function. This is only relevant on arm64 and sparc64, and the check is
only meaningful for SPARC with ADI enabled. We explicitly add a warning
for this arch if a driver invalidates this check, though the code ought
eventually to be fixed to eliminate the need for this.
With all of these measures in place, we no longer need to explicitly close
the VMA on error paths, as we place all checks which might fail prior to a
call to any driver mmap hook.
This eliminates an entire class of errors, makes the code easier to reason
about and more robust.
Link: https://lkml.kernel.org/r/6e0becb36d2f5472053ac5d544c0edfe9b899e25.17302246…
Fixes: deb0f6562884 ("mm/mmap: undo ->mmap() when arch_validate_flags() fails")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Reported-by: Jann Horn <jannh(a)google.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett(a)oracle.com>
Reviewed-by: Vlastimil Babka <vbabka(a)suse.cz>
Tested-by: Mark Brown <broonie(a)kernel.org>
Cc: Andreas Larsson <andreas(a)gaisler.com>
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: David S. Miller <davem(a)davemloft.net>
Cc: Helge Deller <deller(a)gmx.de>
Cc: James E.J. Bottomley <James.Bottomley(a)HansenPartnership.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Will Deacon <will(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/mmap.c | 121 ++++++++++++++++++++++++++++------------------------
1 file changed, 66 insertions(+), 55 deletions(-)
--- a/mm/mmap.c~mm-resolve-faulty-mmap_region-error-path-behaviour
+++ a/mm/mmap.c
@@ -1358,20 +1358,18 @@ int do_munmap(struct mm_struct *mm, unsi
return do_vmi_munmap(&vmi, mm, start, len, uf, false);
}
-unsigned long mmap_region(struct file *file, unsigned long addr,
+static unsigned long __mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
struct list_head *uf)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = NULL;
pgoff_t pglen = PHYS_PFN(len);
- struct vm_area_struct *merge;
unsigned long charged = 0;
struct vma_munmap_struct vms;
struct ma_state mas_detach;
struct maple_tree mt_detach;
unsigned long end = addr + len;
- bool writable_file_mapping = false;
int error;
VMA_ITERATOR(vmi, mm, addr);
VMG_STATE(vmg, mm, &vmi, addr, end, vm_flags, pgoff);
@@ -1445,28 +1443,26 @@ unsigned long mmap_region(struct file *f
vm_flags_init(vma, vm_flags);
vma->vm_page_prot = vm_get_page_prot(vm_flags);
+ if (vma_iter_prealloc(&vmi, vma)) {
+ error = -ENOMEM;
+ goto free_vma;
+ }
+
if (file) {
vma->vm_file = get_file(file);
error = mmap_file(file, vma);
if (error)
- goto unmap_and_free_vma;
-
- if (vma_is_shared_maywrite(vma)) {
- error = mapping_map_writable(file->f_mapping);
- if (error)
- goto close_and_free_vma;
-
- writable_file_mapping = true;
- }
+ goto unmap_and_free_file_vma;
+ /* Drivers cannot alter the address of the VMA. */
+ WARN_ON_ONCE(addr != vma->vm_start);
/*
- * Expansion is handled above, merging is handled below.
- * Drivers should not alter the address of the VMA.
+ * Drivers should not permit writability when previously it was
+ * disallowed.
*/
- if (WARN_ON((addr != vma->vm_start))) {
- error = -EINVAL;
- goto close_and_free_vma;
- }
+ VM_WARN_ON_ONCE(vm_flags != vma->vm_flags &&
+ !(vm_flags & VM_MAYWRITE) &&
+ (vma->vm_flags & VM_MAYWRITE));
vma_iter_config(&vmi, addr, end);
/*
@@ -1474,6 +1470,8 @@ unsigned long mmap_region(struct file *f
* vma again as we may succeed this time.
*/
if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) {
+ struct vm_area_struct *merge;
+
vmg.flags = vma->vm_flags;
/* If this fails, state is reset ready for a reattempt. */
merge = vma_merge_new_range(&vmg);
@@ -1491,7 +1489,7 @@ unsigned long mmap_region(struct file *f
vma = merge;
/* Update vm_flags to pick up the change. */
vm_flags = vma->vm_flags;
- goto unmap_writable;
+ goto file_expanded;
}
vma_iter_config(&vmi, addr, end);
}
@@ -1500,26 +1498,15 @@ unsigned long mmap_region(struct file *f
} else if (vm_flags & VM_SHARED) {
error = shmem_zero_setup(vma);
if (error)
- goto free_vma;
+ goto free_iter_vma;
} else {
vma_set_anonymous(vma);
}
- if (map_deny_write_exec(vma->vm_flags, vma->vm_flags)) {
- error = -EACCES;
- goto close_and_free_vma;
- }
-
- /* Allow architectures to sanity-check the vm_flags */
- if (!arch_validate_flags(vma->vm_flags)) {
- error = -EINVAL;
- goto close_and_free_vma;
- }
-
- if (vma_iter_prealloc(&vmi, vma)) {
- error = -ENOMEM;
- goto close_and_free_vma;
- }
+#ifdef CONFIG_SPARC64
+ /* TODO: Fix SPARC ADI! */
+ WARN_ON_ONCE(!arch_validate_flags(vm_flags));
+#endif
/* Lock the VMA since it is modified after insertion into VMA tree */
vma_start_write(vma);
@@ -1533,10 +1520,7 @@ unsigned long mmap_region(struct file *f
*/
khugepaged_enter_vma(vma, vma->vm_flags);
- /* Once vma denies write, undo our temporary denial count */
-unmap_writable:
- if (writable_file_mapping)
- mapping_unmap_writable(file->f_mapping);
+file_expanded:
file = vma->vm_file;
ksm_add_vma(vma);
expanded:
@@ -1569,23 +1553,17 @@ expanded:
vma_set_page_prot(vma);
- validate_mm(mm);
return addr;
-close_and_free_vma:
- vma_close(vma);
-
- if (file || vma->vm_file) {
-unmap_and_free_vma:
- fput(vma->vm_file);
- vma->vm_file = NULL;
-
- vma_iter_set(&vmi, vma->vm_end);
- /* Undo any partial mapping done by a device driver. */
- unmap_region(&vmi.mas, vma, vmg.prev, vmg.next);
- }
- if (writable_file_mapping)
- mapping_unmap_writable(file->f_mapping);
+unmap_and_free_file_vma:
+ fput(vma->vm_file);
+ vma->vm_file = NULL;
+
+ vma_iter_set(&vmi, vma->vm_end);
+ /* Undo any partial mapping done by a device driver. */
+ unmap_region(&vmi.mas, vma, vmg.prev, vmg.next);
+free_iter_vma:
+ vma_iter_free(&vmi);
free_vma:
vm_area_free(vma);
unacct_error:
@@ -1595,10 +1573,43 @@ unacct_error:
abort_munmap:
vms_abort_munmap_vmas(&vms, &mas_detach);
gather_failed:
- validate_mm(mm);
return error;
}
+unsigned long mmap_region(struct file *file, unsigned long addr,
+ unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
+ struct list_head *uf)
+{
+ unsigned long ret;
+ bool writable_file_mapping = false;
+
+ /* Check to see if MDWE is applicable. */
+ if (map_deny_write_exec(vm_flags, vm_flags))
+ return -EACCES;
+
+ /* Allow architectures to sanity-check the vm_flags. */
+ if (!arch_validate_flags(vm_flags))
+ return -EINVAL;
+
+ /* Map writable and ensure this isn't a sealed memfd. */
+ if (file && is_shared_maywrite(vm_flags)) {
+ int error = mapping_map_writable(file->f_mapping);
+
+ if (error)
+ return error;
+ writable_file_mapping = true;
+ }
+
+ ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf);
+
+ /* Clear our write mapping regardless of error. */
+ if (writable_file_mapping)
+ mapping_unmap_writable(file->f_mapping);
+
+ validate_mm(current->mm);
+ return ret;
+}
+
static int __vm_munmap(unsigned long start, size_t len, bool unlock)
{
int ret;
_
Patches currently in -mm which might be from lorenzo.stoakes(a)oracle.com are
selftests-mm-add-pkey_sighandler_xx-hugetlb_dio-to-gitignore.patch
mm-refactor-mm_access-to-not-return-null.patch
mm-madvise-unrestrict-process_madvise-for-current-process.patch
maple_tree-do-not-hash-pointers-on-dump-in-debug-mode.patch
tools-testing-fix-phys_addr_t-size-on-64-bit-systems.patch
tools-testing-add-additional-vma_internalh-stubs.patch
mm-isolate-mmap-internal-logic-to-mm-vmac.patch
mm-refactor-__mmap_region.patch
mm-remove-unnecessary-reset-state-logic-on-merge-new-vma.patch
mm-defer-second-attempt-at-merge-on-mmap.patch
mm-pagewalk-add-the-ability-to-install-ptes.patch
mm-add-pte_marker_guard-pte-marker.patch
mm-madvise-implement-lightweight-guard-page-mechanism.patch
tools-testing-update-tools-uapi-header-for-mman-commonh.patch
selftests-mm-add-self-tests-for-guard-page-feature.patch
mm-remove-unnecessary-page_table_lock-on-stack-expansion.patch
The quilt patch titled
Subject: mm: refactor arch_calc_vm_flag_bits() and arm64 MTE handling
has been removed from the -mm tree. Its filename was
mm-refactor-arch_calc_vm_flag_bits-and-arm64-mte-handling.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Subject: mm: refactor arch_calc_vm_flag_bits() and arm64 MTE handling
Date: Tue, 29 Oct 2024 18:11:47 +0000
Currently MTE is permitted in two circumstances (desiring to use MTE
having been specified by the VM_MTE flag) - where MAP_ANONYMOUS is
specified, as checked by arch_calc_vm_flag_bits() and actualised by
setting the VM_MTE_ALLOWED flag, or if the file backing the mapping is
shmem, in which case we set VM_MTE_ALLOWED in shmem_mmap() when the mmap
hook is activated in mmap_region().
The function that checks that, if VM_MTE is set, VM_MTE_ALLOWED is also
set is the arm64 implementation of arch_validate_flags().
Unfortunately, we intend to refactor mmap_region() to perform this check
earlier, meaning that in the case of a shmem backing we will not have
invoked shmem_mmap() yet, causing the mapping to fail spuriously.
It is inappropriate to set this architecture-specific flag in general mm
code anyway, so a sensible resolution of this issue is to instead move the
check somewhere else.
We resolve this by setting VM_MTE_ALLOWED much earlier in do_mmap(), via
the arch_calc_vm_flag_bits() call.
This is an appropriate place to do this as we already check for the
MAP_ANONYMOUS case here, and the shmem file case is simply a variant of
the same idea - we permit RAM-backed memory.
This requires a modification to the arch_calc_vm_flag_bits() signature to
pass in a pointer to the struct file associated with the mapping, however
this is not too egregious as this is only used by two architectures anyway
- arm64 and parisc.
So this patch performs this adjustment and removes the unnecessary
assignment of VM_MTE_ALLOWED in shmem_mmap().
[akpm(a)linux-foundation.org: fix whitespace, per Catalin]
Link: https://lkml.kernel.org/r/ec251b20ba1964fb64cf1607d2ad80c47f3873df.17302246…
Fixes: deb0f6562884 ("mm/mmap: undo ->mmap() when arch_validate_flags() fails")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Suggested-by: Catalin Marinas <catalin.marinas(a)arm.com>
Reported-by: Jann Horn <jannh(a)google.com>
Reviewed-by: Catalin Marinas <catalin.marinas(a)arm.com>
Reviewed-by: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Andreas Larsson <andreas(a)gaisler.com>
Cc: David S. Miller <davem(a)davemloft.net>
Cc: Helge Deller <deller(a)gmx.de>
Cc: James E.J. Bottomley <James.Bottomley(a)HansenPartnership.com>
Cc: Liam R. Howlett <Liam.Howlett(a)oracle.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Mark Brown <broonie(a)kernel.org>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Will Deacon <will(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
arch/arm64/include/asm/mman.h | 10 +++++++---
arch/parisc/include/asm/mman.h | 5 +++--
include/linux/mman.h | 7 ++++---
mm/mmap.c | 2 +-
mm/nommu.c | 2 +-
mm/shmem.c | 3 ---
6 files changed, 16 insertions(+), 13 deletions(-)
--- a/arch/arm64/include/asm/mman.h~mm-refactor-arch_calc_vm_flag_bits-and-arm64-mte-handling
+++ a/arch/arm64/include/asm/mman.h
@@ -6,6 +6,8 @@
#ifndef BUILD_VDSO
#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/shmem_fs.h>
#include <linux/types.h>
static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
@@ -31,19 +33,21 @@ static inline unsigned long arch_calc_vm
}
#define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey)
-static inline unsigned long arch_calc_vm_flag_bits(unsigned long flags)
+static inline unsigned long arch_calc_vm_flag_bits(struct file *file,
+ unsigned long flags)
{
/*
* Only allow MTE on anonymous mappings as these are guaranteed to be
* backed by tags-capable memory. The vm_flags may be overridden by a
* filesystem supporting MTE (RAM-based).
*/
- if (system_supports_mte() && (flags & MAP_ANONYMOUS))
+ if (system_supports_mte() &&
+ ((flags & MAP_ANONYMOUS) || shmem_file(file)))
return VM_MTE_ALLOWED;
return 0;
}
-#define arch_calc_vm_flag_bits(flags) arch_calc_vm_flag_bits(flags)
+#define arch_calc_vm_flag_bits(file, flags) arch_calc_vm_flag_bits(file, flags)
static inline bool arch_validate_prot(unsigned long prot,
unsigned long addr __always_unused)
--- a/arch/parisc/include/asm/mman.h~mm-refactor-arch_calc_vm_flag_bits-and-arm64-mte-handling
+++ a/arch/parisc/include/asm/mman.h
@@ -2,6 +2,7 @@
#ifndef __ASM_MMAN_H__
#define __ASM_MMAN_H__
+#include <linux/fs.h>
#include <uapi/asm/mman.h>
/* PARISC cannot allow mdwe as it needs writable stacks */
@@ -11,7 +12,7 @@ static inline bool arch_memory_deny_writ
}
#define arch_memory_deny_write_exec_supported arch_memory_deny_write_exec_supported
-static inline unsigned long arch_calc_vm_flag_bits(unsigned long flags)
+static inline unsigned long arch_calc_vm_flag_bits(struct file *file, unsigned long flags)
{
/*
* The stack on parisc grows upwards, so if userspace requests memory
@@ -23,6 +24,6 @@ static inline unsigned long arch_calc_vm
return 0;
}
-#define arch_calc_vm_flag_bits(flags) arch_calc_vm_flag_bits(flags)
+#define arch_calc_vm_flag_bits(file, flags) arch_calc_vm_flag_bits(file, flags)
#endif /* __ASM_MMAN_H__ */
--- a/include/linux/mman.h~mm-refactor-arch_calc_vm_flag_bits-and-arm64-mte-handling
+++ a/include/linux/mman.h
@@ -2,6 +2,7 @@
#ifndef _LINUX_MMAN_H
#define _LINUX_MMAN_H
+#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/percpu_counter.h>
@@ -94,7 +95,7 @@ static inline void vm_unacct_memory(long
#endif
#ifndef arch_calc_vm_flag_bits
-#define arch_calc_vm_flag_bits(flags) 0
+#define arch_calc_vm_flag_bits(file, flags) 0
#endif
#ifndef arch_validate_prot
@@ -151,13 +152,13 @@ calc_vm_prot_bits(unsigned long prot, un
* Combine the mmap "flags" argument into "vm_flags" used internally.
*/
static inline unsigned long
-calc_vm_flag_bits(unsigned long flags)
+calc_vm_flag_bits(struct file *file, unsigned long flags)
{
return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) |
_calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) |
_calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) |
_calc_vm_trans(flags, MAP_STACK, VM_NOHUGEPAGE) |
- arch_calc_vm_flag_bits(flags);
+ arch_calc_vm_flag_bits(file, flags);
}
unsigned long vm_commit_limit(void);
--- a/mm/mmap.c~mm-refactor-arch_calc_vm_flag_bits-and-arm64-mte-handling
+++ a/mm/mmap.c
@@ -344,7 +344,7 @@ unsigned long do_mmap(struct file *file,
* to. we assume access permissions have been handled by the open
* of the memory object, so we don't do any here.
*/
- vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
+ vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(file, flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
/* Obtain the address to map to. we verify (or select) it and ensure
--- a/mm/nommu.c~mm-refactor-arch_calc_vm_flag_bits-and-arm64-mte-handling
+++ a/mm/nommu.c
@@ -842,7 +842,7 @@ static unsigned long determine_vm_flags(
{
unsigned long vm_flags;
- vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags);
+ vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(file, flags);
if (!file) {
/*
--- a/mm/shmem.c~mm-refactor-arch_calc_vm_flag_bits-and-arm64-mte-handling
+++ a/mm/shmem.c
@@ -2733,9 +2733,6 @@ static int shmem_mmap(struct file *file,
if (ret)
return ret;
- /* arm64 - allow memory tagging on RAM-based files */
- vm_flags_set(vma, VM_MTE_ALLOWED);
-
file_accessed(file);
/* This is anonymous shared memory if it is unlinked at the time of mmap */
if (inode->i_nlink)
_
Patches currently in -mm which might be from lorenzo.stoakes(a)oracle.com are
selftests-mm-add-pkey_sighandler_xx-hugetlb_dio-to-gitignore.patch
mm-refactor-mm_access-to-not-return-null.patch
mm-madvise-unrestrict-process_madvise-for-current-process.patch
maple_tree-do-not-hash-pointers-on-dump-in-debug-mode.patch
tools-testing-fix-phys_addr_t-size-on-64-bit-systems.patch
tools-testing-add-additional-vma_internalh-stubs.patch
mm-isolate-mmap-internal-logic-to-mm-vmac.patch
mm-refactor-__mmap_region.patch
mm-remove-unnecessary-reset-state-logic-on-merge-new-vma.patch
mm-defer-second-attempt-at-merge-on-mmap.patch
mm-pagewalk-add-the-ability-to-install-ptes.patch
mm-add-pte_marker_guard-pte-marker.patch
mm-madvise-implement-lightweight-guard-page-mechanism.patch
tools-testing-update-tools-uapi-header-for-mman-commonh.patch
selftests-mm-add-self-tests-for-guard-page-feature.patch
mm-remove-unnecessary-page_table_lock-on-stack-expansion.patch
The quilt patch titled
Subject: mm: refactor map_deny_write_exec()
has been removed from the -mm tree. Its filename was
mm-refactor-map_deny_write_exec.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Subject: mm: refactor map_deny_write_exec()
Date: Tue, 29 Oct 2024 18:11:46 +0000
Refactor the map_deny_write_exec() to not unnecessarily require a VMA
parameter but rather to accept VMA flags parameters, which allows us to
use this function early in mmap_region() in a subsequent commit.
While we're here, we refactor the function to be more readable and add
some additional documentation.
Link: https://lkml.kernel.org/r/6be8bb59cd7c68006ebb006eb9d8dc27104b1f70.17302246…
Fixes: deb0f6562884 ("mm/mmap: undo ->mmap() when arch_validate_flags() fails")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Reported-by: Jann Horn <jannh(a)google.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett(a)oracle.com>
Reviewed-by: Vlastimil Babka <vbabka(a)suse.cz>
Reviewed-by: Jann Horn <jannh(a)google.com>
Cc: Andreas Larsson <andreas(a)gaisler.com>
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: David S. Miller <davem(a)davemloft.net>
Cc: Helge Deller <deller(a)gmx.de>
Cc: James E.J. Bottomley <James.Bottomley(a)HansenPartnership.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Mark Brown <broonie(a)kernel.org>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Will Deacon <will(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/mman.h | 21 ++++++++++++++++++---
mm/mmap.c | 2 +-
mm/mprotect.c | 2 +-
mm/vma.h | 2 +-
4 files changed, 21 insertions(+), 6 deletions(-)
--- a/include/linux/mman.h~mm-refactor-map_deny_write_exec
+++ a/include/linux/mman.h
@@ -188,16 +188,31 @@ static inline bool arch_memory_deny_writ
*
* d) mmap(PROT_READ | PROT_EXEC)
* mmap(PROT_READ | PROT_EXEC | PROT_BTI)
+ *
+ * This is only applicable if the user has set the Memory-Deny-Write-Execute
+ * (MDWE) protection mask for the current process.
+ *
+ * @old specifies the VMA flags the VMA originally possessed, and @new the ones
+ * we propose to set.
+ *
+ * Return: false if proposed change is OK, true if not ok and should be denied.
*/
-static inline bool map_deny_write_exec(struct vm_area_struct *vma, unsigned long vm_flags)
+static inline bool map_deny_write_exec(unsigned long old, unsigned long new)
{
+ /* If MDWE is disabled, we have nothing to deny. */
if (!test_bit(MMF_HAS_MDWE, ¤t->mm->flags))
return false;
- if ((vm_flags & VM_EXEC) && (vm_flags & VM_WRITE))
+ /* If the new VMA is not executable, we have nothing to deny. */
+ if (!(new & VM_EXEC))
+ return false;
+
+ /* Under MDWE we do not accept newly writably executable VMAs... */
+ if (new & VM_WRITE)
return true;
- if (!(vma->vm_flags & VM_EXEC) && (vm_flags & VM_EXEC))
+ /* ...nor previously non-executable VMAs becoming executable. */
+ if (!(old & VM_EXEC))
return true;
return false;
--- a/mm/mmap.c~mm-refactor-map_deny_write_exec
+++ a/mm/mmap.c
@@ -1505,7 +1505,7 @@ unsigned long mmap_region(struct file *f
vma_set_anonymous(vma);
}
- if (map_deny_write_exec(vma, vma->vm_flags)) {
+ if (map_deny_write_exec(vma->vm_flags, vma->vm_flags)) {
error = -EACCES;
goto close_and_free_vma;
}
--- a/mm/mprotect.c~mm-refactor-map_deny_write_exec
+++ a/mm/mprotect.c
@@ -810,7 +810,7 @@ static int do_mprotect_pkey(unsigned lon
break;
}
- if (map_deny_write_exec(vma, newflags)) {
+ if (map_deny_write_exec(vma->vm_flags, newflags)) {
error = -EACCES;
break;
}
--- a/mm/vma.h~mm-refactor-map_deny_write_exec
+++ a/mm/vma.h
@@ -42,7 +42,7 @@ struct vma_munmap_struct {
int vma_count; /* Number of vmas that will be removed */
bool unlock; /* Unlock after the munmap */
bool clear_ptes; /* If there are outstanding PTE to be cleared */
- /* 1 byte hole */
+ /* 2 byte hole */
unsigned long nr_pages; /* Number of pages being removed */
unsigned long locked_vm; /* Number of locked pages */
unsigned long nr_accounted; /* Number of VM_ACCOUNT pages */
_
Patches currently in -mm which might be from lorenzo.stoakes(a)oracle.com are
selftests-mm-add-pkey_sighandler_xx-hugetlb_dio-to-gitignore.patch
mm-refactor-mm_access-to-not-return-null.patch
mm-madvise-unrestrict-process_madvise-for-current-process.patch
maple_tree-do-not-hash-pointers-on-dump-in-debug-mode.patch
tools-testing-fix-phys_addr_t-size-on-64-bit-systems.patch
tools-testing-add-additional-vma_internalh-stubs.patch
mm-isolate-mmap-internal-logic-to-mm-vmac.patch
mm-refactor-__mmap_region.patch
mm-remove-unnecessary-reset-state-logic-on-merge-new-vma.patch
mm-defer-second-attempt-at-merge-on-mmap.patch
mm-pagewalk-add-the-ability-to-install-ptes.patch
mm-add-pte_marker_guard-pte-marker.patch
mm-madvise-implement-lightweight-guard-page-mechanism.patch
tools-testing-update-tools-uapi-header-for-mman-commonh.patch
selftests-mm-add-self-tests-for-guard-page-feature.patch
mm-remove-unnecessary-page_table_lock-on-stack-expansion.patch
The quilt patch titled
Subject: mm: unconditionally close VMAs on error
has been removed from the -mm tree. Its filename was
mm-unconditionally-close-vmas-on-error.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Subject: mm: unconditionally close VMAs on error
Date: Tue, 29 Oct 2024 18:11:45 +0000
Incorrect invocation of VMA callbacks when the VMA is no longer in a
consistent state is bug prone and risky to perform.
With regards to the important vm_ops->close() callback We have gone to
great lengths to try to track whether or not we ought to close VMAs.
Rather than doing so and risking making a mistake somewhere, instead
unconditionally close and reset vma->vm_ops to an empty dummy operations
set with a NULL .close operator.
We introduce a new function to do so - vma_close() - and simplify existing
vms logic which tracked whether we needed to close or not.
This simplifies the logic, avoids incorrect double-calling of the .close()
callback and allows us to update error paths to simply call vma_close()
unconditionally - making VMA closure idempotent.
Link: https://lkml.kernel.org/r/28e89dda96f68c505cb6f8e9fc9b57c3e9f74b42.17302246…
Fixes: deb0f6562884 ("mm/mmap: undo ->mmap() when arch_validate_flags() fails")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Reported-by: Jann Horn <jannh(a)google.com>
Reviewed-by: Vlastimil Babka <vbabka(a)suse.cz>
Reviewed-by: Liam R. Howlett <Liam.Howlett(a)oracle.com>
Reviewed-by: Jann Horn <jannh(a)google.com>
Cc: Andreas Larsson <andreas(a)gaisler.com>
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: David S. Miller <davem(a)davemloft.net>
Cc: Helge Deller <deller(a)gmx.de>
Cc: James E.J. Bottomley <James.Bottomley(a)HansenPartnership.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Mark Brown <broonie(a)kernel.org>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Will Deacon <will(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/internal.h | 18 ++++++++++++++++++
mm/mmap.c | 5 ++---
mm/nommu.c | 3 +--
mm/vma.c | 14 +++++---------
mm/vma.h | 4 +---
5 files changed, 27 insertions(+), 17 deletions(-)
--- a/mm/internal.h~mm-unconditionally-close-vmas-on-error
+++ a/mm/internal.h
@@ -135,6 +135,24 @@ static inline int mmap_file(struct file
return err;
}
+/*
+ * If the VMA has a close hook then close it, and since closing it might leave
+ * it in an inconsistent state which makes the use of any hooks suspect, clear
+ * them down by installing dummy empty hooks.
+ */
+static inline void vma_close(struct vm_area_struct *vma)
+{
+ if (vma->vm_ops && vma->vm_ops->close) {
+ vma->vm_ops->close(vma);
+
+ /*
+ * The mapping is in an inconsistent state, and no further hooks
+ * may be invoked upon it.
+ */
+ vma->vm_ops = &vma_dummy_vm_ops;
+ }
+}
+
#ifdef CONFIG_MMU
/* Flags for folio_pte_batch(). */
--- a/mm/mmap.c~mm-unconditionally-close-vmas-on-error
+++ a/mm/mmap.c
@@ -1573,8 +1573,7 @@ expanded:
return addr;
close_and_free_vma:
- if (file && !vms.closed_vm_ops && vma->vm_ops && vma->vm_ops->close)
- vma->vm_ops->close(vma);
+ vma_close(vma);
if (file || vma->vm_file) {
unmap_and_free_vma:
@@ -1934,7 +1933,7 @@ void exit_mmap(struct mm_struct *mm)
do {
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += vma_pages(vma);
- remove_vma(vma, /* unreachable = */ true, /* closed = */ false);
+ remove_vma(vma, /* unreachable = */ true);
count++;
cond_resched();
vma = vma_next(&vmi);
--- a/mm/nommu.c~mm-unconditionally-close-vmas-on-error
+++ a/mm/nommu.c
@@ -589,8 +589,7 @@ static int delete_vma_from_mm(struct vm_
*/
static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
{
- if (vma->vm_ops && vma->vm_ops->close)
- vma->vm_ops->close(vma);
+ vma_close(vma);
if (vma->vm_file)
fput(vma->vm_file);
put_nommu_region(vma->vm_region);
--- a/mm/vma.c~mm-unconditionally-close-vmas-on-error
+++ a/mm/vma.c
@@ -323,11 +323,10 @@ static bool can_vma_merge_right(struct v
/*
* Close a vm structure and free it.
*/
-void remove_vma(struct vm_area_struct *vma, bool unreachable, bool closed)
+void remove_vma(struct vm_area_struct *vma, bool unreachable)
{
might_sleep();
- if (!closed && vma->vm_ops && vma->vm_ops->close)
- vma->vm_ops->close(vma);
+ vma_close(vma);
if (vma->vm_file)
fput(vma->vm_file);
mpol_put(vma_policy(vma));
@@ -1115,9 +1114,7 @@ void vms_clean_up_area(struct vma_munmap
vms_clear_ptes(vms, mas_detach, true);
mas_set(mas_detach, 0);
mas_for_each(mas_detach, vma, ULONG_MAX)
- if (vma->vm_ops && vma->vm_ops->close)
- vma->vm_ops->close(vma);
- vms->closed_vm_ops = true;
+ vma_close(vma);
}
/*
@@ -1160,7 +1157,7 @@ void vms_complete_munmap_vmas(struct vma
/* Remove and clean up vmas */
mas_set(mas_detach, 0);
mas_for_each(mas_detach, vma, ULONG_MAX)
- remove_vma(vma, /* = */ false, vms->closed_vm_ops);
+ remove_vma(vma, /* unreachable = */ false);
vm_unacct_memory(vms->nr_accounted);
validate_mm(mm);
@@ -1684,8 +1681,7 @@ struct vm_area_struct *copy_vma(struct v
return new_vma;
out_vma_link:
- if (new_vma->vm_ops && new_vma->vm_ops->close)
- new_vma->vm_ops->close(new_vma);
+ vma_close(new_vma);
if (new_vma->vm_file)
fput(new_vma->vm_file);
--- a/mm/vma.h~mm-unconditionally-close-vmas-on-error
+++ a/mm/vma.h
@@ -42,7 +42,6 @@ struct vma_munmap_struct {
int vma_count; /* Number of vmas that will be removed */
bool unlock; /* Unlock after the munmap */
bool clear_ptes; /* If there are outstanding PTE to be cleared */
- bool closed_vm_ops; /* call_mmap() was encountered, so vmas may be closed */
/* 1 byte hole */
unsigned long nr_pages; /* Number of pages being removed */
unsigned long locked_vm; /* Number of locked pages */
@@ -198,7 +197,6 @@ static inline void init_vma_munmap(struc
vms->unmap_start = FIRST_USER_ADDRESS;
vms->unmap_end = USER_PGTABLES_CEILING;
vms->clear_ptes = false;
- vms->closed_vm_ops = false;
}
#endif
@@ -269,7 +267,7 @@ int do_vmi_munmap(struct vma_iterator *v
unsigned long start, size_t len, struct list_head *uf,
bool unlock);
-void remove_vma(struct vm_area_struct *vma, bool unreachable, bool closed);
+void remove_vma(struct vm_area_struct *vma, bool unreachable);
void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct vm_area_struct *next);
_
Patches currently in -mm which might be from lorenzo.stoakes(a)oracle.com are
selftests-mm-add-pkey_sighandler_xx-hugetlb_dio-to-gitignore.patch
mm-refactor-mm_access-to-not-return-null.patch
mm-madvise-unrestrict-process_madvise-for-current-process.patch
maple_tree-do-not-hash-pointers-on-dump-in-debug-mode.patch
tools-testing-fix-phys_addr_t-size-on-64-bit-systems.patch
tools-testing-add-additional-vma_internalh-stubs.patch
mm-isolate-mmap-internal-logic-to-mm-vmac.patch
mm-refactor-__mmap_region.patch
mm-remove-unnecessary-reset-state-logic-on-merge-new-vma.patch
mm-defer-second-attempt-at-merge-on-mmap.patch
mm-pagewalk-add-the-ability-to-install-ptes.patch
mm-add-pte_marker_guard-pte-marker.patch
mm-madvise-implement-lightweight-guard-page-mechanism.patch
tools-testing-update-tools-uapi-header-for-mman-commonh.patch
selftests-mm-add-self-tests-for-guard-page-feature.patch
mm-remove-unnecessary-page_table_lock-on-stack-expansion.patch
The quilt patch titled
Subject: mm: avoid unsafe VMA hook invocation when error arises on mmap hook
has been removed from the -mm tree. Its filename was
mm-avoid-unsafe-vma-hook-invocation-when-error-arises-on-mmap-hook.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Subject: mm: avoid unsafe VMA hook invocation when error arises on mmap hook
Date: Tue, 29 Oct 2024 18:11:44 +0000
Patch series "fix error handling in mmap_region() and refactor
(hotfixes)", v4.
mmap_region() is somewhat terrifying, with spaghetti-like control flow and
numerous means by which issues can arise and incomplete state, memory
leaks and other unpleasantness can occur.
A large amount of the complexity arises from trying to handle errors late
in the process of mapping a VMA, which forms the basis of recently
observed issues with resource leaks and observable inconsistent state.
This series goes to great lengths to simplify how mmap_region() works and
to avoid unwinding errors late on in the process of setting up the VMA for
the new mapping, and equally avoids such operations occurring while the
VMA is in an inconsistent state.
The patches in this series comprise the minimal changes required to
resolve existing issues in mmap_region() error handling, in order that
they can be hotfixed and backported. There is additionally a follow up
series which goes further, separated out from the v1 series and sent and
updated separately.
This patch (of 5):
After an attempted mmap() fails, we are no longer in a situation where we
can safely interact with VMA hooks. This is currently not enforced,
meaning that we need complicated handling to ensure we do not incorrectly
call these hooks.
We can avoid the whole issue by treating the VMA as suspect the moment
that the file->f_ops->mmap() function reports an error by replacing
whatever VMA operations were installed with a dummy empty set of VMA
operations.
We do so through a new helper function internal to mm - mmap_file() -
which is both more logically named than the existing call_mmap() function
and correctly isolates handling of the vm_op reassignment to mm.
All the existing invocations of call_mmap() outside of mm are ultimately
nested within the call_mmap() from mm, which we now replace.
It is therefore safe to leave call_mmap() in place as a convenience
function (and to avoid churn). The invokers are:
ovl_file_operations -> mmap -> ovl_mmap() -> backing_file_mmap()
coda_file_operations -> mmap -> coda_file_mmap()
shm_file_operations -> shm_mmap()
shm_file_operations_huge -> shm_mmap()
dma_buf_fops -> dma_buf_mmap_internal -> i915_dmabuf_ops
-> i915_gem_dmabuf_mmap()
None of these callers interact with vm_ops or mappings in a problematic
way on error, quickly exiting out.
Link: https://lkml.kernel.org/r/cover.1730224667.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/d41fd763496fd0048a962f3fd9407dc72dd4fd86.17302246…
Fixes: deb0f6562884 ("mm/mmap: undo ->mmap() when arch_validate_flags() fails")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Reported-by: Jann Horn <jannh(a)google.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett(a)oracle.com>
Reviewed-by: Vlastimil Babka <vbabka(a)suse.cz>
Reviewed-by: Jann Horn <jannh(a)google.com>
Cc: Andreas Larsson <andreas(a)gaisler.com>
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: David S. Miller <davem(a)davemloft.net>
Cc: Helge Deller <deller(a)gmx.de>
Cc: James E.J. Bottomley <James.Bottomley(a)HansenPartnership.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Mark Brown <broonie(a)kernel.org>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Will Deacon <will(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/internal.h | 27 +++++++++++++++++++++++++++
mm/mmap.c | 6 +++---
mm/nommu.c | 4 ++--
3 files changed, 32 insertions(+), 5 deletions(-)
--- a/mm/internal.h~mm-avoid-unsafe-vma-hook-invocation-when-error-arises-on-mmap-hook
+++ a/mm/internal.h
@@ -108,6 +108,33 @@ static inline void *folio_raw_mapping(co
return (void *)(mapping & ~PAGE_MAPPING_FLAGS);
}
+/*
+ * This is a file-backed mapping, and is about to be memory mapped - invoke its
+ * mmap hook and safely handle error conditions. On error, VMA hooks will be
+ * mutated.
+ *
+ * @file: File which backs the mapping.
+ * @vma: VMA which we are mapping.
+ *
+ * Returns: 0 if success, error otherwise.
+ */
+static inline int mmap_file(struct file *file, struct vm_area_struct *vma)
+{
+ int err = call_mmap(file, vma);
+
+ if (likely(!err))
+ return 0;
+
+ /*
+ * OK, we tried to call the file hook for mmap(), but an error
+ * arose. The mapping is in an inconsistent state and we most not invoke
+ * any further hooks on it.
+ */
+ vma->vm_ops = &vma_dummy_vm_ops;
+
+ return err;
+}
+
#ifdef CONFIG_MMU
/* Flags for folio_pte_batch(). */
--- a/mm/mmap.c~mm-avoid-unsafe-vma-hook-invocation-when-error-arises-on-mmap-hook
+++ a/mm/mmap.c
@@ -1422,7 +1422,7 @@ unsigned long mmap_region(struct file *f
/*
* clear PTEs while the vma is still in the tree so that rmap
* cannot race with the freeing later in the truncate scenario.
- * This is also needed for call_mmap(), which is why vm_ops
+ * This is also needed for mmap_file(), which is why vm_ops
* close function is called.
*/
vms_clean_up_area(&vms, &mas_detach);
@@ -1447,7 +1447,7 @@ unsigned long mmap_region(struct file *f
if (file) {
vma->vm_file = get_file(file);
- error = call_mmap(file, vma);
+ error = mmap_file(file, vma);
if (error)
goto unmap_and_free_vma;
@@ -1470,7 +1470,7 @@ unsigned long mmap_region(struct file *f
vma_iter_config(&vmi, addr, end);
/*
- * If vm_flags changed after call_mmap(), we should try merge
+ * If vm_flags changed after mmap_file(), we should try merge
* vma again as we may succeed this time.
*/
if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) {
--- a/mm/nommu.c~mm-avoid-unsafe-vma-hook-invocation-when-error-arises-on-mmap-hook
+++ a/mm/nommu.c
@@ -885,7 +885,7 @@ static int do_mmap_shared_file(struct vm
{
int ret;
- ret = call_mmap(vma->vm_file, vma);
+ ret = mmap_file(vma->vm_file, vma);
if (ret == 0) {
vma->vm_region->vm_top = vma->vm_region->vm_end;
return 0;
@@ -918,7 +918,7 @@ static int do_mmap_private(struct vm_are
* happy.
*/
if (capabilities & NOMMU_MAP_DIRECT) {
- ret = call_mmap(vma->vm_file, vma);
+ ret = mmap_file(vma->vm_file, vma);
/* shouldn't return success if we're not sharing */
if (WARN_ON_ONCE(!is_nommu_shared_mapping(vma->vm_flags)))
ret = -ENOSYS;
_
Patches currently in -mm which might be from lorenzo.stoakes(a)oracle.com are
selftests-mm-add-pkey_sighandler_xx-hugetlb_dio-to-gitignore.patch
mm-refactor-mm_access-to-not-return-null.patch
mm-madvise-unrestrict-process_madvise-for-current-process.patch
maple_tree-do-not-hash-pointers-on-dump-in-debug-mode.patch
tools-testing-fix-phys_addr_t-size-on-64-bit-systems.patch
tools-testing-add-additional-vma_internalh-stubs.patch
mm-isolate-mmap-internal-logic-to-mm-vmac.patch
mm-refactor-__mmap_region.patch
mm-remove-unnecessary-reset-state-logic-on-merge-new-vma.patch
mm-defer-second-attempt-at-merge-on-mmap.patch
mm-pagewalk-add-the-ability-to-install-ptes.patch
mm-add-pte_marker_guard-pte-marker.patch
mm-madvise-implement-lightweight-guard-page-mechanism.patch
tools-testing-update-tools-uapi-header-for-mman-commonh.patch
selftests-mm-add-self-tests-for-guard-page-feature.patch
mm-remove-unnecessary-page_table_lock-on-stack-expansion.patch
The quilt patch titled
Subject: mm/thp: fix deferred split unqueue naming and locking
has been removed from the -mm tree. Its filename was
mm-thp-fix-deferred-split-unqueue-naming-and-locking.patch
This patch was dropped because it was merged into the mm-hotfixes-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Hugh Dickins <hughd(a)google.com>
Subject: mm/thp: fix deferred split unqueue naming and locking
Date: Sun, 27 Oct 2024 13:02:13 -0700 (PDT)
Recent changes are putting more pressure on THP deferred split queues:
under load revealing long-standing races, causing list_del corruptions,
"Bad page state"s and worse (I keep BUGs in both of those, so usually
don't get to see how badly they end up without). The relevant recent
changes being 6.8's mTHP, 6.10's mTHP swapout, and 6.12's mTHP swapin,
improved swap allocation, and underused THP splitting.
Before fixing locking: rename misleading folio_undo_large_rmappable(),
which does not undo large_rmappable, to folio_unqueue_deferred_split(),
which is what it does. But that and its out-of-line __callee are mm
internals of very limited usability: add comment and WARN_ON_ONCEs to
check usage; and return a bool to say if a deferred split was unqueued,
which can then be used in WARN_ON_ONCEs around safety checks (sparing
callers the arcane conditionals in __folio_unqueue_deferred_split()).
Just omit the folio_unqueue_deferred_split() from free_unref_folios(), all
of whose callers now call it beforehand (and if any forget then bad_page()
will tell) - except for its caller put_pages_list(), which itself no
longer has any callers (and will be deleted separately).
Swapout: mem_cgroup_swapout() has been resetting folio->memcg_data 0
without checking and unqueueing a THP folio from deferred split list;
which is unfortunate, since the split_queue_lock depends on the memcg
(when memcg is enabled); so swapout has been unqueueing such THPs later,
when freeing the folio, using the pgdat's lock instead: potentially
corrupting the memcg's list. __remove_mapping() has frozen refcount to 0
here, so no problem with calling folio_unqueue_deferred_split() before
resetting memcg_data.
That goes back to 5.4 commit 87eaceb3faa5 ("mm: thp: make deferred split
shrinker memcg aware"): which included a check on swapcache before adding
to deferred queue, but no check on deferred queue before adding THP to
swapcache. That worked fine with the usual sequence of events in reclaim
(though there were a couple of rare ways in which a THP on deferred queue
could have been swapped out), but 6.12 commit dafff3f4c850 ("mm: split
underused THPs") avoids splitting underused THPs in reclaim, which makes
swapcache THPs on deferred queue commonplace.
Keep the check on swapcache before adding to deferred queue? Yes: it is
no longer essential, but preserves the existing behaviour, and is likely
to be a worthwhile optimization (vmstat showed much more traffic on the
queue under swapping load if the check was removed); update its comment.
Memcg-v1 move (deprecated): mem_cgroup_move_account() has been changing
folio->memcg_data without checking and unqueueing a THP folio from the
deferred list, sometimes corrupting "from" memcg's list, like swapout.
Refcount is non-zero here, so folio_unqueue_deferred_split() can only be
used in a WARN_ON_ONCE to validate the fix, which must be done earlier:
mem_cgroup_move_charge_pte_range() first try to split the THP (splitting
of course unqueues), or skip it if that fails. Not ideal, but moving
charge has been requested, and khugepaged should repair the THP later:
nobody wants new custom unqueueing code just for this deprecated case.
The 87eaceb3faa5 commit did have the code to move from one deferred list
to another (but was not conscious of its unsafety while refcount non-0);
but that was removed by 5.6 commit fac0516b5534 ("mm: thp: don't need care
deferred split queue in memcg charge move path"), which argued that the
existence of a PMD mapping guarantees that the THP cannot be on a deferred
list. As above, false in rare cases, and now commonly false.
Backport to 6.11 should be straightforward. Earlier backports must take
care that other _deferred_list fixes and dependencies are included. There
is not a strong case for backports, but they can fix cornercases.
Link: https://lkml.kernel.org/r/8dc111ae-f6db-2da7-b25c-7a20b1effe3b@google.com
Fixes: 87eaceb3faa5 ("mm: thp: make deferred split shrinker memcg aware")
Fixes: dafff3f4c850 ("mm: split underused THPs")
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Yang Shi <shy828301(a)gmail.com>
Cc: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Cc: Barry Song <baohua(a)kernel.org>
Cc: Chris Li <chrisl(a)kernel.org>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Kefeng Wang <wangkefeng.wang(a)huawei.com>
Cc: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Nhat Pham <nphamcs(a)gmail.com>
Cc: Ryan Roberts <ryan.roberts(a)arm.com>
Cc: Shakeel Butt <shakeel.butt(a)linux.dev>
Cc: Usama Arif <usamaarif642(a)gmail.com>
Cc: Wei Yang <richard.weiyang(a)gmail.com>
Cc: Zi Yan <ziy(a)nvidia.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/huge_memory.c | 35 ++++++++++++++++++++++++++---------
mm/internal.h | 10 +++++-----
mm/memcontrol-v1.c | 25 +++++++++++++++++++++++++
mm/memcontrol.c | 8 +++++---
mm/migrate.c | 4 ++--
mm/page_alloc.c | 1 -
mm/swap.c | 4 ++--
mm/vmscan.c | 4 ++--
8 files changed, 67 insertions(+), 24 deletions(-)
--- a/mm/huge_memory.c~mm-thp-fix-deferred-split-unqueue-naming-and-locking
+++ a/mm/huge_memory.c
@@ -3588,10 +3588,27 @@ int split_folio_to_list(struct folio *fo
return split_huge_page_to_list_to_order(&folio->page, list, ret);
}
-void __folio_undo_large_rmappable(struct folio *folio)
+/*
+ * __folio_unqueue_deferred_split() is not to be called directly:
+ * the folio_unqueue_deferred_split() inline wrapper in mm/internal.h
+ * limits its calls to those folios which may have a _deferred_list for
+ * queueing THP splits, and that list is (racily observed to be) non-empty.
+ *
+ * It is unsafe to call folio_unqueue_deferred_split() until folio refcount is
+ * zero: because even when split_queue_lock is held, a non-empty _deferred_list
+ * might be in use on deferred_split_scan()'s unlocked on-stack list.
+ *
+ * If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is
+ * therefore important to unqueue deferred split before changing folio memcg.
+ */
+bool __folio_unqueue_deferred_split(struct folio *folio)
{
struct deferred_split *ds_queue;
unsigned long flags;
+ bool unqueued = false;
+
+ WARN_ON_ONCE(folio_ref_count(folio));
+ WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg(folio));
ds_queue = get_deferred_split_queue(folio);
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
@@ -3603,8 +3620,11 @@ void __folio_undo_large_rmappable(struct
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
}
list_del_init(&folio->_deferred_list);
+ unqueued = true;
}
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
+
+ return unqueued; /* useful for debug warnings */
}
/* partially_mapped=false won't clear PG_partially_mapped folio flag */
@@ -3627,14 +3647,11 @@ void deferred_split_folio(struct folio *
return;
/*
- * The try_to_unmap() in page reclaim path might reach here too,
- * this may cause a race condition to corrupt deferred split queue.
- * And, if page reclaim is already handling the same folio, it is
- * unnecessary to handle it again in shrinker.
- *
- * Check the swapcache flag to determine if the folio is being
- * handled by page reclaim since THP swap would add the folio into
- * swap cache before calling try_to_unmap().
+ * Exclude swapcache: originally to avoid a corrupt deferred split
+ * queue. Nowadays that is fully prevented by mem_cgroup_swapout();
+ * but if page reclaim is already handling the same folio, it is
+ * unnecessary to handle it again in the shrinker, so excluding
+ * swapcache here may still be a useful optimization.
*/
if (folio_test_swapcache(folio))
return;
--- a/mm/internal.h~mm-thp-fix-deferred-split-unqueue-naming-and-locking
+++ a/mm/internal.h
@@ -639,11 +639,11 @@ static inline void folio_set_order(struc
#endif
}
-void __folio_undo_large_rmappable(struct folio *folio);
-static inline void folio_undo_large_rmappable(struct folio *folio)
+bool __folio_unqueue_deferred_split(struct folio *folio);
+static inline bool folio_unqueue_deferred_split(struct folio *folio)
{
if (folio_order(folio) <= 1 || !folio_test_large_rmappable(folio))
- return;
+ return false;
/*
* At this point, there is no one trying to add the folio to
@@ -651,9 +651,9 @@ static inline void folio_undo_large_rmap
* to check without acquiring the split_queue_lock.
*/
if (data_race(list_empty(&folio->_deferred_list)))
- return;
+ return false;
- __folio_undo_large_rmappable(folio);
+ return __folio_unqueue_deferred_split(folio);
}
static inline struct folio *page_rmappable_folio(struct page *page)
--- a/mm/memcontrol.c~mm-thp-fix-deferred-split-unqueue-naming-and-locking
+++ a/mm/memcontrol.c
@@ -4629,9 +4629,6 @@ static void uncharge_folio(struct folio
struct obj_cgroup *objcg;
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
- VM_BUG_ON_FOLIO(folio_order(folio) > 1 &&
- !folio_test_hugetlb(folio) &&
- !list_empty(&folio->_deferred_list), folio);
/*
* Nobody should be changing or seriously looking at
@@ -4678,6 +4675,7 @@ static void uncharge_folio(struct folio
ug->nr_memory += nr_pages;
ug->pgpgout++;
+ WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
folio->memcg_data = 0;
}
@@ -4789,6 +4787,9 @@ void mem_cgroup_migrate(struct folio *ol
/* Transfer the charge and the css ref */
commit_charge(new, memcg);
+
+ /* Warning should never happen, so don't worry about refcount non-0 */
+ WARN_ON_ONCE(folio_unqueue_deferred_split(old));
old->memcg_data = 0;
}
@@ -4975,6 +4976,7 @@ void mem_cgroup_swapout(struct folio *fo
VM_BUG_ON_FOLIO(oldid, folio);
mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
+ folio_unqueue_deferred_split(folio);
folio->memcg_data = 0;
if (!mem_cgroup_is_root(memcg))
--- a/mm/memcontrol-v1.c~mm-thp-fix-deferred-split-unqueue-naming-and-locking
+++ a/mm/memcontrol-v1.c
@@ -848,6 +848,8 @@ static int mem_cgroup_move_account(struc
css_get(&to->css);
css_put(&from->css);
+ /* Warning should never happen, so don't worry about refcount non-0 */
+ WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
folio->memcg_data = (unsigned long)to;
__folio_memcg_unlock(from);
@@ -1217,7 +1219,9 @@ static int mem_cgroup_move_charge_pte_ra
enum mc_target_type target_type;
union mc_target target;
struct folio *folio;
+ bool tried_split_before = false;
+retry_pmd:
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
if (mc.precharge < HPAGE_PMD_NR) {
@@ -1227,6 +1231,27 @@ static int mem_cgroup_move_charge_pte_ra
target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
if (target_type == MC_TARGET_PAGE) {
folio = target.folio;
+ /*
+ * Deferred split queue locking depends on memcg,
+ * and unqueue is unsafe unless folio refcount is 0:
+ * split or skip if on the queue? first try to split.
+ */
+ if (!list_empty(&folio->_deferred_list)) {
+ spin_unlock(ptl);
+ if (!tried_split_before)
+ split_folio(folio);
+ folio_unlock(folio);
+ folio_put(folio);
+ if (tried_split_before)
+ return 0;
+ tried_split_before = true;
+ goto retry_pmd;
+ }
+ /*
+ * So long as that pmd lock is held, the folio cannot
+ * be racily added to the _deferred_list, because
+ * __folio_remove_rmap() will find !partially_mapped.
+ */
if (folio_isolate_lru(folio)) {
if (!mem_cgroup_move_account(folio, true,
mc.from, mc.to)) {
--- a/mm/migrate.c~mm-thp-fix-deferred-split-unqueue-naming-and-locking
+++ a/mm/migrate.c
@@ -490,7 +490,7 @@ static int __folio_migrate_mapping(struc
folio_test_large_rmappable(folio)) {
if (!folio_ref_freeze(folio, expected_count))
return -EAGAIN;
- folio_undo_large_rmappable(folio);
+ folio_unqueue_deferred_split(folio);
folio_ref_unfreeze(folio, expected_count);
}
@@ -515,7 +515,7 @@ static int __folio_migrate_mapping(struc
}
/* Take off deferred split queue while frozen and memcg set */
- folio_undo_large_rmappable(folio);
+ folio_unqueue_deferred_split(folio);
/*
* Now we know that no one else is looking at the folio:
--- a/mm/page_alloc.c~mm-thp-fix-deferred-split-unqueue-naming-and-locking
+++ a/mm/page_alloc.c
@@ -2681,7 +2681,6 @@ void free_unref_folios(struct folio_batc
unsigned long pfn = folio_pfn(folio);
unsigned int order = folio_order(folio);
- folio_undo_large_rmappable(folio);
if (!free_pages_prepare(&folio->page, order))
continue;
/*
--- a/mm/swap.c~mm-thp-fix-deferred-split-unqueue-naming-and-locking
+++ a/mm/swap.c
@@ -121,7 +121,7 @@ void __folio_put(struct folio *folio)
}
page_cache_release(folio);
- folio_undo_large_rmappable(folio);
+ folio_unqueue_deferred_split(folio);
mem_cgroup_uncharge(folio);
free_unref_page(&folio->page, folio_order(folio));
}
@@ -988,7 +988,7 @@ void folios_put_refs(struct folio_batch
free_huge_folio(folio);
continue;
}
- folio_undo_large_rmappable(folio);
+ folio_unqueue_deferred_split(folio);
__page_cache_release(folio, &lruvec, &flags);
if (j != i)
--- a/mm/vmscan.c~mm-thp-fix-deferred-split-unqueue-naming-and-locking
+++ a/mm/vmscan.c
@@ -1476,7 +1476,7 @@ free_it:
*/
nr_reclaimed += nr_pages;
- folio_undo_large_rmappable(folio);
+ folio_unqueue_deferred_split(folio);
if (folio_batch_add(&free_folios, folio) == 0) {
mem_cgroup_uncharge_folios(&free_folios);
try_to_unmap_flush();
@@ -1864,7 +1864,7 @@ static unsigned int move_folios_to_lru(s
if (unlikely(folio_put_testzero(folio))) {
__folio_clear_lru_flags(folio);
- folio_undo_large_rmappable(folio);
+ folio_unqueue_deferred_split(folio);
if (folio_batch_add(&free_folios, folio) == 0) {
spin_unlock_irq(&lruvec->lru_lock);
mem_cgroup_uncharge_folios(&free_folios);
_
Patches currently in -mm which might be from hughd(a)google.com are
mm-delete-the-unused-put_pages_list.patch
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 41e192ad2779cae0102879612dfe46726e4396aa
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024110529-clapper-deferred-1146@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 41e192ad2779cae0102879612dfe46726e4396aa Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Date: Fri, 18 Oct 2024 04:33:10 +0900
Subject: [PATCH] nilfs2: fix kernel bug due to missing clearing of checked
flag
Syzbot reported that in directory operations after nilfs2 detects
filesystem corruption and degrades to read-only,
__block_write_begin_int(), which is called to prepare block writes, may
fail the BUG_ON check for accesses exceeding the folio/page size,
triggering a kernel bug.
This was found to be because the "checked" flag of a page/folio was not
cleared when it was discarded by nilfs2's own routine, which causes the
sanity check of directory entries to be skipped when the directory
page/folio is reloaded. So, fix that.
This was necessary when the use of nilfs2's own page discard routine was
applied to more than just metadata files.
Link: https://lkml.kernel.org/r/20241017193359.5051-1-konishi.ryusuke@gmail.com
Fixes: 8c26c4e2694a ("nilfs2: fix issue with flush kernel thread after remount in RO mode because of driver's internal error or metadata corruption")
Signed-off-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Reported-by: syzbot+d6ca2daf692c7a82f959(a)syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=d6ca2daf692c7a82f959
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 5436eb0424bd..10def4b55995 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -401,6 +401,7 @@ void nilfs_clear_folio_dirty(struct folio *folio)
folio_clear_uptodate(folio);
folio_clear_mappedtodisk(folio);
+ folio_clear_checked(folio);
head = folio_buffers(folio);
if (head) {
Hi Carlos,
Please pull this branch with changes for xfs for 6.13-rc1.
As usual, I did a test-merge with the main upstream branch as of a few
minutes ago, and didn't see any conflicts. Please let me know if you
encounter any problems.
--D
The following changes since commit 59b723cd2adbac2a34fc8e12c74ae26ae45bf230:
Linux 6.12-rc6 (2024-11-03 14:05:52 -1000)
are available in the Git repository at:
https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux.git tags/perag-xarray-6.13_2024-11-05
for you to fetch changes up to d66496578b2a099ea453f56782f1cd2bf63a8029:
xfs: insert the pag structures into the xarray later (2024-11-05 13:38:27 -0800)
----------------------------------------------------------------
xfs: convert perag to use xarrays [v5.5 01/10]
Convert the xfs_mount perag tree to use an xarray instead of a radix
tree. There should be no functional changes here.
With a bit of luck, this should all go splendidly.
Signed-off-by: Darrick J. Wong <djwong(a)kernel.org>
----------------------------------------------------------------
Christoph Hellwig (22):
xfs: fix superfluous clearing of info->low in __xfs_getfsmap_datadev
xfs: remove the unused pagb_count field in struct xfs_perag
xfs: remove the unused pag_active_wq field in struct xfs_perag
xfs: pass a pag to xfs_difree_inode_chunk
xfs: remove the agno argument to xfs_free_ag_extent
xfs: add xfs_agbno_to_fsb and xfs_agbno_to_daddr helpers
xfs: add a xfs_agino_to_ino helper
xfs: pass a pag to xfs_extent_busy_{search,reuse}
xfs: keep a reference to the pag for busy extents
xfs: remove the mount field from struct xfs_busy_extents
xfs: remove the unused trace_xfs_iwalk_ag trace point
xfs: remove the unused xrep_bmap_walk_rmap trace point
xfs: constify pag arguments to trace points
xfs: pass a perag structure to the xfs_ag_resv_init_error trace point
xfs: pass objects to the xfs_irec_merge_{pre,post} trace points
xfs: pass the iunlink item to the xfs_iunlink_update_dinode trace point
xfs: pass objects to the xrep_ibt_walk_rmap tracepoint
xfs: pass the pag to the trace_xrep_calc_ag_resblks{,_btsize} trace points
xfs: pass the pag to the xrep_newbt_extent_class tracepoints
xfs: convert remaining trace points to pass pag structures
xfs: split xfs_initialize_perag
xfs: insert the pag structures into the xarray later
Darrick J. Wong (1):
xfs: fix simplify extent lookup in xfs_can_free_eofblocks
fs/xfs/libxfs/xfs_ag.c | 135 ++++++++++++++------------
fs/xfs/libxfs/xfs_ag.h | 30 +++++-
fs/xfs/libxfs/xfs_ag_resv.c | 3 +-
fs/xfs/libxfs/xfs_alloc.c | 32 +++----
fs/xfs/libxfs/xfs_alloc.h | 5 +-
fs/xfs/libxfs/xfs_alloc_btree.c | 2 +-
fs/xfs/libxfs/xfs_btree.c | 7 +-
fs/xfs/libxfs/xfs_ialloc.c | 67 ++++++-------
fs/xfs/libxfs/xfs_ialloc_btree.c | 2 +-
fs/xfs/libxfs/xfs_inode_util.c | 4 +-
fs/xfs/libxfs/xfs_refcount.c | 11 +--
fs/xfs/libxfs/xfs_refcount_btree.c | 3 +-
fs/xfs/libxfs/xfs_rmap_btree.c | 2 +-
fs/xfs/scrub/agheader_repair.c | 16 +---
fs/xfs/scrub/alloc_repair.c | 10 +-
fs/xfs/scrub/bmap.c | 5 +-
fs/xfs/scrub/bmap_repair.c | 4 +-
fs/xfs/scrub/common.c | 2 +-
fs/xfs/scrub/cow_repair.c | 18 ++--
fs/xfs/scrub/ialloc.c | 8 +-
fs/xfs/scrub/ialloc_repair.c | 25 ++---
fs/xfs/scrub/newbt.c | 46 ++++-----
fs/xfs/scrub/reap.c | 8 +-
fs/xfs/scrub/refcount_repair.c | 5 +-
fs/xfs/scrub/repair.c | 13 ++-
fs/xfs/scrub/rmap_repair.c | 9 +-
fs/xfs/scrub/trace.h | 161 +++++++++++++++----------------
fs/xfs/xfs_bmap_util.c | 8 +-
fs/xfs/xfs_buf_item_recover.c | 5 +-
fs/xfs/xfs_discard.c | 20 ++--
fs/xfs/xfs_extent_busy.c | 31 +++---
fs/xfs/xfs_extent_busy.h | 14 ++-
fs/xfs/xfs_extfree_item.c | 4 +-
fs/xfs/xfs_filestream.c | 5 +-
fs/xfs/xfs_fsmap.c | 25 ++---
fs/xfs/xfs_health.c | 8 +-
fs/xfs/xfs_inode.c | 5 +-
fs/xfs/xfs_iunlink_item.c | 13 ++-
fs/xfs/xfs_iwalk.c | 17 ++--
fs/xfs/xfs_log_cil.c | 3 +-
fs/xfs/xfs_log_recover.c | 5 +-
fs/xfs/xfs_trace.c | 1 +
fs/xfs/xfs_trace.h | 191 ++++++++++++++++---------------------
fs/xfs/xfs_trans.c | 2 +-
44 files changed, 459 insertions(+), 531 deletions(-)
The patch titled
Subject: util_macros.h: fix/rework find_closest() macros
has been added to the -mm mm-nonmm-unstable branch. Its filename is
util_macrosh-fix-rework-find_closest-macros.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-nonmm-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Alexandru Ardelean <aardelean(a)baylibre.com>
Subject: util_macros.h: fix/rework find_closest() macros
Date: Tue, 5 Nov 2024 16:54:05 +0200
A bug was found in the find_closest() (find_closest_descending() is also
affected after some testing), where for certain values with small
progressions, the rounding (done by averaging 2 values) causes an
incorrect index to be returned. The rounding issues occur for
progressions of 1, 2 and 3. It goes away when the progression/interval
between two values is 4 or larger.
It's particularly bad for progressions of 1. For example if there's an
array of 'a = { 1, 2, 3 }', using 'find_closest(2, a ...)' would return 0
(the index of '1'), rather than returning 1 (the index of '2'). This
means that for exact values (with a progression of 1), find_closest() will
misbehave and return the index of the value smaller than the one we're
searching for.
For progressions of 2 and 3, the exact values are obtained correctly; but
values aren't approximated correctly (as one would expect). Starting with
progressions of 4, all seems to be good (one gets what one would expect).
While one could argue that 'find_closest()' should not be used for arrays
with progressions of 1 (i.e. '{1, 2, 3, ...}', the macro should still
behave correctly.
The bug was found while testing the 'drivers/iio/adc/ad7606.c',
specifically the oversampling feature.
For reference, the oversampling values are listed as:
static const unsigned int ad7606_oversampling_avail[7] = {
1, 2, 4, 8, 16, 32, 64,
};
When doing:
1. $ echo 1 > /sys/bus/iio/devices/iio\:device0/oversampling_ratio
$ cat /sys/bus/iio/devices/iio\:device0/oversampling_ratio
1 # this is fine
2. $ echo 2 > /sys/bus/iio/devices/iio\:device0/oversampling_ratio
$ cat /sys/bus/iio/devices/iio\:device0/oversampling_ratio
1 # this is wrong; 2 should be returned here
3. $ echo 3 > /sys/bus/iio/devices/iio\:device0/oversampling_ratio
$ cat /sys/bus/iio/devices/iio\:device0/oversampling_ratio
2 # this is fine
4. $ echo 4 > /sys/bus/iio/devices/iio\:device0/oversampling_ratio
$ cat /sys/bus/iio/devices/iio\:device0/oversampling_ratio
4 # this is fine
And from here-on, the values are as correct (one gets what one would
expect.)
While writing a kunit test for this bug, a peculiar issue was found for the
array in the 'drivers/hwmon/ina2xx.c' & 'drivers/iio/adc/ina2xx-adc.c'
drivers. While running the kunit test (for 'ina226_avg_tab' from these
drivers):
* idx = find_closest([-1 to 2], ina226_avg_tab, ARRAY_SIZE(ina226_avg_tab));
This returns idx == 0, so value.
* idx = find_closest(3, ina226_avg_tab, ARRAY_SIZE(ina226_avg_tab));
This returns idx == 0, value 1; and now one could argue whether 3 is
closer to 4 or to 1. This quirk only appears for value '3' in this
array, but it seems to be a another rounding issue.
* And from 4 onwards the 'find_closest'() works fine (one gets what one
would expect).
This change reworks the find_closest() macros to also check the difference
between the left and right elements when 'x'. If the distance to the right
is smaller (than the distance to the left), the index is incremented by 1.
This also makes redundant the need for using the DIV_ROUND_CLOSEST() macro.
In order to accommodate for any mix of negative + positive values, the
internal variables '__fc_x', '__fc_mid_x', '__fc_left' & '__fc_right' are
forced to 'long' type. This also addresses any potential bugs/issues with
'x' being of an unsigned type. In those situations any comparison between
signed & unsigned would be promoted to a comparison between 2 unsigned
numbers; this is especially annoying when '__fc_left' & '__fc_right'
underflow.
The find_closest_descending() macro was also reworked and duplicated from
the find_closest(), and it is being iterated in reverse. The main reason
for this is to get the same indices as 'find_closest()' (but in reverse).
The comparison for '__fc_right < __fc_left' favors going the array in
ascending order.
For example for array '{ 1024, 512, 256, 128, 64, 16, 4, 1 }' and x = 3, we
get:
__fc_mid_x = 2
__fc_left = -1
__fc_right = -2
Then '__fc_right < __fc_left' evaluates to true and '__fc_i++' becomes 7
which is not quite incorrect, but 3 is closer to 4 than to 1.
This change has been validated with the kunit from the next patch.
Link: https://lkml.kernel.org/r/20241105145406.554365-1-aardelean@baylibre.com
Fixes: 95d119528b0b ("util_macros.h: add find_closest() macro")
Signed-off-by: Alexandru Ardelean <aardelean(a)baylibre.com>
Cc: Bartosz Golaszewski <bartosz.golaszewski(a)linaro.org>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
include/linux/util_macros.h | 56 ++++++++++++++++++++++++----------
1 file changed, 40 insertions(+), 16 deletions(-)
--- a/include/linux/util_macros.h~util_macrosh-fix-rework-find_closest-macros
+++ a/include/linux/util_macros.h
@@ -4,19 +4,6 @@
#include <linux/math.h>
-#define __find_closest(x, a, as, op) \
-({ \
- typeof(as) __fc_i, __fc_as = (as) - 1; \
- typeof(x) __fc_x = (x); \
- typeof(*a) const *__fc_a = (a); \
- for (__fc_i = 0; __fc_i < __fc_as; __fc_i++) { \
- if (__fc_x op DIV_ROUND_CLOSEST(__fc_a[__fc_i] + \
- __fc_a[__fc_i + 1], 2)) \
- break; \
- } \
- (__fc_i); \
-})
-
/**
* find_closest - locate the closest element in a sorted array
* @x: The reference value.
@@ -25,8 +12,27 @@
* @as: Size of 'a'.
*
* Returns the index of the element closest to 'x'.
+ * Note: If using an array of negative numbers (or mixed positive numbers),
+ * then be sure that 'x' is of a signed-type to get good results.
*/
-#define find_closest(x, a, as) __find_closest(x, a, as, <=)
+#define find_closest(x, a, as) \
+({ \
+ typeof(as) __fc_i, __fc_as = (as) - 1; \
+ long __fc_mid_x, __fc_x = (x); \
+ long __fc_left, __fc_right; \
+ typeof(*a) const *__fc_a = (a); \
+ for (__fc_i = 0; __fc_i < __fc_as; __fc_i++) { \
+ __fc_mid_x = (__fc_a[__fc_i] + __fc_a[__fc_i + 1]) / 2; \
+ if (__fc_x <= __fc_mid_x) { \
+ __fc_left = __fc_x - __fc_a[__fc_i]; \
+ __fc_right = __fc_a[__fc_i + 1] - __fc_x; \
+ if (__fc_right < __fc_left) \
+ __fc_i++; \
+ break; \
+ } \
+ } \
+ (__fc_i); \
+})
/**
* find_closest_descending - locate the closest element in a sorted array
@@ -36,9 +42,27 @@
* @as: Size of 'a'.
*
* Similar to find_closest() but 'a' is expected to be sorted in descending
- * order.
+ * order. The iteration is done in reverse order, so that the comparison
+ * of '__fc_right' & '__fc_left' also works for unsigned numbers.
*/
-#define find_closest_descending(x, a, as) __find_closest(x, a, as, >=)
+#define find_closest_descending(x, a, as) \
+({ \
+ typeof(as) __fc_i, __fc_as = (as) - 1; \
+ long __fc_mid_x, __fc_x = (x); \
+ long __fc_left, __fc_right; \
+ typeof(*a) const *__fc_a = (a); \
+ for (__fc_i = __fc_as; __fc_i >= 1; __fc_i--) { \
+ __fc_mid_x = (__fc_a[__fc_i] + __fc_a[__fc_i - 1]) / 2; \
+ if (__fc_x <= __fc_mid_x) { \
+ __fc_left = __fc_x - __fc_a[__fc_i]; \
+ __fc_right = __fc_a[__fc_i - 1] - __fc_x; \
+ if (__fc_right < __fc_left) \
+ __fc_i--; \
+ break; \
+ } \
+ } \
+ (__fc_i); \
+})
/**
* is_insidevar - check if the @ptr points inside the @var memory range.
_
Patches currently in -mm which might be from aardelean(a)baylibre.com are
util_macrosh-fix-rework-find_closest-macros.patch
lib-util_macros_kunit-add-kunit-test-for-util_macrosh.patch
Hi,
I recently installed Arch Linux on an old laptop (Fujitsu-Siemens AMILO Xi 2550) and noticed that:
- when booting Linux from the Arch ISO (kernel version 6.10.10) WIFI is working fine
- after installing Arch Linux from the ISO and booting (kernel version 6.11.5) WIFI was not working properly
By "not working properly" I mean:
downloading small files or installing a few small packages was working ok, but when downloading larger files or installing larger packages with lots of dependencies, the connection would gradually slow down and eventually die.
I reported this on the Arch Linux forum (https://bbs.archlinux.org/viewtopic.php?pid=2206757)
and some helpful memeber suggested that this might be the commit that broke things:
https://github.com/torvalds/linux/commit/02b682d54598f61cbb7dbb14d98ec18011…
An Arch Linux packet manager (gromit) helped me debug this issue by building a couple of kernels that I tested.
- https://pkgbuild.com/\~gromit/linux-bisection-kernels/linux-mainline-6.12rc…
- https://pkgbuild.com/\~gromit/linux-bisection-kernels/linux-mainline-6.12rc…
The first one didn't work, but the second (in which he reverted the commit linked above) did fix my problem.
So, I guess this commit should be investigated by those in the know.
Thats why I also added Andrii and Kalle to CC as they are listed in the commit message.
My network controller: Intel corporation PRO/Wireless 4965 AG or AGN [Kedron] Network Connection (rev 61)
Kernel driver in use: iwl4965
This is my first kernel bug report, hope I did everything right :)
I'm ofc willing to help provide more info and debug locally here to help solve this issue.
Thanks and good night
Alf :)
--
"The generation of random numbers is too important to be left to chance."
This patch addresses a reference count handling issue in the
lpfc_bsg_hba_get_event() function. In the branch
if (evt->reg_id == event_req->ev_reg_id), the function calls
lpfc_bsg_event_ref(), which increments the reference count of the relevant
resources. However, in the branch if (evt_dat == NULL), a goto statement
directly jumps to the function’s final goto block, skipping the release
operations at the end of the function. This means that, if the condition
if (evt_dat == NULL) is met, the function fails to correctly release the
resources acquired by lpfc_bsg_event_ref(), leading to a reference count
leak.
To fix this issue, we added a new block job_error_unref before the
job_error block. When the condition if (evt_dat == NULL) is met, the
function will enter the job_error_unref block, ensuring that the previously
allocated resources are properly released, thereby preventing the reference
count leak.
This bug was identified by an experimental static analysis tool developed
by our team. The tool specializes in analyzing reference count operations
and detecting potential issues where resources are not properly managed.
In this case, the tool flagged the missing release operation as a
potential problem, which led to the development of this patch.
Fixes: 4cc0e56e977f ("[SCSI] lpfc 8.3.8: (BSG3) Modify BSG commands to operate asynchronously")
Cc: stable(a)vger.kernel.org
Signed-off-by: Qiu-ji Chen <chenqiuji666(a)gmail.com>
---
drivers/scsi/lpfc/lpfc_bsg.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/drivers/scsi/lpfc/lpfc_bsg.c b/drivers/scsi/lpfc/lpfc_bsg.c
index 85059b83ea6b..832a5a6dd85f 100644
--- a/drivers/scsi/lpfc/lpfc_bsg.c
+++ b/drivers/scsi/lpfc/lpfc_bsg.c
@@ -1294,7 +1294,7 @@ lpfc_bsg_hba_get_event(struct bsg_job *job)
if (evt_dat == NULL) {
bsg_reply->reply_payload_rcv_len = 0;
rc = -ENOENT;
- goto job_error;
+ goto job_error_unref;
}
if (evt_dat->len > job->request_payload.payload_len) {
@@ -1329,6 +1329,10 @@ lpfc_bsg_hba_get_event(struct bsg_job *job)
bsg_reply->reply_payload_rcv_len);
return 0;
+job_err_unref:
+ spin_lock_irqsave(&phba->ct_ev_lock, flags);
+ lpfc_bsg_event_unref(evt);
+ spin_unlock_irqrestore(&phba->ct_ev_lock, flags);
job_error:
job->dd_data = NULL;
bsg_reply->result = rc;
--
2.34.1
From: Yihan Zhu <Yihan.Zhu(a)amd.com>
[Why]
No check on head pipe during the dml to dc hw mapping will allow illegal
pipe usage. This will result in a wrong pipe topology to cause mpcc tree
totally mess up then cause a display hang.
[How]
Avoid to use the pipe is head in all check and avoid ODM slice during
preferred pipe check.
Cc: stable(a)vger.kernel.org
Reviewed-by: Nicholas Kazlauskas <nicholas.kazlauskas(a)amd.com>
Signed-off-by: Yihan Zhu <Yihan.Zhu(a)amd.com>
Signed-off-by: Hamza Mahfooz <hamza.mahfooz(a)amd.com>
---
.../display/dc/dml2/dml2_dc_resource_mgmt.c | 19 ++++++++++++++++++-
1 file changed, 18 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/display/dc/dml2/dml2_dc_resource_mgmt.c b/drivers/gpu/drm/amd/display/dc/dml2/dml2_dc_resource_mgmt.c
index 6eccf0241d85..9be9ed7e01d3 100644
--- a/drivers/gpu/drm/amd/display/dc/dml2/dml2_dc_resource_mgmt.c
+++ b/drivers/gpu/drm/amd/display/dc/dml2/dml2_dc_resource_mgmt.c
@@ -258,12 +258,23 @@ static unsigned int find_preferred_pipe_candidates(const struct dc_state *existi
* However this condition comes with a caveat. We need to ignore pipes that will
* require a change in OPP but still have the same stream id. For example during
* an MPC to ODM transiton.
+ *
+ * Adding check to avoid pipe select on the head pipe by utilizing dc resource
+ * helper function resource_get_primary_dpp_pipe and comparing the pipe index.
*/
if (existing_state) {
for (i = 0; i < pipe_count; i++) {
if (existing_state->res_ctx.pipe_ctx[i].stream && existing_state->res_ctx.pipe_ctx[i].stream->stream_id == stream_id) {
+ struct pipe_ctx *head_pipe =
+ resource_get_primary_dpp_pipe(&existing_state->res_ctx.pipe_ctx[i]);
+
+ // we should always respect the head pipe from selection
+ if (head_pipe && head_pipe->pipe_idx == i)
+ continue;
if (existing_state->res_ctx.pipe_ctx[i].plane_res.hubp &&
- existing_state->res_ctx.pipe_ctx[i].plane_res.hubp->opp_id != i)
+ existing_state->res_ctx.pipe_ctx[i].plane_res.hubp->opp_id != i &&
+ (existing_state->res_ctx.pipe_ctx[i].prev_odm_pipe ||
+ existing_state->res_ctx.pipe_ctx[i].next_odm_pipe))
continue;
preferred_pipe_candidates[num_preferred_candidates++] = i;
@@ -292,6 +303,12 @@ static unsigned int find_last_resort_pipe_candidates(const struct dc_state *exis
*/
if (existing_state) {
for (i = 0; i < pipe_count; i++) {
+ struct pipe_ctx *head_pipe =
+ resource_get_primary_dpp_pipe(&existing_state->res_ctx.pipe_ctx[i]);
+
+ // we should always respect the head pipe from selection
+ if (head_pipe && head_pipe->pipe_idx == i)
+ continue;
if ((existing_state->res_ctx.pipe_ctx[i].plane_res.hubp &&
existing_state->res_ctx.pipe_ctx[i].plane_res.hubp->opp_id != i) ||
existing_state->res_ctx.pipe_ctx[i].stream_res.tg)
--
2.46.1
From: Ryan Seto <ryanseto(a)amd.com>
[Why]
In the case where a dml allocation fails for any reason, the
current state's dml contexts would no longer be valid. Then
subsequent calls dc_state_copy_internal would shallow copy
invalid memory and if the new state was released, a double
free would occur.
[How]
Reset dml pointers in new_state to NULL and avoid invalid
pointer
Cc: stable(a)vger.kernel.org
Reviewed-by: Dillon Varone <dillon.varone(a)amd.com>
Signed-off-by: Ryan Seto <ryanseto(a)amd.com>
Signed-off-by: Hamza Mahfooz <hamza.mahfooz(a)amd.com>
---
drivers/gpu/drm/amd/display/dc/core/dc_state.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_state.c b/drivers/gpu/drm/amd/display/dc/core/dc_state.c
index 2597e3fd562b..e006f816ff2f 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc_state.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc_state.c
@@ -265,6 +265,9 @@ struct dc_state *dc_state_create_copy(struct dc_state *src_state)
dc_state_copy_internal(new_state, src_state);
#ifdef CONFIG_DRM_AMD_DC_FP
+ new_state->bw_ctx.dml2 = NULL;
+ new_state->bw_ctx.dml2_dc_power_source = NULL;
+
if (src_state->bw_ctx.dml2 &&
!dml2_create_copy(&new_state->bw_ctx.dml2, src_state->bw_ctx.dml2)) {
dc_state_release(new_state);
--
2.46.1
This patch fixes a reference count handling issue in the function
lpfc_bsg_hba_set_event(). In the branch
if (evt->reg_id == event_req->ev_reg_id), the function calls
lpfc_bsg_event_ref(), which increments the reference count of the
associated resource. However, in the subsequent branch
if (&evt->node == &phba->ct_ev_waiters), a new evt is allocated, but the
old evt should be released at this point. Failing to do so could lead to
issues.
To resolve this issue, we added a release instruction at the beginning of
the next branch if (&evt->node == &phba->ct_ev_waiters), ensuring that the
resources allocated in the previous branch are properly released, thereby
preventing a reference count leak.
This bug was identified by an experimental static analysis tool developed
by our team. The tool specializes in analyzing reference count operations
and detecting potential issues where resources are not properly managed.
In this case, the tool flagged the missing release operation as a
potential problem, which led to the development of this patch.
Fixes: 4cc0e56e977f ("[SCSI] lpfc 8.3.8: (BSG3) Modify BSG commands to operate asynchronously")
Cc: stable(a)vger.kernel.org
Signed-off-by: Qiu-ji Chen <chenqiuji666(a)gmail.com>
---
drivers/scsi/lpfc/lpfc_bsg.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/scsi/lpfc/lpfc_bsg.c b/drivers/scsi/lpfc/lpfc_bsg.c
index 85059b83ea6b..3a65270c5584 100644
--- a/drivers/scsi/lpfc/lpfc_bsg.c
+++ b/drivers/scsi/lpfc/lpfc_bsg.c
@@ -1200,6 +1200,9 @@ lpfc_bsg_hba_set_event(struct bsg_job *job)
spin_unlock_irqrestore(&phba->ct_ev_lock, flags);
if (&evt->node == &phba->ct_ev_waiters) {
+ spin_lock_irqsave(&phba->ct_ev_lock, flags);
+ lpfc_bsg_event_unref(evt);
+ spin_unlock_irqrestore(&phba->ct_ev_lock, flags);
/* no event waiting struct yet - first call */
dd_data = kmalloc(sizeof(struct bsg_job_data), GFP_KERNEL);
if (dd_data == NULL) {
--
2.34.1
The following commit has been merged into the x86/urgent branch of tip:
Commit-ID: a5ca1dc46a6b610dd4627d8b633d6c84f9724ef0
Gitweb: https://git.kernel.org/tip/a5ca1dc46a6b610dd4627d8b633d6c84f9724ef0
Author: Mario Limonciello <mario.limonciello(a)amd.com>
AuthorDate: Tue, 05 Nov 2024 10:02:34 -06:00
Committer: Borislav Petkov (AMD) <bp(a)alien8.de>
CommitterDate: Tue, 05 Nov 2024 17:48:32 +01:00
x86/CPU/AMD: Clear virtualized VMLOAD/VMSAVE on Zen4 client
A number of Zen4 client SoCs advertise the ability to use virtualized
VMLOAD/VMSAVE, but using these instructions is reported to be a cause
of a random host reboot.
These instructions aren't intended to be advertised on Zen4 client
so clear the capability.
Signed-off-by: Mario Limonciello <mario.limonciello(a)amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp(a)alien8.de>
Cc: stable(a)vger.kernel.org
Link: https://bugzilla.kernel.org/show_bug.cgi?id=219009
---
arch/x86/kernel/cpu/amd.c | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index fab5cae..823f44f 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -924,6 +924,17 @@ static void init_amd_zen4(struct cpuinfo_x86 *c)
{
if (!cpu_has(c, X86_FEATURE_HYPERVISOR))
msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT);
+
+ /*
+ * These Zen4 SoCs advertise support for virtualized VMLOAD/VMSAVE
+ * in some BIOS versions but they can lead to random host reboots.
+ */
+ switch (c->x86_model) {
+ case 0x18 ... 0x1f:
+ case 0x60 ... 0x7f:
+ clear_cpu_cap(c, X86_FEATURE_V_VMSAVE_VMLOAD);
+ break;
+ }
}
static void init_amd_zen5(struct cpuinfo_x86 *c)
When support for PCC Opregion was added, validation of field_obj
was missed.
Based on the acpi_ev_address_space_dispatch function description,
field_obj can be NULL, and also when acpi_ev_address_space_dispatch
is called in the acpi_ex_region_read() NULL is passed as field_obj.
Found by Linux Verification Center (linuxtesting.org) with SVACE.
Fixes: 0acf24ad7e10 ("ACPICA: Add support for PCC Opregion special context data")
Cc: stable(a)vger.kernel.org
Signed-off-by: George Rurikov <grurikov(a)gmail.com>
---
drivers/acpi/acpica/evregion.c | 18 +++++++++++-------
1 file changed, 11 insertions(+), 7 deletions(-)
diff --git a/drivers/acpi/acpica/evregion.c b/drivers/acpi/acpica/evregion.c
index cf53b9535f18..03e8b6f186af 100644
--- a/drivers/acpi/acpica/evregion.c
+++ b/drivers/acpi/acpica/evregion.c
@@ -164,13 +164,17 @@ acpi_ev_address_space_dispatch(union acpi_operand_object *region_obj,
}
if (region_obj->region.space_id == ACPI_ADR_SPACE_PLATFORM_COMM) {
- struct acpi_pcc_info *ctx =
- handler_desc->address_space.context;
-
- ctx->internal_buffer =
- field_obj->field.internal_pcc_buffer;
- ctx->length = (u16)region_obj->region.length;
- ctx->subspace_id = (u8)region_obj->region.address;
+ if (field_obj != NULL) {
+ struct acpi_pcc_info *ctx =
+ handler_desc->address_space.context;
+
+ ctx->internal_buffer =
+ field_obj->field.internal_pcc_buffer;
+ ctx->length = (u16)region_obj->region.length;
+ ctx->subspace_id = (u8)region_obj->region.address;
+ } else {
+ return_ACPI_STATUS(AE_ERROR);
+ }
}
if (region_obj->region.space_id ==
--
2.34.1
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 6575b268157f37929948a8d1f3bafb3d7c055bc1
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024110551-fragrance-deodorant-e7fa@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 6575b268157f37929948a8d1f3bafb3d7c055bc1 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams(a)intel.com>
Date: Fri, 25 Oct 2024 12:32:55 -0700
Subject: [PATCH] cxl/port: Fix CXL port initialization order when the
subsystem is built-in
When the CXL subsystem is built-in the module init order is determined
by Makefile order. That order violates expectations. The expectation is
that cxl_acpi and cxl_mem can race to attach. If cxl_acpi wins the race,
cxl_mem will find the enabled CXL root ports it needs. If cxl_acpi loses
the race it will retrigger cxl_mem to attach via cxl_bus_rescan(). That
flow only works if cxl_acpi can assume ports are enabled immediately
upon cxl_acpi_probe() return. That in turn can only happen in the
CONFIG_CXL_ACPI=y case if the cxl_port driver is registered before
cxl_acpi_probe() runs.
Fix up the order to prevent initialization failures. Ensure that
cxl_port is built-in when cxl_acpi is also built-in, arrange for
Makefile order to resolve the subsys_initcall() order of cxl_port and
cxl_acpi, and arrange for Makefile order to resolve the
device_initcall() (module_init()) order of the remaining objects.
As for what contributed to this not being found earlier, the CXL
regression environment, cxl_test, builds all CXL functionality as a
module to allow to symbol mocking and other dynamic reload tests. As a
result there is no regression coverage for the built-in case.
Reported-by: Gregory Price <gourry(a)gourry.net>
Closes: http://lore.kernel.org/20241004212504.1246-1-gourry@gourry.net
Tested-by: Gregory Price <gourry(a)gourry.net>
Fixes: 8dd2bc0f8e02 ("cxl/mem: Add the cxl_mem driver")
Cc: stable(a)vger.kernel.org
Cc: Davidlohr Bueso <dave(a)stgolabs.net>
Cc: Jonathan Cameron <jonathan.cameron(a)huawei.com>
Cc: Dave Jiang <dave.jiang(a)intel.com>
Cc: Alison Schofield <alison.schofield(a)intel.com>
Cc: Vishal Verma <vishal.l.verma(a)intel.com>
Cc: Ira Weiny <ira.weiny(a)intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
Reviewed-by: Ira Weiny <ira.weiny(a)intel.com>
Tested-by: Alejandro Lucero <alucerop(a)amd.com>
Reviewed-by: Alejandro Lucero <alucerop(a)amd.com>
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
Link: https://patch.msgid.link/172988474904.476062.7961350937442459266.stgit@dwil…
Signed-off-by: Ira Weiny <ira.weiny(a)intel.com>
diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
index 29c192f20082..876469e23f7a 100644
--- a/drivers/cxl/Kconfig
+++ b/drivers/cxl/Kconfig
@@ -60,6 +60,7 @@ config CXL_ACPI
default CXL_BUS
select ACPI_TABLE_LIB
select ACPI_HMAT
+ select CXL_PORT
help
Enable support for host managed device memory (HDM) resources
published by a platform's ACPI CXL memory layout description. See
diff --git a/drivers/cxl/Makefile b/drivers/cxl/Makefile
index db321f48ba52..2caa90fa4bf2 100644
--- a/drivers/cxl/Makefile
+++ b/drivers/cxl/Makefile
@@ -1,13 +1,21 @@
# SPDX-License-Identifier: GPL-2.0
+
+# Order is important here for the built-in case:
+# - 'core' first for fundamental init
+# - 'port' before platform root drivers like 'acpi' so that CXL-root ports
+# are immediately enabled
+# - 'mem' and 'pmem' before endpoint drivers so that memdevs are
+# immediately enabled
+# - 'pci' last, also mirrors the hardware enumeration hierarchy
obj-y += core/
-obj-$(CONFIG_CXL_PCI) += cxl_pci.o
-obj-$(CONFIG_CXL_MEM) += cxl_mem.o
+obj-$(CONFIG_CXL_PORT) += cxl_port.o
obj-$(CONFIG_CXL_ACPI) += cxl_acpi.o
obj-$(CONFIG_CXL_PMEM) += cxl_pmem.o
-obj-$(CONFIG_CXL_PORT) += cxl_port.o
+obj-$(CONFIG_CXL_MEM) += cxl_mem.o
+obj-$(CONFIG_CXL_PCI) += cxl_pci.o
-cxl_mem-y := mem.o
-cxl_pci-y := pci.o
+cxl_port-y := port.o
cxl_acpi-y := acpi.o
cxl_pmem-y := pmem.o security.o
-cxl_port-y := port.o
+cxl_mem-y := mem.o
+cxl_pci-y := pci.o
diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c
index 861dde65768f..9dc394295e1f 100644
--- a/drivers/cxl/port.c
+++ b/drivers/cxl/port.c
@@ -208,7 +208,22 @@ static struct cxl_driver cxl_port_driver = {
},
};
-module_cxl_driver(cxl_port_driver);
+static int __init cxl_port_init(void)
+{
+ return cxl_driver_register(&cxl_port_driver);
+}
+/*
+ * Be ready to immediately enable ports emitted by the platform CXL root
+ * (e.g. cxl_acpi) when CONFIG_CXL_PORT=y.
+ */
+subsys_initcall(cxl_port_init);
+
+static void __exit cxl_port_exit(void)
+{
+ cxl_driver_unregister(&cxl_port_driver);
+}
+module_exit(cxl_port_exit);
+
MODULE_DESCRIPTION("CXL: Port enumeration and services");
MODULE_LICENSE("GPL v2");
MODULE_IMPORT_NS(CXL);
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 6575b268157f37929948a8d1f3bafb3d7c055bc1
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024110550-chamomile-zit-2c77@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 6575b268157f37929948a8d1f3bafb3d7c055bc1 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams(a)intel.com>
Date: Fri, 25 Oct 2024 12:32:55 -0700
Subject: [PATCH] cxl/port: Fix CXL port initialization order when the
subsystem is built-in
When the CXL subsystem is built-in the module init order is determined
by Makefile order. That order violates expectations. The expectation is
that cxl_acpi and cxl_mem can race to attach. If cxl_acpi wins the race,
cxl_mem will find the enabled CXL root ports it needs. If cxl_acpi loses
the race it will retrigger cxl_mem to attach via cxl_bus_rescan(). That
flow only works if cxl_acpi can assume ports are enabled immediately
upon cxl_acpi_probe() return. That in turn can only happen in the
CONFIG_CXL_ACPI=y case if the cxl_port driver is registered before
cxl_acpi_probe() runs.
Fix up the order to prevent initialization failures. Ensure that
cxl_port is built-in when cxl_acpi is also built-in, arrange for
Makefile order to resolve the subsys_initcall() order of cxl_port and
cxl_acpi, and arrange for Makefile order to resolve the
device_initcall() (module_init()) order of the remaining objects.
As for what contributed to this not being found earlier, the CXL
regression environment, cxl_test, builds all CXL functionality as a
module to allow to symbol mocking and other dynamic reload tests. As a
result there is no regression coverage for the built-in case.
Reported-by: Gregory Price <gourry(a)gourry.net>
Closes: http://lore.kernel.org/20241004212504.1246-1-gourry@gourry.net
Tested-by: Gregory Price <gourry(a)gourry.net>
Fixes: 8dd2bc0f8e02 ("cxl/mem: Add the cxl_mem driver")
Cc: stable(a)vger.kernel.org
Cc: Davidlohr Bueso <dave(a)stgolabs.net>
Cc: Jonathan Cameron <jonathan.cameron(a)huawei.com>
Cc: Dave Jiang <dave.jiang(a)intel.com>
Cc: Alison Schofield <alison.schofield(a)intel.com>
Cc: Vishal Verma <vishal.l.verma(a)intel.com>
Cc: Ira Weiny <ira.weiny(a)intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
Reviewed-by: Ira Weiny <ira.weiny(a)intel.com>
Tested-by: Alejandro Lucero <alucerop(a)amd.com>
Reviewed-by: Alejandro Lucero <alucerop(a)amd.com>
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
Link: https://patch.msgid.link/172988474904.476062.7961350937442459266.stgit@dwil…
Signed-off-by: Ira Weiny <ira.weiny(a)intel.com>
diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
index 29c192f20082..876469e23f7a 100644
--- a/drivers/cxl/Kconfig
+++ b/drivers/cxl/Kconfig
@@ -60,6 +60,7 @@ config CXL_ACPI
default CXL_BUS
select ACPI_TABLE_LIB
select ACPI_HMAT
+ select CXL_PORT
help
Enable support for host managed device memory (HDM) resources
published by a platform's ACPI CXL memory layout description. See
diff --git a/drivers/cxl/Makefile b/drivers/cxl/Makefile
index db321f48ba52..2caa90fa4bf2 100644
--- a/drivers/cxl/Makefile
+++ b/drivers/cxl/Makefile
@@ -1,13 +1,21 @@
# SPDX-License-Identifier: GPL-2.0
+
+# Order is important here for the built-in case:
+# - 'core' first for fundamental init
+# - 'port' before platform root drivers like 'acpi' so that CXL-root ports
+# are immediately enabled
+# - 'mem' and 'pmem' before endpoint drivers so that memdevs are
+# immediately enabled
+# - 'pci' last, also mirrors the hardware enumeration hierarchy
obj-y += core/
-obj-$(CONFIG_CXL_PCI) += cxl_pci.o
-obj-$(CONFIG_CXL_MEM) += cxl_mem.o
+obj-$(CONFIG_CXL_PORT) += cxl_port.o
obj-$(CONFIG_CXL_ACPI) += cxl_acpi.o
obj-$(CONFIG_CXL_PMEM) += cxl_pmem.o
-obj-$(CONFIG_CXL_PORT) += cxl_port.o
+obj-$(CONFIG_CXL_MEM) += cxl_mem.o
+obj-$(CONFIG_CXL_PCI) += cxl_pci.o
-cxl_mem-y := mem.o
-cxl_pci-y := pci.o
+cxl_port-y := port.o
cxl_acpi-y := acpi.o
cxl_pmem-y := pmem.o security.o
-cxl_port-y := port.o
+cxl_mem-y := mem.o
+cxl_pci-y := pci.o
diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c
index 861dde65768f..9dc394295e1f 100644
--- a/drivers/cxl/port.c
+++ b/drivers/cxl/port.c
@@ -208,7 +208,22 @@ static struct cxl_driver cxl_port_driver = {
},
};
-module_cxl_driver(cxl_port_driver);
+static int __init cxl_port_init(void)
+{
+ return cxl_driver_register(&cxl_port_driver);
+}
+/*
+ * Be ready to immediately enable ports emitted by the platform CXL root
+ * (e.g. cxl_acpi) when CONFIG_CXL_PORT=y.
+ */
+subsys_initcall(cxl_port_init);
+
+static void __exit cxl_port_exit(void)
+{
+ cxl_driver_unregister(&cxl_port_driver);
+}
+module_exit(cxl_port_exit);
+
MODULE_DESCRIPTION("CXL: Port enumeration and services");
MODULE_LICENSE("GPL v2");
MODULE_IMPORT_NS(CXL);
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 101c268bd2f37e965a5468353e62d154db38838e
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024110539-swooned-cold-53f5@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 101c268bd2f37e965a5468353e62d154db38838e Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams(a)intel.com>
Date: Tue, 22 Oct 2024 18:43:49 -0700
Subject: [PATCH] cxl/port: Fix use-after-free, permit out-of-order decoder
shutdown
In support of investigating an initialization failure report [1],
cxl_test was updated to register mock memory-devices after the mock
root-port/bus device had been registered. That led to cxl_test crashing
with a use-after-free bug with the following signature:
cxl_port_attach_region: cxl region3: cxl_host_bridge.0:port3 decoder3.0 add: mem0:decoder7.0 @ 0 next: cxl_switch_uport.0 nr_eps: 1 nr_targets: 1
cxl_port_attach_region: cxl region3: cxl_host_bridge.0:port3 decoder3.0 add: mem4:decoder14.0 @ 1 next: cxl_switch_uport.0 nr_eps: 2 nr_targets: 1
cxl_port_setup_targets: cxl region3: cxl_switch_uport.0:port6 target[0] = cxl_switch_dport.0 for mem0:decoder7.0 @ 0
1) cxl_port_setup_targets: cxl region3: cxl_switch_uport.0:port6 target[1] = cxl_switch_dport.4 for mem4:decoder14.0 @ 1
[..]
cxld_unregister: cxl decoder14.0:
cxl_region_decode_reset: cxl_region region3:
mock_decoder_reset: cxl_port port3: decoder3.0 reset
2) mock_decoder_reset: cxl_port port3: decoder3.0: out of order reset, expected decoder3.1
cxl_endpoint_decoder_release: cxl decoder14.0:
[..]
cxld_unregister: cxl decoder7.0:
3) cxl_region_decode_reset: cxl_region region3:
Oops: general protection fault, probably for non-canonical address 0x6b6b6b6b6b6b6bc3: 0000 [#1] PREEMPT SMP PTI
[..]
RIP: 0010:to_cxl_port+0x8/0x60 [cxl_core]
[..]
Call Trace:
<TASK>
cxl_region_decode_reset+0x69/0x190 [cxl_core]
cxl_region_detach+0xe8/0x210 [cxl_core]
cxl_decoder_kill_region+0x27/0x40 [cxl_core]
cxld_unregister+0x5d/0x60 [cxl_core]
At 1) a region has been established with 2 endpoint decoders (7.0 and
14.0). Those endpoints share a common switch-decoder in the topology
(3.0). At teardown, 2), decoder14.0 is the first to be removed and hits
the "out of order reset case" in the switch decoder. The effect though
is that region3 cleanup is aborted leaving it in-tact and
referencing decoder14.0. At 3) the second attempt to teardown region3
trips over the stale decoder14.0 object which has long since been
deleted.
The fix here is to recognize that the CXL specification places no
mandate on in-order shutdown of switch-decoders, the driver enforces
in-order allocation, and hardware enforces in-order commit. So, rather
than fail and leave objects dangling, always remove them.
In support of making cxl_region_decode_reset() always succeed,
cxl_region_invalidate_memregion() failures are turned into warnings.
Crashing the kernel is ok there since system integrity is at risk if
caches cannot be managed around physical address mutation events like
CXL region destruction.
A new device_for_each_child_reverse_from() is added to cleanup
port->commit_end after all dependent decoders have been disabled. In
other words if decoders are allocated 0->1->2 and disabled 1->2->0 then
port->commit_end only decrements from 2 after 2 has been disabled, and
it decrements all the way to zero since 1 was disabled previously.
Link: http://lore.kernel.org/20241004212504.1246-1-gourry@gourry.net [1]
Cc: stable(a)vger.kernel.org
Fixes: 176baefb2eb5 ("cxl/hdm: Commit decoder state to hardware")
Reviewed-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: Davidlohr Bueso <dave(a)stgolabs.net>
Cc: Dave Jiang <dave.jiang(a)intel.com>
Cc: Alison Schofield <alison.schofield(a)intel.com>
Cc: Ira Weiny <ira.weiny(a)intel.com>
Cc: Zijun Hu <quic_zijuhu(a)quicinc.com>
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
Reviewed-by: Ira Weiny <ira.weiny(a)intel.com>
Link: https://patch.msgid.link/172964782781.81806.17902885593105284330.stgit@dwil…
Signed-off-by: Ira Weiny <ira.weiny(a)intel.com>
diff --git a/drivers/base/core.c b/drivers/base/core.c
index a4c853411a6b..e42f1ad73078 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -4037,6 +4037,41 @@ int device_for_each_child_reverse(struct device *parent, void *data,
}
EXPORT_SYMBOL_GPL(device_for_each_child_reverse);
+/**
+ * device_for_each_child_reverse_from - device child iterator in reversed order.
+ * @parent: parent struct device.
+ * @from: optional starting point in child list
+ * @fn: function to be called for each device.
+ * @data: data for the callback.
+ *
+ * Iterate over @parent's child devices, starting at @from, and call @fn
+ * for each, passing it @data. This helper is identical to
+ * device_for_each_child_reverse() when @from is NULL.
+ *
+ * @fn is checked each iteration. If it returns anything other than 0,
+ * iteration stop and that value is returned to the caller of
+ * device_for_each_child_reverse_from();
+ */
+int device_for_each_child_reverse_from(struct device *parent,
+ struct device *from, const void *data,
+ int (*fn)(struct device *, const void *))
+{
+ struct klist_iter i;
+ struct device *child;
+ int error = 0;
+
+ if (!parent->p)
+ return 0;
+
+ klist_iter_init_node(&parent->p->klist_children, &i,
+ (from ? &from->p->knode_parent : NULL));
+ while ((child = prev_device(&i)) && !error)
+ error = fn(child, data);
+ klist_iter_exit(&i);
+ return error;
+}
+EXPORT_SYMBOL_GPL(device_for_each_child_reverse_from);
+
/**
* device_find_child - device iterator for locating a particular device.
* @parent: parent struct device
diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
index 3df10517a327..223c273c0cd1 100644
--- a/drivers/cxl/core/hdm.c
+++ b/drivers/cxl/core/hdm.c
@@ -712,7 +712,44 @@ static int cxl_decoder_commit(struct cxl_decoder *cxld)
return 0;
}
-static int cxl_decoder_reset(struct cxl_decoder *cxld)
+static int commit_reap(struct device *dev, const void *data)
+{
+ struct cxl_port *port = to_cxl_port(dev->parent);
+ struct cxl_decoder *cxld;
+
+ if (!is_switch_decoder(dev) && !is_endpoint_decoder(dev))
+ return 0;
+
+ cxld = to_cxl_decoder(dev);
+ if (port->commit_end == cxld->id &&
+ ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)) {
+ port->commit_end--;
+ dev_dbg(&port->dev, "reap: %s commit_end: %d\n",
+ dev_name(&cxld->dev), port->commit_end);
+ }
+
+ return 0;
+}
+
+void cxl_port_commit_reap(struct cxl_decoder *cxld)
+{
+ struct cxl_port *port = to_cxl_port(cxld->dev.parent);
+
+ lockdep_assert_held_write(&cxl_region_rwsem);
+
+ /*
+ * Once the highest committed decoder is disabled, free any other
+ * decoders that were pinned allocated by out-of-order release.
+ */
+ port->commit_end--;
+ dev_dbg(&port->dev, "reap: %s commit_end: %d\n", dev_name(&cxld->dev),
+ port->commit_end);
+ device_for_each_child_reverse_from(&port->dev, &cxld->dev, NULL,
+ commit_reap);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_port_commit_reap, CXL);
+
+static void cxl_decoder_reset(struct cxl_decoder *cxld)
{
struct cxl_port *port = to_cxl_port(cxld->dev.parent);
struct cxl_hdm *cxlhdm = dev_get_drvdata(&port->dev);
@@ -721,14 +758,14 @@ static int cxl_decoder_reset(struct cxl_decoder *cxld)
u32 ctrl;
if ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)
- return 0;
+ return;
- if (port->commit_end != id) {
+ if (port->commit_end == id)
+ cxl_port_commit_reap(cxld);
+ else
dev_dbg(&port->dev,
"%s: out of order reset, expected decoder%d.%d\n",
dev_name(&cxld->dev), port->id, port->commit_end);
- return -EBUSY;
- }
down_read(&cxl_dpa_rwsem);
ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id));
@@ -741,7 +778,6 @@ static int cxl_decoder_reset(struct cxl_decoder *cxld)
writel(0, hdm + CXL_HDM_DECODER0_BASE_LOW_OFFSET(id));
up_read(&cxl_dpa_rwsem);
- port->commit_end--;
cxld->flags &= ~CXL_DECODER_F_ENABLE;
/* Userspace is now responsible for reconfiguring this decoder */
@@ -751,8 +787,6 @@ static int cxl_decoder_reset(struct cxl_decoder *cxld)
cxled = to_cxl_endpoint_decoder(&cxld->dev);
cxled->state = CXL_DECODER_STATE_MANUAL;
}
-
- return 0;
}
static int cxl_setup_hdm_decoder_from_dvsec(
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index e701e4b04032..3478d2058303 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -232,8 +232,8 @@ static int cxl_region_invalidate_memregion(struct cxl_region *cxlr)
"Bypassing cpu_cache_invalidate_memregion() for testing!\n");
return 0;
} else {
- dev_err(&cxlr->dev,
- "Failed to synchronize CPU cache state\n");
+ dev_WARN(&cxlr->dev,
+ "Failed to synchronize CPU cache state\n");
return -ENXIO;
}
}
@@ -242,19 +242,17 @@ static int cxl_region_invalidate_memregion(struct cxl_region *cxlr)
return 0;
}
-static int cxl_region_decode_reset(struct cxl_region *cxlr, int count)
+static void cxl_region_decode_reset(struct cxl_region *cxlr, int count)
{
struct cxl_region_params *p = &cxlr->params;
- int i, rc = 0;
+ int i;
/*
- * Before region teardown attempt to flush, and if the flush
- * fails cancel the region teardown for data consistency
- * concerns
+ * Before region teardown attempt to flush, evict any data cached for
+ * this region, or scream loudly about missing arch / platform support
+ * for CXL teardown.
*/
- rc = cxl_region_invalidate_memregion(cxlr);
- if (rc)
- return rc;
+ cxl_region_invalidate_memregion(cxlr);
for (i = count - 1; i >= 0; i--) {
struct cxl_endpoint_decoder *cxled = p->targets[i];
@@ -277,23 +275,17 @@ static int cxl_region_decode_reset(struct cxl_region *cxlr, int count)
cxl_rr = cxl_rr_load(iter, cxlr);
cxld = cxl_rr->decoder;
if (cxld->reset)
- rc = cxld->reset(cxld);
- if (rc)
- return rc;
+ cxld->reset(cxld);
set_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags);
}
endpoint_reset:
- rc = cxled->cxld.reset(&cxled->cxld);
- if (rc)
- return rc;
+ cxled->cxld.reset(&cxled->cxld);
set_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags);
}
/* all decoders associated with this region have been torn down */
clear_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags);
-
- return 0;
}
static int commit_decoder(struct cxl_decoder *cxld)
@@ -409,16 +401,8 @@ static ssize_t commit_store(struct device *dev, struct device_attribute *attr,
* still pending.
*/
if (p->state == CXL_CONFIG_RESET_PENDING) {
- rc = cxl_region_decode_reset(cxlr, p->interleave_ways);
- /*
- * Revert to committed since there may still be active
- * decoders associated with this region, or move forward
- * to active to mark the reset successful
- */
- if (rc)
- p->state = CXL_CONFIG_COMMIT;
- else
- p->state = CXL_CONFIG_ACTIVE;
+ cxl_region_decode_reset(cxlr, p->interleave_ways);
+ p->state = CXL_CONFIG_ACTIVE;
}
}
@@ -2054,13 +2038,7 @@ static int cxl_region_detach(struct cxl_endpoint_decoder *cxled)
get_device(&cxlr->dev);
if (p->state > CXL_CONFIG_ACTIVE) {
- /*
- * TODO: tear down all impacted regions if a device is
- * removed out of order
- */
- rc = cxl_region_decode_reset(cxlr, p->interleave_ways);
- if (rc)
- goto out;
+ cxl_region_decode_reset(cxlr, p->interleave_ways);
p->state = CXL_CONFIG_ACTIVE;
}
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 0d8b810a51f0..5406e3ab3d4a 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -359,7 +359,7 @@ struct cxl_decoder {
struct cxl_region *region;
unsigned long flags;
int (*commit)(struct cxl_decoder *cxld);
- int (*reset)(struct cxl_decoder *cxld);
+ void (*reset)(struct cxl_decoder *cxld);
};
/*
@@ -730,6 +730,7 @@ static inline bool is_cxl_root(struct cxl_port *port)
int cxl_num_decoders_committed(struct cxl_port *port);
bool is_cxl_port(const struct device *dev);
struct cxl_port *to_cxl_port(const struct device *dev);
+void cxl_port_commit_reap(struct cxl_decoder *cxld);
struct pci_bus;
int devm_cxl_register_pci_bus(struct device *host, struct device *uport_dev,
struct pci_bus *bus);
diff --git a/include/linux/device.h b/include/linux/device.h
index b4bde8d22697..667cb6db9019 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1078,6 +1078,9 @@ int device_for_each_child(struct device *dev, void *data,
int (*fn)(struct device *dev, void *data));
int device_for_each_child_reverse(struct device *dev, void *data,
int (*fn)(struct device *dev, void *data));
+int device_for_each_child_reverse_from(struct device *parent,
+ struct device *from, const void *data,
+ int (*fn)(struct device *, const void *));
struct device *device_find_child(struct device *dev, void *data,
int (*match)(struct device *dev, void *data));
struct device *device_find_child_by_name(struct device *parent,
diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c
index 90d5afd52dd0..c5bbd89b3192 100644
--- a/tools/testing/cxl/test/cxl.c
+++ b/tools/testing/cxl/test/cxl.c
@@ -693,26 +693,22 @@ static int mock_decoder_commit(struct cxl_decoder *cxld)
return 0;
}
-static int mock_decoder_reset(struct cxl_decoder *cxld)
+static void mock_decoder_reset(struct cxl_decoder *cxld)
{
struct cxl_port *port = to_cxl_port(cxld->dev.parent);
int id = cxld->id;
if ((cxld->flags & CXL_DECODER_F_ENABLE) == 0)
- return 0;
+ return;
dev_dbg(&port->dev, "%s reset\n", dev_name(&cxld->dev));
- if (port->commit_end != id) {
+ if (port->commit_end == id)
+ cxl_port_commit_reap(cxld);
+ else
dev_dbg(&port->dev,
"%s: out of order reset, expected decoder%d.%d\n",
dev_name(&cxld->dev), port->id, port->commit_end);
- return -EBUSY;
- }
-
- port->commit_end--;
cxld->flags &= ~CXL_DECODER_F_ENABLE;
-
- return 0;
}
static void default_mock_decoder(struct cxl_decoder *cxld)
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x f8c879192465d9f328cb0df07208ef077c560bb1
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024110503-registry-reheat-cd11@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f8c879192465d9f328cb0df07208ef077c560bb1 Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson(a)oss.qualcomm.com>
Date: Wed, 23 Oct 2024 17:24:33 +0000
Subject: [PATCH] soc: qcom: pmic_glink: Handle GLINK intent allocation
rejections
Some versions of the pmic_glink firmware does not allow dynamic GLINK
intent allocations, attempting to send a message before the firmware has
allocated its receive buffers and announced these intent allocations
will fail. When this happens something like this showns up in the log:
pmic_glink_altmode.pmic_glink_altmode pmic_glink.altmode.0: failed to send altmode request: 0x10 (-125)
pmic_glink_altmode.pmic_glink_altmode pmic_glink.altmode.0: failed to request altmode notifications: -125
ucsi_glink.pmic_glink_ucsi pmic_glink.ucsi.0: failed to send UCSI read request: -125
qcom_battmgr.pmic_glink_power_supply pmic_glink.power-supply.0: failed to request power notifications
GLINK has been updated to distinguish between the cases where the remote
is going down (-ECANCELED) and the intent allocation being rejected
(-EAGAIN).
Retry the send until intent buffers becomes available, or an actual
error occur.
To avoid infinitely waiting for the firmware in the event that this
misbehaves and no intents arrive, an arbitrary 5 second timeout is
used.
This patch was developed with input from Chris Lew.
Reported-by: Johan Hovold <johan(a)kernel.org>
Closes: https://lore.kernel.org/all/Zqet8iInnDhnxkT9@hovoldconsulting.com/#t
Cc: stable(a)vger.kernel.org # rpmsg: glink: Handle rejected intent request better
Fixes: 58ef4ece1e41 ("soc: qcom: pmic_glink: Introduce base PMIC GLINK driver")
Tested-by: Johan Hovold <johan+linaro(a)kernel.org>
Reviewed-by: Johan Hovold <johan+linaro(a)kernel.org>
Signed-off-by: Bjorn Andersson <bjorn.andersson(a)oss.qualcomm.com>
Reviewed-by: Chris Lew <quic_clew(a)quicinc.com>
Link: https://lore.kernel.org/r/20241023-pmic-glink-ecancelled-v2-2-ebc268129407@…
Signed-off-by: Bjorn Andersson <andersson(a)kernel.org>
diff --git a/drivers/soc/qcom/pmic_glink.c b/drivers/soc/qcom/pmic_glink.c
index 9606222993fd..baa4ac6704a9 100644
--- a/drivers/soc/qcom/pmic_glink.c
+++ b/drivers/soc/qcom/pmic_glink.c
@@ -4,6 +4,7 @@
* Copyright (c) 2022, Linaro Ltd
*/
#include <linux/auxiliary_bus.h>
+#include <linux/delay.h>
#include <linux/module.h>
#include <linux/of.h>
#include <linux/platform_device.h>
@@ -13,6 +14,8 @@
#include <linux/soc/qcom/pmic_glink.h>
#include <linux/spinlock.h>
+#define PMIC_GLINK_SEND_TIMEOUT (5 * HZ)
+
enum {
PMIC_GLINK_CLIENT_BATT = 0,
PMIC_GLINK_CLIENT_ALTMODE,
@@ -112,13 +115,29 @@ EXPORT_SYMBOL_GPL(pmic_glink_client_register);
int pmic_glink_send(struct pmic_glink_client *client, void *data, size_t len)
{
struct pmic_glink *pg = client->pg;
+ bool timeout_reached = false;
+ unsigned long start;
int ret;
mutex_lock(&pg->state_lock);
- if (!pg->ept)
+ if (!pg->ept) {
ret = -ECONNRESET;
- else
- ret = rpmsg_send(pg->ept, data, len);
+ } else {
+ start = jiffies;
+ for (;;) {
+ ret = rpmsg_send(pg->ept, data, len);
+ if (ret != -EAGAIN)
+ break;
+
+ if (timeout_reached) {
+ ret = -ETIMEDOUT;
+ break;
+ }
+
+ usleep_range(1000, 5000);
+ timeout_reached = time_after(jiffies, start + PMIC_GLINK_SEND_TIMEOUT);
+ }
+ }
mutex_unlock(&pg->state_lock);
return ret;
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x d67907154808745b0fae5874edc7b0f78d33991c
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024110527-maternal-devious-a112@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From d67907154808745b0fae5874edc7b0f78d33991c Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan+linaro(a)kernel.org>
Date: Wed, 2 Oct 2024 12:01:21 +0200
Subject: [PATCH] firmware: qcom: scm: suppress download mode error
Stop spamming the logs with errors about missing mechanism for setting
the so called download (or dump) mode for users that have not requested
that feature to be enabled in the first place.
This avoids the follow error being logged on boot as well as on
shutdown when the feature it not available and download mode has not
been enabled on the kernel command line:
qcom_scm firmware:scm: No available mechanism for setting download mode
Fixes: 79cb2cb8d89b ("firmware: qcom: scm: Disable SDI and write no dump to dump mode")
Fixes: 781d32d1c970 ("firmware: qcom_scm: Clear download bit during reboot")
Cc: Mukesh Ojha <quic_mojha(a)quicinc.com>
Cc: stable(a)vger.kernel.org # 6.4
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
Reviewed-by: Mukesh Ojha <quic_mojha(a)quicinc.com>
Link: https://lore.kernel.org/r/20241002100122.18809-2-johan+linaro@kernel.org
Signed-off-by: Bjorn Andersson <andersson(a)kernel.org>
diff --git a/drivers/firmware/qcom/qcom_scm.c b/drivers/firmware/qcom/qcom_scm.c
index 10986cb11ec0..e2ac595902ed 100644
--- a/drivers/firmware/qcom/qcom_scm.c
+++ b/drivers/firmware/qcom/qcom_scm.c
@@ -545,7 +545,7 @@ static void qcom_scm_set_download_mode(u32 dload_mode)
} else if (__qcom_scm_is_call_available(__scm->dev, QCOM_SCM_SVC_BOOT,
QCOM_SCM_BOOT_SET_DLOAD_MODE)) {
ret = __qcom_scm_set_dload_mode(__scm->dev, !!dload_mode);
- } else {
+ } else if (dload_mode) {
dev_err(__scm->dev,
"No available mechanism for setting download mode\n");
}
The patch below does not apply to the 6.11-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.11.y
git checkout FETCH_HEAD
git cherry-pick -x d67907154808745b0fae5874edc7b0f78d33991c
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024110526-driven-small-2a5c@gregkh' --subject-prefix 'PATCH 6.11.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From d67907154808745b0fae5874edc7b0f78d33991c Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan+linaro(a)kernel.org>
Date: Wed, 2 Oct 2024 12:01:21 +0200
Subject: [PATCH] firmware: qcom: scm: suppress download mode error
Stop spamming the logs with errors about missing mechanism for setting
the so called download (or dump) mode for users that have not requested
that feature to be enabled in the first place.
This avoids the follow error being logged on boot as well as on
shutdown when the feature it not available and download mode has not
been enabled on the kernel command line:
qcom_scm firmware:scm: No available mechanism for setting download mode
Fixes: 79cb2cb8d89b ("firmware: qcom: scm: Disable SDI and write no dump to dump mode")
Fixes: 781d32d1c970 ("firmware: qcom_scm: Clear download bit during reboot")
Cc: Mukesh Ojha <quic_mojha(a)quicinc.com>
Cc: stable(a)vger.kernel.org # 6.4
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
Reviewed-by: Mukesh Ojha <quic_mojha(a)quicinc.com>
Link: https://lore.kernel.org/r/20241002100122.18809-2-johan+linaro@kernel.org
Signed-off-by: Bjorn Andersson <andersson(a)kernel.org>
diff --git a/drivers/firmware/qcom/qcom_scm.c b/drivers/firmware/qcom/qcom_scm.c
index 10986cb11ec0..e2ac595902ed 100644
--- a/drivers/firmware/qcom/qcom_scm.c
+++ b/drivers/firmware/qcom/qcom_scm.c
@@ -545,7 +545,7 @@ static void qcom_scm_set_download_mode(u32 dload_mode)
} else if (__qcom_scm_is_call_available(__scm->dev, QCOM_SCM_SVC_BOOT,
QCOM_SCM_BOOT_SET_DLOAD_MODE)) {
ret = __qcom_scm_set_dload_mode(__scm->dev, !!dload_mode);
- } else {
+ } else if (dload_mode) {
dev_err(__scm->dev,
"No available mechanism for setting download mode\n");
}
An atomicity violation occurs when the validity of the variables
da7219->clk_src and da7219->mclk_rate is being assessed. Since the entire
assessment is not protected by a lock, the da7219 variable might still be
in flux during the assessment, rendering this check invalid.
To fix this issue, we recommend adding a lock before the block
if ((da7219->clk_src == clk_id) && (da7219->mclk_rate == freq)) so that
the legitimacy check for da7219->clk_src and da7219->mclk_rate is
protected by the lock, ensuring the validity of the check.
This possible bug is found by an experimental static analysis tool
developed by our team. This tool analyzes the locking APIs
to extract function pairs that can be concurrently executed, and then
analyzes the instructions in the paired functions to identify possible
concurrency bugs including data races and atomicity violations.
Fixes: 6d817c0e9fd7 ("ASoC: codecs: Add da7219 codec driver")
Cc: stable(a)vger.kernel.org
Signed-off-by: Qiu-ji Chen <chenqiuji666(a)gmail.com>
---
sound/soc/codecs/da7219.c | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/sound/soc/codecs/da7219.c b/sound/soc/codecs/da7219.c
index 311ea7918b31..e2da3e317b5a 100644
--- a/sound/soc/codecs/da7219.c
+++ b/sound/soc/codecs/da7219.c
@@ -1167,17 +1167,20 @@ static int da7219_set_dai_sysclk(struct snd_soc_dai *codec_dai,
struct da7219_priv *da7219 = snd_soc_component_get_drvdata(component);
int ret = 0;
- if ((da7219->clk_src == clk_id) && (da7219->mclk_rate == freq))
+ mutex_lock(&da7219->pll_lock);
+
+ if ((da7219->clk_src == clk_id) && (da7219->mclk_rate == freq)) {
+ mutex_unlock(&da7219->pll_lock);
return 0;
+ }
if ((freq < 2000000) || (freq > 54000000)) {
+ mutex_unlock(&da7219->pll_lock);
dev_err(codec_dai->dev, "Unsupported MCLK value %d\n",
freq);
return -EINVAL;
}
- mutex_lock(&da7219->pll_lock);
-
switch (clk_id) {
case DA7219_CLKSRC_MCLK_SQR:
snd_soc_component_update_bits(component, DA7219_PLL_CTRL,
--
2.34.1
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 9d08ec41a0645283d79a2e642205d488feaceacf
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024110524-patience-rice-79e2@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9d08ec41a0645283d79a2e642205d488feaceacf Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao(a)google.com>
Date: Sat, 19 Oct 2024 22:22:12 -0600
Subject: [PATCH] mm: allow set/clear page_type again
Some page flags (page->flags) were converted to page types
(page->page_types). A recent example is PG_hugetlb.
From the exclusive writer's perspective, e.g., a thread doing
__folio_set_hugetlb(), there is a difference between the page flag and
type APIs: the former allows the same non-atomic operation to be repeated
whereas the latter does not. For example, calling __folio_set_hugetlb()
twice triggers VM_BUG_ON_FOLIO(), since the second call expects the type
(PG_hugetlb) not to be set previously.
Using add_hugetlb_folio() as an example, it calls __folio_set_hugetlb() in
the following error-handling path. And when that happens, it triggers the
aforementioned VM_BUG_ON_FOLIO().
if (folio_test_hugetlb(folio)) {
rc = hugetlb_vmemmap_restore_folio(h, folio);
if (rc) {
spin_lock_irq(&hugetlb_lock);
add_hugetlb_folio(h, folio, false);
...
It is possible to make hugeTLB comply with the new requirements from the
page type API. However, a straightforward fix would be to just allow the
same page type to be set or cleared again inside the API, to avoid any
changes to its callers.
Link: https://lkml.kernel.org/r/20241020042212.296781-1-yuzhao@google.com
Fixes: d99e3140a4d3 ("mm: turn folio_test_hugetlb into a PageType")
Signed-off-by: Yu Zhao <yuzhao(a)google.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Muchun Song <muchun.song(a)linux.dev>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 1b3a76710487..cc839e4365c1 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -975,12 +975,16 @@ static __always_inline bool folio_test_##fname(const struct folio *folio) \
} \
static __always_inline void __folio_set_##fname(struct folio *folio) \
{ \
+ if (folio_test_##fname(folio)) \
+ return; \
VM_BUG_ON_FOLIO(data_race(folio->page.page_type) != UINT_MAX, \
folio); \
folio->page.page_type = (unsigned int)PGTY_##lname << 24; \
} \
static __always_inline void __folio_clear_##fname(struct folio *folio) \
{ \
+ if (folio->page.page_type == UINT_MAX) \
+ return; \
VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio); \
folio->page.page_type = UINT_MAX; \
}
@@ -993,11 +997,15 @@ static __always_inline int Page##uname(const struct page *page) \
} \
static __always_inline void __SetPage##uname(struct page *page) \
{ \
+ if (Page##uname(page)) \
+ return; \
VM_BUG_ON_PAGE(data_race(page->page_type) != UINT_MAX, page); \
page->page_type = (unsigned int)PGTY_##lname << 24; \
} \
static __always_inline void __ClearPage##uname(struct page *page) \
{ \
+ if (page->page_type == UINT_MAX) \
+ return; \
VM_BUG_ON_PAGE(!Page##uname(page), page); \
page->page_type = UINT_MAX; \
}
The patch below does not apply to the 6.11-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.11.y
git checkout FETCH_HEAD
git cherry-pick -x 9d08ec41a0645283d79a2e642205d488feaceacf
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024110524-runner-gravity-4288@gregkh' --subject-prefix 'PATCH 6.11.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 9d08ec41a0645283d79a2e642205d488feaceacf Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao(a)google.com>
Date: Sat, 19 Oct 2024 22:22:12 -0600
Subject: [PATCH] mm: allow set/clear page_type again
Some page flags (page->flags) were converted to page types
(page->page_types). A recent example is PG_hugetlb.
From the exclusive writer's perspective, e.g., a thread doing
__folio_set_hugetlb(), there is a difference between the page flag and
type APIs: the former allows the same non-atomic operation to be repeated
whereas the latter does not. For example, calling __folio_set_hugetlb()
twice triggers VM_BUG_ON_FOLIO(), since the second call expects the type
(PG_hugetlb) not to be set previously.
Using add_hugetlb_folio() as an example, it calls __folio_set_hugetlb() in
the following error-handling path. And when that happens, it triggers the
aforementioned VM_BUG_ON_FOLIO().
if (folio_test_hugetlb(folio)) {
rc = hugetlb_vmemmap_restore_folio(h, folio);
if (rc) {
spin_lock_irq(&hugetlb_lock);
add_hugetlb_folio(h, folio, false);
...
It is possible to make hugeTLB comply with the new requirements from the
page type API. However, a straightforward fix would be to just allow the
same page type to be set or cleared again inside the API, to avoid any
changes to its callers.
Link: https://lkml.kernel.org/r/20241020042212.296781-1-yuzhao@google.com
Fixes: d99e3140a4d3 ("mm: turn folio_test_hugetlb into a PageType")
Signed-off-by: Yu Zhao <yuzhao(a)google.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Muchun Song <muchun.song(a)linux.dev>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 1b3a76710487..cc839e4365c1 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -975,12 +975,16 @@ static __always_inline bool folio_test_##fname(const struct folio *folio) \
} \
static __always_inline void __folio_set_##fname(struct folio *folio) \
{ \
+ if (folio_test_##fname(folio)) \
+ return; \
VM_BUG_ON_FOLIO(data_race(folio->page.page_type) != UINT_MAX, \
folio); \
folio->page.page_type = (unsigned int)PGTY_##lname << 24; \
} \
static __always_inline void __folio_clear_##fname(struct folio *folio) \
{ \
+ if (folio->page.page_type == UINT_MAX) \
+ return; \
VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio); \
folio->page.page_type = UINT_MAX; \
}
@@ -993,11 +997,15 @@ static __always_inline int Page##uname(const struct page *page) \
} \
static __always_inline void __SetPage##uname(struct page *page) \
{ \
+ if (Page##uname(page)) \
+ return; \
VM_BUG_ON_PAGE(data_race(page->page_type) != UINT_MAX, page); \
page->page_type = (unsigned int)PGTY_##lname << 24; \
} \
static __always_inline void __ClearPage##uname(struct page *page) \
{ \
+ if (page->page_type == UINT_MAX) \
+ return; \
VM_BUG_ON_PAGE(!Page##uname(page), page); \
page->page_type = UINT_MAX; \
}
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x ddd6d8e975b171ea3f63a011a75820883ff0d479
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024110504-flagship-precook-4a47@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ddd6d8e975b171ea3f63a011a75820883ff0d479 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao(a)google.com>
Date: Sat, 19 Oct 2024 01:29:38 +0000
Subject: [PATCH] mm: multi-gen LRU: remove MM_LEAF_OLD and MM_NONLEAF_TOTAL
stats
Patch series "mm: multi-gen LRU: Have secondary MMUs participate in
MM_WALK".
Today, the MM_WALK capability causes MGLRU to clear the young bit from
PMDs and PTEs during the page table walk before eviction, but MGLRU does
not call the clear_young() MMU notifier in this case. By not calling this
notifier, the MM walk takes less time/CPU, but it causes pages that are
accessed mostly through KVM / secondary MMUs to appear younger than they
should be.
We do call the clear_young() notifier today, but only when attempting to
evict the page, so we end up clearing young/accessed information less
frequently for secondary MMUs than for mm PTEs, and therefore they appear
younger and are less likely to be evicted. Therefore, memory that is
*not* being accessed mostly by KVM will be evicted *more* frequently,
worsening performance.
ChromeOS observed a tab-open latency regression when enabling MGLRU with a
setup that involved running a VM:
Tab-open latency histogram (ms)
Version p50 mean p95 p99 max
base 1315 1198 2347 3454 10319
mglru 2559 1311 7399 12060 43758
fix 1119 926 2470 4211 6947
This series replaces the final non-selftest patchs from this series[1],
which introduced a similar change (and a new MMU notifier) with KVM
optimizations. I'll send a separate series (to Sean and Paolo) for the
KVM optimizations.
This series also makes proactive reclaim with MGLRU possible for KVM
memory. I have verified that this functions correctly with the selftest
from [1], but given that that test is a KVM selftest, I'll send it with
the rest of the KVM optimizations later. Andrew, let me know if you'd
like to take the test now anyway.
[1]: https://lore.kernel.org/linux-mm/20240926013506.860253-18-jthoughton@google…
This patch (of 2):
The removed stats, MM_LEAF_OLD and MM_NONLEAF_TOTAL, are not very helpful
and become more complicated to properly compute when adding
test/clear_young() notifiers in MGLRU's mm walk.
Link: https://lkml.kernel.org/r/20241019012940.3656292-1-jthoughton@google.com
Link: https://lkml.kernel.org/r/20241019012940.3656292-2-jthoughton@google.com
Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks")
Signed-off-by: Yu Zhao <yuzhao(a)google.com>
Signed-off-by: James Houghton <jthoughton(a)google.com>
Cc: Axel Rasmussen <axelrasmussen(a)google.com>
Cc: David Matlack <dmatlack(a)google.com>
Cc: David Rientjes <rientjes(a)google.com>
Cc: David Stevens <stevensd(a)google.com>
Cc: Oliver Upton <oliver.upton(a)linux.dev>
Cc: Paolo Bonzini <pbonzini(a)redhat.com>
Cc: Sean Christopherson <seanjc(a)google.com>
Cc: Wei Xu <weixugc(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 17506e4a2835..9342e5692dab 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -458,9 +458,7 @@ struct lru_gen_folio {
enum {
MM_LEAF_TOTAL, /* total leaf entries */
- MM_LEAF_OLD, /* old leaf entries */
MM_LEAF_YOUNG, /* young leaf entries */
- MM_NONLEAF_TOTAL, /* total non-leaf entries */
MM_NONLEAF_FOUND, /* non-leaf entries found in Bloom filters */
MM_NONLEAF_ADDED, /* non-leaf entries added to Bloom filters */
NR_MM_STATS
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eb4e8440c507..4f1d33e4b360 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3399,7 +3399,6 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
continue;
if (!pte_young(ptent)) {
- walk->mm_stats[MM_LEAF_OLD]++;
continue;
}
@@ -3552,7 +3551,6 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
walk->mm_stats[MM_LEAF_TOTAL]++;
if (!pmd_young(val)) {
- walk->mm_stats[MM_LEAF_OLD]++;
continue;
}
@@ -3564,8 +3562,6 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
continue;
}
- walk->mm_stats[MM_NONLEAF_TOTAL]++;
-
if (!walk->force_scan && should_clear_pmd_young()) {
if (!pmd_young(val))
continue;
@@ -5254,11 +5250,11 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
for (tier = 0; tier < MAX_NR_TIERS; tier++) {
seq_printf(m, " %10d", tier);
for (type = 0; type < ANON_AND_FILE; type++) {
- const char *s = " ";
+ const char *s = "xxx";
unsigned long n[3] = {};
if (seq == max_seq) {
- s = "RT ";
+ s = "RTx";
n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
} else if (seq == min_seq[type] || NR_HIST_GENS > 1) {
@@ -5280,14 +5276,14 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
seq_puts(m, " ");
for (i = 0; i < NR_MM_STATS; i++) {
- const char *s = " ";
+ const char *s = "xxxx";
unsigned long n = 0;
if (seq == max_seq && NR_HIST_GENS == 1) {
- s = "LOYNFA";
+ s = "TYFA";
n = READ_ONCE(mm_state->stats[hist][i]);
} else if (seq != max_seq && NR_HIST_GENS > 1) {
- s = "loynfa";
+ s = "tyfa";
n = READ_ONCE(mm_state->stats[hist][i]);
}
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x ddd6d8e975b171ea3f63a011a75820883ff0d479
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024110502-removal-babied-f697@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ddd6d8e975b171ea3f63a011a75820883ff0d479 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao(a)google.com>
Date: Sat, 19 Oct 2024 01:29:38 +0000
Subject: [PATCH] mm: multi-gen LRU: remove MM_LEAF_OLD and MM_NONLEAF_TOTAL
stats
Patch series "mm: multi-gen LRU: Have secondary MMUs participate in
MM_WALK".
Today, the MM_WALK capability causes MGLRU to clear the young bit from
PMDs and PTEs during the page table walk before eviction, but MGLRU does
not call the clear_young() MMU notifier in this case. By not calling this
notifier, the MM walk takes less time/CPU, but it causes pages that are
accessed mostly through KVM / secondary MMUs to appear younger than they
should be.
We do call the clear_young() notifier today, but only when attempting to
evict the page, so we end up clearing young/accessed information less
frequently for secondary MMUs than for mm PTEs, and therefore they appear
younger and are less likely to be evicted. Therefore, memory that is
*not* being accessed mostly by KVM will be evicted *more* frequently,
worsening performance.
ChromeOS observed a tab-open latency regression when enabling MGLRU with a
setup that involved running a VM:
Tab-open latency histogram (ms)
Version p50 mean p95 p99 max
base 1315 1198 2347 3454 10319
mglru 2559 1311 7399 12060 43758
fix 1119 926 2470 4211 6947
This series replaces the final non-selftest patchs from this series[1],
which introduced a similar change (and a new MMU notifier) with KVM
optimizations. I'll send a separate series (to Sean and Paolo) for the
KVM optimizations.
This series also makes proactive reclaim with MGLRU possible for KVM
memory. I have verified that this functions correctly with the selftest
from [1], but given that that test is a KVM selftest, I'll send it with
the rest of the KVM optimizations later. Andrew, let me know if you'd
like to take the test now anyway.
[1]: https://lore.kernel.org/linux-mm/20240926013506.860253-18-jthoughton@google…
This patch (of 2):
The removed stats, MM_LEAF_OLD and MM_NONLEAF_TOTAL, are not very helpful
and become more complicated to properly compute when adding
test/clear_young() notifiers in MGLRU's mm walk.
Link: https://lkml.kernel.org/r/20241019012940.3656292-1-jthoughton@google.com
Link: https://lkml.kernel.org/r/20241019012940.3656292-2-jthoughton@google.com
Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks")
Signed-off-by: Yu Zhao <yuzhao(a)google.com>
Signed-off-by: James Houghton <jthoughton(a)google.com>
Cc: Axel Rasmussen <axelrasmussen(a)google.com>
Cc: David Matlack <dmatlack(a)google.com>
Cc: David Rientjes <rientjes(a)google.com>
Cc: David Stevens <stevensd(a)google.com>
Cc: Oliver Upton <oliver.upton(a)linux.dev>
Cc: Paolo Bonzini <pbonzini(a)redhat.com>
Cc: Sean Christopherson <seanjc(a)google.com>
Cc: Wei Xu <weixugc(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 17506e4a2835..9342e5692dab 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -458,9 +458,7 @@ struct lru_gen_folio {
enum {
MM_LEAF_TOTAL, /* total leaf entries */
- MM_LEAF_OLD, /* old leaf entries */
MM_LEAF_YOUNG, /* young leaf entries */
- MM_NONLEAF_TOTAL, /* total non-leaf entries */
MM_NONLEAF_FOUND, /* non-leaf entries found in Bloom filters */
MM_NONLEAF_ADDED, /* non-leaf entries added to Bloom filters */
NR_MM_STATS
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eb4e8440c507..4f1d33e4b360 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3399,7 +3399,6 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
continue;
if (!pte_young(ptent)) {
- walk->mm_stats[MM_LEAF_OLD]++;
continue;
}
@@ -3552,7 +3551,6 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
walk->mm_stats[MM_LEAF_TOTAL]++;
if (!pmd_young(val)) {
- walk->mm_stats[MM_LEAF_OLD]++;
continue;
}
@@ -3564,8 +3562,6 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
continue;
}
- walk->mm_stats[MM_NONLEAF_TOTAL]++;
-
if (!walk->force_scan && should_clear_pmd_young()) {
if (!pmd_young(val))
continue;
@@ -5254,11 +5250,11 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
for (tier = 0; tier < MAX_NR_TIERS; tier++) {
seq_printf(m, " %10d", tier);
for (type = 0; type < ANON_AND_FILE; type++) {
- const char *s = " ";
+ const char *s = "xxx";
unsigned long n[3] = {};
if (seq == max_seq) {
- s = "RT ";
+ s = "RTx";
n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
} else if (seq == min_seq[type] || NR_HIST_GENS > 1) {
@@ -5280,14 +5276,14 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
seq_puts(m, " ");
for (i = 0; i < NR_MM_STATS; i++) {
- const char *s = " ";
+ const char *s = "xxxx";
unsigned long n = 0;
if (seq == max_seq && NR_HIST_GENS == 1) {
- s = "LOYNFA";
+ s = "TYFA";
n = READ_ONCE(mm_state->stats[hist][i]);
} else if (seq != max_seq && NR_HIST_GENS > 1) {
- s = "loynfa";
+ s = "tyfa";
n = READ_ONCE(mm_state->stats[hist][i]);
}
The patch below does not apply to the 6.11-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.11.y
git checkout FETCH_HEAD
git cherry-pick -x ddd6d8e975b171ea3f63a011a75820883ff0d479
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024110501-scrubber-eating-d64d@gregkh' --subject-prefix 'PATCH 6.11.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ddd6d8e975b171ea3f63a011a75820883ff0d479 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao(a)google.com>
Date: Sat, 19 Oct 2024 01:29:38 +0000
Subject: [PATCH] mm: multi-gen LRU: remove MM_LEAF_OLD and MM_NONLEAF_TOTAL
stats
Patch series "mm: multi-gen LRU: Have secondary MMUs participate in
MM_WALK".
Today, the MM_WALK capability causes MGLRU to clear the young bit from
PMDs and PTEs during the page table walk before eviction, but MGLRU does
not call the clear_young() MMU notifier in this case. By not calling this
notifier, the MM walk takes less time/CPU, but it causes pages that are
accessed mostly through KVM / secondary MMUs to appear younger than they
should be.
We do call the clear_young() notifier today, but only when attempting to
evict the page, so we end up clearing young/accessed information less
frequently for secondary MMUs than for mm PTEs, and therefore they appear
younger and are less likely to be evicted. Therefore, memory that is
*not* being accessed mostly by KVM will be evicted *more* frequently,
worsening performance.
ChromeOS observed a tab-open latency regression when enabling MGLRU with a
setup that involved running a VM:
Tab-open latency histogram (ms)
Version p50 mean p95 p99 max
base 1315 1198 2347 3454 10319
mglru 2559 1311 7399 12060 43758
fix 1119 926 2470 4211 6947
This series replaces the final non-selftest patchs from this series[1],
which introduced a similar change (and a new MMU notifier) with KVM
optimizations. I'll send a separate series (to Sean and Paolo) for the
KVM optimizations.
This series also makes proactive reclaim with MGLRU possible for KVM
memory. I have verified that this functions correctly with the selftest
from [1], but given that that test is a KVM selftest, I'll send it with
the rest of the KVM optimizations later. Andrew, let me know if you'd
like to take the test now anyway.
[1]: https://lore.kernel.org/linux-mm/20240926013506.860253-18-jthoughton@google…
This patch (of 2):
The removed stats, MM_LEAF_OLD and MM_NONLEAF_TOTAL, are not very helpful
and become more complicated to properly compute when adding
test/clear_young() notifiers in MGLRU's mm walk.
Link: https://lkml.kernel.org/r/20241019012940.3656292-1-jthoughton@google.com
Link: https://lkml.kernel.org/r/20241019012940.3656292-2-jthoughton@google.com
Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks")
Signed-off-by: Yu Zhao <yuzhao(a)google.com>
Signed-off-by: James Houghton <jthoughton(a)google.com>
Cc: Axel Rasmussen <axelrasmussen(a)google.com>
Cc: David Matlack <dmatlack(a)google.com>
Cc: David Rientjes <rientjes(a)google.com>
Cc: David Stevens <stevensd(a)google.com>
Cc: Oliver Upton <oliver.upton(a)linux.dev>
Cc: Paolo Bonzini <pbonzini(a)redhat.com>
Cc: Sean Christopherson <seanjc(a)google.com>
Cc: Wei Xu <weixugc(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 17506e4a2835..9342e5692dab 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -458,9 +458,7 @@ struct lru_gen_folio {
enum {
MM_LEAF_TOTAL, /* total leaf entries */
- MM_LEAF_OLD, /* old leaf entries */
MM_LEAF_YOUNG, /* young leaf entries */
- MM_NONLEAF_TOTAL, /* total non-leaf entries */
MM_NONLEAF_FOUND, /* non-leaf entries found in Bloom filters */
MM_NONLEAF_ADDED, /* non-leaf entries added to Bloom filters */
NR_MM_STATS
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eb4e8440c507..4f1d33e4b360 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3399,7 +3399,6 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
continue;
if (!pte_young(ptent)) {
- walk->mm_stats[MM_LEAF_OLD]++;
continue;
}
@@ -3552,7 +3551,6 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
walk->mm_stats[MM_LEAF_TOTAL]++;
if (!pmd_young(val)) {
- walk->mm_stats[MM_LEAF_OLD]++;
continue;
}
@@ -3564,8 +3562,6 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
continue;
}
- walk->mm_stats[MM_NONLEAF_TOTAL]++;
-
if (!walk->force_scan && should_clear_pmd_young()) {
if (!pmd_young(val))
continue;
@@ -5254,11 +5250,11 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
for (tier = 0; tier < MAX_NR_TIERS; tier++) {
seq_printf(m, " %10d", tier);
for (type = 0; type < ANON_AND_FILE; type++) {
- const char *s = " ";
+ const char *s = "xxx";
unsigned long n[3] = {};
if (seq == max_seq) {
- s = "RT ";
+ s = "RTx";
n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
} else if (seq == min_seq[type] || NR_HIST_GENS > 1) {
@@ -5280,14 +5276,14 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
seq_puts(m, " ");
for (i = 0; i < NR_MM_STATS; i++) {
- const char *s = " ";
+ const char *s = "xxxx";
unsigned long n = 0;
if (seq == max_seq && NR_HIST_GENS == 1) {
- s = "LOYNFA";
+ s = "TYFA";
n = READ_ONCE(mm_state->stats[hist][i]);
} else if (seq != max_seq && NR_HIST_GENS > 1) {
- s = "loynfa";
+ s = "tyfa";
n = READ_ONCE(mm_state->stats[hist][i]);
}
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 01626a18230246efdcea322aa8f067e60ffe5ccd
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024110506-octane-phosphate-f084@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 01626a18230246efdcea322aa8f067e60ffe5ccd Mon Sep 17 00:00:00 2001
From: Barry Song <baohua(a)kernel.org>
Date: Fri, 27 Sep 2024 09:19:36 +1200
Subject: [PATCH] mm: avoid unconditional one-tick sleep when swapcache_prepare
fails
Commit 13ddaf26be32 ("mm/swap: fix race when skipping swapcache")
introduced an unconditional one-tick sleep when `swapcache_prepare()`
fails, which has led to reports of UI stuttering on latency-sensitive
Android devices. To address this, we can use a waitqueue to wake up tasks
that fail `swapcache_prepare()` sooner, instead of always sleeping for a
full tick. While tasks may occasionally be woken by an unrelated
`do_swap_page()`, this method is preferable to two scenarios: rapid
re-entry into page faults, which can cause livelocks, and multiple
millisecond sleeps, which visibly degrade user experience.
Oven's testing shows that a single waitqueue resolves the UI stuttering
issue. If a 'thundering herd' problem becomes apparent later, a waitqueue
hash similar to `folio_wait_table[PAGE_WAIT_TABLE_SIZE]` for page bit
locks can be introduced.
[v-songbaohua(a)oppo.com: wake_up only when swapcache_wq waitqueue is active]
Link: https://lkml.kernel.org/r/20241008130807.40833-1-21cnbao@gmail.com
Link: https://lkml.kernel.org/r/20240926211936.75373-1-21cnbao@gmail.com
Fixes: 13ddaf26be32 ("mm/swap: fix race when skipping swapcache")
Signed-off-by: Barry Song <v-songbaohua(a)oppo.com>
Reported-by: Oven Liyang <liyangouwen1(a)oppo.com>
Tested-by: Oven Liyang <liyangouwen1(a)oppo.com>
Cc: Kairui Song <kasong(a)tencent.com>
Cc: "Huang, Ying" <ying.huang(a)intel.com>
Cc: Yu Zhao <yuzhao(a)google.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Chris Li <chrisl(a)kernel.org>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: Yosry Ahmed <yosryahmed(a)google.com>
Cc: SeongJae Park <sj(a)kernel.org>
Cc: Kalesh Singh <kaleshsingh(a)google.com>
Cc: Suren Baghdasaryan <surenb(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/memory.c b/mm/memory.c
index 3ccee51adfbb..bdf77a3ec47b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4187,6 +4187,8 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq);
+
/*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
@@ -4199,6 +4201,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct folio *swapcache, *folio = NULL;
+ DECLARE_WAITQUEUE(wait, current);
struct page *page;
struct swap_info_struct *si = NULL;
rmap_t rmap_flags = RMAP_NONE;
@@ -4297,7 +4300,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* Relax a bit to prevent rapid
* repeated page faults.
*/
+ add_wait_queue(&swapcache_wq, &wait);
schedule_timeout_uninterruptible(1);
+ remove_wait_queue(&swapcache_wq, &wait);
goto out_page;
}
need_clear_cache = true;
@@ -4604,8 +4609,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
/* Clear the swap cache pin for direct swapin after PTL unlock */
- if (need_clear_cache)
+ if (need_clear_cache) {
swapcache_clear(si, entry, nr_pages);
+ if (waitqueue_active(&swapcache_wq))
+ wake_up(&swapcache_wq);
+ }
if (si)
put_swap_device(si);
return ret;
@@ -4620,8 +4628,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
folio_unlock(swapcache);
folio_put(swapcache);
}
- if (need_clear_cache)
+ if (need_clear_cache) {
swapcache_clear(si, entry, nr_pages);
+ if (waitqueue_active(&swapcache_wq))
+ wake_up(&swapcache_wq);
+ }
if (si)
put_swap_device(si);
return ret;
The patch below does not apply to the 6.6-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.6.y
git checkout FETCH_HEAD
git cherry-pick -x 01626a18230246efdcea322aa8f067e60ffe5ccd
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024110505-january-napped-4f1b@gregkh' --subject-prefix 'PATCH 6.6.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 01626a18230246efdcea322aa8f067e60ffe5ccd Mon Sep 17 00:00:00 2001
From: Barry Song <baohua(a)kernel.org>
Date: Fri, 27 Sep 2024 09:19:36 +1200
Subject: [PATCH] mm: avoid unconditional one-tick sleep when swapcache_prepare
fails
Commit 13ddaf26be32 ("mm/swap: fix race when skipping swapcache")
introduced an unconditional one-tick sleep when `swapcache_prepare()`
fails, which has led to reports of UI stuttering on latency-sensitive
Android devices. To address this, we can use a waitqueue to wake up tasks
that fail `swapcache_prepare()` sooner, instead of always sleeping for a
full tick. While tasks may occasionally be woken by an unrelated
`do_swap_page()`, this method is preferable to two scenarios: rapid
re-entry into page faults, which can cause livelocks, and multiple
millisecond sleeps, which visibly degrade user experience.
Oven's testing shows that a single waitqueue resolves the UI stuttering
issue. If a 'thundering herd' problem becomes apparent later, a waitqueue
hash similar to `folio_wait_table[PAGE_WAIT_TABLE_SIZE]` for page bit
locks can be introduced.
[v-songbaohua(a)oppo.com: wake_up only when swapcache_wq waitqueue is active]
Link: https://lkml.kernel.org/r/20241008130807.40833-1-21cnbao@gmail.com
Link: https://lkml.kernel.org/r/20240926211936.75373-1-21cnbao@gmail.com
Fixes: 13ddaf26be32 ("mm/swap: fix race when skipping swapcache")
Signed-off-by: Barry Song <v-songbaohua(a)oppo.com>
Reported-by: Oven Liyang <liyangouwen1(a)oppo.com>
Tested-by: Oven Liyang <liyangouwen1(a)oppo.com>
Cc: Kairui Song <kasong(a)tencent.com>
Cc: "Huang, Ying" <ying.huang(a)intel.com>
Cc: Yu Zhao <yuzhao(a)google.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Chris Li <chrisl(a)kernel.org>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Minchan Kim <minchan(a)kernel.org>
Cc: Yosry Ahmed <yosryahmed(a)google.com>
Cc: SeongJae Park <sj(a)kernel.org>
Cc: Kalesh Singh <kaleshsingh(a)google.com>
Cc: Suren Baghdasaryan <surenb(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/memory.c b/mm/memory.c
index 3ccee51adfbb..bdf77a3ec47b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4187,6 +4187,8 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq);
+
/*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
@@ -4199,6 +4201,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct folio *swapcache, *folio = NULL;
+ DECLARE_WAITQUEUE(wait, current);
struct page *page;
struct swap_info_struct *si = NULL;
rmap_t rmap_flags = RMAP_NONE;
@@ -4297,7 +4300,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* Relax a bit to prevent rapid
* repeated page faults.
*/
+ add_wait_queue(&swapcache_wq, &wait);
schedule_timeout_uninterruptible(1);
+ remove_wait_queue(&swapcache_wq, &wait);
goto out_page;
}
need_clear_cache = true;
@@ -4604,8 +4609,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
/* Clear the swap cache pin for direct swapin after PTL unlock */
- if (need_clear_cache)
+ if (need_clear_cache) {
swapcache_clear(si, entry, nr_pages);
+ if (waitqueue_active(&swapcache_wq))
+ wake_up(&swapcache_wq);
+ }
if (si)
put_swap_device(si);
return ret;
@@ -4620,8 +4628,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
folio_unlock(swapcache);
folio_put(swapcache);
}
- if (need_clear_cache)
+ if (need_clear_cache) {
swapcache_clear(si, entry, nr_pages);
+ if (waitqueue_active(&swapcache_wq))
+ wake_up(&swapcache_wq);
+ }
if (si)
put_swap_device(si);
return ret;