The mmap read lock is used during the shrinker's callback, which means
that using alloc->vma pointer isn't safe as it can race with munmap().
As of commit dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in
munmap") the mmap lock is downgraded after the vma has been isolated.
I was able to reproduce this issue by manually adding some delays and
triggering page reclaiming through the shrinker's debug sysfs. The
following KASAN report confirms the UAF:
==================================================================
BUG: KASAN: slab-use-after-free in zap_page_range_single+0x470/0x4b8
Read of size 8 at addr ffff356ed50e50f0 by task bash/478
CPU: 1 PID: 478 Comm: bash Not tainted 6.6.0-rc5-00055-g1c8b86a3799f-dirty #70
Hardware name: linux,dummy-virt (DT)
Call trace:
zap_page_range_single+0x470/0x4b8
binder_alloc_free_page+0x608/0xadc
__list_lru_walk_one+0x130/0x3b0
list_lru_walk_node+0xc4/0x22c
binder_shrink_scan+0x108/0x1dc
shrinker_debugfs_scan_write+0x2b4/0x500
full_proxy_write+0xd4/0x140
vfs_write+0x1ac/0x758
ksys_write+0xf0/0x1dc
__arm64_sys_write+0x6c/0x9c
Allocated by task 492:
kmem_cache_alloc+0x130/0x368
vm_area_alloc+0x2c/0x190
mmap_region+0x258/0x18bc
do_mmap+0x694/0xa60
vm_mmap_pgoff+0x170/0x29c
ksys_mmap_pgoff+0x290/0x3a0
__arm64_sys_mmap+0xcc/0x144
Freed by task 491:
kmem_cache_free+0x17c/0x3c8
vm_area_free_rcu_cb+0x74/0x98
rcu_core+0xa38/0x26d4
rcu_core_si+0x10/0x1c
__do_softirq+0x2fc/0xd24
Last potentially related work creation:
__call_rcu_common.constprop.0+0x6c/0xba0
call_rcu+0x10/0x1c
vm_area_free+0x18/0x24
remove_vma+0xe4/0x118
do_vmi_align_munmap.isra.0+0x718/0xb5c
do_vmi_munmap+0xdc/0x1fc
__vm_munmap+0x10c/0x278
__arm64_sys_munmap+0x58/0x7c
Fix this issue by performing instead a vma_lookup() which will fail to
find the vma that was isolated before the mmap lock downgrade. Note that
this option has better performance than upgrading to a mmap write lock
which would increase contention. Plus, mmap_write_trylock() has been
recently removed anyway.
Fixes: dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in munmap")
Cc: stable(a)vger.kernel.org
Cc: Liam Howlett <liam.howlett(a)oracle.com>
Cc: Minchan Kim <minchan(a)kernel.org>
Signed-off-by: Carlos Llamas <cmllamas(a)google.com>
---
drivers/android/binder_alloc.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index e3db8297095a..c4d60d81221b 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -1005,7 +1005,9 @@ enum lru_status binder_alloc_free_page(struct list_head *item,
goto err_mmget;
if (!mmap_read_trylock(mm))
goto err_mmap_read_lock_failed;
- vma = binder_alloc_get_vma(alloc);
+ vma = vma_lookup(mm, page_addr);
+ if (vma && vma != binder_alloc_get_vma(alloc))
+ goto err_invalid_vma;
list_lru_isolate(lru, item);
spin_unlock(lock);
@@ -1031,6 +1033,8 @@ enum lru_status binder_alloc_free_page(struct list_head *item,
mutex_unlock(&alloc->mutex);
return LRU_REMOVED_RETRY;
+err_invalid_vma:
+ mmap_read_unlock(mm);
err_mmap_read_lock_failed:
mmput_async(mm);
err_mmget:
--
2.42.0.869.gea05f2083d-goog
Commits 7b8ef22ea547 ("usb: xhci: plat: Add USB phy support") and
9134c1fd0503 ("usb: xhci: plat: Add USB 3.0 phy support") added support
for looking up legacy PHYs from the sysdev devicetree node and
initialising them.
This broke drivers such as dwc3 which manages PHYs themself as the PHYs
would now be initialised twice, something which specifically can lead to
resources being left enabled during suspend (e.g. with the
usb_phy_generic PHY driver).
As the dwc3 driver uses driver-name matching for the xhci platform
device, fix this by only looking up and initialising PHYs for devices
that have been matched using OF.
Note that checking that the platform device has a devicetree node would
currently be sufficient, but that could lead to subtle breakages in case
anyone ever tries to reuse an ancestor's node.
Fixes: 7b8ef22ea547 ("usb: xhci: plat: Add USB phy support")
Fixes: 9134c1fd0503 ("usb: xhci: plat: Add USB 3.0 phy support")
Cc: stable(a)vger.kernel.org # 4.1
Cc: Maxime Ripard <mripard(a)kernel.org>
Cc: Stanley Chang <stanley_chang(a)realtek.com>
Signed-off-by: Johan Hovold <johan+linaro(a)kernel.org>
---
drivers/usb/host/xhci-plat.c | 50 +++++++++++++++++++++---------------
1 file changed, 30 insertions(+), 20 deletions(-)
diff --git a/drivers/usb/host/xhci-plat.c b/drivers/usb/host/xhci-plat.c
index 28218c8f1837..01d19d17153b 100644
--- a/drivers/usb/host/xhci-plat.c
+++ b/drivers/usb/host/xhci-plat.c
@@ -13,6 +13,7 @@
#include <linux/module.h>
#include <linux/pci.h>
#include <linux/of.h>
+#include <linux/of_device.h>
#include <linux/platform_device.h>
#include <linux/usb/phy.h>
#include <linux/slab.h>
@@ -148,7 +149,7 @@ int xhci_plat_probe(struct platform_device *pdev, struct device *sysdev, const s
int ret;
int irq;
struct xhci_plat_priv *priv = NULL;
-
+ bool of_match;
if (usb_disabled())
return -ENODEV;
@@ -253,16 +254,23 @@ int xhci_plat_probe(struct platform_device *pdev, struct device *sysdev, const s
&xhci->imod_interval);
}
- hcd->usb_phy = devm_usb_get_phy_by_phandle(sysdev, "usb-phy", 0);
- if (IS_ERR(hcd->usb_phy)) {
- ret = PTR_ERR(hcd->usb_phy);
- if (ret == -EPROBE_DEFER)
- goto disable_clk;
- hcd->usb_phy = NULL;
- } else {
- ret = usb_phy_init(hcd->usb_phy);
- if (ret)
- goto disable_clk;
+ /*
+ * Drivers such as dwc3 manages PHYs themself (and rely on driver name
+ * matching for the xhci platform device).
+ */
+ of_match = of_match_device(pdev->dev.driver->of_match_table, &pdev->dev);
+ if (of_match) {
+ hcd->usb_phy = devm_usb_get_phy_by_phandle(sysdev, "usb-phy", 0);
+ if (IS_ERR(hcd->usb_phy)) {
+ ret = PTR_ERR(hcd->usb_phy);
+ if (ret == -EPROBE_DEFER)
+ goto disable_clk;
+ hcd->usb_phy = NULL;
+ } else {
+ ret = usb_phy_init(hcd->usb_phy);
+ if (ret)
+ goto disable_clk;
+ }
}
hcd->tpl_support = of_usb_host_tpl_support(sysdev->of_node);
@@ -285,15 +293,17 @@ int xhci_plat_probe(struct platform_device *pdev, struct device *sysdev, const s
goto dealloc_usb2_hcd;
}
- xhci->shared_hcd->usb_phy = devm_usb_get_phy_by_phandle(sysdev,
- "usb-phy", 1);
- if (IS_ERR(xhci->shared_hcd->usb_phy)) {
- xhci->shared_hcd->usb_phy = NULL;
- } else {
- ret = usb_phy_init(xhci->shared_hcd->usb_phy);
- if (ret)
- dev_err(sysdev, "%s init usb3phy fail (ret=%d)\n",
- __func__, ret);
+ if (of_match) {
+ xhci->shared_hcd->usb_phy = devm_usb_get_phy_by_phandle(sysdev,
+ "usb-phy", 1);
+ if (IS_ERR(xhci->shared_hcd->usb_phy)) {
+ xhci->shared_hcd->usb_phy = NULL;
+ } else {
+ ret = usb_phy_init(xhci->shared_hcd->usb_phy);
+ if (ret)
+ dev_err(sysdev, "%s init usb3phy fail (ret=%d)\n",
+ __func__, ret);
+ }
}
xhci->shared_hcd->tpl_support = hcd->tpl_support;
--
2.41.0
From: Shuai Xue <xueshuai(a)linux.alibaba.com>
[ Upstream commit 54aee5f15b83437f23b2b2469bcf21bdd9823916 ]
When perf-record with a large AUX area, e.g 4GB, it fails with:
#perf record -C 0 -m ,4G -e arm_spe_0// -- sleep 1
failed to mmap with 12 (Cannot allocate memory)
and it reveals a WARNING with __alloc_pages():
------------[ cut here ]------------
WARNING: CPU: 44 PID: 17573 at mm/page_alloc.c:5568 __alloc_pages+0x1ec/0x248
Call trace:
__alloc_pages+0x1ec/0x248
__kmalloc_large_node+0xc0/0x1f8
__kmalloc_node+0x134/0x1e8
rb_alloc_aux+0xe0/0x298
perf_mmap+0x440/0x660
mmap_region+0x308/0x8a8
do_mmap+0x3c0/0x528
vm_mmap_pgoff+0xf4/0x1b8
ksys_mmap_pgoff+0x18c/0x218
__arm64_sys_mmap+0x38/0x58
invoke_syscall+0x50/0x128
el0_svc_common.constprop.0+0x58/0x188
do_el0_svc+0x34/0x50
el0_svc+0x34/0x108
el0t_64_sync_handler+0xb8/0xc0
el0t_64_sync+0x1a4/0x1a8
'rb->aux_pages' allocated by kcalloc() is a pointer array which is used to
maintains AUX trace pages. The allocated page for this array is physically
contiguous (and virtually contiguous) with an order of 0..MAX_ORDER. If the
size of pointer array crosses the limitation set by MAX_ORDER, it reveals a
WARNING.
So bail out early with -ENOMEM if the request AUX area is out of bound,
e.g.:
#perf record -C 0 -m ,4G -e arm_spe_0// -- sleep 1
failed to mmap with 12 (Cannot allocate memory)
Signed-off-by: Shuai Xue <xueshuai(a)linux.alibaba.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
kernel/events/ring_buffer.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index fb1e180b5f0af..e8d82c2f07d0e 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -700,6 +700,12 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
watermark = 0;
}
+ /*
+ * kcalloc_node() is unable to allocate buffer if the size is larger
+ * than: PAGE_SIZE << MAX_ORDER; directly bail out in this case.
+ */
+ if (get_order((unsigned long)nr_pages * sizeof(void *)) > MAX_ORDER)
+ return -ENOMEM;
rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL,
node);
if (!rb->aux_pages)
--
2.42.0
DAMON sysfs interface's before_damos_apply callback
(damon_sysfs_before_damos_apply()), which creates the DAMOS tried
regions for each DAMOS action applied region, is not handling the
allocation failure for the sysfs directory data. As a result, NULL
pointer derefeence is possible. Fix it by handling the case.
Fixes: f1d13cacabe1 ("mm/damon/sysfs: implement DAMOS tried regions update command")
Cc: <stable(a)vger.kernel.org> # 6.2.x
Signed-off-by: SeongJae Park <sj(a)kernel.org>
---
mm/damon/sysfs-schemes.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 7413cb35c5a9..be667236b8e6 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -1826,6 +1826,8 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx,
return 0;
region = damon_sysfs_scheme_region_alloc(r);
+ if (!region)
+ return 0;
list_add_tail(®ion->list, &sysfs_regions->regions_list);
sysfs_regions->nr_regions++;
if (kobject_init_and_add(®ion->kobj,
--
2.34.1
damon_sysfs_update_target() returns error code for failures, but its
caller, damon_sysfs_set_targets() is ignoring that. The update function
seems making no critical change in case of such failures, but the
behavior will look like DAMON sysfs is silently ignoring or only
partially accepting the user input. Fix it.
Fixes: 19467a950b49 ("mm/damon/sysfs: remove requested targets when online-commit inputs")
Cc: <stable(a)vger.kernel.org> # 5.19.x
Signed-off-by: SeongJae Park <sj(a)kernel.org>
---
Note that yet another fix[1] should be applied before this.
[1] https://lore.kernel.org/all/739e6aaf-a634-4e33-98a8-16546379ec9f@moroto.mou…
mm/damon/sysfs.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 1dfa96d4de99..7472404456aa 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1203,8 +1203,10 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx,
damon_for_each_target_safe(t, next, ctx) {
if (i < sysfs_targets->nr) {
- damon_sysfs_update_target(t, ctx,
+ err = damon_sysfs_update_target(t, ctx,
sysfs_targets->targets_arr[i]);
+ if (err)
+ return err;
} else {
if (damon_target_has_pid(ctx))
put_pid(t->pid);
--
2.34.1
From: Shuai Xue <xueshuai(a)linux.alibaba.com>
[ Upstream commit 54aee5f15b83437f23b2b2469bcf21bdd9823916 ]
When perf-record with a large AUX area, e.g 4GB, it fails with:
#perf record -C 0 -m ,4G -e arm_spe_0// -- sleep 1
failed to mmap with 12 (Cannot allocate memory)
and it reveals a WARNING with __alloc_pages():
------------[ cut here ]------------
WARNING: CPU: 44 PID: 17573 at mm/page_alloc.c:5568 __alloc_pages+0x1ec/0x248
Call trace:
__alloc_pages+0x1ec/0x248
__kmalloc_large_node+0xc0/0x1f8
__kmalloc_node+0x134/0x1e8
rb_alloc_aux+0xe0/0x298
perf_mmap+0x440/0x660
mmap_region+0x308/0x8a8
do_mmap+0x3c0/0x528
vm_mmap_pgoff+0xf4/0x1b8
ksys_mmap_pgoff+0x18c/0x218
__arm64_sys_mmap+0x38/0x58
invoke_syscall+0x50/0x128
el0_svc_common.constprop.0+0x58/0x188
do_el0_svc+0x34/0x50
el0_svc+0x34/0x108
el0t_64_sync_handler+0xb8/0xc0
el0t_64_sync+0x1a4/0x1a8
'rb->aux_pages' allocated by kcalloc() is a pointer array which is used to
maintains AUX trace pages. The allocated page for this array is physically
contiguous (and virtually contiguous) with an order of 0..MAX_ORDER. If the
size of pointer array crosses the limitation set by MAX_ORDER, it reveals a
WARNING.
So bail out early with -ENOMEM if the request AUX area is out of bound,
e.g.:
#perf record -C 0 -m ,4G -e arm_spe_0// -- sleep 1
failed to mmap with 12 (Cannot allocate memory)
Signed-off-by: Shuai Xue <xueshuai(a)linux.alibaba.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
kernel/events/ring_buffer.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 12f351b253bbb..2f6f77658eba2 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -639,6 +639,12 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
}
}
+ /*
+ * kcalloc_node() is unable to allocate buffer if the size is larger
+ * than: PAGE_SIZE << MAX_ORDER; directly bail out in this case.
+ */
+ if (get_order((unsigned long)nr_pages * sizeof(void *)) > MAX_ORDER)
+ return -ENOMEM;
rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL,
node);
if (!rb->aux_pages)
--
2.42.0
From: Shuai Xue <xueshuai(a)linux.alibaba.com>
[ Upstream commit 54aee5f15b83437f23b2b2469bcf21bdd9823916 ]
When perf-record with a large AUX area, e.g 4GB, it fails with:
#perf record -C 0 -m ,4G -e arm_spe_0// -- sleep 1
failed to mmap with 12 (Cannot allocate memory)
and it reveals a WARNING with __alloc_pages():
------------[ cut here ]------------
WARNING: CPU: 44 PID: 17573 at mm/page_alloc.c:5568 __alloc_pages+0x1ec/0x248
Call trace:
__alloc_pages+0x1ec/0x248
__kmalloc_large_node+0xc0/0x1f8
__kmalloc_node+0x134/0x1e8
rb_alloc_aux+0xe0/0x298
perf_mmap+0x440/0x660
mmap_region+0x308/0x8a8
do_mmap+0x3c0/0x528
vm_mmap_pgoff+0xf4/0x1b8
ksys_mmap_pgoff+0x18c/0x218
__arm64_sys_mmap+0x38/0x58
invoke_syscall+0x50/0x128
el0_svc_common.constprop.0+0x58/0x188
do_el0_svc+0x34/0x50
el0_svc+0x34/0x108
el0t_64_sync_handler+0xb8/0xc0
el0t_64_sync+0x1a4/0x1a8
'rb->aux_pages' allocated by kcalloc() is a pointer array which is used to
maintains AUX trace pages. The allocated page for this array is physically
contiguous (and virtually contiguous) with an order of 0..MAX_ORDER. If the
size of pointer array crosses the limitation set by MAX_ORDER, it reveals a
WARNING.
So bail out early with -ENOMEM if the request AUX area is out of bound,
e.g.:
#perf record -C 0 -m ,4G -e arm_spe_0// -- sleep 1
failed to mmap with 12 (Cannot allocate memory)
Signed-off-by: Shuai Xue <xueshuai(a)linux.alibaba.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
kernel/events/ring_buffer.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index ffb59a4ef4ff3..fb3edb2f8ac93 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -653,6 +653,12 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
max_order--;
}
+ /*
+ * kcalloc_node() is unable to allocate buffer if the size is larger
+ * than: PAGE_SIZE << MAX_ORDER; directly bail out in this case.
+ */
+ if (get_order((unsigned long)nr_pages * sizeof(void *)) > MAX_ORDER)
+ return -ENOMEM;
rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL,
node);
if (!rb->aux_pages)
--
2.42.0
From: Shuai Xue <xueshuai(a)linux.alibaba.com>
[ Upstream commit 54aee5f15b83437f23b2b2469bcf21bdd9823916 ]
When perf-record with a large AUX area, e.g 4GB, it fails with:
#perf record -C 0 -m ,4G -e arm_spe_0// -- sleep 1
failed to mmap with 12 (Cannot allocate memory)
and it reveals a WARNING with __alloc_pages():
------------[ cut here ]------------
WARNING: CPU: 44 PID: 17573 at mm/page_alloc.c:5568 __alloc_pages+0x1ec/0x248
Call trace:
__alloc_pages+0x1ec/0x248
__kmalloc_large_node+0xc0/0x1f8
__kmalloc_node+0x134/0x1e8
rb_alloc_aux+0xe0/0x298
perf_mmap+0x440/0x660
mmap_region+0x308/0x8a8
do_mmap+0x3c0/0x528
vm_mmap_pgoff+0xf4/0x1b8
ksys_mmap_pgoff+0x18c/0x218
__arm64_sys_mmap+0x38/0x58
invoke_syscall+0x50/0x128
el0_svc_common.constprop.0+0x58/0x188
do_el0_svc+0x34/0x50
el0_svc+0x34/0x108
el0t_64_sync_handler+0xb8/0xc0
el0t_64_sync+0x1a4/0x1a8
'rb->aux_pages' allocated by kcalloc() is a pointer array which is used to
maintains AUX trace pages. The allocated page for this array is physically
contiguous (and virtually contiguous) with an order of 0..MAX_ORDER. If the
size of pointer array crosses the limitation set by MAX_ORDER, it reveals a
WARNING.
So bail out early with -ENOMEM if the request AUX area is out of bound,
e.g.:
#perf record -C 0 -m ,4G -e arm_spe_0// -- sleep 1
failed to mmap with 12 (Cannot allocate memory)
Signed-off-by: Shuai Xue <xueshuai(a)linux.alibaba.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
kernel/events/ring_buffer.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 4032cd4750001..01351e7e25435 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -691,6 +691,12 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
max_order--;
}
+ /*
+ * kcalloc_node() is unable to allocate buffer if the size is larger
+ * than: PAGE_SIZE << MAX_ORDER; directly bail out in this case.
+ */
+ if (get_order((unsigned long)nr_pages * sizeof(void *)) > MAX_ORDER)
+ return -ENOMEM;
rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL,
node);
if (!rb->aux_pages)
--
2.42.0