With the introduction of binder_available_for_proc_work_ilocked() in
commit 1b77e9dcc3da ("ANDROID: binder: remove proc waitqueue") a binder
thread can only "wait_for_proc_work" after its thread->looper has been
marked as BINDER_LOOPER_STATE_{ENTERED|REGISTERED}.
This means an unregistered reader risks waiting indefinitely for work
since it never gets added to the proc->waiting_threads. If there are no
further references to its waitqueue either the task will hang. The same
applies to readers using the (e)poll interface.
I couldn't find the rationale behind this restriction. So this patch
restores the previous behavior of allowing unregistered threads to
"wait_for_proc_work". Note that an error message for this scenario,
which had previously become unreachable, is now re-enabled.
Fixes: 1b77e9dcc3da ("ANDROID: binder: remove proc waitqueue")
Cc: stable(a)vger.kernel.org
Cc: Martijn Coenen <maco(a)google.com>
Cc: Arve Hjønnevåg <arve(a)google.com>
Signed-off-by: Carlos Llamas <cmllamas(a)google.com>
---
drivers/android/binder.c | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index b21a7b246a0d..2d0a24a56508 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -570,9 +570,7 @@ static bool binder_has_work(struct binder_thread *thread, bool do_proc_work)
static bool binder_available_for_proc_work_ilocked(struct binder_thread *thread)
{
return !thread->transaction_stack &&
- binder_worklist_empty_ilocked(&thread->todo) &&
- (thread->looper & (BINDER_LOOPER_STATE_ENTERED |
- BINDER_LOOPER_STATE_REGISTERED));
+ binder_worklist_empty_ilocked(&thread->todo);
}
static void binder_wakeup_poll_threads_ilocked(struct binder_proc *proc,
--
2.45.2.993.g49e7a77208-goog
The patch titled
Subject: mm/huge_memory: avoid PMD-size page cache if needed
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-huge_memory-avoid-pmd-size-page-cache-if-needed.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Gavin Shan <gshan(a)redhat.com>
Subject: mm/huge_memory: avoid PMD-size page cache if needed
Date: Thu, 11 Jul 2024 20:48:40 +1000
Currently, xarray can't support arbitrary page cache size and the largest
and supported page cache size is defined as MAX_PAGECACHE_ORDER in commit
099d90642a71 ("mm/filemap: make MAX_PAGECACHE_ORDER acceptable to
xarray"). However, it's possible to have 512MB page cache in the huge
memory collapsing path on ARM64 system whose base page size is 64KB. A
warning is raised when the huge page cache is split as shown in the
following example.
[root@dhcp-10-26-1-207 ~]# cat /proc/1/smaps | grep KernelPageSize
KernelPageSize: 64 kB
[root@dhcp-10-26-1-207 ~]# cat /tmp/test.c
:
int main(int argc, char **argv)
{
const char *filename = TEST_XFS_FILENAME;
int fd = 0;
void *buf = (void *)-1, *p;
int pgsize = getpagesize();
int ret = 0;
if (pgsize != 0x10000) {
fprintf(stdout, "System with 64KB base page size is required!\n");
return -EPERM;
}
system("echo 0 > /sys/devices/virtual/bdi/253:0/read_ahead_kb");
system("echo 1 > /proc/sys/vm/drop_caches");
/* Open xfs or shmem file */
fd = open(filename, O_RDONLY);
assert(fd > 0);
/* Create VMA */
buf = mmap(NULL, TEST_MEM_SIZE, PROT_READ, MAP_SHARED, fd, 0);
assert(buf != (void *)-1);
fprintf(stdout, "mapped buffer at 0x%p\n", buf);
/* Populate VMA */
ret = madvise(buf, TEST_MEM_SIZE, MADV_NOHUGEPAGE);
assert(ret == 0);
ret = madvise(buf, TEST_MEM_SIZE, MADV_POPULATE_READ);
assert(ret == 0);
/* Collapse VMA */
ret = madvise(buf, TEST_MEM_SIZE, MADV_HUGEPAGE);
assert(ret == 0);
ret = madvise(buf, TEST_MEM_SIZE, MADV_COLLAPSE);
if (ret) {
fprintf(stdout, "Error %d to madvise(MADV_COLLAPSE)\n", errno);
goto out;
}
/* Split xarray. The file needs to reopened with write permission */
munmap(buf, TEST_MEM_SIZE);
buf = (void *)-1;
close(fd);
fd = open(filename, O_RDWR);
assert(fd > 0);
fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
TEST_MEM_SIZE - pgsize, pgsize);
out:
if (buf != (void *)-1)
munmap(buf, TEST_MEM_SIZE);
if (fd > 0)
close(fd);
return ret;
}
[root@dhcp-10-26-1-207 ~]# gcc /tmp/test.c -o /tmp/test
[root@dhcp-10-26-1-207 ~]# /tmp/test
------------[ cut here ]------------
WARNING: CPU: 25 PID: 7560 at lib/xarray.c:1025 xas_split_alloc+0xf8/0x128
Modules linked in: nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib \
nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct \
nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 \
ip_set rfkill nf_tables nfnetlink vfat fat virtio_balloon drm fuse \
xfs libcrc32c crct10dif_ce ghash_ce sha2_ce sha256_arm64 virtio_net \
sha1_ce net_failover virtio_blk virtio_console failover dimlib virtio_mmio
CPU: 25 PID: 7560 Comm: test Kdump: loaded Not tainted 6.10.0-rc7-gavin+ #9
Hardware name: QEMU KVM Virtual Machine, BIOS edk2-20240524-1.el9 05/24/2024
pstate: 83400005 (Nzcv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--)
pc : xas_split_alloc+0xf8/0x128
lr : split_huge_page_to_list_to_order+0x1c4/0x780
sp : ffff8000ac32f660
x29: ffff8000ac32f660 x28: ffff0000e0969eb0 x27: ffff8000ac32f6c0
x26: 0000000000000c40 x25: ffff0000e0969eb0 x24: 000000000000000d
x23: ffff8000ac32f6c0 x22: ffffffdfc0700000 x21: 0000000000000000
x20: 0000000000000000 x19: ffffffdfc0700000 x18: 0000000000000000
x17: 0000000000000000 x16: ffffd5f3708ffc70 x15: 0000000000000000
x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000
x11: ffffffffffffffc0 x10: 0000000000000040 x9 : ffffd5f3708e692c
x8 : 0000000000000003 x7 : 0000000000000000 x6 : ffff0000e0969eb8
x5 : ffffd5f37289e378 x4 : 0000000000000000 x3 : 0000000000000c40
x2 : 000000000000000d x1 : 000000000000000c x0 : 0000000000000000
Call trace:
xas_split_alloc+0xf8/0x128
split_huge_page_to_list_to_order+0x1c4/0x780
truncate_inode_partial_folio+0xdc/0x160
truncate_inode_pages_range+0x1b4/0x4a8
truncate_pagecache_range+0x84/0xa0
xfs_flush_unmap_range+0x70/0x90 [xfs]
xfs_file_fallocate+0xfc/0x4d8 [xfs]
vfs_fallocate+0x124/0x2f0
ksys_fallocate+0x4c/0xa0
__arm64_sys_fallocate+0x24/0x38
invoke_syscall.constprop.0+0x7c/0xd8
do_el0_svc+0xb4/0xd0
el0_svc+0x44/0x1d8
el0t_64_sync_handler+0x134/0x150
el0t_64_sync+0x17c/0x180
Fix it by avoiding PMD-sized page cache in the huge memory collapsing
path. After this patch is applied, the test program fails with error
-EINVAL returned from __thp_vma_allowable_orders() and the madvise()
system call to collapse the page caches.
Link: https://lkml.kernel.org/r/20240711104840.200573-1-gshan@redhat.com
Fixes: 6b24ca4a1a8d ("mm: Use multi-index entries in the page cache")
Signed-off-by: Gavin Shan <gshan(a)redhat.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Ryan Roberts <ryan.roberts(a)arm.com>
Cc: William Kucharski <william.kucharski(a)oracle.com>
Cc: <stable(a)vger.kernel.org> [5.17+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/huge_memory.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
--- a/mm/huge_memory.c~mm-huge_memory-avoid-pmd-size-page-cache-if-needed
+++ a/mm/huge_memory.c
@@ -136,7 +136,8 @@ unsigned long __thp_vma_allowable_orders
while (orders) {
addr = vma->vm_end - (PAGE_SIZE << order);
- if (thp_vma_suitable_order(vma, addr, order))
+ if (!(vma->vm_file && order > MAX_PAGECACHE_ORDER) &&
+ thp_vma_suitable_order(vma, addr, order))
break;
order = next_order(&orders, order);
}
_
Patches currently in -mm which might be from gshan(a)redhat.com are
mm-huge_memory-avoid-pmd-size-page-cache-if-needed.patch
The patch titled
Subject: mm/mglru: fix overshooting shrinker memory
has been added to the -mm mm-unstable branch. Its filename is
mm-mglru-fix-overshooting-shrinker-memory.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Yu Zhao <yuzhao(a)google.com>
Subject: mm/mglru: fix overshooting shrinker memory
Date: Thu, 11 Jul 2024 13:19:57 -0600
set_initial_priority() tries to jump-start global reclaim by estimating
the priority based on cold/hot LRU pages. The estimation does not account
for shrinker objects, and it cannot do so because their sizes can be in
different units other than page.
If shrinker objects are the majority, e.g., on TrueNAS SCALE 24.04.0 where
ZFS ARC can use almost all system memory, set_initial_priority() can
vastly underestimate how much memory ARC shrinker can evict and assign
extreme low values to scan_control->priority, resulting in overshoots of
shrinker objects.
To reproduce the problem, using TrueNAS SCALE 24.04.0 with 32GB DRAM, a
test ZFS pool and the following commands:
fio --name=mglru.file --numjobs=36 --ioengine=io_uring \
--directory=/root/test-zfs-pool/ --size=1024m --buffered=1 \
--rw=randread --random_distribution=random \
--time_based --runtime=1h &
for ((i = 0; i < 20; i++))
do
sleep 120
fio --name=mglru.anon --numjobs=16 --ioengine=mmap \
--filename=/dev/zero --size=1024m --fadvise_hint=0 \
--rw=randrw --random_distribution=random \
--time_based --runtime=1m
done
To fix the problem:
1. Cap scan_control->priority at or above DEF_PRIORITY/2, to prevent
the jump-start from being overly aggressive.
2. Account for the progress from mm_account_reclaimed_pages(), to
prevent kswapd_shrink_node() from raising the priority
unnecessarily.
Link: https://lkml.kernel.org/r/20240711191957.939105-2-yuzhao@google.com
Fixes: e4dde56cd208 ("mm: multi-gen LRU: per-node lru_gen_folio lists")
Signed-off-by: Yu Zhao <yuzhao(a)google.com>
Reported-by: Alexander Motin <mav(a)ixsystems.com>
Cc: Wei Xu <weixugc(a)google.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/vmscan.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
--- a/mm/vmscan.c~mm-mglru-fix-overshooting-shrinker-memory
+++ a/mm/vmscan.c
@@ -4930,7 +4930,11 @@ static void set_initial_priority(struct
/* round down reclaimable and round up sc->nr_to_reclaim */
priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
- sc->priority = clamp(priority, 0, DEF_PRIORITY);
+ /*
+ * The estimation is based on LRU pages only, so cap it to prevent
+ * overshoots of shrinker objects by large margins.
+ */
+ sc->priority = clamp(priority, DEF_PRIORITY / 2, DEF_PRIORITY);
}
static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
@@ -6754,6 +6758,7 @@ static bool kswapd_shrink_node(pg_data_t
{
struct zone *zone;
int z;
+ unsigned long nr_reclaimed = sc->nr_reclaimed;
/* Reclaim a number of pages proportional to the number of zones */
sc->nr_to_reclaim = 0;
@@ -6781,7 +6786,8 @@ static bool kswapd_shrink_node(pg_data_t
if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
sc->order = 0;
- return sc->nr_scanned >= sc->nr_to_reclaim;
+ /* account for progress from mm_account_reclaimed_pages() */
+ return max(sc->nr_scanned, sc->nr_reclaimed - nr_reclaimed) >= sc->nr_to_reclaim;
}
/* Page allocator PCP high watermark is lowered if reclaim is active. */
_
Patches currently in -mm which might be from yuzhao(a)google.com are
mm-truncate-batch-clear-shadow-entries.patch
mm-truncate-batch-clear-shadow-entries-v2.patch
mm-mglru-fix-div-by-zero-in-vmpressure_calc_level.patch
mm-mglru-fix-overshooting-shrinker-memory.patch
The patch titled
Subject: mm/mglru: fix div-by-zero in vmpressure_calc_level()
has been added to the -mm mm-unstable branch. Its filename is
mm-mglru-fix-div-by-zero-in-vmpressure_calc_level.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Yu Zhao <yuzhao(a)google.com>
Subject: mm/mglru: fix div-by-zero in vmpressure_calc_level()
Date: Thu, 11 Jul 2024 13:19:56 -0600
evict_folios() uses a second pass to reclaim folios that have gone through
page writeback and become clean before it finishes the first pass, since
folio_rotate_reclaimable() cannot handle those folios due to the
isolation.
The second pass tries to avoid potential double counting by deducting
scan_control->nr_scanned. However, this can result in underflow of
nr_scanned, under a condition where shrink_folio_list() does not increment
nr_scanned, i.e., when folio_trylock() fails.
The underflow can cause the divisor, i.e., scale=scanned+reclaimed in
vmpressure_calc_level(), to become zero, resulting in the following crash:
[exception RIP: vmpressure_work_fn+101]
process_one_work at ffffffffa3313f2b
Since scan_control->nr_scanned has no established semantics, the potential
double counting has minimal risks. Therefore, fix the problem by not
deducting scan_control->nr_scanned in evict_folios().
Link: https://lkml.kernel.org/r/20240711191957.939105-1-yuzhao@google.com
Fixes: 359a5e1416ca ("mm: multi-gen LRU: retry folios written back while isolated")
Reported-by: Wei Xu <weixugc(a)google.com>
Signed-off-by: Yu Zhao <yuzhao(a)google.com>
Cc: Alexander Motin <mav(a)ixsystems.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/vmscan.c | 1 -
1 file changed, 1 deletion(-)
--- a/mm/vmscan.c~mm-mglru-fix-div-by-zero-in-vmpressure_calc_level
+++ a/mm/vmscan.c
@@ -4597,7 +4597,6 @@ retry:
/* retry folios that may have missed folio_rotate_reclaimable() */
list_move(&folio->lru, &clean);
- sc->nr_scanned -= folio_nr_pages(folio);
}
spin_lock_irq(&lruvec->lru_lock);
_
Patches currently in -mm which might be from yuzhao(a)google.com are
mm-truncate-batch-clear-shadow-entries.patch
mm-truncate-batch-clear-shadow-entries-v2.patch
mm-mglru-fix-div-by-zero-in-vmpressure_calc_level.patch
mm-mglru-fix-overshooting-shrinker-memory.patch
evict_folios() uses a second pass to reclaim folios that have gone
through page writeback and become clean before it finishes the first
pass, since folio_rotate_reclaimable() cannot handle those folios due
to the isolation.
The second pass tries to avoid potential double counting by deducting
scan_control->nr_scanned. However, this can result in underflow of
nr_scanned, under a condition where shrink_folio_list() does not
increment nr_scanned, i.e., when folio_trylock() fails.
The underflow can cause the divisor, i.e., scale=scanned+reclaimed in
vmpressure_calc_level(), to become zero, resulting in the following
crash:
[exception RIP: vmpressure_work_fn+101]
process_one_work at ffffffffa3313f2b
Since scan_control->nr_scanned has no established semantics, the
potential double counting has minimal risks. Therefore, fix the
problem by not deducting scan_control->nr_scanned in evict_folios().
Reported-by: Wei Xu <weixugc(a)google.com>
Fixes: 359a5e1416ca ("mm: multi-gen LRU: retry folios written back while isolated")
Cc: stable(a)vger.kernel.org
Signed-off-by: Yu Zhao <yuzhao(a)google.com>
---
mm/vmscan.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0761f91b407f..6403038c776e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -4597,7 +4597,6 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
/* retry folios that may have missed folio_rotate_reclaimable() */
list_move(&folio->lru, &clean);
- sc->nr_scanned -= folio_nr_pages(folio);
}
spin_lock_irq(&lruvec->lru_lock);
--
2.45.2.993.g49e7a77208-goog
The patch titled
Subject: crash: fix x86_32 memory reserve dead loop retry bug
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
crash-fix-x86_32-memory-reserve-dead-loop-retry-bug.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Jinjie Ruan <ruanjinjie(a)huawei.com>
Subject: crash: fix x86_32 memory reserve dead loop retry bug
Date: Thu, 11 Jul 2024 15:31:18 +0800
On x86_32 Qemu machine with 1GB memory, the cmdline "crashkernel=1G,high"
will cause system stall as below:
ACPI: Reserving FACP table memory at [mem 0x3ffe18b8-0x3ffe192b]
ACPI: Reserving DSDT table memory at [mem 0x3ffe0040-0x3ffe18b7]
ACPI: Reserving FACS table memory at [mem 0x3ffe0000-0x3ffe003f]
ACPI: Reserving APIC table memory at [mem 0x3ffe192c-0x3ffe19bb]
ACPI: Reserving HPET table memory at [mem 0x3ffe19bc-0x3ffe19f3]
ACPI: Reserving WAET table memory at [mem 0x3ffe19f4-0x3ffe1a1b]
143MB HIGHMEM available.
879MB LOWMEM available.
mapped low ram: 0 - 36ffe000
low ram: 0 - 36ffe000
(stall here)
The reason is that the CRASH_ADDR_LOW_MAX is equal to CRASH_ADDR_HIGH_MAX
on x86_32, the first high crash kernel memory reservation will fail, then
go into the "retry" loop and never came out as below.
-> reserve_crashkernel_generic() and high is true
-> alloc at [CRASH_ADDR_LOW_MAX, CRASH_ADDR_HIGH_MAX] fail
-> alloc at [0, CRASH_ADDR_LOW_MAX] fail and repeatedly
(because CRASH_ADDR_LOW_MAX = CRASH_ADDR_HIGH_MAX).
Fix it by changing the out check condition.
After this patch, it prints:
cannot allocate crashkernel (size:0x40000000)
Link: https://lkml.kernel.org/r/20240711073118.1289866-1-ruanjinjie@huawei.com
Fixes: 9c08a2a139fe ("x86: kdump: use generic interface to simplify crashkernel reservation code")
Signed-off-by: Jinjie Ruan <ruanjinjie(a)huawei.com>
Cc: Baoquan He <bhe(a)redhat.com>
Cc: Dave Young <dyoung(a)redhat.com>
Cc: Vivek Goyal <vgoyal(a)redhat.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
kernel/crash_reserve.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/kernel/crash_reserve.c~crash-fix-x86_32-memory-reserve-dead-loop-retry-bug
+++ a/kernel/crash_reserve.c
@@ -421,7 +421,7 @@ retry:
* For crashkernel=size[KMG],high, if the first attempt was
* for high memory, fall back to low memory.
*/
- if (high && search_end == CRASH_ADDR_HIGH_MAX) {
+ if (high && search_base == CRASH_ADDR_LOW_MAX) {
search_end = CRASH_ADDR_LOW_MAX;
search_base = 0;
goto retry;
_
Patches currently in -mm which might be from ruanjinjie(a)huawei.com are
crash-fix-x86_32-memory-reserve-dead-loop-retry-bug.patch
Robert Gill reported below #GP when dosemu software was executing vm86()
system call:
general protection fault: 0000 [#1] PREEMPT SMP
CPU: 4 PID: 4610 Comm: dosemu.bin Not tainted 6.6.21-gentoo-x86 #1
Hardware name: Dell Inc. PowerEdge 1950/0H723K, BIOS 2.7.0 10/30/2010
EIP: restore_all_switch_stack+0xbe/0xcf
EAX: 00000000 EBX: 00000000 ECX: 00000000 EDX: 00000000
ESI: 00000000 EDI: 00000000 EBP: 00000000 ESP: ff8affdc
DS: 0000 ES: 0000 FS: 0000 GS: 0033 SS: 0068 EFLAGS: 00010046
CR0: 80050033 CR2: 00c2101c CR3: 04b6d000 CR4: 000406d0
Call Trace:
show_regs+0x70/0x78
die_addr+0x29/0x70
exc_general_protection+0x13c/0x348
exc_bounds+0x98/0x98
handle_exception+0x14d/0x14d
exc_bounds+0x98/0x98
restore_all_switch_stack+0xbe/0xcf
exc_bounds+0x98/0x98
restore_all_switch_stack+0xbe/0xcf
This only happens when VERW based mitigations like MDS/RFDS are enabled.
This is because segment registers with an arbitrary user value can result
in #GP when executing VERW. Intel SDM vol. 2C documents the following
behavior for VERW instruction:
#GP(0) - If a memory operand effective address is outside the CS, DS, ES,
FS, or GS segment limit.
CLEAR_CPU_BUFFERS macro executes VERW instruction before returning to user
space. Replace CLEAR_CPU_BUFFERS with a safer version that uses %ss to
refer VERW operand mds_verw_sel. This ensures VERW will not #GP for an
arbitrary user %ds. Also, in NMI return path, move VERW to after
RESTORE_ALL_NMI that touches GPRs.
For clarity, below are the locations where the new CLEAR_CPU_BUFFERS_SAFE
version is being used:
* entry_INT80_32(), entry_SYSENTER_32() and interrupts (via
handle_exception_return) do:
restore_all_switch_stack:
[...]
mov %esi,%esi
verw %ss:0xc0fc92c0 <-------------
iret
* Opportunistic SYSEXIT:
[...]
verw %ss:0xc0fc92c0 <-------------
btrl $0x9,(%esp)
popf
pop %eax
sti
sysexit
* nmi_return and nmi_from_espfix:
mov %esi,%esi
verw %ss:0xc0fc92c0 <-------------
jmp .Lirq_return
Fixes: a0e2dab44d22 ("x86/entry_32: Add VERW just before userspace transition")
Cc: stable(a)vger.kernel.org # 5.10+
Reported-by: Robert Gill <rtgill82(a)gmail.com>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218707
Closes: https://lore.kernel.org/all/8c77ccfd-d561-45a1-8ed5-6b75212c7a58@leemhuis.i…
Suggested-by: Dave Hansen <dave.hansen(a)linux.intel.com>
Suggested-by: Brian Gerst <brgerst(a)gmail.com> # Use %ss
Signed-off-by: Pawan Gupta <pawan.kumar.gupta(a)linux.intel.com>
---
v4:
- Further simplify the patch by using %ss for all VERW calls in 32-bit mode (Brian).
- In NMI exit path move VERW after RESTORE_ALL_NMI that touches GPRs (Dave).
v3: https://lore.kernel.org/r/20240701-fix-dosemu-vm86-v3-1-b1969532c75a@linux.…
- Simplify CLEAR_CPU_BUFFERS_SAFE by using %ss instead of %ds (Brian).
- Do verw before popf in SYSEXIT path (Jari).
v2: https://lore.kernel.org/r/20240627-fix-dosemu-vm86-v2-1-d5579f698e77@linux.…
- Safe guard against any other system calls like vm86() that might change %ds (Dave).
v1: https://lore.kernel.org/r/20240426-fix-dosemu-vm86-v1-1-88c826a3f378@linux.…
---
---
arch/x86/entry/entry_32.S | 18 +++++++++++++++---
1 file changed, 15 insertions(+), 3 deletions(-)
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index d3a814efbff6..d54f6002e5a0 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -253,6 +253,16 @@
.Lend_\@:
.endm
+/*
+ * Safer version of CLEAR_CPU_BUFFERS that uses %ss to reference VERW operand
+ * mds_verw_sel. This ensures VERW will not #GP for an arbitrary user %ds.
+ */
+.macro CLEAR_CPU_BUFFERS_SAFE
+ ALTERNATIVE "jmp .Lskip_verw\@", "", X86_FEATURE_CLEAR_CPU_BUF
+ verw %ss:_ASM_RIP(mds_verw_sel)
+.Lskip_verw\@:
+.endm
+
.macro RESTORE_INT_REGS
popl %ebx
popl %ecx
@@ -871,6 +881,8 @@ SYM_FUNC_START(entry_SYSENTER_32)
/* Now ready to switch the cr3 */
SWITCH_TO_USER_CR3 scratch_reg=%eax
+ /* Clobbers ZF */
+ CLEAR_CPU_BUFFERS_SAFE
/*
* Restore all flags except IF. (We restore IF separately because
@@ -881,7 +893,6 @@ SYM_FUNC_START(entry_SYSENTER_32)
BUG_IF_WRONG_CR3 no_user_check=1
popfl
popl %eax
- CLEAR_CPU_BUFFERS
/*
* Return back to the vDSO, which will pop ecx and edx.
@@ -951,7 +962,7 @@ restore_all_switch_stack:
/* Restore user state */
RESTORE_REGS pop=4 # skip orig_eax/error_code
- CLEAR_CPU_BUFFERS
+ CLEAR_CPU_BUFFERS_SAFE
.Lirq_return:
/*
* ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
@@ -1144,7 +1155,6 @@ SYM_CODE_START(asm_exc_nmi)
/* Not on SYSENTER stack. */
call exc_nmi
- CLEAR_CPU_BUFFERS
jmp .Lnmi_return
.Lnmi_from_sysenter_stack:
@@ -1165,6 +1175,7 @@ SYM_CODE_START(asm_exc_nmi)
CHECK_AND_APPLY_ESPFIX
RESTORE_ALL_NMI cr3_reg=%edi pop=4
+ CLEAR_CPU_BUFFERS_SAFE
jmp .Lirq_return
#ifdef CONFIG_X86_ESPFIX32
@@ -1206,6 +1217,7 @@ SYM_CODE_START(asm_exc_nmi)
* 1 - orig_ax
*/
lss (1+5+6)*4(%esp), %esp # back to espfix stack
+ CLEAR_CPU_BUFFERS_SAFE
jmp .Lirq_return
#endif
SYM_CODE_END(asm_exc_nmi)
---
base-commit: f2661062f16b2de5d7b6a5c42a9a5c96326b8454
change-id: 20240426-fix-dosemu-vm86-dd111a01737e
From: Kan Liang <kan.liang(a)linux.intel.com>
The EAX of the CPUID Leaf 023H enumerates the mask of valid sub-leaves.
To tell the availability of the sub-leaf 1 (enumerate the counter mask),
perf should check the bit 1 (0x2) of EAS, rather than bit 0 (0x1).
The error is not user-visible on bare metal. Because the sub-leaf 0 and
the sub-leaf 1 are always available. However, it may bring issues in a
virtualization environment when a VMM only enumerates the sub-leaf 0.
Fixes: eb467aaac21e ("perf/x86/intel: Support Architectural PerfMon Extension leaf")
Signed-off-by: Kan Liang <kan.liang(a)linux.intel.com>
Cc: stable(a)vger.kernel.org
---
arch/x86/events/intel/core.c | 4 ++--
arch/x86/include/asm/perf_event.h | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index cd8f2db6cdf6..3fb81f7b618c 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4842,8 +4842,8 @@ static void update_pmu_cap(struct x86_hybrid_pmu *pmu)
if (ebx & ARCH_PERFMON_EXT_EQ)
pmu->config_mask |= ARCH_PERFMON_EVENTSEL_EQ;
- if (sub_bitmaps & ARCH_PERFMON_NUM_COUNTER_LEAF_BIT) {
- cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_NUM_COUNTER_LEAF,
+ if (sub_bitmaps & ARCH_PERFMON_NUM_COUNTER_LEAF) {
+ cpuid_count(ARCH_PERFMON_EXT_LEAF, ARCH_PERFMON_NUM_COUNTER_LEAF_BIT,
&eax, &ebx, &ecx, &edx);
pmu->cntr_mask64 = eax;
pmu->fixed_cntr_mask64 = ebx;
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 91b73571412f..41ace8431e01 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -190,7 +190,7 @@ union cpuid10_edx {
#define ARCH_PERFMON_EXT_UMASK2 0x1
#define ARCH_PERFMON_EXT_EQ 0x2
#define ARCH_PERFMON_NUM_COUNTER_LEAF_BIT 0x1
-#define ARCH_PERFMON_NUM_COUNTER_LEAF 0x1
+#define ARCH_PERFMON_NUM_COUNTER_LEAF BIT(ARCH_PERFMON_NUM_COUNTER_LEAF_BIT)
/*
* Intel Architectural LBR CPUID detection/enumeration details:
--
2.38.1