In hrtimer_interrupt(), interrupts are disabled when acquiring a spinlock,
which subsequently triggers an oops. During the oops call chain,
blocking_notifier_call_chain() invokes _cond_resched, ultimately leading
to a hard lockup.
Call Stack:
hrtimer_interrupt//raw_spin_lock_irqsave
__hrtimer_run_queues
page_fault
do_page_fault
bad_area_nosemaphore
no_context
oops_end
bust_spinlocks
unblank_screen
do_unblank_screen
fbcon_blank
fb_notifier_call_chain
blocking_notifier_call_chain
down_read
_cond_resched
If the system is in an oops state, use down_read_trylock instead of a
blocking lock acquisition. If the trylock fails, skip executing the
notifier callbacks to avoid potential deadlocks or unsafe operations
during the oops handling process.
Cc: stable(a)vger.kernel.org # 6.6
Fixes: fe9d4f576324 ("Add kernel/notifier.c")
Signed-off-by: Yi Yang <yiyang13(a)huawei.com>
---
kernel/notifier.c | 15 ++++++++++++---
1 file changed, 12 insertions(+), 3 deletions(-)
diff --git a/kernel/notifier.c b/kernel/notifier.c
index b3ce28f39eb6..ebff2315fac2 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -384,9 +384,18 @@ int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
* is, we re-check the list after having taken the lock anyway:
*/
if (rcu_access_pointer(nh->head)) {
- down_read(&nh->rwsem);
- ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
- up_read(&nh->rwsem);
+ if (!oops_in_progress) {
+ down_read(&nh->rwsem);
+ ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
+ up_read(&nh->rwsem);
+ } else {
+ if (down_read_trylock(&nh->rwsem)) {
+ ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
+ up_read(&nh->rwsem);
+ } else {
+ ret = NOTIFY_BAD;
+ }
+ }
}
return ret;
}
--
2.25.1
When looking at the recent CI results on NIPA and MPTCP CIs, a few MPTCP
Join tests are marked as unstable. Here are some fixes for that.
- Patch 1: a small fix for mptcp_connect.sh, printing a note as
initially intended. For >=v5.13.
- Patch 2: avoid unexpected reset when closing subflows. For >= 5.13.
- Patches 3-4: longer transfer when not waiting for the end. For >=5.18.
- Patch 5: read all received data when expecting a reset. For >= v6.1.
- Patch 6: a fix to properly kill background tasks. For >= v6.5.
Signed-off-by: Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
---
Matthieu Baerts (NGI0) (6):
selftests: mptcp: connect: fix fallback note due to OoO
selftests: mptcp: join: rm: set backup flag
selftests: mptcp: join: endpoints: longer transfer
selftests: mptcp: join: userspace: longer transfer
selftests: mptcp: connect: trunc: read all recv data
selftests: mptcp: join: properly kill background tasks
tools/testing/selftests/net/mptcp/mptcp_connect.c | 18 +++--
tools/testing/selftests/net/mptcp/mptcp_connect.sh | 2 +-
tools/testing/selftests/net/mptcp/mptcp_join.sh | 90 +++++++++++-----------
tools/testing/selftests/net/mptcp/mptcp_lib.sh | 21 +++++
4 files changed, 80 insertions(+), 51 deletions(-)
---
base-commit: 96a9178a29a6b84bb632ebeb4e84cf61191c73d5
change-id: 20251108-net-mptcp-sft-join-unstable-5a28cdb6ea54
Best regards,
--
Matthieu Baerts (NGI0) <matttbe(a)kernel.org>
svm_update_lbrv() always updates LBR MSRs intercepts, even when they are
already set correctly. This results in force_msr_bitmap_recalc always
being set to true on every nested transition, essentially undoing the
hyperv optimization in nested_svm_merge_msrpm().
Fix it by keeping track of whether LBR MSRs are intercepted or not and
only doing the update if needed, similar to x2avic_msrs_intercepted.
Avoid using svm_test_msr_bitmap_*() to check the status of the
intercepts, as an arbitrary MSR will need to be chosen as a
representative of all LBR MSRs, and this could theoretically break if
some of the MSRs intercepts are handled differently from the rest.
Also, using svm_test_msr_bitmap_*() makes backports difficult as it was
only recently introduced with no direct alternatives in older kernels.
Fixes: fbe5e5f030c2 ("KVM: nSVM: Always recalculate LBR MSR intercepts in svm_update_lbrv()")
Cc: stable(a)vger.kernel.org
Signed-off-by: Yosry Ahmed <yosry.ahmed(a)linux.dev>
---
arch/x86/kvm/svm/svm.c | 9 ++++++++-
arch/x86/kvm/svm/svm.h | 1 +
2 files changed, 9 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 10c21e4c5406f..9d29b2e7e855d 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -705,7 +705,11 @@ void *svm_alloc_permissions_map(unsigned long size, gfp_t gfp_mask)
static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu)
{
- bool intercept = !(to_svm(vcpu)->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK);
+ struct vcpu_svm *svm = to_svm(vcpu);
+ bool intercept = !(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK);
+
+ if (intercept == svm->lbr_msrs_intercepted)
+ return;
svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHFROMIP, MSR_TYPE_RW, intercept);
svm_set_intercept_for_msr(vcpu, MSR_IA32_LASTBRANCHTOIP, MSR_TYPE_RW, intercept);
@@ -714,6 +718,8 @@ static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu)
if (sev_es_guest(vcpu->kvm))
svm_set_intercept_for_msr(vcpu, MSR_IA32_DEBUGCTLMSR, MSR_TYPE_RW, intercept);
+
+ svm->lbr_msrs_intercepted = intercept;
}
void svm_vcpu_free_msrpm(void *msrpm)
@@ -1221,6 +1227,7 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu)
}
svm->x2avic_msrs_intercepted = true;
+ svm->lbr_msrs_intercepted = true;
svm->vmcb01.ptr = page_address(vmcb01_page);
svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT);
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index c856d8e0f95e7..dd78e64023450 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -336,6 +336,7 @@ struct vcpu_svm {
bool guest_state_loaded;
bool x2avic_msrs_intercepted;
+ bool lbr_msrs_intercepted;
/* Guest GIF value, used when vGIF is not enabled */
bool guest_gif;
base-commit: 8a4821412cf2c1429fffa07c012dd150f2edf78c
--
2.51.2.1041.gc1ab5b90ca-goog
The patch below does not apply to the 6.12-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.12.y
git checkout FETCH_HEAD
git cherry-pick -x 16c43a56b79e2c3220b043236369a129d508c65a
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025110816-catalog-residency-716f@gregkh' --subject-prefix 'PATCH 6.12.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 16c43a56b79e2c3220b043236369a129d508c65a Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda(a)kernel.org>
Date: Sun, 2 Nov 2025 22:28:52 +0100
Subject: [PATCH] rust: kbuild: treat `build_error` and `rustdoc` as kernel
objects
Even if normally `build_error` isn't a kernel object, it should still
be treated as such so that we pass the same flags. Similarly, `rustdoc`
targets are never kernel objects, but we need to treat them as such.
Otherwise, starting with Rust 1.91.0 (released 2025-10-30), `rustc`
will complain about missing sanitizer flags since `-Zsanitizer` is a
target modifier too [1]:
error: mixing `-Zsanitizer` will cause an ABI mismatch in crate `build_error`
--> rust/build_error.rs:3:1
|
3 | //! Build-time error.
| ^
|
= help: the `-Zsanitizer` flag modifies the ABI so Rust crates compiled with different values of this flag cannot be used together safely
= note: unset `-Zsanitizer` in this crate is incompatible with `-Zsanitizer=kernel-address` in dependency `core`
= help: set `-Zsanitizer=kernel-address` in this crate or unset `-Zsanitizer` in `core`
= help: if you are sure this will not cause problems, you may use `-Cunsafe-allow-abi-mismatch=sanitizer` to silence this error
Thus explicitly mark them as kernel objects.
Cc: stable(a)vger.kernel.org # Needed in 6.12.y and later (Rust is pinned in older LTSs).
Link: https://github.com/rust-lang/rust/pull/138736 [1]
Reviewed-by: Alice Ryhl <aliceryhl(a)google.com>
Tested-by: Justin M. Forbes <jforbes(a)fedoraproject.org>
Link: https://patch.msgid.link/20251102212853.1505384-1-ojeda@kernel.org
Signed-off-by: Miguel Ojeda <ojeda(a)kernel.org>
diff --git a/rust/Makefile b/rust/Makefile
index 23c7ae905bd2..a9fb9354b659 100644
--- a/rust/Makefile
+++ b/rust/Makefile
@@ -127,9 +127,14 @@ rustdoc-core: private rustc_target_flags = --edition=$(core-edition) $(core-cfgs
rustdoc-core: $(RUST_LIB_SRC)/core/src/lib.rs rustdoc-clean FORCE
+$(call if_changed,rustdoc)
+# Even if `rustdoc` targets are not kernel objects, they should still be
+# treated as such so that we pass the same flags. Otherwise, for instance,
+# `rustdoc` will complain about missing sanitizer flags causing an ABI mismatch.
+rustdoc-compiler_builtins: private is-kernel-object := y
rustdoc-compiler_builtins: $(src)/compiler_builtins.rs rustdoc-core FORCE
+$(call if_changed,rustdoc)
+rustdoc-ffi: private is-kernel-object := y
rustdoc-ffi: $(src)/ffi.rs rustdoc-core FORCE
+$(call if_changed,rustdoc)
@@ -147,6 +152,7 @@ rustdoc-pin_init: $(src)/pin-init/src/lib.rs rustdoc-pin_init_internal \
rustdoc-macros FORCE
+$(call if_changed,rustdoc)
+rustdoc-kernel: private is-kernel-object := y
rustdoc-kernel: private rustc_target_flags = --extern ffi --extern pin_init \
--extern build_error --extern macros \
--extern bindings --extern uapi
@@ -522,6 +528,10 @@ $(obj)/pin_init.o: $(src)/pin-init/src/lib.rs $(obj)/compiler_builtins.o \
$(obj)/$(libpin_init_internal_name) $(obj)/$(libmacros_name) FORCE
+$(call if_changed_rule,rustc_library)
+# Even if normally `build_error` is not a kernel object, it should still be
+# treated as such so that we pass the same flags. Otherwise, for instance,
+# `rustc` will complain about missing sanitizer flags causing an ABI mismatch.
+$(obj)/build_error.o: private is-kernel-object := y
$(obj)/build_error.o: private skip_gendwarfksyms = 1
$(obj)/build_error.o: $(src)/build_error.rs $(obj)/compiler_builtins.o FORCE
+$(call if_changed_rule,rustc_library)
The quilt patch titled
Subject: crash: let architecture decide crash memory export to iomem_resource
has been removed from the -mm tree. Its filename was
crash-let-architecture-decide-crash-memory-export-to-iomem_resource.patch
This patch was dropped because it was merged into the mm-nonmm-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Sourabh Jain <sourabhjain(a)linux.ibm.com>
Subject: crash: let architecture decide crash memory export to iomem_resource
Date: Thu, 16 Oct 2025 19:58:31 +0530
With the generic crashkernel reservation, the kernel emits the following
warning on powerpc:
WARNING: CPU: 0 PID: 1 at arch/powerpc/mm/mem.c:341 add_system_ram_resources+0xfc/0x180
Modules linked in:
CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.17.0-auto-12607-g5472d60c129f #1 VOLUNTARY
Hardware name: IBM,9080-HEX Power11 (architected) 0x820200 0xf000007 of:IBM,FW1110.01 (NH1110_069) hv:phyp pSeries
NIP: c00000000201de3c LR: c00000000201de34 CTR: 0000000000000000
REGS: c000000127cef8a0 TRAP: 0700 Not tainted (6.17.0-auto-12607-g5472d60c129f)
MSR: 8000000002029033 <SF,VEC,EE,ME,IR,DR,RI,LE> CR: 84000840 XER: 20040010
CFAR: c00000000017eed0 IRQMASK: 0
GPR00: c00000000201de34 c000000127cefb40 c0000000016a8100 0000000000000001
GPR04: c00000012005aa00 0000000020000000 c000000002b705c8 0000000000000000
GPR08: 000000007fffffff fffffffffffffff0 c000000002db8100 000000011fffffff
GPR12: c00000000201dd40 c000000002ff0000 c0000000000112bc 0000000000000000
GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
GPR20: 0000000000000000 0000000000000000 0000000000000000 c0000000015a3808
GPR24: c00000000200468c c000000001699888 0000000000000106 c0000000020d1950
GPR28: c0000000014683f8 0000000081000200 c0000000015c1868 c000000002b9f710
NIP [c00000000201de3c] add_system_ram_resources+0xfc/0x180
LR [c00000000201de34] add_system_ram_resources+0xf4/0x180
Call Trace:
add_system_ram_resources+0xf4/0x180 (unreliable)
do_one_initcall+0x60/0x36c
do_initcalls+0x120/0x220
kernel_init_freeable+0x23c/0x390
kernel_init+0x34/0x26c
ret_from_kernel_user_thread+0x14/0x1c
This warning occurs due to a conflict between crashkernel and System RAM
iomem resources.
The generic crashkernel reservation adds the crashkernel memory range to
/proc/iomem during early initialization. Later, all memblock ranges are
added to /proc/iomem as System RAM. If the crashkernel region overlaps
with any memblock range, it causes a conflict while adding those memblock
regions as iomem resources, triggering the above warning. The conflicting
memblock regions are then omitted from /proc/iomem.
For example, if the following crashkernel region is added to /proc/iomem:
20000000-11fffffff : Crash kernel
then the following memblock regions System RAM regions fail to be inserted:
00000000-7fffffff : System RAM
80000000-257fffffff : System RAM
Fix this by not adding the crashkernel memory to /proc/iomem on powerpc.
Introduce an architecture hook to let each architecture decide whether to
export the crashkernel region to /proc/iomem.
For more info checkout commit c40dd2f766440 ("powerpc: Add System RAM
to /proc/iomem") and commit bce074bdbc36 ("powerpc: insert System RAM
resource to prevent crashkernel conflict")
Note: Before switching to the generic crashkernel reservation, powerpc
never exported the crashkernel region to /proc/iomem.
Link: https://lkml.kernel.org/r/20251016142831.144515-1-sourabhjain@linux.ibm.com
Fixes: e3185ee438c2 ("powerpc/crash: use generic crashkernel reservation").
Signed-off-by: Sourabh Jain <sourabhjain(a)linux.ibm.com>
Reported-by: Venkat Rao Bagalkote <venkat88(a)linux.ibm.com>
Closes: https://lore.kernel.org/all/90937fe0-2e76-4c82-b27e-7b8a7fe3ac69@linux.ibm.…
Tested-by: Venkat Rao Bagalkote <venkat88(a)linux.ibm.com>
Cc: Baoquan he <bhe(a)redhat.com>
Cc: Hari Bathini <hbathini(a)linux.ibm.com>
Cc: Madhavan Srinivasan <maddy(a)linux.ibm.com>
Cc: Mahesh Salgaonkar <mahesh(a)linux.ibm.com>
Cc: Michael Ellerman <mpe(a)ellerman.id.au>
Cc: Ritesh Harjani (IBM) <ritesh.list(a)gmail.com>
Cc: Vivek Goyal <vgoyal(a)redhat.com>
Cc: Dave Young <dyoung(a)redhat.com>
Cc: Mike Rapoport <rppt(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
arch/powerpc/include/asm/crash_reserve.h | 8 ++++++++
include/linux/crash_reserve.h | 6 ++++++
kernel/crash_reserve.c | 3 +++
3 files changed, 17 insertions(+)
--- a/arch/powerpc/include/asm/crash_reserve.h~crash-let-architecture-decide-crash-memory-export-to-iomem_resource
+++ a/arch/powerpc/include/asm/crash_reserve.h
@@ -5,4 +5,12 @@
/* crash kernel regions are Page size agliged */
#define CRASH_ALIGN PAGE_SIZE
+#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+static inline bool arch_add_crash_res_to_iomem(void)
+{
+ return false;
+}
+#define arch_add_crash_res_to_iomem arch_add_crash_res_to_iomem
+#endif
+
#endif /* _ASM_POWERPC_CRASH_RESERVE_H */
--- a/include/linux/crash_reserve.h~crash-let-architecture-decide-crash-memory-export-to-iomem_resource
+++ a/include/linux/crash_reserve.h
@@ -32,6 +32,12 @@ int __init parse_crashkernel(char *cmdli
void __init reserve_crashkernel_cma(unsigned long long cma_size);
#ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
+#ifndef arch_add_crash_res_to_iomem
+static inline bool arch_add_crash_res_to_iomem(void)
+{
+ return true;
+}
+#endif
#ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE
#define DEFAULT_CRASH_KERNEL_LOW_SIZE (128UL << 20)
#endif
--- a/kernel/crash_reserve.c~crash-let-architecture-decide-crash-memory-export-to-iomem_resource
+++ a/kernel/crash_reserve.c
@@ -524,6 +524,9 @@ void __init reserve_crashkernel_cma(unsi
#ifndef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY
static __init int insert_crashkernel_resources(void)
{
+ if (!arch_add_crash_res_to_iomem())
+ return 0;
+
if (crashk_res.start < crashk_res.end)
insert_resource(&iomem_resource, &crashk_res);
_
Patches currently in -mm which might be from sourabhjain(a)linux.ibm.com are
crash-fix-crashkernel-resource-shrink.patch
The quilt patch titled
Subject: scs: fix a wrong parameter in __scs_magic
has been removed from the -mm tree. Its filename was
scs-fix-a-wrong-parameter-in-__scs_magic.patch
This patch was dropped because it was merged into the mm-nonmm-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: Zhichi Lin <zhichi.lin(a)vivo.com>
Subject: scs: fix a wrong parameter in __scs_magic
Date: Sat, 11 Oct 2025 16:22:22 +0800
__scs_magic() needs a 'void *' variable, but a 'struct task_struct *' is
given. 'task_scs(tsk)' is the starting address of the task's shadow call
stack, and '__scs_magic(task_scs(tsk))' is the end address of the task's
shadow call stack. Here should be '__scs_magic(task_scs(tsk))'.
The user-visible effect of this bug is that when CONFIG_DEBUG_STACK_USAGE
is enabled, the shadow call stack usage checking function
(scs_check_usage) would scan an incorrect memory range. This could lead
to:
1. **Inaccurate stack usage reporting**: The function would calculate
wrong usage statistics for the shadow call stack, potentially showing
incorrect value in kmsg.
2. **Potential kernel crash**: If the value of __scs_magic(tsk)is
greater than that of __scs_magic(task_scs(tsk)), the for loop may
access unmapped memory, potentially causing a kernel panic. However,
this scenario is unlikely because task_struct is allocated via the slab
allocator (which typically returns lower addresses), while the shadow
call stack returned by task_scs(tsk) is allocated via vmalloc(which
typically returns higher addresses).
However, since this is purely a debugging feature
(CONFIG_DEBUG_STACK_USAGE), normal production systems should be not
unaffected. The bug only impacts developers and testers who are actively
debugging stack usage with this configuration enabled.
Link: https://lkml.kernel.org/r/20251011082222.12965-1-zhichi.lin@vivo.com
Fixes: 5bbaf9d1fcb9 ("scs: Add support for stack usage debugging")
Signed-off-by: Jiyuan Xie <xiejiyuan(a)vivo.com>
Signed-off-by: Zhichi Lin <zhichi.lin(a)vivo.com>
Reviewed-by: Sami Tolvanen <samitolvanen(a)google.com>
Acked-by: Will Deacon <will(a)kernel.org>
Cc: Andrey Konovalov <andreyknvl(a)gmail.com>
Cc: Kees Cook <keescook(a)chromium.org>
Cc: Marco Elver <elver(a)google.com>
Cc: Will Deacon <will(a)kernel.org>
Cc: Yee Lee <yee.lee(a)mediatek.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
kernel/scs.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/kernel/scs.c~scs-fix-a-wrong-parameter-in-__scs_magic
+++ a/kernel/scs.c
@@ -135,7 +135,7 @@ static void scs_check_usage(struct task_
if (!IS_ENABLED(CONFIG_DEBUG_STACK_USAGE))
return;
- for (p = task_scs(tsk); p < __scs_magic(tsk); ++p) {
+ for (p = task_scs(tsk); p < __scs_magic(task_scs(tsk)); ++p) {
if (!READ_ONCE_NOCHECK(*p))
break;
used += sizeof(*p);
_
Patches currently in -mm which might be from zhichi.lin(a)vivo.com are
F2FS can mount filesystems with corrupted directory depth values that
get runtime-clamped to MAX_DIR_HASH_DEPTH. When RENAME_WHITEOUT
operations are performed on such directories, f2fs_rename performs
directory modifications (updating target entry and deleting source
entry) before attempting to add the whiteout entry via f2fs_add_link.
If f2fs_add_link fails due to the corrupted directory structure, the
function returns an error to VFS, but the partial directory
modifications have already been committed to disk. VFS assumes the
entire rename operation failed and does not update the dentry cache,
leaving stale mappings.
In the error path, VFS does not call d_move() to update the dentry
cache. This results in new_dentry still pointing to the old inode
(new_inode) which has already had its i_nlink decremented to zero.
The stale cache causes subsequent operations to incorrectly reference
the freed inode.
This causes subsequent operations to use cached dentry information that
no longer matches the on-disk state. When a second rename targets the
same entry, VFS attempts to decrement i_nlink on the stale inode, which
may already have i_nlink=0, triggering a WARNING in drop_nlink().
Example sequence:
1. First rename (RENAME_WHITEOUT): file2 → file1
- f2fs updates file1 entry on disk (points to inode 8)
- f2fs deletes file2 entry on disk
- f2fs_add_link(whiteout) fails (corrupted directory)
- Returns error to VFS
- VFS does not call d_move() due to error
- VFS cache still has: file1 → inode 7 (stale!)
- inode 7 has i_nlink=0 (already decremented)
2. Second rename: file3 → file1
- VFS uses stale cache: file1 → inode 7
- Tries to drop_nlink on inode 7 (i_nlink already 0)
- WARNING in drop_nlink()
Fix this by explicitly invalidating old_dentry and new_dentry when
f2fs_add_link fails during whiteout creation. This forces VFS to
refresh from disk on subsequent operations, ensuring cache consistency
even when the rename partially succeeds.
Reproducer:
1. Mount F2FS image with corrupted i_current_depth
2. renameat2(file2, file1, RENAME_WHITEOUT)
3. renameat2(file3, file1, 0)
4. System triggers WARNING in drop_nlink()
Fixes: 7e01e7ad746b ("f2fs: support RENAME_WHITEOUT")
Reported-by: syzbot+632cf32276a9a564188d(a)syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=632cf32276a9a564188d
Suggested-by: Chao Yu <chao(a)kernel.org>
Link: https://lore.kernel.org/all/20251022233349.102728-1-kartikey406@gmail.com/ [v1]
Cc: stable(a)vger.kernel.org
Signed-off-by: Deepanshu Kartikey <kartikey406(a)gmail.com>
---
Changes in v2:
- Added detailed explanation about VFS not calling d_move() in error path,
resulting in new_dentry still pointing to inode with zeroed i_nlink
(suggested by Chao Yu)
- Added Fixes tag pointing to commit 7e01e7ad746b
- Added Cc: stable(a)vger.kernel.org for backporting to stable kernels
---
fs/f2fs/namei.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index b882771e4699..712479b7b93d 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -1053,9 +1053,11 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (whiteout) {
set_inode_flag(whiteout, FI_INC_LINK);
err = f2fs_add_link(old_dentry, whiteout);
- if (err)
+ if (err) {
+ d_invalidate(old_dentry);
+ d_invalidate(new_dentry);
goto put_out_dir;
-
+ }
spin_lock(&whiteout->i_lock);
whiteout->i_state &= ~I_LINKABLE;
spin_unlock(&whiteout->i_lock);
--
2.43.0
svm_update_lbrv() is called when MSR_IA32_DEBUGCTLMSR is updated, and on
nested transitions where LBRV is used. It checks whether LBRV enablement
needs to be changed in the current VMCB, and if it does, it also
recalculate intercepts to LBR MSRs.
However, there are cases where intercepts need to be updated even when
LBRV enablement doesn't. Example scenario:
- L1 has MSR_IA32_DEBUGCTLMSR cleared.
- L1 runs L2 without LBR_CTL_ENABLE (no LBRV).
- L2 sets DEBUGCTLMSR_LBR in MSR_IA32_DEBUGCTLMSR, svm_update_lbrv()
sets LBR_CTL_ENABLE in VMCB02 and disables intercepts to LBR MSRs.
- L2 exits to L1, svm_update_lbrv() is not called on this transition.
- L1 clears MSR_IA32_DEBUGCTLMSR, svm_update_lbrv() finds that
LBR_CTL_ENABLE is already cleared in VMCB01 and does nothing.
- Intercepts remain disabled, L1 reads to LBR MSRs read the host MSRs.
Fix it by always recalculating intercepts in svm_update_lbrv().
Fixes: 1d5a1b5860ed ("KVM: x86: nSVM: correctly virtualize LBR msrs when L2 is running")
Cc: stable(a)vger.kernel.org
Signed-off-by: Yosry Ahmed <yosry.ahmed(a)linux.dev>
---
arch/x86/kvm/svm/svm.c | 29 +++++++++++++++++++----------
1 file changed, 19 insertions(+), 10 deletions(-)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index d25c56b30b4e2..26ab75ecf1c67 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -806,25 +806,29 @@ void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
vmcb_mark_dirty(to_vmcb, VMCB_LBR);
}
-void svm_enable_lbrv(struct kvm_vcpu *vcpu)
+static void __svm_enable_lbrv(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
- svm_recalc_lbr_msr_intercepts(vcpu);
/* Move the LBR msrs to the vmcb02 so that the guest can see them. */
if (is_guest_mode(vcpu))
svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr);
}
-static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
+void svm_enable_lbrv(struct kvm_vcpu *vcpu)
+{
+ __svm_enable_lbrv(vcpu);
+ svm_recalc_lbr_msr_intercepts(vcpu);
+}
+
+static void __svm_disable_lbrv(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm);
svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
- svm_recalc_lbr_msr_intercepts(vcpu);
/*
* Move the LBR msrs back to the vmcb01 to avoid copying them
@@ -853,13 +857,18 @@ void svm_update_lbrv(struct kvm_vcpu *vcpu)
(is_guest_mode(vcpu) && guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) &&
(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK));
- if (enable_lbrv == current_enable_lbrv)
- return;
+ if (enable_lbrv && !current_enable_lbrv)
+ __svm_enable_lbrv(vcpu);
+ else if (!enable_lbrv && current_enable_lbrv)
+ __svm_disable_lbrv(vcpu);
- if (enable_lbrv)
- svm_enable_lbrv(vcpu);
- else
- svm_disable_lbrv(vcpu);
+ /*
+ * During nested transitions, it is possible that the current VMCB has
+ * LBR_CTL set, but the previous LBR_CTL had it cleared (or vice versa).
+ * In this case, even though LBR_CTL does not need an update, intercepts
+ * do, so always recalculate the intercepts here.
+ */
+ svm_recalc_lbr_msr_intercepts(vcpu);
}
void disable_nmi_singlestep(struct vcpu_svm *svm)
--
2.51.2.1041.gc1ab5b90ca-goog