From: Baokun Li <libaokun1(a)huawei.com>
[ Upstream commit b4b4fda34e535756f9e774fb2d09c4537b7dfd1c ]
In the following concurrency we will access the uninitialized rs->lock:
ext4_fill_super
ext4_register_sysfs
// sysfs registered msg_ratelimit_interval_ms
// Other processes modify rs->interval to
// non-zero via msg_ratelimit_interval_ms
ext4_orphan_cleanup
ext4_msg(sb, KERN_INFO, "Errors on filesystem, "
__ext4_msg
___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state)
if (!rs->interval) // do nothing if interval is 0
return 1;
raw_spin_trylock_irqsave(&rs->lock, flags)
raw_spin_trylock(lock)
_raw_spin_trylock
__raw_spin_trylock
spin_acquire(&lock->dep_map, 0, 1, _RET_IP_)
lock_acquire
__lock_acquire
register_lock_class
assign_lock_key
dump_stack();
ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10);
raw_spin_lock_init(&rs->lock);
// init rs->lock here
and get the following dump_stack:
=========================================================
INFO: trying to register non-static key.
The code is fine but needs lockdep annotation, or maybe
you didn't initialize this object before use?
turning off the locking correctness validator.
CPU: 12 PID: 753 Comm: mount Tainted: G E 6.7.0-rc6-next-20231222 #504
[...]
Call Trace:
dump_stack_lvl+0xc5/0x170
dump_stack+0x18/0x30
register_lock_class+0x740/0x7c0
__lock_acquire+0x69/0x13a0
lock_acquire+0x120/0x450
_raw_spin_trylock+0x98/0xd0
___ratelimit+0xf6/0x220
__ext4_msg+0x7f/0x160 [ext4]
ext4_orphan_cleanup+0x665/0x740 [ext4]
__ext4_fill_super+0x21ea/0x2b10 [ext4]
ext4_fill_super+0x14d/0x360 [ext4]
[...]
=========================================================
Normally interval is 0 until s_msg_ratelimit_state is initialized, so
___ratelimit() does nothing. But registering sysfs precedes initializing
rs->lock, so it is possible to change rs->interval to a non-zero value
via the msg_ratelimit_interval_ms interface of sysfs while rs->lock is
uninitialized, and then a call to ext4_msg triggers the problem by
accessing an uninitialized rs->lock. Therefore register sysfs after all
initializations are complete to avoid such problems.
Signed-off-by: Baokun Li <libaokun1(a)huawei.com>
Reviewed-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20240102133730.1098120-1-libaokun1@huawei.com
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
[Minor context change fixed.]
Signed-off-by: Bin Lan <bin.lan.cn(a)windriver.com>
Signed-off-by: He Zhe <zhe.he(a)windriver.com>
---
Build test passed.
---
fs/ext4/super.c | 22 ++++++++++------------
1 file changed, 10 insertions(+), 12 deletions(-)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7f0231b34905..8528f61854ab 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5496,19 +5496,15 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
if (err)
goto failed_mount6;
- err = ext4_register_sysfs(sb);
- if (err)
- goto failed_mount7;
-
err = ext4_init_orphan_info(sb);
if (err)
- goto failed_mount8;
+ goto failed_mount7;
#ifdef CONFIG_QUOTA
/* Enable quota usage during mount. */
if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) {
err = ext4_enable_quotas(sb);
if (err)
- goto failed_mount9;
+ goto failed_mount8;
}
#endif /* CONFIG_QUOTA */
@@ -5534,7 +5530,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
ext4_msg(sb, KERN_INFO, "recovery complete");
err = ext4_mark_recovery_complete(sb, es);
if (err)
- goto failed_mount10;
+ goto failed_mount9;
}
if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev))
@@ -5551,15 +5547,17 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
atomic_set(&sbi->s_warning_count, 0);
atomic_set(&sbi->s_msg_count, 0);
+ /* Register sysfs after all initializations are complete. */
+ err = ext4_register_sysfs(sb);
+ if (err)
+ goto failed_mount9;
+
return 0;
-failed_mount10:
+failed_mount9:
ext4_quota_off_umount(sb);
-failed_mount9: __maybe_unused
+failed_mount8: __maybe_unused
ext4_release_orphan_info(sb);
-failed_mount8:
- ext4_unregister_sysfs(sb);
- kobject_put(&sbi->s_kobj);
failed_mount7:
ext4_unregister_li_request(sb);
failed_mount6:
--
2.34.1
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 94cff94634e506a4a44684bee1875d2dbf782722
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025051258-washbowl-alongside-de3d@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 94cff94634e506a4a44684bee1875d2dbf782722 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy(a)linutronix.de>
Date: Fri, 4 Apr 2025 15:31:16 +0200
Subject: [PATCH] clocksource/i8253: Use raw_spinlock_irqsave() in
clockevent_i8253_disable()
On x86 during boot, clockevent_i8253_disable() can be invoked via
x86_late_time_init -> hpet_time_init() -> pit_timer_init() which happens
with enabled interrupts.
If some of the old i8253 hardware is actually used then lockdep will notice
that i8253_lock is used in hard interrupt context. This causes lockdep to
complain because it observed the lock being acquired with interrupts
enabled and in hard interrupt context.
Make clockevent_i8253_disable() acquire the lock with
raw_spinlock_irqsave() to cure this.
[ tglx: Massage change log and use guard() ]
Fixes: c8c4076723dac ("x86/timer: Skip PIT initialization on modern chipsets")
Signed-off-by: Sebastian Andrzej Siewior <bigeasy(a)linutronix.de>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/all/20250404133116.p-XRWJXf@linutronix.de
diff --git a/drivers/clocksource/i8253.c b/drivers/clocksource/i8253.c
index 39f7c2d736d1..b603c25f3dfa 100644
--- a/drivers/clocksource/i8253.c
+++ b/drivers/clocksource/i8253.c
@@ -103,7 +103,7 @@ int __init clocksource_i8253_init(void)
#ifdef CONFIG_CLKEVT_I8253
void clockevent_i8253_disable(void)
{
- raw_spin_lock(&i8253_lock);
+ guard(raw_spinlock_irqsave)(&i8253_lock);
/*
* Writing the MODE register should stop the counter, according to
@@ -132,8 +132,6 @@ void clockevent_i8253_disable(void)
outb_p(0, PIT_CH0);
outb_p(0x30, PIT_MODE);
-
- raw_spin_unlock(&i8253_lock);
}
static int pit_shutdown(struct clock_event_device *evt)
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x ab00ddd802f80e31fc9639c652d736fe3913feae
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2025051245-jailbreak-unlinked-27ec@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ab00ddd802f80e31fc9639c652d736fe3913feae Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang(a)linux.alibaba.com>
Date: Wed, 23 Apr 2025 18:36:45 +0800
Subject: [PATCH] selftests/mm: compaction_test: support platform with huge
mount of memory
When running mm selftest to verify mm patches, 'compaction_test' case
failed on an x86 server with 1TB memory. And the root cause is that it
has too much free memory than what the test supports.
The test case tries to allocate 100000 huge pages, which is about 200 GB
for that x86 server, and when it succeeds, it expects it's large than 1/3
of 80% of the free memory in system. This logic only works for platform
with 750 GB ( 200 / (1/3) / 80% ) or less free memory, and may raise false
alarm for others.
Fix it by changing the fixed page number to self-adjustable number
according to the real number of free memory.
Link: https://lkml.kernel.org/r/20250423103645.2758-1-feng.tang@linux.alibaba.com
Fixes: bd67d5c15cc1 ("Test compaction of mlocked memory")
Signed-off-by: Feng Tang <feng.tang(a)linux.alibaba.com>
Acked-by: Dev Jain <dev.jain(a)arm.com>
Reviewed-by: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Tested-by: Baolin Wang <baolin.wang(a)inux.alibaba.com>
Cc: Shuah Khan <shuah(a)kernel.org>
Cc: Sri Jayaramappa <sjayaram(a)akamai.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/tools/testing/selftests/mm/compaction_test.c b/tools/testing/selftests/mm/compaction_test.c
index 2c3a0eb6b22d..9bc4591c7b16 100644
--- a/tools/testing/selftests/mm/compaction_test.c
+++ b/tools/testing/selftests/mm/compaction_test.c
@@ -90,6 +90,8 @@ int check_compaction(unsigned long mem_free, unsigned long hugepage_size,
int compaction_index = 0;
char nr_hugepages[20] = {0};
char init_nr_hugepages[24] = {0};
+ char target_nr_hugepages[24] = {0};
+ int slen;
snprintf(init_nr_hugepages, sizeof(init_nr_hugepages),
"%lu", initial_nr_hugepages);
@@ -106,11 +108,18 @@ int check_compaction(unsigned long mem_free, unsigned long hugepage_size,
goto out;
}
- /* Request a large number of huge pages. The Kernel will allocate
- as much as it can */
- if (write(fd, "100000", (6*sizeof(char))) != (6*sizeof(char))) {
- ksft_print_msg("Failed to write 100000 to /proc/sys/vm/nr_hugepages: %s\n",
- strerror(errno));
+ /*
+ * Request huge pages for about half of the free memory. The Kernel
+ * will allocate as much as it can, and we expect it will get at least 1/3
+ */
+ nr_hugepages_ul = mem_free / hugepage_size / 2;
+ snprintf(target_nr_hugepages, sizeof(target_nr_hugepages),
+ "%lu", nr_hugepages_ul);
+
+ slen = strlen(target_nr_hugepages);
+ if (write(fd, target_nr_hugepages, slen) != slen) {
+ ksft_print_msg("Failed to write %lu to /proc/sys/vm/nr_hugepages: %s\n",
+ nr_hugepages_ul, strerror(errno));
goto close_fd;
}
V2: not to add extra read-back in vcn_v4_0_5_start as there is a
read-back already. New comment for better understanding.
On VCN v4.0.5 there is a race condition where the WPTR is not
updated after starting from idle when doorbell is used. The read-back
of regVCN_RB1_DB_CTRL register after written is to ensure the
doorbell_index is updated before it can work properly.
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/12528
Cc: stable(a)vger.kernel.org
Signed-off-by: David (Ming Qiang) Wu <David.Wu3(a)amd.com>
Reviewed-by: Mario Limonciello <mario.limonciello(a)amd.com>
Tested-by: Mario Limonciello <mario.limonciello(a)amd.com>
---
drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c
index ed00d35039c13..e55b76d71367d 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c
@@ -1034,6 +1034,10 @@ static int vcn_v4_0_5_start_dpg_mode(struct amdgpu_vcn_inst *vinst,
ring->doorbell_index << VCN_RB1_DB_CTRL__OFFSET__SHIFT |
VCN_RB1_DB_CTRL__EN_MASK);
+ /* Keeping one read-back to ensure all register writes are done, otherwise
+ * it may introduce race conditions */
+ RREG32_SOC15(VCN, inst_idx, regVCN_RB1_DB_CTRL);
+
return 0;
}
--
2.34.1
From: Ashish Kalra <ashish.kalra(a)amd.com>
When the shared pages are being made private during kdump preparation
there are additional checks to handle shared GHCB pages.
These additional checks include handling the case of GHCB page being
contained within a huge page.
The check for handling the case of GHCB contained within a huge
page incorrectly skips a page just below the GHCB page from being
transitioned back to private during kdump preparation.
This skipped page causes a 0x404 #VC exception when it is accessed
later while dumping guest memory during vmcore generation via kdump.
Correct the range to be checked for GHCB contained in a huge page.
Also ensure that the skipped huge page containing the GHCB page is
transitioned back to private by applying the correct address mask
later when changing GHCBs to private at end of kdump preparation.
Fixes: 3074152e56c9 ("x86/sev: Convert shared memory back to private on kexec")
Cc: stable(a)vger.kernel.org
Signed-off-by: Ashish Kalra <ashish.kalra(a)amd.com>
---
arch/x86/coco/sev/core.c | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c
index d35fec7b164a..30b74e4e4e88 100644
--- a/arch/x86/coco/sev/core.c
+++ b/arch/x86/coco/sev/core.c
@@ -1019,7 +1019,8 @@ static void unshare_all_memory(void)
data = per_cpu(runtime_data, cpu);
ghcb = (unsigned long)&data->ghcb_page;
- if (addr <= ghcb && ghcb <= addr + size) {
+ /* Handle the case of a huge page containing the GHCB page */
+ if (addr <= ghcb && ghcb < addr + size) {
skipped_addr = true;
break;
}
@@ -1131,8 +1132,8 @@ static void shutdown_all_aps(void)
void snp_kexec_finish(void)
{
struct sev_es_runtime_data *data;
+ unsigned long size, addr;
unsigned int level, cpu;
- unsigned long size;
struct ghcb *ghcb;
pte_t *pte;
@@ -1160,8 +1161,10 @@ void snp_kexec_finish(void)
ghcb = &data->ghcb_page;
pte = lookup_address((unsigned long)ghcb, &level);
size = page_level_size(level);
- set_pte_enc(pte, level, (void *)ghcb);
- snp_set_memory_private((unsigned long)ghcb, (size / PAGE_SIZE));
+ /* Handle the case of a huge page containing the GHCB page */
+ addr = (unsigned long)ghcb & page_level_mask(level);
+ set_pte_enc(pte, level, (void *)addr);
+ snp_set_memory_private(addr, (size / PAGE_SIZE));
}
}
--
2.34.1
On VCN v4.0.5 there is a race condition where the WPTR is not
updated after starting from idle when doorbell is used. The read-back
of regVCN_RB1_DB_CTRL register after written is to ensure the
doorbell_index is updated before it can work properly.
Link: https://gitlab.freedesktop.org/mesa/mesa/-/issues/12528
Signed-off-by: David (Ming Qiang) Wu <David.Wu3(a)amd.com>
---
drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c
index ed00d35039c1..d6be8b05d7a2 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c
@@ -1033,6 +1033,8 @@ static int vcn_v4_0_5_start_dpg_mode(struct amdgpu_vcn_inst *vinst,
WREG32_SOC15(VCN, inst_idx, regVCN_RB1_DB_CTRL,
ring->doorbell_index << VCN_RB1_DB_CTRL__OFFSET__SHIFT |
VCN_RB1_DB_CTRL__EN_MASK);
+ /* Read DB_CTRL to flush the write DB_CTRL command. */
+ RREG32_SOC15(VCN, inst_idx, regVCN_RB1_DB_CTRL);
return 0;
}
@@ -1195,6 +1197,8 @@ static int vcn_v4_0_5_start(struct amdgpu_vcn_inst *vinst)
WREG32_SOC15(VCN, i, regVCN_RB1_DB_CTRL,
ring->doorbell_index << VCN_RB1_DB_CTRL__OFFSET__SHIFT |
VCN_RB1_DB_CTRL__EN_MASK);
+ /* Read DB_CTRL to flush the write DB_CTRL command. */
+ RREG32_SOC15(VCN, i, regVCN_RB1_DB_CTRL);
WREG32_SOC15(VCN, i, regUVD_RB_BASE_LO, ring->gpu_addr);
WREG32_SOC15(VCN, i, regUVD_RB_BASE_HI, upper_32_bits(ring->gpu_addr));
--
2.49.0