- Linux-stable-mirror - lists.linaro.org

[PATCH] LoongArch: Avoid using $r0/$r1 as "mask" for csrxchg

by Huacai Chen

When building kernel with LLVM there are occasionally such errors: In file included from ./include/linux/spinlock.h:59: In file included from ./include/linux/irqflags.h:17: arch/loongarch/include/asm/irqflags.h:38:3: error: must not be $r0 or $r1 38 | "csrxchg %[val], %[mask], %[reg]\n\t" | ^ <inline asm>:1:16: note: instantiated into assembly here 1 | csrxchg $a1, $ra, 0 | ^ The "mask" of the csrxchg instruction should not be $r0 or $r1, but the compiler cannot avoid generating such code currently. So force to use t0 in the inline asm, in order to avoid using $r0/$r1. Cc: stable(a)vger.kernel.org Suggested-by: WANG Rui <wangrui(a)loongson.cn> Signed-off-by: Huacai Chen <chenhuacai(a)loongson.cn> --- arch/loongarch/include/asm/irqflags.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/arch/loongarch/include/asm/irqflags.h b/arch/loongarch/include/asm/irqflags.h index 319a8c616f1f..003172b8406b 100644 --- a/arch/loongarch/include/asm/irqflags.h +++ b/arch/loongarch/include/asm/irqflags.h @@ -14,40 +14,48 @@ static inline void arch_local_irq_enable(void) { u32 flags = CSR_CRMD_IE; + register u32 mask asm("t0") = CSR_CRMD_IE; + __asm__ __volatile__( "csrxchg %[val], %[mask], %[reg]\n\t" : [val] "+r" (flags) - : [mask] "r" (CSR_CRMD_IE), [reg] "i" (LOONGARCH_CSR_CRMD) + : [mask] "r" (mask), [reg] "i" (LOONGARCH_CSR_CRMD) : "memory"); } static inline void arch_local_irq_disable(void) { u32 flags = 0; + register u32 mask asm("t0") = CSR_CRMD_IE; + __asm__ __volatile__( "csrxchg %[val], %[mask], %[reg]\n\t" : [val] "+r" (flags) - : [mask] "r" (CSR_CRMD_IE), [reg] "i" (LOONGARCH_CSR_CRMD) + : [mask] "r" (mask), [reg] "i" (LOONGARCH_CSR_CRMD) : "memory"); } static inline unsigned long arch_local_irq_save(void) { u32 flags = 0; + register u32 mask asm("t0") = CSR_CRMD_IE; + __asm__ __volatile__( "csrxchg %[val], %[mask], %[reg]\n\t" : [val] "+r" (flags) - : [mask] "r" (CSR_CRMD_IE), [reg] "i" (LOONGARCH_CSR_CRMD) + : [mask] "r" (mask), [reg] "i" (LOONGARCH_CSR_CRMD) : "memory"); return flags; } static inline void arch_local_irq_restore(unsigned long flags) { + register u32 mask asm("t0") = CSR_CRMD_IE; + __asm__ __volatile__( "csrxchg %[val], %[mask], %[reg]\n\t" : [val] "+r" (flags) - : [mask] "r" (CSR_CRMD_IE), [reg] "i" (LOONGARCH_CSR_CRMD) + : [mask] "r" (mask), [reg] "i" (LOONGARCH_CSR_CRMD) : "memory"); } -- 2.47.1

1 month, 3 weeks

4
3
0 0

[PATCH v1 1/1] x86/fred/signal: Prevent single-step upon ERETU completion

by Xin Li (Intel)

From: Xin Li <xin(a)zytor.com> Clear the software event flag in the augmented SS to prevent infinite SIGTRAP handler loop if TF is used without an external debugger. Following is a typical single-stepping flow for a user process: 1) The user process is prepared for single-stepping by setting RFLAGS.TF = 1. 2) When any instruction in user space completes, a #DB is triggered. 3) The kernel handles the #DB and returns to user space, invoking the SIGTRAP handler with RFLAGS.TF = 0. 4) After the SIGTRAP handler finishes, the user process performs a sigreturn syscall, restoring the original state, including RFLAGS.TF = 1. 5) Goto step 2. According to the FRED specification: A) Bit 17 in the augmented SS is designated as the software event flag, which is set to 1 for FRED event delivery of SYSCALL, SYSENTER, or INT n. B) If bit 17 of the augmented SS is 1 and ERETU would result in RFLAGS.TF = 1, a single-step trap will be pending upon completion of ERETU. In step 4) above, the software event flag is set upon the sigreturn syscall, and its corresponding ERETU would restore RFLAGS.TF = 1. This combination causes a pending single-step trap upon completion of ERETU. Therefore, another #DB is triggered before any user space instruction is executed, which leads to an infinite loop in which the SIGTRAP handler keeps being invoked on the same user space IP. Suggested-by: H. Peter Anvin (Intel) <hpa(a)zytor.com> Signed-off-by: Xin Li (Intel) <xin(a)zytor.com> Cc: stable(a)vger.kernel.org --- arch/x86/include/asm/sighandling.h | 20 ++++++++++++++++++++ arch/x86/kernel/signal_32.c | 4 ++++ arch/x86/kernel/signal_64.c | 4 ++++ 3 files changed, 28 insertions(+) diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h index e770c4fc47f4..ecb0411fe88c 100644 --- a/arch/x86/include/asm/sighandling.h +++ b/arch/x86/include/asm/sighandling.h @@ -24,4 +24,24 @@ int ia32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs); int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs); int x32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs); +/* + * To prevent infinite SIGTRAP handler loop if TF is used without an external + * debugger, clear the software event flag in the augmented SS, ensuring no + * single-step trap is pending upon ERETU completion. + * + * Note, this function should be called in sigreturn() before the original state + * is restored to make sure the TF is read from the entry frame. + */ +static __always_inline void prevent_single_step_upon_eretu(struct pt_regs *regs) +{ + /* + * If the trap flag (TF) is set, i.e., the sigreturn() SYSCALL instruction + * is being single-stepped, do not clear the software event flag in the + * augmented SS, thus a debugger won't skip over the following instruction. + */ + if (IS_ENABLED(CONFIG_X86_FRED) && cpu_feature_enabled(X86_FEATURE_FRED) && + !(regs->flags & X86_EFLAGS_TF)) + regs->fred_ss.swevent = 0; +} + #endif /* _ASM_X86_SIGHANDLING_H */ diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 98123ff10506..42bbc42bd350 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -152,6 +152,8 @@ SYSCALL32_DEFINE0(sigreturn) struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8); sigset_t set; + prevent_single_step_upon_eretu(regs); + if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__get_user(set.sig[0], &frame->sc.oldmask) @@ -175,6 +177,8 @@ SYSCALL32_DEFINE0(rt_sigreturn) struct rt_sigframe_ia32 __user *frame; sigset_t set; + prevent_single_step_upon_eretu(regs); + frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4); if (!access_ok(frame, sizeof(*frame))) diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index ee9453891901..d483b585c6c6 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -250,6 +250,8 @@ SYSCALL_DEFINE0(rt_sigreturn) sigset_t set; unsigned long uc_flags; + prevent_single_step_upon_eretu(regs); + frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); if (!access_ok(frame, sizeof(*frame))) goto badframe; @@ -366,6 +368,8 @@ COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn) sigset_t set; unsigned long uc_flags; + prevent_single_step_upon_eretu(regs); + frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); if (!access_ok(frame, sizeof(*frame))) base-commit: 6a7c3c2606105a41dde81002c0037420bc1ddf00 -- 2.49.0

1 month, 3 weeks

6
8
0 0

[PATCH v3 1/1] x86/fred/signal: Prevent single-step upon ERETU completion

by Xin Li (Intel)

From: Xin Li <xin(a)zytor.com> Clear the software event flag in the augmented SS to prevent infinite SIGTRAP handler loop if TF is used without an external debugger. Following is a typical single-stepping flow for a user process: 1) The user process is prepared for single-stepping by setting RFLAGS.TF = 1. 2) When any instruction in user space completes, a #DB is triggered. 3) The kernel handles the #DB and returns to user space, invoking the SIGTRAP handler with RFLAGS.TF = 0. 4) After the SIGTRAP handler finishes, the user process performs a sigreturn syscall, restoring the original state, including RFLAGS.TF = 1. 5) Goto step 2. According to the FRED specification: A) Bit 17 in the augmented SS is designated as the software event flag, which is set to 1 for FRED event delivery of SYSCALL, SYSENTER, or INT n. B) If bit 17 of the augmented SS is 1 and ERETU would result in RFLAGS.TF = 1, a single-step trap will be pending upon completion of ERETU. In step 4) above, the software event flag is set upon the sigreturn syscall, and its corresponding ERETU would restore RFLAGS.TF = 1. This combination causes a pending single-step trap upon completion of ERETU. Therefore, another #DB is triggered before any user space instruction is executed, which leads to an infinite loop in which the SIGTRAP handler keeps being invoked on the same user space IP. Suggested-by: H. Peter Anvin (Intel) <hpa(a)zytor.com> Signed-off-by: Xin Li (Intel) <xin(a)zytor.com> Cc: stable(a)vger.kernel.org --- Change in v3: *) Use "#ifdef CONFIG_X86_FRED" instead of IS_ENABLED(CONFIG_X86_FRED) (Intel LKP). Change in v2: *) Remove the check cpu_feature_enabled(X86_FEATURE_FRED), because regs->fred_ss.swevent will always be 0 otherwise (H. Peter Anvin). --- arch/x86/include/asm/sighandling.h | 21 +++++++++++++++++++++ arch/x86/kernel/signal_32.c | 4 ++++ arch/x86/kernel/signal_64.c | 4 ++++ 3 files changed, 29 insertions(+) diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h index e770c4fc47f4..530eecc371fc 100644 --- a/arch/x86/include/asm/sighandling.h +++ b/arch/x86/include/asm/sighandling.h @@ -24,4 +24,25 @@ int ia32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs); int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs); int x32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs); +/* + * To prevent infinite SIGTRAP handler loop if TF is used without an external + * debugger, clear the software event flag in the augmented SS, ensuring no + * single-step trap is pending upon ERETU completion. + * + * Note, this function should be called in sigreturn() before the original state + * is restored to make sure the TF is read from the entry frame. + */ +static __always_inline void prevent_single_step_upon_eretu(struct pt_regs *regs) +{ + /* + * If the trap flag (TF) is set, i.e., the sigreturn() SYSCALL instruction + * is being single-stepped, do not clear the software event flag in the + * augmented SS, thus a debugger won't skip over the following instruction. + */ +#ifdef CONFIG_X86_FRED + if (!(regs->flags & X86_EFLAGS_TF)) + regs->fred_ss.swevent = 0; +#endif +} + #endif /* _ASM_X86_SIGHANDLING_H */ diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 98123ff10506..42bbc42bd350 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -152,6 +152,8 @@ SYSCALL32_DEFINE0(sigreturn) struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8); sigset_t set; + prevent_single_step_upon_eretu(regs); + if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__get_user(set.sig[0], &frame->sc.oldmask) @@ -175,6 +177,8 @@ SYSCALL32_DEFINE0(rt_sigreturn) struct rt_sigframe_ia32 __user *frame; sigset_t set; + prevent_single_step_upon_eretu(regs); + frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4); if (!access_ok(frame, sizeof(*frame))) diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index ee9453891901..d483b585c6c6 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -250,6 +250,8 @@ SYSCALL_DEFINE0(rt_sigreturn) sigset_t set; unsigned long uc_flags; + prevent_single_step_upon_eretu(regs); + frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); if (!access_ok(frame, sizeof(*frame))) goto badframe; @@ -366,6 +368,8 @@ COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn) sigset_t set; unsigned long uc_flags; + prevent_single_step_upon_eretu(regs); + frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); if (!access_ok(frame, sizeof(*frame))) base-commit: 6a7c3c2606105a41dde81002c0037420bc1ddf00 -- 2.49.0

1 month, 3 weeks

1
0
0 0

Re: Patch "btrfs: allow buffered write to avoid full page read if it's block aligned" has been added to the 6.14-stable tree

by Qu Wenruo

在 2025/5/23 06:35, Sasha Levin 写道: > This is a note to let you know that I've just added the patch titled > > btrfs: allow buffered write to avoid full page read if it's block aligned > > to the 6.14-stable tree which can be found at: > http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum… > > The filename of the patch is: > btrfs-allow-buffered-write-to-avoid-full-page-read-i.patch > and it can be found in the queue-6.14 subdirectory. > > If you, or anyone else, feels it should not be added to the stable tree, > please let <stable(a)vger.kernel.org> know about it. Please drop this patch from all stable branches. Although this patch mentions a failure in fstests, it acts more like an optimization for btrfs. Furthermore it relies quite some patches that may not be in stable kernels. Without all the dependency, this can lead to data corruption. Please drop this one from all stable kernels. Thanks, Qu > > > > commit de0860d610aaaee77a8c5c713c41fea584ac83b3 > Author: Qu Wenruo <wqu(a)suse.com> > Date: Wed Oct 30 17:04:02 2024 +1030 > > btrfs: allow buffered write to avoid full page read if it's block aligned > > [ Upstream commit 0d31ca6584f21821c708752d379871b9fce2dc48 ] > > [BUG] > Since the support of block size (sector size) < page size for btrfs, > test case generic/563 fails with 4K block size and 64K page size: > > --- tests/generic/563.out 2024-04-25 18:13:45.178550333 +0930 > +++ /home/adam/xfstests-dev/results//generic/563.out.bad 2024-09-30 09:09:16.155312379 +0930 > @@ -3,7 +3,8 @@ > read is in range > write is in range > write -> read/write > -read is in range > +read has value of 8388608 > +read is NOT in range -33792 .. 33792 > write is in range > ... > > [CAUSE] > The test case creates a 8MiB file, then does buffered write into the 8MiB > using 4K block size, to overwrite the whole file. > > On 4K page sized systems, since the write range covers the full block and > page, btrfs will not bother reading the page, just like what XFS and EXT4 > do. > > But on 64K page sized systems, although the 4K sized write is still block > aligned, it's not page aligned anymore, thus btrfs will read the full > page, which will be accounted by cgroup and fail the test. > > As the test case itself expects such 4K block aligned write should not > trigger any read. > > Such expected behavior is an optimization to reduce folio reads when > possible, and unfortunately btrfs does not implement such optimization. > > [FIX] > To skip the full page read, we need to do the following modification: > > - Do not trigger full page read as long as the buffered write is block > aligned > This is pretty simple by modifying the check inside > prepare_uptodate_page(). > > - Skip already uptodate blocks during full page read > Or we can lead to the following data corruption: > > 0 32K 64K > |///////| | > > Where the file range [0, 32K) is dirtied by buffered write, the > remaining range [32K, 64K) is not. > > When reading the full page, since [0,32K) is only dirtied but not > written back, there is no data extent map for it, but a hole covering > [0, 64k). > > If we continue reading the full page range [0, 64K), the dirtied range > will be filled with 0 (since there is only a hole covering the whole > range). > This causes the dirtied range to get lost. > > With this optimization, btrfs can pass generic/563 even if the page size > is larger than fs block size. > > Reviewed-by: Filipe Manana <fdmanana(a)suse.com> > Signed-off-by: Qu Wenruo <wqu(a)suse.com> > Signed-off-by: David Sterba <dsterba(a)suse.com> > Signed-off-by: Sasha Levin <sashal(a)kernel.org> > > diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c > index 06922529f19dc..13b5359ea1b77 100644 > --- a/fs/btrfs/extent_io.c > +++ b/fs/btrfs/extent_io.c > @@ -974,6 +974,10 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, > end_folio_read(folio, true, cur, iosize); > break; > } > + if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) { > + end_folio_read(folio, true, cur, blocksize); > + continue; > + } > em = get_extent_map(BTRFS_I(inode), folio, cur, end - cur + 1, em_cached); > if (IS_ERR(em)) { > end_folio_read(folio, false, cur, end + 1 - cur); > diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c > index cd4e40a719186..61ad1a79e5698 100644 > --- a/fs/btrfs/file.c > +++ b/fs/btrfs/file.c > @@ -804,14 +804,15 @@ static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 > { > u64 clamp_start = max_t(u64, pos, folio_pos(folio)); > u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio)); > + const u32 blocksize = inode_to_fs_info(inode)->sectorsize; > int ret = 0; > > if (folio_test_uptodate(folio)) > return 0; > > if (!force_uptodate && > - IS_ALIGNED(clamp_start, PAGE_SIZE) && > - IS_ALIGNED(clamp_end, PAGE_SIZE)) > + IS_ALIGNED(clamp_start, blocksize) && > + IS_ALIGNED(clamp_end, blocksize)) > return 0; > > ret = btrfs_read_folio(NULL, folio);

1 month, 3 weeks

2
1
0 0

[PATCH v4] net/mlx5: Add error handling in mlx5_query_nic_vport_node_guid()

by Wentao Liang

The function mlx5_query_nic_vport_node_guid() calls the function mlx5_query_nic_vport_context() but does not check its return value. A proper implementation can be found in mlx5_nic_vport_query_local_lb(). Add error handling for mlx5_query_nic_vport_context(). If it fails, free the out buffer via kvfree() and return error code. Fixes: 9efa75254593 ("net/mlx5_core: Introduce access functions to query vport RoCE fields") Cc: stable(a)vger.kernel.org # v4.5 Target: net Signed-off-by: Wentao Liang <vulab(a)iscas.ac.cn> --- v4: Fix code error. v3: Explicitly mention target branch. Change improper code. v2: Remove redundant reassignment. Fix typo error. drivers/net/ethernet/mellanox/mlx5/core/vport.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index 0d5f750faa45..c34cd9a1a79b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -465,19 +465,22 @@ int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid) { u32 *out; int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out); + int err; out = kvzalloc(outlen, GFP_KERNEL); if (!out) return -ENOMEM; - mlx5_query_nic_vport_context(mdev, 0, out); + err = mlx5_query_nic_vport_context(mdev, 0, out); + if (err) + goto out; *node_guid = MLX5_GET64(query_nic_vport_context_out, out, nic_vport_context.node_guid); - +out: kvfree(out); - return 0; + return err; } EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_node_guid); -- 2.42.0.windows.2

1 month, 3 weeks

2
1
0 0

[PATCH 1/2] block: Make __submit_bio_noacct() preserve the bio submission order

by Bart Van Assche

submit_bio() may be called recursively. To limit the stack depth, recursive calls result in bios being added to a list (current->bio_list). __submit_bio_noacct() sets up that list and maintains two lists with requests: * bio_list_on_stack[0] is the list with bios submitted by recursive submit_bio() calls from inside the latest __submit_bio() call. * bio_list_on_stack[1] is the list with bios submitted by recursive submit_bio() calls from inside previous __submit_bio() calls. Make sure that bios are submitted to lower devices in the order these have been submitted by submit_bio() by adding new bios at the end of the list instead of at the front. This patch fixes unaligned write errors that I encountered with F2FS submitting zoned writes to a dm driver stacked on top of a zoned UFS device. Cc: Christoph Hellwig <hch(a)lst.de> Cc: Damien Le Moal <dlemoal(a)kernel.org> Cc: Yu Kuai <yukuai1(a)huaweicloud.com> Cc: Ming Lei <ming.lei(a)redhat.com> Cc: stable(a)vger.kernel.org Signed-off-by: Bart Van Assche <bvanassche(a)acm.org> --- block/blk-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-core.c b/block/blk-core.c index b862c66018f2..4b728fa1c138 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -704,9 +704,9 @@ static void __submit_bio_noacct(struct bio *bio) /* * Now assemble so we handle the lowest level first. */ + bio_list_on_stack[0] = bio_list_on_stack[1]; bio_list_merge(&bio_list_on_stack[0], &lower); bio_list_merge(&bio_list_on_stack[0], &same); - bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]); } while ((bio = bio_list_pop(&bio_list_on_stack[0]))); current->bio_list = NULL;

1 month, 3 weeks

3
9
0 0

Re: Patch "f2fs: defer readonly check vs norecovery" has been added to the 6.14-stable tree

by Eric Sandeen

On 5/22/25 4:10 PM, Sasha Levin wrote: > This is a note to let you know that I've just added the patch titled > > f2fs: defer readonly check vs norecovery > > to the 6.14-stable tree which can be found at: > http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum… > > The filename of the patch is: > f2fs-defer-readonly-check-vs-norecovery.patch > and it can be found in the queue-6.14 subdirectory. > > If you, or anyone else, feels it should not be added to the stable tree, > please let <stable(a)vger.kernel.org> know about it. I already replied to the AUTOSEL email on 5/5 saying that this is not a bug fix and should not be in the stable tree, but here we are. > commit 442e4090bb78d5dce4506a591214ce2447d6ea50 > Author: Eric Sandeen <sandeen(a)redhat.com> > Date: Mon Mar 3 11:12:17 2025 -0600 > > f2fs: defer readonly check vs norecovery > > [ Upstream commit 9cca49875997a1a7e92800a828a62bacb0f577b9 ] > > Defer the readonly-vs-norecovery check until after option parsing is done > so that option parsing does not require an active superblock for the test. > Add a helpful message, while we're at it. > > (I think could be moved back into parsing after we switch to the new mount > API if desired, as the fs context will have RO state available.) > > Signed-off-by: Eric Sandeen <sandeen(a)redhat.com> > Reviewed-by: Chao Yu <chao(a)kernel.org> > Signed-off-by: Jaegeuk Kim <jaegeuk(a)kernel.org> > Signed-off-by: Sasha Levin <sashal(a)kernel.org> > > diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c > index b8a0e925a4011..d3b04a589b525 100644 > --- a/fs/f2fs/super.c > +++ b/fs/f2fs/super.c > @@ -728,10 +728,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) > set_opt(sbi, DISABLE_ROLL_FORWARD); > break; > case Opt_norecovery: > - /* this option mounts f2fs with ro */ > + /* requires ro mount, checked in f2fs_default_check */ > set_opt(sbi, NORECOVERY); > - if (!f2fs_readonly(sb)) > - return -EINVAL; > break; > case Opt_discard: > if (!f2fs_hw_support_discard(sbi)) { > @@ -1418,6 +1416,12 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) > f2fs_err(sbi, "Allow to mount readonly mode only"); > return -EROFS; > } > + > + if (test_opt(sbi, NORECOVERY) && !f2fs_readonly(sbi->sb)) { > + f2fs_err(sbi, "norecovery requires readonly mount"); > + return -EINVAL; > + } > + > return 0; > } > >

1 month, 3 weeks

1
0
0 0

Re: [PATCH 6.6 005/568] md: fix deadlock between mddev_suspend and flush bio

by Andrew Kanner

> [...] > > Additionally, the only difference between fixing the issue and before is > that there is no return error handling of make_request(). But after > previous patch cleaned md_write_start(), make_requst() only return error > in raid5_make_request() by dm-raid, see commit 41425f96d7aa ("dm-raid456, > md/raid456: fix a deadlock for dm-raid456 while io concurrent with > reshape)". Since dm always splits data and flush operation into two > separate io, io size of flush submitted by dm always is 0, make_request() > will not be called in md_submit_flush_data(). To prevent future > modifications from introducing issues, add WARN_ON to ensure > make_request() no error is returned in this context. > > [...] > @@ -560,8 +552,20 @@ static void md_submit_flush_data(struct work_struct *ws) > bio_endio(bio); > } else { > bio->bi_opf &= ~REQ_PREFLUSH; > - md_handle_request(mddev, bio); > + > + /* > + * make_requst() will never return error here, it only > + * returns error in raid5_make_request() by dm-raid. > + * Since dm always splits data and flush operation into > + * two separate io, io size of flush submitted by dm > + * always is 0, make_request() will not be called here. > + */ > + if (WARN_ON_ONCE(!mddev->pers->make_request(mddev, bio))) > + bio_io_error(bio);; > } Hello, It looks we can hit this WARN_ON_ONCE() after which rootfs is switching to read-only: May 20 15:13:35 hostname kernel: WARNING: CPU: 35 PID: 1517323 at drivers/md/md.c:621 md_submit_flush_data+0x9b/0xe0 ... May 20 15:13:35 hostname kernel: XFS (md125): log I/O error -5 May 20 15:13:35 hostname kernel: XFS (md125): Filesystem has been shut down due to log error (0x2). May 20 15:13:35 hostname kernel: XFS (md125): Please unmount the filesystem and rectify the problem(s). Can you double check if the following regression is actual? Since both stable/linux-6.1.y and stable/linux-6.6.y branches don't have b75197e86e6d ("md: Remove flush handling") there is a minor issue with this backport. Statement "previous patch cleaned md_write_start(), make_requst() only return error in raid5_make_request() by dm-raid" will not work for both branches since 03e792eaf18e ("md: change the return value type of md_write_start to void") was not backported. So we should either backport it, or do error handling, not the WARN_ON_ONCE(). -- Andrew Kanner

1 month, 3 weeks

1
0
0 0

Re: Patch "dm vdo vio-pool: allow variable-sized metadata vios" has been added to the 6.12-stable tree

by Matthew Sakai

On 5/22/25 6:31 PM, Sasha Levin wrote: > This is a note to let you know that I've just added the patch titled > > dm vdo vio-pool: allow variable-sized metadata vios > > to the 6.12-stable tree which can be found at: > http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum… > > The filename of the patch is: > dm-vdo-vio-pool-allow-variable-sized-metadata-vios.patch > and it can be found in the queue-6.12 subdirectory. > > If you, or anyone else, feels it should not be added to the stable tree, > please let <stable(a)vger.kernel.org> know about it. There is no reason to pull this patch into 6.12 since there are no features in 6.12 that would use it. Matt > > commit ac663217ac4eeab508db348a67511d45ccd9846f > Author: Ken Raeburn <raeburn(a)redhat.com> > Date: Fri Jan 31 21:18:05 2025 -0500 > > dm vdo vio-pool: allow variable-sized metadata vios > > [ Upstream commit f979da512553a41a657f2c1198277e84d66f8ce3 ] > > With larger-sized metadata vio pools, vdo will sometimes need to > issue I/O with a smaller size than the allocated size. Since > vio_reset_bio is where the bvec array and I/O size are initialized, > this reset interface must now specify what I/O size to use. > > Signed-off-by: Ken Raeburn <raeburn(a)redhat.com> > Signed-off-by: Matthew Sakai <msakai(a)redhat.com> > Signed-off-by: Mikulas Patocka <mpatocka(a)redhat.com> > Signed-off-by: Sasha Levin <sashal(a)kernel.org> > > diff --git a/drivers/md/dm-vdo/io-submitter.c b/drivers/md/dm-vdo/io-submitter.c > index ab62abe18827b..a664be89c15d7 100644 > --- a/drivers/md/dm-vdo/io-submitter.c > +++ b/drivers/md/dm-vdo/io-submitter.c > @@ -327,6 +327,7 @@ void vdo_submit_data_vio(struct data_vio *data_vio) > * @error_handler: the handler for submission or I/O errors (may be NULL) > * @operation: the type of I/O to perform > * @data: the buffer to read or write (may be NULL) > + * @size: the I/O amount in bytes > * > * The vio is enqueued on a vdo bio queue so that bio submission (which may block) does not block > * other vdo threads. > @@ -338,7 +339,7 @@ void vdo_submit_data_vio(struct data_vio *data_vio) > */ > void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical, > bio_end_io_t callback, vdo_action_fn error_handler, > - blk_opf_t operation, char *data) > + blk_opf_t operation, char *data, int size) > { > int result; > struct vdo_completion *completion = &vio->completion; > @@ -349,7 +350,8 @@ void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical, > > vdo_reset_completion(completion); > completion->error_handler = error_handler; > - result = vio_reset_bio(vio, data, callback, operation | REQ_META, physical); > + result = vio_reset_bio_with_size(vio, data, size, callback, operation | REQ_META, > + physical); > if (result != VDO_SUCCESS) { > continue_vio(vio, result); > return; > diff --git a/drivers/md/dm-vdo/io-submitter.h b/drivers/md/dm-vdo/io-submitter.h > index 80748699496f2..3088f11055fdd 100644 > --- a/drivers/md/dm-vdo/io-submitter.h > +++ b/drivers/md/dm-vdo/io-submitter.h > @@ -8,6 +8,7 @@ > > #include <linux/bio.h> > > +#include "constants.h" > #include "types.h" > > struct io_submitter; > @@ -26,14 +27,25 @@ void vdo_submit_data_vio(struct data_vio *data_vio); > > void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical, > bio_end_io_t callback, vdo_action_fn error_handler, > - blk_opf_t operation, char *data); > + blk_opf_t operation, char *data, int size); > > static inline void vdo_submit_metadata_vio(struct vio *vio, physical_block_number_t physical, > bio_end_io_t callback, vdo_action_fn error_handler, > blk_opf_t operation) > { > __submit_metadata_vio(vio, physical, callback, error_handler, > - operation, vio->data); > + operation, vio->data, vio->block_count * VDO_BLOCK_SIZE); > +} > + > +static inline void vdo_submit_metadata_vio_with_size(struct vio *vio, > + physical_block_number_t physical, > + bio_end_io_t callback, > + vdo_action_fn error_handler, > + blk_opf_t operation, > + int size) > +{ > + __submit_metadata_vio(vio, physical, callback, error_handler, > + operation, vio->data, size); > } > > static inline void vdo_submit_flush_vio(struct vio *vio, bio_end_io_t callback, > @@ -41,7 +53,7 @@ static inline void vdo_submit_flush_vio(struct vio *vio, bio_end_io_t callback, > { > /* FIXME: Can we just use REQ_OP_FLUSH? */ > __submit_metadata_vio(vio, 0, callback, error_handler, > - REQ_OP_WRITE | REQ_PREFLUSH, NULL); > + REQ_OP_WRITE | REQ_PREFLUSH, NULL, 0); > } > > #endif /* VDO_IO_SUBMITTER_H */ > diff --git a/drivers/md/dm-vdo/types.h b/drivers/md/dm-vdo/types.h > index dbe892b10f265..cdf36e7d77021 100644 > --- a/drivers/md/dm-vdo/types.h > +++ b/drivers/md/dm-vdo/types.h > @@ -376,6 +376,9 @@ struct vio { > /* The size of this vio in blocks */ > unsigned int block_count; > > + /* The amount of data to be read or written, in bytes */ > + unsigned int io_size; > + > /* The data being read or written. */ > char *data; > > diff --git a/drivers/md/dm-vdo/vio.c b/drivers/md/dm-vdo/vio.c > index b291578f726f5..7c417c1af4516 100644 > --- a/drivers/md/dm-vdo/vio.c > +++ b/drivers/md/dm-vdo/vio.c > @@ -188,14 +188,23 @@ void vdo_set_bio_properties(struct bio *bio, struct vio *vio, bio_end_io_t callb > > /* > * Prepares the bio to perform IO with the specified buffer. May only be used on a VDO-allocated > - * bio, as it assumes the bio wraps a 4k buffer that is 4k aligned, but there does not have to be a > - * vio associated with the bio. > + * bio, as it assumes the bio wraps a 4k-multiple buffer that is 4k aligned, but there does not > + * have to be a vio associated with the bio. > */ > int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback, > blk_opf_t bi_opf, physical_block_number_t pbn) > { > - int bvec_count, offset, len, i; > + return vio_reset_bio_with_size(vio, data, vio->block_count * VDO_BLOCK_SIZE, > + callback, bi_opf, pbn); > +} > + > +int vio_reset_bio_with_size(struct vio *vio, char *data, int size, bio_end_io_t callback, > + blk_opf_t bi_opf, physical_block_number_t pbn) > +{ > + int bvec_count, offset, i; > struct bio *bio = vio->bio; > + int vio_size = vio->block_count * VDO_BLOCK_SIZE; > + int remaining; > > bio_reset(bio, bio->bi_bdev, bi_opf); > vdo_set_bio_properties(bio, vio, callback, bi_opf, pbn); > @@ -204,22 +213,21 @@ int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback, > > bio->bi_io_vec = bio->bi_inline_vecs; > bio->bi_max_vecs = vio->block_count + 1; > - len = VDO_BLOCK_SIZE * vio->block_count; > + if (VDO_ASSERT(size <= vio_size, "specified size %d is not greater than allocated %d", > + size, vio_size) != VDO_SUCCESS) > + size = vio_size; > + vio->io_size = size; > offset = offset_in_page(data); > - bvec_count = DIV_ROUND_UP(offset + len, PAGE_SIZE); > + bvec_count = DIV_ROUND_UP(offset + size, PAGE_SIZE); > + remaining = size; > > - /* > - * If we knew that data was always on one page, or contiguous pages, we wouldn't need the > - * loop. But if we're using vmalloc, it's not impossible that the data is in different > - * pages that can't be merged in bio_add_page... > - */ > - for (i = 0; (i < bvec_count) && (len > 0); i++) { > + for (i = 0; (i < bvec_count) && (remaining > 0); i++) { > struct page *page; > int bytes_added; > int bytes = PAGE_SIZE - offset; > > - if (bytes > len) > - bytes = len; > + if (bytes > remaining) > + bytes = remaining; > > page = is_vmalloc_addr(data) ? vmalloc_to_page(data) : virt_to_page(data); > bytes_added = bio_add_page(bio, page, bytes, offset); > @@ -231,7 +239,7 @@ int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback, > } > > data += bytes; > - len -= bytes; > + remaining -= bytes; > offset = 0; > } > > diff --git a/drivers/md/dm-vdo/vio.h b/drivers/md/dm-vdo/vio.h > index 3490e9f59b04a..74e8fd7c8c029 100644 > --- a/drivers/md/dm-vdo/vio.h > +++ b/drivers/md/dm-vdo/vio.h > @@ -123,6 +123,8 @@ void vdo_set_bio_properties(struct bio *bio, struct vio *vio, bio_end_io_t callb > > int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback, > blk_opf_t bi_opf, physical_block_number_t pbn); > +int vio_reset_bio_with_size(struct vio *vio, char *data, int size, bio_end_io_t callback, > + blk_opf_t bi_opf, physical_block_number_t pbn); > > void update_vio_error_stats(struct vio *vio, const char *format, ...) > __printf(2, 3); >

1 month, 3 weeks

1
0
0 0

Re: Patch "btrfs: prevent inline data extents read from touching blocks beyond its range" has been added to the 6.12-stable tree

by Qu Wenruo

在 2025/5/23 07:31, Sasha Levin 写道: > This is a note to let you know that I've just added the patch titled > > btrfs: prevent inline data extents read from touching blocks beyond its range > > to the 6.12-stable tree which can be found at: > http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum… > > The filename of the patch is: > btrfs-prevent-inline-data-extents-read-from-touching.patch > and it can be found in the queue-6.12 subdirectory. > > If you, or anyone else, feels it should not be added to the stable tree, > please let <stable(a)vger.kernel.org> know about it. Please drop this one from all stable trees. Although the patch won't cause any behavior change, the main reason for this patch is to prepare for the subpage optimization (and future large folios support). Thanks, Qu > > > > commit 98504dd74a2688ff63dba6bf1d9f8abc7f0b322e > Author: Qu Wenruo <wqu(a)suse.com> > Date: Fri Nov 15 19:15:34 2024 +1030 > > btrfs: prevent inline data extents read from touching blocks beyond its range > > [ Upstream commit 1a5b5668d711d3d1ef447446beab920826decec3 ] > > Currently reading an inline data extent will zero out the remaining > range in the page. > > This is not yet causing problems even for block size < page size > (subpage) cases because: > > 1) An inline data extent always starts at file offset 0 > Meaning at page read, we always read the inline extent first, before > any other blocks in the page. Then later blocks are properly read out > and re-fill the zeroed out ranges. > > 2) Currently btrfs will read out the whole page if a buffered write is > not page aligned > So a page is either fully uptodate at buffered write time (covers the > whole page), or we will read out the whole page first. > Meaning there is nothing to lose for such an inline extent read. > > But it's still not ideal: > > - We're zeroing out the page twice > Once done by read_inline_extent()/uncompress_inline(), once done by > btrfs_do_readpage() for ranges beyond i_size. > > - We're touching blocks that don't belong to the inline extent > In the incoming patches, we can have a partial uptodate folio, of > which some dirty blocks can exist while the page is not fully uptodate: > > The page size is 16K and block size is 4K: > > 0 4K 8K 12K 16K > | | |/////////| | > > And range [8K, 12K) is dirtied by a buffered write, the remaining > blocks are not uptodate. > > If range [0, 4K) contains an inline data extent, and we try to read > the whole page, the current behavior will overwrite range [8K, 12K) > with zero and cause data loss. > > So to make the behavior more consistent and in preparation for future > changes, limit the inline data extents read to only zero out the range > inside the first block, not the whole page. > > Reviewed-by: Filipe Manana <fdmanana(a)suse.com> > Signed-off-by: Qu Wenruo <wqu(a)suse.com> > Signed-off-by: David Sterba <dsterba(a)suse.com> > Signed-off-by: Sasha Levin <sashal(a)kernel.org> > > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c > index 0da2611fb9c85..ee8c18d298758 100644 > --- a/fs/btrfs/inode.c > +++ b/fs/btrfs/inode.c > @@ -6825,6 +6825,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, > { > int ret; > struct extent_buffer *leaf = path->nodes[0]; > + const u32 blocksize = leaf->fs_info->sectorsize; > char *tmp; > size_t max_size; > unsigned long inline_size; > @@ -6841,7 +6842,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, > > read_extent_buffer(leaf, tmp, ptr, inline_size); > > - max_size = min_t(unsigned long, PAGE_SIZE, max_size); > + max_size = min_t(unsigned long, blocksize, max_size); > ret = btrfs_decompress(compress_type, tmp, folio, 0, inline_size, > max_size); > > @@ -6853,8 +6854,8 @@ static noinline int uncompress_inline(struct btrfs_path *path, > * cover that region here. > */ > > - if (max_size < PAGE_SIZE) > - folio_zero_range(folio, max_size, PAGE_SIZE - max_size); > + if (max_size < blocksize) > + folio_zero_range(folio, max_size, blocksize - max_size); > kfree(tmp); > return ret; > } > @@ -6862,6 +6863,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, > static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path, > struct folio *folio) > { > + const u32 blocksize = path->nodes[0]->fs_info->sectorsize; > struct btrfs_file_extent_item *fi; > void *kaddr; > size_t copy_size; > @@ -6876,14 +6878,14 @@ static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path > if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE) > return uncompress_inline(path, folio, fi); > > - copy_size = min_t(u64, PAGE_SIZE, > + copy_size = min_t(u64, blocksize, > btrfs_file_extent_ram_bytes(path->nodes[0], fi)); > kaddr = kmap_local_folio(folio, 0); > read_extent_buffer(path->nodes[0], kaddr, > btrfs_file_extent_inline_start(fi), copy_size); > kunmap_local(kaddr); > - if (copy_size < PAGE_SIZE) > - folio_zero_range(folio, copy_size, PAGE_SIZE - copy_size); > + if (copy_size < blocksize) > + folio_zero_range(folio, copy_size, blocksize - copy_size); > return 0; > } >

1 month, 3 weeks

1
0
0 0

2025

2024

2023

2022

2021

2020

2019

2018

2017

Linux-stable-mirror