Gao Xiang has reported that on ext4 O_SYNC direct IO does not properly
sync file size update and thus if we crash at unfortunate moment, the
file can have smaller size although O_SYNC IO has reported successful
completion. The problem happens because update of on-disk inode size is
handled in ext4_dio_write_iter() *after* iomap_dio_rw() (and thus
dio_complete() in particular) has returned and generic_file_sync() gets
called by dio_complete(). Fix the problem by handling on-disk inode size
update directly in our ->end_io completion handler.
References: https://lore.kernel.org/all/02d18236-26ef-09b0-90ad-030c4fe3ee20@linux.alib…
Reported-by: Gao Xiang <hsiangkao(a)linux.alibaba.com>
CC: stable(a)vger.kernel.org
Fixes: 378f32bab371 ("ext4: introduce direct I/O write using iomap infrastructure")
Signed-off-by: Jan Kara <jack(a)suse.cz>
---
fs/ext4/file.c | 153 +++++++++++++++++++++----------------------------
1 file changed, 65 insertions(+), 88 deletions(-)
Changes since v2:
* Added more comments explaining the code flow
* Added WARN_ON_ONCE to verify extending IO is handled synchronously
Changes since v1:
* Rebased on top of Linus' tree (instead of a tree with iomap cleanup)
* Made ext4_dio_write_end_io() always return number of written bytes on
success for consistency
* Added Fixes tag
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 6830ea3a6c59..19d9db4799c4 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -306,80 +306,38 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
}
static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
- ssize_t written, size_t count)
+ ssize_t count)
{
handle_t *handle;
- bool truncate = false;
- u8 blkbits = inode->i_blkbits;
- ext4_lblk_t written_blk, end_blk;
- int ret;
-
- /*
- * Note that EXT4_I(inode)->i_disksize can get extended up to
- * inode->i_size while the I/O was running due to writeback of delalloc
- * blocks. But, the code in ext4_iomap_alloc() is careful to use
- * zeroed/unwritten extents if this is possible; thus we won't leave
- * uninitialized blocks in a file even if we didn't succeed in writing
- * as much as we intended.
- */
- WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize);
- if (offset + count <= EXT4_I(inode)->i_disksize) {
- /*
- * We need to ensure that the inode is removed from the orphan
- * list if it has been added prematurely, due to writeback of
- * delalloc blocks.
- */
- if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-
- if (IS_ERR(handle)) {
- ext4_orphan_del(NULL, inode);
- return PTR_ERR(handle);
- }
-
- ext4_orphan_del(handle, inode);
- ext4_journal_stop(handle);
- }
-
- return written;
- }
-
- if (written < 0)
- goto truncate;
+ lockdep_assert_held_write(&inode->i_rwsem);
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
- if (IS_ERR(handle)) {
- written = PTR_ERR(handle);
- goto truncate;
- }
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
- if (ext4_update_inode_size(inode, offset + written)) {
- ret = ext4_mark_inode_dirty(handle, inode);
+ if (ext4_update_inode_size(inode, offset + count)) {
+ int ret = ext4_mark_inode_dirty(handle, inode);
if (unlikely(ret)) {
- written = ret;
ext4_journal_stop(handle);
- goto truncate;
+ return ret;
}
}
- /*
- * We may need to truncate allocated but not written blocks beyond EOF.
- */
- written_blk = ALIGN(offset + written, 1 << blkbits);
- end_blk = ALIGN(offset + count, 1 << blkbits);
- if (written_blk < end_blk && ext4_can_truncate(inode))
- truncate = true;
-
- /*
- * Remove the inode from the orphan list if it has been extended and
- * everything went OK.
- */
- if (!truncate && inode->i_nlink)
+ if (inode->i_nlink)
ext4_orphan_del(handle, inode);
ext4_journal_stop(handle);
- if (truncate) {
-truncate:
+ return count;
+}
+
+/*
+ * Clean up the inode after DIO or DAX extending write has completed and the
+ * inode size has been updated using ext4_handle_inode_extension().
+ */
+static void ext4_inode_extension_cleanup(struct inode *inode, ssize_t count)
+{
+ lockdep_assert_held_write(&inode->i_rwsem);
+ if (count < 0) {
ext4_truncate_failed_write(inode);
/*
* If the truncate operation failed early, then the inode may
@@ -388,9 +346,28 @@ static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
*/
if (inode->i_nlink)
ext4_orphan_del(NULL, inode);
+ return;
}
+ /*
+ * If i_disksize got extended due to writeback of delalloc blocks while
+ * the DIO was running we could fail to cleanup the orphan list in
+ * ext4_handle_inode_extension(). Do it now.
+ */
+ if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
+ handle_t *handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
- return written;
+ if (IS_ERR(handle)) {
+ /*
+ * The write has successfully completed. Not much to
+ * do with the error here so just cleanup the orphan
+ * list and hope for the best.
+ */
+ ext4_orphan_del(NULL, inode);
+ return;
+ }
+ ext4_orphan_del(handle, inode);
+ ext4_journal_stop(handle);
+ }
}
static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
@@ -399,31 +376,22 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
loff_t pos = iocb->ki_pos;
struct inode *inode = file_inode(iocb->ki_filp);
+ if (!error && size && flags & IOMAP_DIO_UNWRITTEN)
+ error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
if (error)
return error;
-
- if (size && flags & IOMAP_DIO_UNWRITTEN) {
- error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
- if (error < 0)
- return error;
- }
/*
- * If we are extending the file, we have to update i_size here before
- * page cache gets invalidated in iomap_dio_rw(). Otherwise racing
- * buffered reads could zero out too much from page cache pages. Update
- * of on-disk size will happen later in ext4_dio_write_iter() where
- * we have enough information to also perform orphan list handling etc.
- * Note that we perform all extending writes synchronously under
- * i_rwsem held exclusively so i_size update is safe here in that case.
- * If the write was not extending, we cannot see pos > i_size here
- * because operations reducing i_size like truncate wait for all
- * outstanding DIO before updating i_size.
+ * Note that EXT4_I(inode)->i_disksize can get extended up to
+ * inode->i_size while the I/O was running due to writeback of delalloc
+ * blocks. But the code in ext4_iomap_alloc() is careful to use
+ * zeroed/unwritten extents if this is possible; thus we won't leave
+ * uninitialized blocks in a file even if we didn't succeed in writing
+ * as much as we intended.
*/
- pos += size;
- if (pos > i_size_read(inode))
- i_size_write(inode, pos);
-
- return 0;
+ WARN_ON_ONCE(i_size_read(inode) < READ_ONCE(EXT4_I(inode)->i_disksize));
+ if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize))
+ return size;
+ return ext4_handle_inode_extension(inode, pos, size);
}
static const struct iomap_dio_ops ext4_dio_write_ops = {
@@ -606,9 +574,16 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
dio_flags, NULL, 0);
if (ret == -ENOTBLK)
ret = 0;
-
- if (extend)
- ret = ext4_handle_inode_extension(inode, offset, ret, count);
+ if (extend) {
+ /*
+ * We always perform extending DIO write synchronously so by
+ * now the IO is completed and ext4_handle_inode_extension()
+ * was called. Cleanup the inode in case of error or race with
+ * writeback of delalloc blocks.
+ */
+ WARN_ON_ONCE(ret == -EIOCBQUEUED);
+ ext4_inode_extension_cleanup(inode, ret);
+ }
out:
if (ilock_shared)
@@ -689,8 +664,10 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
- if (extend)
- ret = ext4_handle_inode_extension(inode, offset, ret, count);
+ if (extend) {
+ ret = ext4_handle_inode_extension(inode, offset, ret);
+ ext4_inode_extension_cleanup(inode, ret);
+ }
out:
inode_unlock(inode);
if (ret > 0)
--
2.35.3
stable-rc/linux-5.4.y build: 17 builds: 2 failed, 15 passed, 8 errors, 30 warnings (v5.4.258-119-g9842aef4b12b)
Full Build Summary: https://kernelci.org/build/stable-rc/branch/linux-5.4.y/kernel/v5.4.258-119…
Tree: stable-rc
Branch: linux-5.4.y
Git Describe: v5.4.258-119-g9842aef4b12b
Git Commit: 9842aef4b12b300a40f0bc2d408313e89a790d20
Git URL: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
Built: 7 unique architectures
Build Failures Detected:
arm:
imx_v6_v7_defconfig: (gcc-10) FAIL
multi_v7_defconfig: (gcc-10) FAIL
Errors and Warnings Detected:
arc:
arm64:
defconfig (gcc-10): 2 warnings
defconfig+arm64-chromebook (gcc-10): 2 warnings
arm:
imx_v6_v7_defconfig (gcc-10): 4 errors, 2 warnings
multi_v7_defconfig (gcc-10): 4 errors, 2 warnings
i386:
allnoconfig (gcc-10): 2 warnings
i386_defconfig (gcc-10): 2 warnings
tinyconfig (gcc-10): 2 warnings
mips:
riscv:
x86_64:
allnoconfig (gcc-10): 4 warnings
tinyconfig (gcc-10): 4 warnings
x86_64_defconfig (gcc-10): 4 warnings
x86_64_defconfig+x86-chromebook (gcc-10): 4 warnings
Errors summary:
2 drivers/gpio/gpio-vf610.c:340:2: error: implicit declaration of function ‘gpio_irq_chip_set_chip’ [-Werror=implicit-function-declaration]
2 drivers/gpio/gpio-vf610.c:251:2: error: ‘GPIOCHIP_IRQ_RESOURCE_HELPERS’ undeclared here (not in a function)
2 drivers/gpio/gpio-vf610.c:250:6: error: ‘IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND’ undeclared here (not in a function); did you mean ‘IRQCHIP_MASK_ON_SUSPEND’?
2 drivers/gpio/gpio-vf610.c:249:11: error: ‘IRQCHIP_IMMUTABLE’ undeclared here (not in a function); did you mean ‘IS_IMMUTABLE’?
Warnings summary:
7 ld: warning: creating DT_TEXTREL in a PIE
4 ld: arch/x86/boot/compressed/head_64.o: warning: relocation in read-only section `.head.text'
4 arch/arm64/include/asm/memory.h:238:15: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
3 ld: arch/x86/boot/compressed/head_32.o: warning: relocation in read-only section `.head.text'
2 drivers/gpio/gpio-vf610.c:251:2: warning: excess elements in struct initializer
2 cc1: some warnings being treated as errors
2 arch/x86/entry/entry_64.o: warning: objtool: If this is a retpoline, please patch it in with alternatives and annotate it with ANNOTATE_NOSPEC_ALTERNATIVE.
2 arch/x86/entry/entry_64.o: warning: objtool: .entry.text+0x1c1: unsupported intra-function call
2 arch/x86/entry/entry_64.o: warning: objtool: .entry.text+0x151: unsupported intra-function call
2 arch/x86/entry/entry_64.S:1756: Warning: no instruction mnemonic suffix given and no register operands; using default for `sysret'
Section mismatches summary:
1 WARNING: vmlinux.o(___ksymtab_gpl+vic_init_cascaded+0x0): Section mismatch in reference from the variable __ksymtab_vic_init_cascaded to the function .init.text:vic_init_cascaded()
================================================================================
Detailed per-defconfig build reports:
--------------------------------------------------------------------------------
32r2el_defconfig (mips, gcc-10) — PASS, 0 errors, 0 warnings, 0 section mismatches
--------------------------------------------------------------------------------
allnoconfig (i386, gcc-10) — PASS, 0 errors, 2 warnings, 0 section mismatches
Warnings:
ld: arch/x86/boot/compressed/head_32.o: warning: relocation in read-only section `.head.text'
ld: warning: creating DT_TEXTREL in a PIE
--------------------------------------------------------------------------------
allnoconfig (x86_64, gcc-10) — PASS, 0 errors, 4 warnings, 0 section mismatches
Warnings:
arch/x86/entry/entry_64.S:1756: Warning: no instruction mnemonic suffix given and no register operands; using default for `sysret'
arch/x86/entry/entry_64.o: warning: objtool: .entry.text+0x151: unsupported intra-function call
ld: arch/x86/boot/compressed/head_64.o: warning: relocation in read-only section `.head.text'
ld: warning: creating DT_TEXTREL in a PIE
--------------------------------------------------------------------------------
defconfig (riscv, gcc-10) — PASS, 0 errors, 0 warnings, 0 section mismatches
--------------------------------------------------------------------------------
defconfig (arm64, gcc-10) — PASS, 0 errors, 2 warnings, 0 section mismatches
Warnings:
arch/arm64/include/asm/memory.h:238:15: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
arch/arm64/include/asm/memory.h:238:15: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
--------------------------------------------------------------------------------
defconfig+arm64-chromebook (arm64, gcc-10) — PASS, 0 errors, 2 warnings, 0 section mismatches
Warnings:
arch/arm64/include/asm/memory.h:238:15: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
arch/arm64/include/asm/memory.h:238:15: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
--------------------------------------------------------------------------------
haps_hs_smp_defconfig (arc, gcc-10) — PASS, 0 errors, 0 warnings, 0 section mismatches
--------------------------------------------------------------------------------
i386_defconfig (i386, gcc-10) — PASS, 0 errors, 2 warnings, 0 section mismatches
Warnings:
ld: arch/x86/boot/compressed/head_32.o: warning: relocation in read-only section `.head.text'
ld: warning: creating DT_TEXTREL in a PIE
--------------------------------------------------------------------------------
imx_v6_v7_defconfig (arm, gcc-10) — FAIL, 4 errors, 2 warnings, 0 section mismatches
Errors:
drivers/gpio/gpio-vf610.c:249:11: error: ‘IRQCHIP_IMMUTABLE’ undeclared here (not in a function); did you mean ‘IS_IMMUTABLE’?
drivers/gpio/gpio-vf610.c:250:6: error: ‘IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND’ undeclared here (not in a function); did you mean ‘IRQCHIP_MASK_ON_SUSPEND’?
drivers/gpio/gpio-vf610.c:251:2: error: ‘GPIOCHIP_IRQ_RESOURCE_HELPERS’ undeclared here (not in a function)
drivers/gpio/gpio-vf610.c:340:2: error: implicit declaration of function ‘gpio_irq_chip_set_chip’ [-Werror=implicit-function-declaration]
Warnings:
drivers/gpio/gpio-vf610.c:251:2: warning: excess elements in struct initializer
cc1: some warnings being treated as errors
--------------------------------------------------------------------------------
multi_v5_defconfig (arm, gcc-10) — PASS, 0 errors, 0 warnings, 0 section mismatches
Section mismatches:
WARNING: vmlinux.o(___ksymtab_gpl+vic_init_cascaded+0x0): Section mismatch in reference from the variable __ksymtab_vic_init_cascaded to the function .init.text:vic_init_cascaded()
--------------------------------------------------------------------------------
multi_v7_defconfig (arm, gcc-10) — FAIL, 4 errors, 2 warnings, 0 section mismatches
Errors:
drivers/gpio/gpio-vf610.c:249:11: error: ‘IRQCHIP_IMMUTABLE’ undeclared here (not in a function); did you mean ‘IS_IMMUTABLE’?
drivers/gpio/gpio-vf610.c:250:6: error: ‘IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND’ undeclared here (not in a function); did you mean ‘IRQCHIP_MASK_ON_SUSPEND’?
drivers/gpio/gpio-vf610.c:251:2: error: ‘GPIOCHIP_IRQ_RESOURCE_HELPERS’ undeclared here (not in a function)
drivers/gpio/gpio-vf610.c:340:2: error: implicit declaration of function ‘gpio_irq_chip_set_chip’ [-Werror=implicit-function-declaration]
Warnings:
drivers/gpio/gpio-vf610.c:251:2: warning: excess elements in struct initializer
cc1: some warnings being treated as errors
--------------------------------------------------------------------------------
omap2plus_defconfig (arm, gcc-10) — PASS, 0 errors, 0 warnings, 0 section mismatches
--------------------------------------------------------------------------------
tinyconfig (i386, gcc-10) — PASS, 0 errors, 2 warnings, 0 section mismatches
Warnings:
ld: arch/x86/boot/compressed/head_32.o: warning: relocation in read-only section `.head.text'
ld: warning: creating DT_TEXTREL in a PIE
--------------------------------------------------------------------------------
tinyconfig (x86_64, gcc-10) — PASS, 0 errors, 4 warnings, 0 section mismatches
Warnings:
arch/x86/entry/entry_64.S:1756: Warning: no instruction mnemonic suffix given and no register operands; using default for `sysret'
arch/x86/entry/entry_64.o: warning: objtool: .entry.text+0x151: unsupported intra-function call
ld: arch/x86/boot/compressed/head_64.o: warning: relocation in read-only section `.head.text'
ld: warning: creating DT_TEXTREL in a PIE
--------------------------------------------------------------------------------
vexpress_defconfig (arm, gcc-10) — PASS, 0 errors, 0 warnings, 0 section mismatches
--------------------------------------------------------------------------------
x86_64_defconfig (x86_64, gcc-10) — PASS, 0 errors, 4 warnings, 0 section mismatches
Warnings:
arch/x86/entry/entry_64.o: warning: objtool: .entry.text+0x1c1: unsupported intra-function call
arch/x86/entry/entry_64.o: warning: objtool: If this is a retpoline, please patch it in with alternatives and annotate it with ANNOTATE_NOSPEC_ALTERNATIVE.
ld: arch/x86/boot/compressed/head_64.o: warning: relocation in read-only section `.head.text'
ld: warning: creating DT_TEXTREL in a PIE
--------------------------------------------------------------------------------
x86_64_defconfig+x86-chromebook (x86_64, gcc-10) — PASS, 0 errors, 4 warnings, 0 section mismatches
Warnings:
arch/x86/entry/entry_64.o: warning: objtool: .entry.text+0x1c1: unsupported intra-function call
arch/x86/entry/entry_64.o: warning: objtool: If this is a retpoline, please patch it in with alternatives and annotate it with ANNOTATE_NOSPEC_ALTERNATIVE.
ld: arch/x86/boot/compressed/head_64.o: warning: relocation in read-only section `.head.text'
ld: warning: creating DT_TEXTREL in a PIE
---
For more info write to <info(a)kernelci.org>
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x b134a5805455d1886662a6516c965cdb9df9fbcc
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023102035-citation-buddhist-8a5d@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b134a5805455d1886662a6516c965cdb9df9fbcc Mon Sep 17 00:00:00 2001
From: Matthieu Baerts <matttbe(a)kernel.org>
Date: Wed, 18 Oct 2023 11:23:52 -0700
Subject: [PATCH] selftests: mptcp: join: correctly check for no RST
The commit mentioned below was more tolerant with the number of RST seen
during a test because in some uncontrollable situations, multiple RST
can be generated.
But it was not taking into account the case where no RST are expected:
this validation was then no longer reporting issues for the 0 RST case
because it is not possible to have less than 0 RST in the counter. This
patch fixes the issue by adding a specific condition.
Fixes: 6bf41020b72b ("selftests: mptcp: update and extend fastclose test-cases")
Cc: stable(a)vger.kernel.org
Reviewed-by: Mat Martineau <martineau(a)kernel.org>
Signed-off-by: Matthieu Baerts <matttbe(a)kernel.org>
Signed-off-by: Mat Martineau <martineau(a)kernel.org>
Link: https://lore.kernel.org/r/20231018-send-net-20231018-v1-1-17ecb002e41d@kern…
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index ee1f89a872b3..27953670206e 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -1432,7 +1432,9 @@ chk_rst_nr()
count=$(get_counter ${ns_tx} "MPTcpExtMPRstTx")
if [ -z "$count" ]; then
print_skip
- elif [ $count -lt $rst_tx ]; then
+ # accept more rst than expected except if we don't expect any
+ elif { [ $rst_tx -ne 0 ] && [ $count -lt $rst_tx ]; } ||
+ { [ $rst_tx -eq 0 ] && [ $count -ne 0 ]; }; then
fail_test "got $count MP_RST[s] TX expected $rst_tx"
else
print_ok
@@ -1442,7 +1444,9 @@ chk_rst_nr()
count=$(get_counter ${ns_rx} "MPTcpExtMPRstRx")
if [ -z "$count" ]; then
print_skip
- elif [ "$count" -lt "$rst_rx" ]; then
+ # accept more rst than expected except if we don't expect any
+ elif { [ $rst_rx -ne 0 ] && [ $count -lt $rst_rx ]; } ||
+ { [ $rst_rx -eq 0 ] && [ $count -ne 0 ]; }; then
fail_test "got $count MP_RST[s] RX expected $rst_rx"
else
print_ok