Gao Xiang has reported that on ext4 O_SYNC direct IO does not properly
sync file size update and thus if we crash at unfortunate moment, the
file can have smaller size although O_SYNC IO has reported successful
completion. The problem happens because update of on-disk inode size is
handled in ext4_dio_write_iter() *after* iomap_dio_rw() (and thus
dio_complete() in particular) has returned and generic_file_sync() gets
called by dio_complete(). Fix the problem by handling on-disk inode size
update directly in our ->end_io completion handler.
References: https://lore.kernel.org/all/02d18236-26ef-09b0-90ad-030c4fe3ee20@linux.alib…
Reported-by: Gao Xiang <hsiangkao(a)linux.alibaba.com>
CC: stable(a)vger.kernel.org
Fixes: 378f32bab371 ("ext4: introduce direct I/O write using iomap infrastructure")
Signed-off-by: Jan Kara <jack(a)suse.cz>
---
fs/ext4/file.c | 139 ++++++++++++++++++-------------------------------
1 file changed, 52 insertions(+), 87 deletions(-)
Changes since v1:
* Rebased on top of Linus' tree (instead of a tree with iomap cleanup)
* Made ext4_dio_write_end_io() always return number of written bytes on
success for consistency
* Added Fixes tag
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 6830ea3a6c59..40066a5db8e8 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -306,80 +306,34 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
}
static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
- ssize_t written, size_t count)
+ ssize_t count)
{
handle_t *handle;
- bool truncate = false;
- u8 blkbits = inode->i_blkbits;
- ext4_lblk_t written_blk, end_blk;
- int ret;
-
- /*
- * Note that EXT4_I(inode)->i_disksize can get extended up to
- * inode->i_size while the I/O was running due to writeback of delalloc
- * blocks. But, the code in ext4_iomap_alloc() is careful to use
- * zeroed/unwritten extents if this is possible; thus we won't leave
- * uninitialized blocks in a file even if we didn't succeed in writing
- * as much as we intended.
- */
- WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize);
- if (offset + count <= EXT4_I(inode)->i_disksize) {
- /*
- * We need to ensure that the inode is removed from the orphan
- * list if it has been added prematurely, due to writeback of
- * delalloc blocks.
- */
- if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-
- if (IS_ERR(handle)) {
- ext4_orphan_del(NULL, inode);
- return PTR_ERR(handle);
- }
-
- ext4_orphan_del(handle, inode);
- ext4_journal_stop(handle);
- }
-
- return written;
- }
-
- if (written < 0)
- goto truncate;
+ lockdep_assert_held_write(&inode->i_rwsem);
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
- if (IS_ERR(handle)) {
- written = PTR_ERR(handle);
- goto truncate;
- }
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
- if (ext4_update_inode_size(inode, offset + written)) {
- ret = ext4_mark_inode_dirty(handle, inode);
+ if (ext4_update_inode_size(inode, offset + count)) {
+ int ret = ext4_mark_inode_dirty(handle, inode);
if (unlikely(ret)) {
- written = ret;
ext4_journal_stop(handle);
- goto truncate;
+ return ret;
}
}
- /*
- * We may need to truncate allocated but not written blocks beyond EOF.
- */
- written_blk = ALIGN(offset + written, 1 << blkbits);
- end_blk = ALIGN(offset + count, 1 << blkbits);
- if (written_blk < end_blk && ext4_can_truncate(inode))
- truncate = true;
-
- /*
- * Remove the inode from the orphan list if it has been extended and
- * everything went OK.
- */
- if (!truncate && inode->i_nlink)
+ if (inode->i_nlink)
ext4_orphan_del(handle, inode);
ext4_journal_stop(handle);
- if (truncate) {
-truncate:
+ return count;
+}
+
+static void ext4_inode_extension_cleanup(struct inode *inode, ssize_t count)
+{
+ lockdep_assert_held_write(&inode->i_rwsem);
+ if (count < 0) {
ext4_truncate_failed_write(inode);
/*
* If the truncate operation failed early, then the inode may
@@ -388,9 +342,28 @@ static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
*/
if (inode->i_nlink)
ext4_orphan_del(NULL, inode);
+ return;
}
+ /*
+ * If i_disksize got extended due to writeback of delalloc blocks while
+ * the DIO was running we could fail to cleanup the orphan list in
+ * ext4_handle_inode_extension(). Do it now.
+ */
+ if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
+ handle_t *handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
- return written;
+ if (IS_ERR(handle)) {
+ /*
+ * The write has successfully completed. Not much to
+ * do with the error here so just cleanup the orphan
+ * list and hope for the best.
+ */
+ ext4_orphan_del(NULL, inode);
+ return;
+ }
+ ext4_orphan_del(handle, inode);
+ ext4_journal_stop(handle);
+ }
}
static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
@@ -399,31 +372,22 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
loff_t pos = iocb->ki_pos;
struct inode *inode = file_inode(iocb->ki_filp);
+ if (!error && size && flags & IOMAP_DIO_UNWRITTEN)
+ error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
if (error)
return error;
-
- if (size && flags & IOMAP_DIO_UNWRITTEN) {
- error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
- if (error < 0)
- return error;
- }
/*
- * If we are extending the file, we have to update i_size here before
- * page cache gets invalidated in iomap_dio_rw(). Otherwise racing
- * buffered reads could zero out too much from page cache pages. Update
- * of on-disk size will happen later in ext4_dio_write_iter() where
- * we have enough information to also perform orphan list handling etc.
- * Note that we perform all extending writes synchronously under
- * i_rwsem held exclusively so i_size update is safe here in that case.
- * If the write was not extending, we cannot see pos > i_size here
- * because operations reducing i_size like truncate wait for all
- * outstanding DIO before updating i_size.
+ * Note that EXT4_I(inode)->i_disksize can get extended up to
+ * inode->i_size while the I/O was running due to writeback of delalloc
+ * blocks. But the code in ext4_iomap_alloc() is careful to use
+ * zeroed/unwritten extents if this is possible; thus we won't leave
+ * uninitialized blocks in a file even if we didn't succeed in writing
+ * as much as we intended.
*/
- pos += size;
- if (pos > i_size_read(inode))
- i_size_write(inode, pos);
-
- return 0;
+ WARN_ON_ONCE(i_size_read(inode) < READ_ONCE(EXT4_I(inode)->i_disksize));
+ if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize))
+ return size;
+ return ext4_handle_inode_extension(inode, pos, size);
}
static const struct iomap_dio_ops ext4_dio_write_ops = {
@@ -606,9 +570,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
dio_flags, NULL, 0);
if (ret == -ENOTBLK)
ret = 0;
-
if (extend)
- ret = ext4_handle_inode_extension(inode, offset, ret, count);
+ ext4_inode_extension_cleanup(inode, ret);
out:
if (ilock_shared)
@@ -689,8 +652,10 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
- if (extend)
- ret = ext4_handle_inode_extension(inode, offset, ret, count);
+ if (extend) {
+ ret = ext4_handle_inode_extension(inode, offset, ret);
+ ext4_inode_extension_cleanup(inode, ret);
+ }
out:
inode_unlock(inode);
if (ret > 0)
--
2.35.3
Fei has reported that KASAN triggers during apply_alternatives() on
5-level paging machine:
BUG: KASAN: out-of-bounds in rcu_is_watching
Read of size 4 at addr ff110003ee6419a0 by task swapper/0/0
...
__asan_load4
rcu_is_watching
trace_hardirqs_on
text_poke_early
apply_alternatives
...
On machines with 5-level paging, cpu_feature_enabled(X86_FEATURE_LA57)
got patched. It includes KASAN code, where KASAN_SHADOW_START depends on
__VIRTUAL_MASK_SHIFT, which is defined with the cpu_feature_enabled().
KASAN gets confused when apply_alternatives() patches the
KASAN_SHADOW_START users. A test patch that makes KASAN_SHADOW_START
static, by replacing __VIRTUAL_MASK_SHIFT with 56, fixes the issue.
Disable KASAN while kernel patches alternatives.
Signed-off-by: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Reported-by: Fei Yang <fei.yang(a)intel.com>
Fixes: 6657fca06e3f ("x86/mm: Allow to boot without LA57 if CONFIG_X86_5LEVEL=y")
Cc: stable(a)vger.kernel.org
---
v3:
- Summarize KASAN splat;
- Update comment in apply_alternatives();
v2:
- Move kasan_disable/_enable_current() to cover whole loop, not only
text_poke_early();
- Adjust commit message.
---
arch/x86/kernel/alternative.c | 13 +++++++++++++
1 file changed, 13 insertions(+)
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 517ee01503be..73be3931e4f0 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -403,6 +403,17 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
u8 insn_buff[MAX_PATCH_LEN];
DPRINTK(ALT, "alt table %px, -> %px", start, end);
+
+ /*
+ * In the case CONFIG_X86_5LEVEL=y, KASAN_SHADOW_START is defined using
+ * cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here.
+ * During the process, KASAN becomes confused seeing partial LA57
+ * conversion and triggers a false-positive out-of-bound report.
+ *
+ * Disable KASAN until the patching is complete.
+ */
+ kasan_disable_current();
+
/*
* The scan order should be from start to end. A later scanned
* alternative code can overwrite previously scanned alternative code.
@@ -452,6 +463,8 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
text_poke_early(instr, insn_buff, insn_buff_sz);
}
+
+ kasan_enable_current();
}
static inline bool is_jcc32(struct insn *insn)
--
2.41.0
Please backport to Linux 5.10.y :
20401d1058f3f841f35a594ac2fc1293710e55b9
ipc: replace costly bailout check in sysvipc_find_ipc()
This should address CVE-2021-3669
https://nvd.nist.gov/vuln/detail/CVE-2021-3669
Thank you