From: "Darrick J. Wong" djwong@kernel.org
[ Upstream commit 2723234923b3294dbcf6019c288c87465e927ed4 ]
Move the code that allocates and frees the buffer cancellation tables used by log recovery into the file that actually uses the tables. This is a precursor to some cleanups and a memory leak fix.
( backport: dependency of 8db074bd84df5ccc88bff3f8f900f66f4b8349fa )
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Dave Chinner david@fromorbit.com Signed-off-by: Leah Rumancik leah.rumancik@gmail.com Acked-by: Chandan Babu R chandanbabu@kernel.org --- fs/xfs/libxfs/xfs_log_recover.h | 14 +++++----- fs/xfs/xfs_buf_item_recover.c | 47 +++++++++++++++++++++++++++++++++ fs/xfs/xfs_log_priv.h | 3 --- fs/xfs/xfs_log_recover.c | 32 +++++++--------------- 4 files changed, 64 insertions(+), 32 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index ff69a0000817..b8b65a6e9b1e 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -108,12 +108,6 @@ struct xlog_recover {
#define ITEM_TYPE(i) (*(unsigned short *)(i)->ri_buf[0].i_addr)
-/* - * This is the number of entries in the l_buf_cancel_table used during - * recovery. - */ -#define XLOG_BC_TABLE_SIZE 64 - #define XLOG_RECOVER_CRCPASS 0 #define XLOG_RECOVER_PASS1 1 #define XLOG_RECOVER_PASS2 2 @@ -126,5 +120,13 @@ int xlog_recover_iget(struct xfs_mount *mp, xfs_ino_t ino, struct xfs_inode **ipp); void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type, uint64_t intent_id); +void xlog_alloc_buf_cancel_table(struct xlog *log); +void xlog_free_buf_cancel_table(struct xlog *log); + +#ifdef DEBUG +void xlog_check_buf_cancel_table(struct xlog *log); +#else +#define xlog_check_buf_cancel_table(log) do { } while (0) +#endif
#endif /* __XFS_LOG_RECOVER_H__ */ diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index e04e44ef14c6..dc099b2f4984 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -23,6 +23,15 @@ #include "xfs_dir2.h" #include "xfs_quota.h"
+/* + * This is the number of entries in the l_buf_cancel_table used during + * recovery. + */ +#define XLOG_BC_TABLE_SIZE 64 + +#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ + ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) + /* * This structure is used during recovery to record the buf log items which * have been canceled and should not be replayed. @@ -1003,3 +1012,41 @@ const struct xlog_recover_item_ops xlog_buf_item_ops = { .commit_pass1 = xlog_recover_buf_commit_pass1, .commit_pass2 = xlog_recover_buf_commit_pass2, }; + +#ifdef DEBUG +void +xlog_check_buf_cancel_table( + struct xlog *log) +{ + int i; + + for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) + ASSERT(list_empty(&log->l_buf_cancel_table[i])); +} +#endif + +void +xlog_alloc_buf_cancel_table( + struct xlog *log) +{ + int i; + + ASSERT(log->l_buf_cancel_table == NULL); + + log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE * + sizeof(struct list_head), + 0); + for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) + INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); +} + +void +xlog_free_buf_cancel_table( + struct xlog *log) +{ + if (!log->l_buf_cancel_table) + return; + + kmem_free(log->l_buf_cancel_table); + log->l_buf_cancel_table = NULL; +} diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index f3d68ca39f45..03393595676f 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -454,9 +454,6 @@ struct xlog { struct rw_semaphore l_incompat_users; };
-#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ - ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) - /* * Bits for operational state */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 581aeb288b32..18d8eebc2d44 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3248,7 +3248,7 @@ xlog_do_log_recovery( xfs_daddr_t head_blk, xfs_daddr_t tail_blk) { - int error, i; + int error;
ASSERT(head_blk != tail_blk);
@@ -3256,37 +3256,23 @@ xlog_do_log_recovery( * First do a pass to find all of the cancelled buf log items. * Store them in the buf_cancel_table for use in the second pass. */ - log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE * - sizeof(struct list_head), - 0); - for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) - INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); + xlog_alloc_buf_cancel_table(log);
error = xlog_do_recovery_pass(log, head_blk, tail_blk, XLOG_RECOVER_PASS1, NULL); - if (error != 0) { - kmem_free(log->l_buf_cancel_table); - log->l_buf_cancel_table = NULL; - return error; - } + if (error != 0) + goto out_cancel; + /* * Then do a second pass to actually recover the items in the log. * When it is complete free the table of buf cancel items. */ error = xlog_do_recovery_pass(log, head_blk, tail_blk, XLOG_RECOVER_PASS2, NULL); -#ifdef DEBUG - if (!error) { - int i; - - for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) - ASSERT(list_empty(&log->l_buf_cancel_table[i])); - } -#endif /* DEBUG */ - - kmem_free(log->l_buf_cancel_table); - log->l_buf_cancel_table = NULL; - + if (!error) + xlog_check_buf_cancel_table(log); +out_cancel: + xlog_free_buf_cancel_table(log); return error; }
From: "Darrick J. Wong" djwong@kernel.org
[ Upstream commit 8db074bd84df5ccc88bff3f8f900f66f4b8349fa ]
If log recovery fails, we free the memory used by the buffer cancellation buckets, but we don't actually traverse each bucket list to free the individual xfs_buf_cancel objects. This leads to a memory leak, as reported by kmemleak in xfs/051:
unreferenced object 0xffff888103629560 (size 32): comm "mount", pid 687045, jiffies 4296935916 (age 10.752s) hex dump (first 32 bytes): 08 d3 0a 01 00 00 00 00 08 00 00 00 01 00 00 00 ................ d0 f5 0b 92 81 88 ff ff 80 64 64 25 81 88 ff ff .........dd%.... backtrace: [<ffffffffa0317c83>] kmem_alloc+0x73/0x140 [xfs] [<ffffffffa03234a9>] xlog_recover_buf_commit_pass1+0x139/0x200 [xfs] [<ffffffffa032dc27>] xlog_recover_commit_trans+0x307/0x350 [xfs] [<ffffffffa032df15>] xlog_recovery_process_trans+0xa5/0xe0 [xfs] [<ffffffffa032e12d>] xlog_recover_process_data+0x8d/0x140 [xfs] [<ffffffffa032e49d>] xlog_do_recovery_pass+0x19d/0x740 [xfs] [<ffffffffa032f22d>] xlog_do_log_recovery+0x6d/0x150 [xfs] [<ffffffffa032f343>] xlog_do_recover+0x33/0x1d0 [xfs] [<ffffffffa032faba>] xlog_recover+0xda/0x190 [xfs] [<ffffffffa03194bc>] xfs_log_mount+0x14c/0x360 [xfs] [<ffffffffa030bfed>] xfs_mountfs+0x50d/0xa60 [xfs] [<ffffffffa03124b5>] xfs_fs_fill_super+0x6a5/0x950 [xfs] [<ffffffff812b92a5>] get_tree_bdev+0x175/0x280 [<ffffffff812b7c3a>] vfs_get_tree+0x1a/0x80 [<ffffffff812e366f>] path_mount+0x6ff/0xaa0 [<ffffffff812e3b13>] __x64_sys_mount+0x103/0x140
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Dave Chinner david@fromorbit.com Signed-off-by: Leah Rumancik leah.rumancik@gmail.com Acked-by: Chandan Babu R chandanbabu@kernel.org --- fs/xfs/xfs_buf_item_recover.c | 13 +++++++++++++ 1 file changed, 13 insertions(+)
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index dc099b2f4984..635f7f8ed9c2 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -1044,9 +1044,22 @@ void xlog_free_buf_cancel_table( struct xlog *log) { + int i; + if (!log->l_buf_cancel_table) return;
+ for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) { + struct xfs_buf_cancel *bc; + + while ((bc = list_first_entry_or_null( + &log->l_buf_cancel_table[i], + struct xfs_buf_cancel, bc_list))) { + list_del(&bc->bc_list); + kmem_free(bc); + } + } + kmem_free(log->l_buf_cancel_table); log->l_buf_cancel_table = NULL; }
From: "Darrick J. Wong" djwong@kernel.org
[ Upstream commit 910bbdf2f4d7df46781bc9b723048f5ebed3d0d7 ]
While we're messing around with how recovery allocates and frees the buffer cancellation table, convert the allocation to use kmalloc_array instead of the old kmem_alloc APIs, and make it handle a null return, even though that's not likely.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Dave Chinner david@fromorbit.com Signed-off-by: Leah Rumancik leah.rumancik@gmail.com Acked-by: Chandan Babu R chandanbabu@kernel.org --- fs/xfs/libxfs/xfs_log_recover.h | 2 +- fs/xfs/xfs_buf_item_recover.c | 14 ++++++++++---- fs/xfs/xfs_log_recover.c | 4 +++- 3 files changed, 14 insertions(+), 6 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index b8b65a6e9b1e..81a065b0b571 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -120,7 +120,7 @@ int xlog_recover_iget(struct xfs_mount *mp, xfs_ino_t ino, struct xfs_inode **ipp); void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type, uint64_t intent_id); -void xlog_alloc_buf_cancel_table(struct xlog *log); +int xlog_alloc_buf_cancel_table(struct xlog *log); void xlog_free_buf_cancel_table(struct xlog *log);
#ifdef DEBUG diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 635f7f8ed9c2..31cbe7deebfa 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -1025,19 +1025,25 @@ xlog_check_buf_cancel_table( } #endif
-void +int xlog_alloc_buf_cancel_table( struct xlog *log) { + void *p; int i;
ASSERT(log->l_buf_cancel_table == NULL);
- log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE * - sizeof(struct list_head), - 0); + p = kmalloc_array(XLOG_BC_TABLE_SIZE, sizeof(struct list_head), + GFP_KERNEL); + if (!p) + return -ENOMEM; + + log->l_buf_cancel_table = p; for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); + + return 0; }
void diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 18d8eebc2d44..aeb01d4c0423 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3256,7 +3256,9 @@ xlog_do_log_recovery( * First do a pass to find all of the cancelled buf log items. * Store them in the buf_cancel_table for use in the second pass. */ - xlog_alloc_buf_cancel_table(log); + error = xlog_alloc_buf_cancel_table(log); + if (error) + return error;
error = xlog_do_recovery_pass(log, head_blk, tail_blk, XLOG_RECOVER_PASS1, NULL);
From: Kaixu Xia kaixuxia@tencent.com
[ Upstream commit 82af88063961da9425924d9aec3fb67a4ebade3e ]
We should use invalidate_lock and XFS_MMAPLOCK_SHARED to check the state of mmap_lock rw_semaphore in xfs_isilocked(), rather than i_rwsem and XFS_IOLOCK_SHARED.
Fixes: 2433480a7e1d ("xfs: Convert to use invalidate_lock") Signed-off-by: Kaixu Xia kaixuxia@tencent.com Reviewed-by: Dave Chinner dchinner@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Leah Rumancik leah.rumancik@gmail.com Acked-by: Chandan Babu R chandanbabu@kernel.org --- fs/xfs/xfs_inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index b2ea85318214..df64b902842d 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -378,8 +378,8 @@ xfs_isilocked( }
if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) { - return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem, - (lock_flags & XFS_IOLOCK_SHARED)); + return __xfs_rwsem_islocked(&VFS_I(ip)->i_mapping->invalidate_lock, + (lock_flags & XFS_MMAPLOCK_SHARED)); }
if (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) {
From: "Darrick J. Wong" djwong@kernel.org
[ Upstream commit 7561cea5dbb97fecb952548a0fb74fb105bf4664 ]
KASAN reported the following use after free bug when running generic/475:
XFS (dm-0): Mounting V5 Filesystem XFS (dm-0): Starting recovery (logdev: internal) XFS (dm-0): Ending recovery (logdev: internal) Buffer I/O error on dev dm-0, logical block 20639616, async page read Buffer I/O error on dev dm-0, logical block 20639617, async page read XFS (dm-0): log I/O error -5 XFS (dm-0): Filesystem has been shut down due to log error (0x2). XFS (dm-0): Unmounting Filesystem XFS (dm-0): Please unmount the filesystem and rectify the problem(s). ================================================================== BUG: KASAN: use-after-free in do_raw_spin_lock+0x246/0x270 Read of size 4 at addr ffff888109dd84c4 by task 3:1H/136
CPU: 3 PID: 136 Comm: 3:1H Not tainted 5.19.0-rc4-xfsx #rc4 8e53ab5ad0fddeb31cee5e7063ff9c361915a9c4 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.15.0-1 04/01/2014 Workqueue: xfs-log/dm-0 xlog_ioend_work [xfs] Call Trace: <TASK> dump_stack_lvl+0x34/0x44 print_report.cold+0x2b8/0x661 ? do_raw_spin_lock+0x246/0x270 kasan_report+0xab/0x120 ? do_raw_spin_lock+0x246/0x270 do_raw_spin_lock+0x246/0x270 ? rwlock_bug.part.0+0x90/0x90 xlog_force_shutdown+0xf6/0x370 [xfs 4ad76ae0d6add7e8183a553e624c31e9ed567318] xlog_ioend_work+0x100/0x190 [xfs 4ad76ae0d6add7e8183a553e624c31e9ed567318] process_one_work+0x672/0x1040 worker_thread+0x59b/0xec0 ? __kthread_parkme+0xc6/0x1f0 ? process_one_work+0x1040/0x1040 ? process_one_work+0x1040/0x1040 kthread+0x29e/0x340 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x1f/0x30 </TASK>
Allocated by task 154099: kasan_save_stack+0x1e/0x40 __kasan_kmalloc+0x81/0xa0 kmem_alloc+0x8d/0x2e0 [xfs] xlog_cil_init+0x1f/0x540 [xfs] xlog_alloc_log+0xd1e/0x1260 [xfs] xfs_log_mount+0xba/0x640 [xfs] xfs_mountfs+0xf2b/0x1d00 [xfs] xfs_fs_fill_super+0x10af/0x1910 [xfs] get_tree_bdev+0x383/0x670 vfs_get_tree+0x7d/0x240 path_mount+0xdb7/0x1890 __x64_sys_mount+0x1fa/0x270 do_syscall_64+0x2b/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0
Freed by task 154151: kasan_save_stack+0x1e/0x40 kasan_set_track+0x21/0x30 kasan_set_free_info+0x20/0x30 ____kasan_slab_free+0x110/0x190 slab_free_freelist_hook+0xab/0x180 kfree+0xbc/0x310 xlog_dealloc_log+0x1b/0x2b0 [xfs] xfs_unmountfs+0x119/0x200 [xfs] xfs_fs_put_super+0x6e/0x2e0 [xfs] generic_shutdown_super+0x12b/0x3a0 kill_block_super+0x95/0xd0 deactivate_locked_super+0x80/0x130 cleanup_mnt+0x329/0x4d0 task_work_run+0xc5/0x160 exit_to_user_mode_prepare+0xd4/0xe0 syscall_exit_to_user_mode+0x1d/0x40 entry_SYSCALL_64_after_hwframe+0x46/0xb0
This appears to be a race between the unmount process, which frees the CIL and waits for in-flight iclog IO; and the iclog IO completion. When generic/475 runs, it starts fsstress in the background, waits a few seconds, and substitutes a dm-error device to simulate a disk falling out of a machine. If the fsstress encounters EIO on a pure data write, it will exit but the filesystem will still be online.
The next thing the test does is unmount the filesystem, which tries to clean the log, free the CIL, and wait for iclog IO completion. If an iclog was being written when the dm-error switch occurred, it can race with log unmounting as follows:
Thread 1 Thread 2
xfs_log_unmount xfs_log_clean xfs_log_quiesce xlog_ioend_work <observe error> xlog_force_shutdown test_and_set_bit(XLOG_IOERROR) xfs_log_force <log is shut down, nop> xfs_log_umount_write <log is shut down, nop> xlog_dealloc_log xlog_cil_destroy <wait for iclogs> spin_lock(&log->l_cilp->xc_push_lock) <KABOOM>
Therefore, free the CIL after waiting for the iclogs to complete. I /think/ this race has existed for quite a few years now, though I don't remember the ~2014 era logging code well enough to know if it was a real threat then or if the actual race was exposed only more recently.
Fixes: ac983517ec59 ("xfs: don't sleep in xlog_cil_force_lsn on shutdown") Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Leah Rumancik leah.rumancik@gmail.com Acked-by: Chandan Babu R chandanbabu@kernel.org --- fs/xfs/xfs_log.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 0fb7d05ca308..eba295f666ac 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2061,8 +2061,6 @@ xlog_dealloc_log( xlog_in_core_t *iclog, *next_iclog; int i;
- xlog_cil_destroy(log); - /* * Cycle all the iclogbuf locks to make sure all log IO completion * is done before we tear down these buffers. @@ -2074,6 +2072,13 @@ xlog_dealloc_log( iclog = iclog->ic_next; }
+ /* + * Destroy the CIL after waiting for iclog IO completion because an + * iclog EIO error will try to shut down the log, which accesses the + * CIL to wake up the waiters. + */ + xlog_cil_destroy(log); + iclog = log->l_iclog; for (i = 0; i < log->l_iclog_bufs; i++) { next_iclog = iclog->ic_next;
From: Zhang Yi yi.zhang@huawei.com
[ Upstream commit 04a98a036cf8b810dda172a9dcfcbd783bf63655 ]
In the procedure of recover AGI unlinked lists, if something bad happenes on one of the unlinked inode in the bucket list, we would call xlog_recover_clear_agi_bucket() to clear the whole unlinked bucket list, not the unlinked inodes after the bad one. If we have already added some inodes to the gc workqueue before the bad inode in the list, we could get below error when freeing those inodes, and finaly fail to complete the log recover procedure.
XFS (ram0): Internal error xfs_iunlink_remove at line 2456 of file fs/xfs/xfs_inode.c. Caller xfs_ifree+0xb0/0x360 [xfs]
The problem is xlog_recover_clear_agi_bucket() clear the bucket list, so the gc worker fail to check the agino in xfs_verify_agino(). Fix this by flush workqueue before clearing the bucket.
Fixes: ab23a7768739 ("xfs: per-cpu deferred inode inactivation queues") Signed-off-by: Zhang Yi yi.zhang@huawei.com Reviewed-by: Dave Chinner dchinner@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Dave Chinner david@fromorbit.com Signed-off-by: Leah Rumancik leah.rumancik@gmail.com Acked-by: Chandan Babu R chandanbabu@kernel.org --- fs/xfs/xfs_log_recover.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index aeb01d4c0423..04961ebf16ea 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2739,6 +2739,7 @@ xlog_recover_process_one_iunlink( * Call xlog_recover_clear_agi_bucket() to perform a transaction to * clear the inode pointer in the bucket. */ + xfs_inodegc_flush(mp); xlog_recover_clear_agi_bucket(mp, agno, bucket); return NULLAGINO; }
From: "Darrick J. Wong" djwong@kernel.org
[ Upstream commit 95ff0363f3f6ae70c21a0f2b0603e54438e5988b ]
The kernel build robot reported a UAF error while running xfs/433 (edited somewhat for brevity):
BUG: KASAN: use-after-free in xfs_attr3_node_inactive (fs/xfs/xfs_attr_inactive.c:214) xfs Read of size 4 at addr ffff88820ac2bd44 by task kworker/0:2/139
CPU: 0 PID: 139 Comm: kworker/0:2 Tainted: G S 5.19.0-rc2-00004-g7cf2b0f9611b #1 Hardware name: Hewlett-Packard p6-1451cx/2ADA, BIOS 8.15 02/05/2013 Workqueue: xfs-inodegc/sdb4 xfs_inodegc_worker [xfs] Call Trace: <TASK> dump_stack_lvl (lib/dump_stack.c:107 (discriminator 1)) print_address_description+0x1f/0x200 print_report.cold (mm/kasan/report.c:430) kasan_report (mm/kasan/report.c:162 mm/kasan/report.c:493) xfs_attr3_node_inactive (fs/xfs/xfs_attr_inactive.c:214) xfs xfs_attr3_root_inactive (fs/xfs/xfs_attr_inactive.c:296) xfs xfs_attr_inactive (fs/xfs/xfs_attr_inactive.c:371) xfs xfs_inactive (fs/xfs/xfs_inode.c:1781) xfs xfs_inodegc_worker (fs/xfs/xfs_icache.c:1837 fs/xfs/xfs_icache.c:1860) xfs process_one_work worker_thread kthread ret_from_fork </TASK>
Allocated by task 139: kasan_save_stack (mm/kasan/common.c:39) __kasan_slab_alloc (mm/kasan/common.c:45 mm/kasan/common.c:436 mm/kasan/common.c:469) kmem_cache_alloc (mm/slab.h:750 mm/slub.c:3214 mm/slub.c:3222 mm/slub.c:3229 mm/slub.c:3239) _xfs_buf_alloc (include/linux/instrumented.h:86 include/linux/atomic/atomic-instrumented.h:41 fs/xfs/xfs_buf.c:232) xfs xfs_buf_get_map (fs/xfs/xfs_buf.c:660) xfs xfs_buf_read_map (fs/xfs/xfs_buf.c:777) xfs xfs_trans_read_buf_map (fs/xfs/xfs_trans_buf.c:289) xfs xfs_da_read_buf (fs/xfs/libxfs/xfs_da_btree.c:2652) xfs xfs_da3_node_read (fs/xfs/libxfs/xfs_da_btree.c:392) xfs xfs_attr3_root_inactive (fs/xfs/xfs_attr_inactive.c:272) xfs xfs_attr_inactive (fs/xfs/xfs_attr_inactive.c:371) xfs xfs_inactive (fs/xfs/xfs_inode.c:1781) xfs xfs_inodegc_worker (fs/xfs/xfs_icache.c:1837 fs/xfs/xfs_icache.c:1860) xfs process_one_work worker_thread kthread ret_from_fork
Freed by task 139: kasan_save_stack (mm/kasan/common.c:39) kasan_set_track (mm/kasan/common.c:45) kasan_set_free_info (mm/kasan/generic.c:372) __kasan_slab_free (mm/kasan/common.c:368 mm/kasan/common.c:328 mm/kasan/common.c:374) kmem_cache_free (mm/slub.c:1753 mm/slub.c:3507 mm/slub.c:3524) xfs_buf_rele (fs/xfs/xfs_buf.c:1040) xfs xfs_attr3_node_inactive (fs/xfs/xfs_attr_inactive.c:210) xfs xfs_attr3_root_inactive (fs/xfs/xfs_attr_inactive.c:296) xfs xfs_attr_inactive (fs/xfs/xfs_attr_inactive.c:371) xfs xfs_inactive (fs/xfs/xfs_inode.c:1781) xfs xfs_inodegc_worker (fs/xfs/xfs_icache.c:1837 fs/xfs/xfs_icache.c:1860) xfs process_one_work worker_thread kthread ret_from_fork
I reproduced this for my own satisfaction, and got the same report, along with an extra morsel:
The buggy address belongs to the object at ffff88802103a800 which belongs to the cache xfs_buf of size 432 The buggy address is located 396 bytes inside of 432-byte region [ffff88802103a800, ffff88802103a9b0)
I tracked this code down to:
error = xfs_trans_get_buf(*trans, mp->m_ddev_targp, child_blkno, XFS_FSB_TO_BB(mp, mp->m_attr_geo->fsbcount), 0, &child_bp); if (error) return error; error = bp->b_error;
That doesn't look right -- I think this should be dereferencing child_bp, not bp. Looking through the codebase history, I think this was added by commit 2911edb653b9 ("xfs: remove the mappedbno argument to xfs_da_get_buf"), which replaced a call to xfs_da_get_buf with the current call to xfs_trans_get_buf. Not sure why we trans_brelse'd @bp earlier in the function, but I'm guessing it's to avoid pinning too many buffers in memory while we inactivate the bottom of the attr tree. Hence we now have to get the buffer back.
I /think/ this was supposed to check child_bp->b_error and fail the rest of the invalidation if child_bp had experienced any kind of IO or corruption error. I bet the xfs_da3_node_read earlier in the loop will catch most cases of incoming on-disk corruption which makes this check mostly moot unless someone corrupts the buffer and the AIL pushes it out to disk while the buffer's unlocked.
In the first case we'll never get to the bad check, and in the second case the AIL will shut down the log, at which point there's no reason to check b_error. Remove the check, and null out @bp to avoid this problem in the future.
Cc: hch@lst.de Reported-by: kernel test robot oliver.sang@intel.com Fixes: 2911edb653b9 ("xfs: remove the mappedbno argument to xfs_da_get_buf") Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Leah Rumancik leah.rumancik@gmail.com Acked-by: Chandan Babu R chandanbabu@kernel.org --- fs/xfs/xfs_attr_inactive.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 2b5da6218977..2afa6d9a7f8f 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -158,6 +158,7 @@ xfs_attr3_node_inactive( } child_fsb = be32_to_cpu(ichdr.btree[0].before); xfs_trans_brelse(*trans, bp); /* no locks for later trans */ + bp = NULL;
/* * If this is the node level just above the leaves, simply loop @@ -211,12 +212,8 @@ xfs_attr3_node_inactive( &child_bp); if (error) return error; - error = bp->b_error; - if (error) { - xfs_trans_brelse(*trans, child_bp); - return error; - } xfs_trans_binval(*trans, child_bp); + child_bp = NULL;
/* * If we're not done, re-read the parent to get the next @@ -233,6 +230,7 @@ xfs_attr3_node_inactive( bp->b_addr); child_fsb = be32_to_cpu(phdr.btree[i + 1].before); xfs_trans_brelse(*trans, bp); + bp = NULL; } /* * Atomically commit the whole invalidate stuff.
From: "Darrick J. Wong" djwong@kernel.org
[ Upstream commit c78c2d0903183a41beb90c56a923e30f90fa91b9 ]
I observed the following evidence of a memory leak while running xfs/399 from the xfs fsck test suite (edited for brevity):
XFS (sde): Metadata corruption detected at xfs_attr_shortform_verify_struct.part.0+0x7b/0xb0 [xfs], inode 0x1172 attr fork XFS: Assertion failed: ip->i_af.if_u1.if_data == NULL, file: fs/xfs/libxfs/xfs_inode_fork.c, line: 315 ------------[ cut here ]------------ WARNING: CPU: 2 PID: 91635 at fs/xfs/xfs_message.c:104 assfail+0x46/0x4a [xfs] CPU: 2 PID: 91635 Comm: xfs_scrub Tainted: G W 5.19.0-rc7-xfsx #rc7 6e6475eb29fd9dda3181f81b7ca7ff961d277a40 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.15.0-1 04/01/2014 RIP: 0010:assfail+0x46/0x4a [xfs] Call Trace: <TASK> xfs_ifork_zap_attr+0x7c/0xb0 xfs_iformat_attr_fork+0x86/0x110 xfs_inode_from_disk+0x41d/0x480 xfs_iget+0x389/0xd70 xfs_bulkstat_one_int+0x5b/0x540 xfs_bulkstat_iwalk+0x1e/0x30 xfs_iwalk_ag_recs+0xd1/0x160 xfs_iwalk_run_callbacks+0xb9/0x180 xfs_iwalk_ag+0x1d8/0x2e0 xfs_iwalk+0x141/0x220 xfs_bulkstat+0x105/0x180 xfs_ioc_bulkstat.constprop.0.isra.0+0xc5/0x130 xfs_file_ioctl+0xa5f/0xef0 __x64_sys_ioctl+0x82/0xa0 do_syscall_64+0x2b/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0
This newly-added assertion checks that there aren't any incore data structures hanging off the incore fork when we're trying to reset its contents. From the call trace, it is evident that iget was trying to construct an incore inode from the ondisk inode, but the attr fork verifier failed and we were trying to undo all the memory allocations that we had done earlier.
The three assertions in xfs_ifork_zap_attr check that the caller has already called xfs_idestroy_fork, which clearly has not been done here. As the zap function then zeroes the pointers, we've effectively leaked the memory.
The shortest change would have been to insert an extra call to xfs_idestroy_fork, but it makes more sense to bundle the _idestroy_fork call into _zap_attr, since all other callsites call _idestroy_fork immediately prior to calling _zap_attr. IOWs, it eliminates one way to fail.
Note: This change only applies cleanly to 2ed5b09b3e8f, since we just reworked the attr fork lifetime. However, I think this memory leak has existed since 0f45a1b20cd8, since the chain xfs_iformat_attr_fork -> xfs_iformat_local -> xfs_init_local_fork will allocate ifp->if_u1.if_data, but if xfs_ifork_verify_local_attr fails, xfs_iformat_attr_fork will free i_afp without freeing any of the stuff hanging off i_afp. The solution for older kernels I think is to add the missing call to xfs_idestroy_fork just prior to calling kmem_cache_free.
Found by fuzzing a.sfattr.hdr.totsize = lastbit in xfs/399.
[ backport note: did not include refactoring of xfs_idestroy_fork into xfs_ifork_zap_attr, simply added the missing call as suggested in the commit for backports ]
Fixes: 2ed5b09b3e8f ("xfs: make inode attribute forks a permanent part of struct xfs_inode") Probably-Fixes: 0f45a1b20cd8 ("xfs: improve local fork verification") Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Leah Rumancik leah.rumancik@gmail.com Acked-by: Chandan Babu R chandanbabu@kernel.org --- fs/xfs/libxfs/xfs_inode_fork.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 20095233d7bc..c1f965af8432 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -330,6 +330,7 @@ xfs_iformat_attr_fork( }
if (error) { + xfs_idestroy_fork(ip->i_afp); kmem_cache_free(xfs_ifork_zone, ip->i_afp); ip->i_afp = NULL; }
From: ChenXiaoSong chenxiaosong2@huawei.com
[ Upstream commit 001c179c4e26d04db8c9f5e3fef9558b58356be6 ]
Reproducer: 1. fallocate -l 100M image 2. mkfs.xfs -f image 3. mount image /mnt 4. setxattr("/mnt", "trusted.overlay.upper", NULL, 0, XATTR_CREATE) 5. char arg[32] = "\x01\xff\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00" "\x00\x00\x00\x00\x00\x08\x00\x00\x00\xc6\x2a\xf7"; fd = open("/mnt", O_RDONLY|O_DIRECTORY); ioctl(fd, _IOC(_IOC_READ|_IOC_WRITE, 0x58, 0x2c, 0x20), arg);
NULL pointer dereference will occur when race happens between xfs_getbmap() and xfs_bmap_set_attrforkoff():
ioctl | setxattr ----------------------------|--------------------------- xfs_getbmap | xfs_ifork_ptr | xfs_inode_has_attr_fork | ip->i_forkoff == 0 | return NULL | ifp == NULL | | xfs_bmap_set_attrforkoff | ip->i_forkoff > 0 xfs_inode_has_attr_fork | ip->i_forkoff > 0 | ifp == NULL | ifp->if_format |
Fix this by locking i_lock before xfs_ifork_ptr().
Fixes: abbf9e8a4507 ("xfs: rewrite getbmap using the xfs_iext_* helpers") Signed-off-by: ChenXiaoSong chenxiaosong2@huawei.com Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Darrick J. Wong djwong@kernel.org [djwong: added fixes tag] Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Leah Rumancik leah.rumancik@gmail.com Acked-by: Chandan Babu R chandanbabu@kernel.org --- fs/xfs/xfs_bmap_util.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index fd2ad6a3019c..bea6cc26abf9 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -439,29 +439,28 @@ xfs_getbmap( whichfork = XFS_COW_FORK; else whichfork = XFS_DATA_FORK; - ifp = XFS_IFORK_PTR(ip, whichfork);
xfs_ilock(ip, XFS_IOLOCK_SHARED); switch (whichfork) { case XFS_ATTR_FORK: + lock = xfs_ilock_attr_map_shared(ip); if (!XFS_IFORK_Q(ip)) - goto out_unlock_iolock; + goto out_unlock_ilock;
max_len = 1LL << 32; - lock = xfs_ilock_attr_map_shared(ip); break; case XFS_COW_FORK: + lock = XFS_ILOCK_SHARED; + xfs_ilock(ip, lock); + /* No CoW fork? Just return */ - if (!ifp) - goto out_unlock_iolock; + if (!XFS_IFORK_PTR(ip, whichfork)) + goto out_unlock_ilock;
if (xfs_get_cowextsz_hint(ip)) max_len = mp->m_super->s_maxbytes; else max_len = XFS_ISIZE(ip); - - lock = XFS_ILOCK_SHARED; - xfs_ilock(ip, lock); break; case XFS_DATA_FORK: if (!(iflags & BMV_IF_DELALLOC) && @@ -491,6 +490,8 @@ xfs_getbmap( break; }
+ ifp = XFS_IFORK_PTR(ip, whichfork); + switch (ifp->if_format) { case XFS_DINODE_FMT_EXTENTS: case XFS_DINODE_FMT_BTREE:
On Wed, Nov 15, 2023 at 06:28:25PM -0800, Leah Rumancik wrote:
From: ChenXiaoSong chenxiaosong2@huawei.com
[ Upstream commit 001c179c4e26d04db8c9f5e3fef9558b58356be6 ]
Reproducer:
- fallocate -l 100M image
- mkfs.xfs -f image
- mount image /mnt
- setxattr("/mnt", "trusted.overlay.upper", NULL, 0, XATTR_CREATE)
- char arg[32] = "\x01\xff\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00" "\x00\x00\x00\x00\x00\x08\x00\x00\x00\xc6\x2a\xf7"; fd = open("/mnt", O_RDONLY|O_DIRECTORY); ioctl(fd, _IOC(_IOC_READ|_IOC_WRITE, 0x58, 0x2c, 0x20), arg);
NULL pointer dereference will occur when race happens between xfs_getbmap() and xfs_bmap_set_attrforkoff():
ioctl | setxattr
----------------------------|--------------------------- xfs_getbmap | xfs_ifork_ptr | xfs_inode_has_attr_fork | ip->i_forkoff == 0 | return NULL | ifp == NULL | | xfs_bmap_set_attrforkoff | ip->i_forkoff > 0 xfs_inode_has_attr_fork | ip->i_forkoff > 0 | ifp == NULL | ifp->if_format |
Fix this by locking i_lock before xfs_ifork_ptr().
Fixes: abbf9e8a4507 ("xfs: rewrite getbmap using the xfs_iext_* helpers") Signed-off-by: ChenXiaoSong chenxiaosong2@huawei.com Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Darrick J. Wong djwong@kernel.org [djwong: added fixes tag] Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Leah Rumancik leah.rumancik@gmail.com Acked-by: Chandan Babu R chandanbabu@kernel.org
fs/xfs/xfs_bmap_util.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index fd2ad6a3019c..bea6cc26abf9 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -439,29 +439,28 @@ xfs_getbmap( whichfork = XFS_COW_FORK; else whichfork = XFS_DATA_FORK;
- ifp = XFS_IFORK_PTR(ip, whichfork);
xfs_ilock(ip, XFS_IOLOCK_SHARED); switch (whichfork) { case XFS_ATTR_FORK:
if (!XFS_IFORK_Q(ip))lock = xfs_ilock_attr_map_shared(ip);
goto out_unlock_iolock;
goto out_unlock_ilock;
max_len = 1LL << 32;
break; case XFS_COW_FORK:lock = xfs_ilock_attr_map_shared(ip);
lock = XFS_ILOCK_SHARED;
xfs_ilock(ip, lock);
- /* No CoW fork? Just return */
if (!ifp)
goto out_unlock_iolock;
if (!XFS_IFORK_PTR(ip, whichfork))
goto out_unlock_ilock;
if (xfs_get_cowextsz_hint(ip)) max_len = mp->m_super->s_maxbytes; else max_len = XFS_ISIZE(ip);
lock = XFS_ILOCK_SHARED;
break; case XFS_DATA_FORK: if (!(iflags & BMV_IF_DELALLOC) &&xfs_ilock(ip, lock);
@@ -491,6 +490,8 @@ xfs_getbmap( break; }
- ifp = XFS_IFORK_PTR(ip, whichfork);
- switch (ifp->if_format) { case XFS_DINODE_FMT_EXTENTS: case XFS_DINODE_FMT_BTREE:
-- 2.43.0.rc0.421.g78406f8d94-goog
This patch breaks the build, how was it tested?
fs/xfs/xfs_bmap_util.c: In function ‘xfs_getbmap’: fs/xfs/xfs_bmap_util.c:457:21: error: the comparison will always evaluate as ‘true’ for the address of ‘i_df’ will never be NULL [-Werror=address] 457 | if (!XFS_IFORK_PTR(ip, whichfork)) | ^ In file included from fs/xfs/xfs_bmap_util.c:16: fs/xfs/xfs_inode.h:38:33: note: ‘i_df’ declared here 38 | struct xfs_ifork i_df; /* data fork */ | ^~~~ cc1: all warnings being treated as errors
On Mon, Nov 20, 2023 at 04:38:24PM +0100, Greg KH wrote:
On Wed, Nov 15, 2023 at 06:28:25PM -0800, Leah Rumancik wrote:
From: ChenXiaoSong chenxiaosong2@huawei.com
[ Upstream commit 001c179c4e26d04db8c9f5e3fef9558b58356be6 ]
Reproducer:
- fallocate -l 100M image
- mkfs.xfs -f image
- mount image /mnt
- setxattr("/mnt", "trusted.overlay.upper", NULL, 0, XATTR_CREATE)
- char arg[32] = "\x01\xff\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00" "\x00\x00\x00\x00\x00\x08\x00\x00\x00\xc6\x2a\xf7"; fd = open("/mnt", O_RDONLY|O_DIRECTORY); ioctl(fd, _IOC(_IOC_READ|_IOC_WRITE, 0x58, 0x2c, 0x20), arg);
NULL pointer dereference will occur when race happens between xfs_getbmap() and xfs_bmap_set_attrforkoff():
ioctl | setxattr
----------------------------|--------------------------- xfs_getbmap | xfs_ifork_ptr | xfs_inode_has_attr_fork | ip->i_forkoff == 0 | return NULL | ifp == NULL | | xfs_bmap_set_attrforkoff | ip->i_forkoff > 0 xfs_inode_has_attr_fork | ip->i_forkoff > 0 | ifp == NULL | ifp->if_format |
Fix this by locking i_lock before xfs_ifork_ptr().
Fixes: abbf9e8a4507 ("xfs: rewrite getbmap using the xfs_iext_* helpers") Signed-off-by: ChenXiaoSong chenxiaosong2@huawei.com Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Darrick J. Wong djwong@kernel.org [djwong: added fixes tag] Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Leah Rumancik leah.rumancik@gmail.com Acked-by: Chandan Babu R chandanbabu@kernel.org
fs/xfs/xfs_bmap_util.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index fd2ad6a3019c..bea6cc26abf9 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -439,29 +439,28 @@ xfs_getbmap( whichfork = XFS_COW_FORK; else whichfork = XFS_DATA_FORK;
- ifp = XFS_IFORK_PTR(ip, whichfork);
xfs_ilock(ip, XFS_IOLOCK_SHARED); switch (whichfork) { case XFS_ATTR_FORK:
if (!XFS_IFORK_Q(ip))lock = xfs_ilock_attr_map_shared(ip);
goto out_unlock_iolock;
goto out_unlock_ilock;
max_len = 1LL << 32;
break; case XFS_COW_FORK:lock = xfs_ilock_attr_map_shared(ip);
lock = XFS_ILOCK_SHARED;
xfs_ilock(ip, lock);
- /* No CoW fork? Just return */
if (!ifp)
goto out_unlock_iolock;
if (!XFS_IFORK_PTR(ip, whichfork))
Is this the line 457 that the compiler complains about below?
If so, then whichfork == XFS_COW_FORK here, because the code just switch()d on that. The XFS_IFORK_PTR macro decomposes into:
#define XFS_IFORK_PTR(ip,w) \ ((w) == XFS_DATA_FORK ? \ &(ip)->i_df : \ ((w) == XFS_ATTR_FORK ? \ (ip)->i_afp : \ (ip)->i_cowfp))
which means this test /should/ be turning into:
if (!(ip->i_cowfp)) goto out_unlock_ilock;
I'm not sure why your compiler is whining about &ip->i_df; that's not what's being selected here for testing. Unless your compiler is somehow deciding that XFS_DATA_FORK == XFS_COW_FORK? This should not ever be possible.
--D
goto out_unlock_ilock;
if (xfs_get_cowextsz_hint(ip)) max_len = mp->m_super->s_maxbytes; else max_len = XFS_ISIZE(ip);
lock = XFS_ILOCK_SHARED;
break; case XFS_DATA_FORK: if (!(iflags & BMV_IF_DELALLOC) &&xfs_ilock(ip, lock);
@@ -491,6 +490,8 @@ xfs_getbmap( break; }
- ifp = XFS_IFORK_PTR(ip, whichfork);
- switch (ifp->if_format) { case XFS_DINODE_FMT_EXTENTS: case XFS_DINODE_FMT_BTREE:
-- 2.43.0.rc0.421.g78406f8d94-goog
This patch breaks the build, how was it tested?
fs/xfs/xfs_bmap_util.c: In function ‘xfs_getbmap’: fs/xfs/xfs_bmap_util.c:457:21: error: the comparison will always evaluate as ‘true’ for the address of ‘i_df’ will never be NULL [-Werror=address] 457 | if (!XFS_IFORK_PTR(ip, whichfork)) | ^ In file included from fs/xfs/xfs_bmap_util.c:16: fs/xfs/xfs_inode.h:38:33: note: ‘i_df’ declared here 38 | struct xfs_ifork i_df; /* data fork */ | ^~~~ cc1: all warnings being treated as errors
On Mon, Nov 20, 2023 at 11:11:30AM -0800, Darrick J. Wong wrote:
On Mon, Nov 20, 2023 at 04:38:24PM +0100, Greg KH wrote:
On Wed, Nov 15, 2023 at 06:28:25PM -0800, Leah Rumancik wrote:
From: ChenXiaoSong chenxiaosong2@huawei.com
[ Upstream commit 001c179c4e26d04db8c9f5e3fef9558b58356be6 ]
Reproducer:
- fallocate -l 100M image
- mkfs.xfs -f image
- mount image /mnt
- setxattr("/mnt", "trusted.overlay.upper", NULL, 0, XATTR_CREATE)
- char arg[32] = "\x01\xff\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00" "\x00\x00\x00\x00\x00\x08\x00\x00\x00\xc6\x2a\xf7"; fd = open("/mnt", O_RDONLY|O_DIRECTORY); ioctl(fd, _IOC(_IOC_READ|_IOC_WRITE, 0x58, 0x2c, 0x20), arg);
NULL pointer dereference will occur when race happens between xfs_getbmap() and xfs_bmap_set_attrforkoff():
ioctl | setxattr
----------------------------|--------------------------- xfs_getbmap | xfs_ifork_ptr | xfs_inode_has_attr_fork | ip->i_forkoff == 0 | return NULL | ifp == NULL | | xfs_bmap_set_attrforkoff | ip->i_forkoff > 0 xfs_inode_has_attr_fork | ip->i_forkoff > 0 | ifp == NULL | ifp->if_format |
Fix this by locking i_lock before xfs_ifork_ptr().
Fixes: abbf9e8a4507 ("xfs: rewrite getbmap using the xfs_iext_* helpers") Signed-off-by: ChenXiaoSong chenxiaosong2@huawei.com Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Darrick J. Wong djwong@kernel.org [djwong: added fixes tag] Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Leah Rumancik leah.rumancik@gmail.com Acked-by: Chandan Babu R chandanbabu@kernel.org
fs/xfs/xfs_bmap_util.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index fd2ad6a3019c..bea6cc26abf9 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -439,29 +439,28 @@ xfs_getbmap( whichfork = XFS_COW_FORK; else whichfork = XFS_DATA_FORK;
- ifp = XFS_IFORK_PTR(ip, whichfork);
xfs_ilock(ip, XFS_IOLOCK_SHARED); switch (whichfork) { case XFS_ATTR_FORK:
if (!XFS_IFORK_Q(ip))lock = xfs_ilock_attr_map_shared(ip);
goto out_unlock_iolock;
goto out_unlock_ilock;
max_len = 1LL << 32;
break; case XFS_COW_FORK:lock = xfs_ilock_attr_map_shared(ip);
lock = XFS_ILOCK_SHARED;
xfs_ilock(ip, lock);
- /* No CoW fork? Just return */
if (!ifp)
goto out_unlock_iolock;
if (!XFS_IFORK_PTR(ip, whichfork))
Is this the line 457 that the compiler complains about below?
If so, then whichfork == XFS_COW_FORK here, because the code just switch()d on that. The XFS_IFORK_PTR macro decomposes into:
#define XFS_IFORK_PTR(ip,w) \ ((w) == XFS_DATA_FORK ? \ &(ip)->i_df : \ ((w) == XFS_ATTR_FORK ? \ (ip)->i_afp : \ (ip)->i_cowfp))
which means this test /should/ be turning into:
if (!(ip->i_cowfp)) goto out_unlock_ilock;
I'm not sure why your compiler is whining about &ip->i_df; that's not what's being selected here for testing. Unless your compiler is somehow deciding that XFS_DATA_FORK == XFS_COW_FORK? This should not ever be possible.
This is using gcc-12, and gcc-13, no idea what happened, just that it throws up that warning which stops my builds :(
thanks,
greg k-h
On Mon, Nov 20, 2023 at 04:38:24PM +0100, Greg KH wrote:
On Wed, Nov 15, 2023 at 06:28:25PM -0800, Leah Rumancik wrote:
From: ChenXiaoSong chenxiaosong2@huawei.com
[ Upstream commit 001c179c4e26d04db8c9f5e3fef9558b58356be6 ]
Reproducer:
- fallocate -l 100M image
- mkfs.xfs -f image
- mount image /mnt
- setxattr("/mnt", "trusted.overlay.upper", NULL, 0, XATTR_CREATE)
- char arg[32] = "\x01\xff\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00" "\x00\x00\x00\x00\x00\x08\x00\x00\x00\xc6\x2a\xf7"; fd = open("/mnt", O_RDONLY|O_DIRECTORY); ioctl(fd, _IOC(_IOC_READ|_IOC_WRITE, 0x58, 0x2c, 0x20), arg);
NULL pointer dereference will occur when race happens between xfs_getbmap() and xfs_bmap_set_attrforkoff():
ioctl | setxattr
----------------------------|--------------------------- xfs_getbmap | xfs_ifork_ptr | xfs_inode_has_attr_fork | ip->i_forkoff == 0 | return NULL | ifp == NULL | | xfs_bmap_set_attrforkoff | ip->i_forkoff > 0 xfs_inode_has_attr_fork | ip->i_forkoff > 0 | ifp == NULL | ifp->if_format |
Fix this by locking i_lock before xfs_ifork_ptr().
Fixes: abbf9e8a4507 ("xfs: rewrite getbmap using the xfs_iext_* helpers") Signed-off-by: ChenXiaoSong chenxiaosong2@huawei.com Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Darrick J. Wong djwong@kernel.org [djwong: added fixes tag] Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Leah Rumancik leah.rumancik@gmail.com Acked-by: Chandan Babu R chandanbabu@kernel.org
fs/xfs/xfs_bmap_util.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index fd2ad6a3019c..bea6cc26abf9 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -439,29 +439,28 @@ xfs_getbmap( whichfork = XFS_COW_FORK; else whichfork = XFS_DATA_FORK;
ifp = XFS_IFORK_PTR(ip, whichfork);
xfs_ilock(ip, XFS_IOLOCK_SHARED); switch (whichfork) { case XFS_ATTR_FORK:
if (!XFS_IFORK_Q(ip))lock = xfs_ilock_attr_map_shared(ip);
goto out_unlock_iolock;
goto out_unlock_ilock;
max_len = 1LL << 32;
break; case XFS_COW_FORK:lock = xfs_ilock_attr_map_shared(ip);
lock = XFS_ILOCK_SHARED;
xfs_ilock(ip, lock);
- /* No CoW fork? Just return */
if (!ifp)
goto out_unlock_iolock;
if (!XFS_IFORK_PTR(ip, whichfork))
goto out_unlock_ilock;
if (xfs_get_cowextsz_hint(ip)) max_len = mp->m_super->s_maxbytes; else max_len = XFS_ISIZE(ip);
lock = XFS_ILOCK_SHARED;
break; case XFS_DATA_FORK: if (!(iflags & BMV_IF_DELALLOC) &&xfs_ilock(ip, lock);
@@ -491,6 +490,8 @@ xfs_getbmap( break; }
- ifp = XFS_IFORK_PTR(ip, whichfork);
- switch (ifp->if_format) { case XFS_DINODE_FMT_EXTENTS: case XFS_DINODE_FMT_BTREE:
-- 2.43.0.rc0.421.g78406f8d94-goog
This patch breaks the build, how was it tested?
fs/xfs/xfs_bmap_util.c: In function ‘xfs_getbmap’: fs/xfs/xfs_bmap_util.c:457:21: error: the comparison will always evaluate as ‘true’ for the address of ‘i_df’ will never be NULL [-Werror=address] 457 | if (!XFS_IFORK_PTR(ip, whichfork)) | ^ In file included from fs/xfs/xfs_bmap_util.c:16: fs/xfs/xfs_inode.h:38:33: note: ‘i_df’ declared here 38 | struct xfs_ifork i_df; /* data fork */ | ^~~~ cc1: all warnings being treated as errors
That's odd. I actually ended up queueing these patches earlier, and I don't see any such warnings.
Looking at the code, this is a bit weird too - do you see these warnings with the current 5.15 queue?
On Mon, Nov 20, 2023 at 09:50:45PM -0500, Sasha Levin wrote:
On Mon, Nov 20, 2023 at 04:38:24PM +0100, Greg KH wrote:
On Wed, Nov 15, 2023 at 06:28:25PM -0800, Leah Rumancik wrote:
From: ChenXiaoSong chenxiaosong2@huawei.com
[ Upstream commit 001c179c4e26d04db8c9f5e3fef9558b58356be6 ]
Reproducer:
- fallocate -l 100M image
- mkfs.xfs -f image
- mount image /mnt
- setxattr("/mnt", "trusted.overlay.upper", NULL, 0, XATTR_CREATE)
- char arg[32] = "\x01\xff\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00" "\x00\x00\x00\x00\x00\x08\x00\x00\x00\xc6\x2a\xf7"; fd = open("/mnt", O_RDONLY|O_DIRECTORY); ioctl(fd, _IOC(_IOC_READ|_IOC_WRITE, 0x58, 0x2c, 0x20), arg);
NULL pointer dereference will occur when race happens between xfs_getbmap() and xfs_bmap_set_attrforkoff():
ioctl | setxattr
----------------------------|--------------------------- xfs_getbmap | xfs_ifork_ptr | xfs_inode_has_attr_fork | ip->i_forkoff == 0 | return NULL | ifp == NULL | | xfs_bmap_set_attrforkoff | ip->i_forkoff > 0 xfs_inode_has_attr_fork | ip->i_forkoff > 0 | ifp == NULL | ifp->if_format |
Fix this by locking i_lock before xfs_ifork_ptr().
Fixes: abbf9e8a4507 ("xfs: rewrite getbmap using the xfs_iext_* helpers") Signed-off-by: ChenXiaoSong chenxiaosong2@huawei.com Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Darrick J. Wong djwong@kernel.org [djwong: added fixes tag] Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Leah Rumancik leah.rumancik@gmail.com Acked-by: Chandan Babu R chandanbabu@kernel.org
fs/xfs/xfs_bmap_util.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index fd2ad6a3019c..bea6cc26abf9 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -439,29 +439,28 @@ xfs_getbmap( whichfork = XFS_COW_FORK; else whichfork = XFS_DATA_FORK;
ifp = XFS_IFORK_PTR(ip, whichfork);
xfs_ilock(ip, XFS_IOLOCK_SHARED); switch (whichfork) { case XFS_ATTR_FORK:
if (!XFS_IFORK_Q(ip))lock = xfs_ilock_attr_map_shared(ip);
goto out_unlock_iolock;
goto out_unlock_ilock;
max_len = 1LL << 32;
break; case XFS_COW_FORK:lock = xfs_ilock_attr_map_shared(ip);
lock = XFS_ILOCK_SHARED;
xfs_ilock(ip, lock);
- /* No CoW fork? Just return */
if (!ifp)
goto out_unlock_iolock;
if (!XFS_IFORK_PTR(ip, whichfork))
goto out_unlock_ilock;
if (xfs_get_cowextsz_hint(ip)) max_len = mp->m_super->s_maxbytes; else max_len = XFS_ISIZE(ip);
lock = XFS_ILOCK_SHARED;
break; case XFS_DATA_FORK: if (!(iflags & BMV_IF_DELALLOC) &&xfs_ilock(ip, lock);
@@ -491,6 +490,8 @@ xfs_getbmap( break; }
- ifp = XFS_IFORK_PTR(ip, whichfork);
- switch (ifp->if_format) { case XFS_DINODE_FMT_EXTENTS: case XFS_DINODE_FMT_BTREE:
-- 2.43.0.rc0.421.g78406f8d94-goog
This patch breaks the build, how was it tested?
fs/xfs/xfs_bmap_util.c: In function ‘xfs_getbmap’: fs/xfs/xfs_bmap_util.c:457:21: error: the comparison will always evaluate as ‘true’ for the address of ‘i_df’ will never be NULL [-Werror=address] 457 | if (!XFS_IFORK_PTR(ip, whichfork)) | ^ In file included from fs/xfs/xfs_bmap_util.c:16: fs/xfs/xfs_inode.h:38:33: note: ‘i_df’ declared here 38 | struct xfs_ifork i_df; /* data fork */ | ^~~~ cc1: all warnings being treated as errors
That's odd. I actually ended up queueing these patches earlier, and I don't see any such warnings.
Looking at the code, this is a bit weird too - do you see these warnings with the current 5.15 queue?
I did, that's where I saw this, so I dropped this commit from there, it failed my builds using gcc-12.
thanks,
greg k-h
From: "Darrick J. Wong" djwong@kernel.org
[ Upstream commit f0c2d7d2abca24d19831c99edea458704fac8087 ]
Every now and then, I see the following hang during mount time quotacheck when running fstests. Turning on KASAN seems to make it happen somewhat more frequently. I've edited the backtrace for brevity.
XFS (sdd): Quotacheck needed: Please wait. XFS: Assertion failed: bp->b_flags & _XBF_DELWRI_Q, file: fs/xfs/xfs_buf.c, line: 2411 ------------[ cut here ]------------ WARNING: CPU: 0 PID: 1831409 at fs/xfs/xfs_message.c:104 assfail+0x46/0x4a [xfs] CPU: 0 PID: 1831409 Comm: mount Tainted: G W 5.19.0-rc6-xfsx #rc6 09911566947b9f737b036b4af85e399e4b9aef64 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.15.0-1 04/01/2014 RIP: 0010:assfail+0x46/0x4a [xfs] Code: a0 8f 41 a0 e8 45 fe ff ff 8a 1d 2c 36 10 00 80 fb 01 76 0f 0f b6 f3 48 c7 c7 c0 f0 4f a0 e8 10 f0 02 e1 80 e3 01 74 02 0f 0b <0f> 0b 5b c3 48 8d 45 10 48 89 e2 4c 89 e6 48 89 1c 24 48 89 44 24 RSP: 0018:ffffc900078c7b30 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff8880099ac000 RCX: 000000007fffffff RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffffa0418fa0 RBP: ffff8880197bc1c0 R08: 0000000000000000 R09: 000000000000000a R10: 000000000000000a R11: f000000000000000 R12: ffffc900078c7d20 R13: 00000000fffffff5 R14: ffffc900078c7d20 R15: 0000000000000000 FS: 00007f0449903800(0000) GS:ffff88803ec00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00005610ada631f0 CR3: 0000000014dd8002 CR4: 00000000001706f0 Call Trace: <TASK> xfs_buf_delwri_pushbuf+0x150/0x160 [xfs 4561f5b32c9bfb874ec98d58d0719464e1f87368] xfs_qm_flush_one+0xd6/0x130 [xfs 4561f5b32c9bfb874ec98d58d0719464e1f87368] xfs_qm_dquot_walk.isra.0+0x109/0x1e0 [xfs 4561f5b32c9bfb874ec98d58d0719464e1f87368] xfs_qm_quotacheck+0x319/0x490 [xfs 4561f5b32c9bfb874ec98d58d0719464e1f87368] xfs_qm_mount_quotas+0x65/0x2c0 [xfs 4561f5b32c9bfb874ec98d58d0719464e1f87368] xfs_mountfs+0x6b5/0xab0 [xfs 4561f5b32c9bfb874ec98d58d0719464e1f87368] xfs_fs_fill_super+0x781/0x990 [xfs 4561f5b32c9bfb874ec98d58d0719464e1f87368] get_tree_bdev+0x175/0x280 vfs_get_tree+0x1a/0x80 path_mount+0x6f5/0xaa0 __x64_sys_mount+0x103/0x140 do_syscall_64+0x2b/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0
I /think/ this can happen if xfs_qm_flush_one is racing with xfs_qm_dquot_isolate (i.e. dquot reclaim) when the second function has taken the dquot flush lock but xfs_qm_dqflush hasn't yet locked the dquot buffer, let alone queued it to the delwri list. In this case, flush_one will fail to get the dquot flush lock, but it can lock the incore buffer, but xfs_buf_delwri_pushbuf will then trip over this ASSERT, which checks that the buffer isn't on a delwri list. The hang results because the _delwri_submit_buffers ignores non DELWRI_Q buffers, which means that xfs_buf_iowait waits forever for an IO that has not yet been scheduled.
AFAICT, a reasonable solution here is to detect a dquot buffer that is not on a DELWRI list, drop it, and return -EAGAIN to try the flush again. It's not /that/ big of a deal if quotacheck writes the dquot buffer repeatedly before we even set QUOTA_CHKD.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Leah Rumancik leah.rumancik@gmail.com Acked-by: Chandan Babu R chandanbabu@kernel.org --- fs/xfs/xfs_qm.c | 7 +++++++ 1 file changed, 7 insertions(+)
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 623244650a2f..792736e29a37 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1244,6 +1244,13 @@ xfs_qm_flush_one( error = -EINVAL; goto out_unlock; } + + if (!(bp->b_flags & _XBF_DELWRI_Q)) { + error = -EAGAIN; + xfs_buf_relse(bp); + goto out_unlock; + } + xfs_buf_unlock(bp);
xfs_buf_delwri_pushbuf(bp, buffer_list);
From: Gao Xiang hsiangkao@linux.alibaba.com
[ Upstream commit 1a39ae415c1be1e46f5b3f97d438c7c4adc22b63 ]
COW extents are already converted into written real extents after xfs_reflink_convert_cow_locked(), therefore cmap->br_state should reflect it.
Otherwise, there is another necessary unwritten convertion triggered in xfs_dio_write_end_io() for direct I/O cases.
Signed-off-by: Gao Xiang hsiangkao@linux.alibaba.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Leah Rumancik leah.rumancik@gmail.com Acked-by: Chandan Babu R chandanbabu@kernel.org --- fs/xfs/xfs_reflink.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 36832e4bc803..628ce65d02bb 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -425,7 +425,10 @@ xfs_reflink_allocate_cow( if (!convert_now || cmap->br_state == XFS_EXT_NORM) return 0; trace_xfs_reflink_convert_cow(ip, cmap); - return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); + error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); + if (!error) + cmap->br_state = XFS_EXT_NORM; + return error;
out_trans_cancel: xfs_trans_cancel(tp);
From: Chandan Babu R chandan.babu@oracle.com
[ Upstream commit d62113303d691bcd8d0675ae4ac63e7769afc56c ]
On a higly fragmented filesystem a Direct IO write can fail with -ENOSPC error even though the filesystem has sufficient number of free blocks.
This occurs if the file offset range on which the write operation is being performed has a delalloc extent in the cow fork and this delalloc extent begins much before the Direct IO range.
In such a scenario, xfs_reflink_allocate_cow() invokes xfs_bmapi_write() to allocate the blocks mapped by the delalloc extent. The extent thus allocated may not cover the beginning of file offset range on which the Direct IO write was issued. Hence xfs_reflink_allocate_cow() ends up returning -ENOSPC.
The following script reliably recreates the bug described above.
#!/usr/bin/bash
device=/dev/loop0 shortdev=$(basename $device)
mntpnt=/mnt/ file1=${mntpnt}/file1 file2=${mntpnt}/file2 fragmentedfile=${mntpnt}/fragmentedfile punchprog=/root/repos/xfstests-dev/src/punch-alternating
errortag=/sys/fs/xfs/${shortdev}/errortag/bmap_alloc_minlen_extent
umount $device > /dev/null 2>&1
echo "Create FS" mkfs.xfs -f -m reflink=1 $device > /dev/null 2>&1 if [[ $? != 0 ]]; then echo "mkfs failed." exit 1 fi
echo "Mount FS" mount $device $mntpnt > /dev/null 2>&1 if [[ $? != 0 ]]; then echo "mount failed." exit 1 fi
echo "Create source file" xfs_io -f -c "pwrite 0 32M" $file1 > /dev/null 2>&1
sync
echo "Create Reflinked file" xfs_io -f -c "reflink $file1" $file2 &>/dev/null
echo "Set cowextsize" xfs_io -c "cowextsize 16M" $file1 > /dev/null 2>&1
echo "Fragment FS" xfs_io -f -c "pwrite 0 64M" $fragmentedfile > /dev/null 2>&1 sync $punchprog $fragmentedfile
echo "Allocate block sized extent from now onwards" echo -n 1 > $errortag
echo "Create 16MiB delalloc extent in CoW fork" xfs_io -c "pwrite 0 4k" $file1 > /dev/null 2>&1
sync
echo "Direct I/O write at offset 12k" xfs_io -d -c "pwrite 12k 8k" $file1
This commit fixes the bug by invoking xfs_bmapi_write() in a loop until disk blocks are allocated for atleast the starting file offset of the Direct IO write range.
Fixes: 3c68d44a2b49 ("xfs: allocate direct I/O COW blocks in iomap_begin") Reported-and-Root-caused-by: Wengang Wang wen.gang.wang@oracle.com Signed-off-by: Chandan Babu R chandan.babu@oracle.com Reviewed-by: Darrick J. Wong djwong@kernel.org [djwong: slight editing to make the locking less grody, and fix some style things] Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Leah Rumancik leah.rumancik@gmail.com Acked-by: Chandan Babu R chandanbabu@kernel.org --- fs/xfs/xfs_reflink.c | 198 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 163 insertions(+), 35 deletions(-)
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 628ce65d02bb..793bdf5ac2f7 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -340,9 +340,41 @@ xfs_find_trim_cow_extent( return 0; }
-/* Allocate all CoW reservations covering a range of blocks in a file. */ -int -xfs_reflink_allocate_cow( +static int +xfs_reflink_convert_unwritten( + struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, + struct xfs_bmbt_irec *cmap, + bool convert_now) +{ + xfs_fileoff_t offset_fsb = imap->br_startoff; + xfs_filblks_t count_fsb = imap->br_blockcount; + int error; + + /* + * cmap might larger than imap due to cowextsize hint. + */ + xfs_trim_extent(cmap, offset_fsb, count_fsb); + + /* + * COW fork extents are supposed to remain unwritten until we're ready + * to initiate a disk write. For direct I/O we are going to write the + * data and need the conversion, but for buffered writes we're done. + */ + if (!convert_now || cmap->br_state == XFS_EXT_NORM) + return 0; + + trace_xfs_reflink_convert_cow(ip, cmap); + + error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); + if (!error) + cmap->br_state = XFS_EXT_NORM; + + return error; +} + +static int +xfs_reflink_fill_cow_hole( struct xfs_inode *ip, struct xfs_bmbt_irec *imap, struct xfs_bmbt_irec *cmap, @@ -351,25 +383,12 @@ xfs_reflink_allocate_cow( bool convert_now) { struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t offset_fsb = imap->br_startoff; - xfs_filblks_t count_fsb = imap->br_blockcount; struct xfs_trans *tp; - int nimaps, error = 0; - bool found; xfs_filblks_t resaligned; - xfs_extlen_t resblks = 0; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - if (!ip->i_cowfp) { - ASSERT(!xfs_is_reflink_inode(ip)); - xfs_ifork_init_cow(ip); - } - - error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found); - if (error || !*shared) - return error; - if (found) - goto convert; + xfs_extlen_t resblks; + int nimaps; + int error; + bool found;
resaligned = xfs_aligned_fsb_count(imap->br_startoff, imap->br_blockcount, xfs_get_cowextsz_hint(ip)); @@ -385,17 +404,17 @@ xfs_reflink_allocate_cow(
*lockmode = XFS_ILOCK_EXCL;
- /* - * Check for an overlapping extent again now that we dropped the ilock. - */ error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found); if (error || !*shared) goto out_trans_cancel; + if (found) { xfs_trans_cancel(tp); goto convert; }
+ ASSERT(cmap->br_startoff > imap->br_startoff); + /* Allocate the entire reservation as unwritten blocks. */ nimaps = 1; error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount, @@ -415,26 +434,135 @@ xfs_reflink_allocate_cow( */ if (nimaps == 0) return -ENOSPC; + convert: - xfs_trim_extent(cmap, offset_fsb, count_fsb); - /* - * COW fork extents are supposed to remain unwritten until we're ready - * to initiate a disk write. For direct I/O we are going to write the - * data and need the conversion, but for buffered writes we're done. - */ - if (!convert_now || cmap->br_state == XFS_EXT_NORM) - return 0; - trace_xfs_reflink_convert_cow(ip, cmap); - error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); - if (!error) - cmap->br_state = XFS_EXT_NORM; + return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now); + +out_trans_cancel: + xfs_trans_cancel(tp); return error; +} + +static int +xfs_reflink_fill_delalloc( + struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, + struct xfs_bmbt_irec *cmap, + bool *shared, + uint *lockmode, + bool convert_now) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int nimaps; + int error; + bool found; + + do { + xfs_iunlock(ip, *lockmode); + *lockmode = 0; + + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, 0, 0, + false, &tp); + if (error) + return error; + + *lockmode = XFS_ILOCK_EXCL; + + error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, + &found); + if (error || !*shared) + goto out_trans_cancel; + + if (found) { + xfs_trans_cancel(tp); + break; + } + + ASSERT(isnullstartblock(cmap->br_startblock) || + cmap->br_startblock == DELAYSTARTBLOCK); + + /* + * Replace delalloc reservation with an unwritten extent. + */ + nimaps = 1; + error = xfs_bmapi_write(tp, ip, cmap->br_startoff, + cmap->br_blockcount, + XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, + cmap, &nimaps); + if (error) + goto out_trans_cancel; + + xfs_inode_set_cowblocks_tag(ip); + error = xfs_trans_commit(tp); + if (error) + return error; + + /* + * Allocation succeeded but the requested range was not even + * partially satisfied? Bail out! + */ + if (nimaps == 0) + return -ENOSPC; + } while (cmap->br_startoff + cmap->br_blockcount <= imap->br_startoff); + + return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
out_trans_cancel: xfs_trans_cancel(tp); return error; }
+/* Allocate all CoW reservations covering a range of blocks in a file. */ +int +xfs_reflink_allocate_cow( + struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, + struct xfs_bmbt_irec *cmap, + bool *shared, + uint *lockmode, + bool convert_now) +{ + int error; + bool found; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + if (!ip->i_cowfp) { + ASSERT(!xfs_is_reflink_inode(ip)); + xfs_ifork_init_cow(ip); + } + + error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found); + if (error || !*shared) + return error; + + /* CoW fork has a real extent */ + if (found) + return xfs_reflink_convert_unwritten(ip, imap, cmap, + convert_now); + + /* + * CoW fork does not have an extent and data extent is shared. + * Allocate a real extent in the CoW fork. + */ + if (cmap->br_startoff > imap->br_startoff) + return xfs_reflink_fill_cow_hole(ip, imap, cmap, shared, + lockmode, convert_now); + + /* + * CoW fork has a delalloc reservation. Replace it with a real extent. + * There may or may not be a data fork mapping. + */ + if (isnullstartblock(cmap->br_startblock) || + cmap->br_startblock == DELAYSTARTBLOCK) + return xfs_reflink_fill_delalloc(ip, imap, cmap, shared, + lockmode, convert_now); + + /* Shouldn't get here. */ + ASSERT(0); + return -EFSCORRUPTED; +} + /* * Cancel CoW reservations for some block range of an inode. *
From: hexiaole hexiaole@kylinos.cn
[ Upstream commit 031d166f968efba6e4f091ff75d0bb5206bb3918 ]
In 'fs/xfs/libxfs/xfs_trans_resv.c', the comment for transaction of removing a directory entry writes:
/* fs/xfs/libxfs/xfs_trans_resv.c begin */ /* * For removing a directory entry we can modify: * the parent directory inode: inode size * the removed inode: inode size ... xfs_calc_remove_reservation( struct xfs_mount *mp) { return XFS_DQUOT_LOGRES(mp) + xfs_calc_iunlink_add_reservation(mp) + max((xfs_calc_inode_res(mp, 1) + ... /* fs/xfs/libxfs/xfs_trans_resv.c end */
There has 2 inode size of space to be reserverd, but the actual code for inode reservation space writes.
There only count for 1 inode size to be reserved in 'xfs_calc_inode_res(mp, 1)', rather than 2.
Signed-off-by: hexiaole hexiaole@kylinos.cn Reviewed-by: Darrick J. Wong djwong@kernel.org [djwong: remove redundant code citations] Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Leah Rumancik leah.rumancik@gmail.com Acked-by: Chandan Babu R chandanbabu@kernel.org --- fs/xfs/libxfs/xfs_trans_resv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 5e300daa2559..2db9d9d12344 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -423,7 +423,7 @@ xfs_calc_remove_reservation( { return XFS_DQUOT_LOGRES(mp) + xfs_calc_iunlink_add_reservation(mp) + - max((xfs_calc_inode_res(mp, 1) + + max((xfs_calc_inode_res(mp, 2) + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1))), (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) +
From: "Darrick J. Wong" djwong@kernel.org
[ Upstream commit 97cf79677ecb50a38517253ae2fd705849a7e51a ]
KASAN reported a UAF bug when I was running xfs/235:
BUG: KASAN: use-after-free in xlog_recover_process_intents+0xa77/0xae0 [xfs] Read of size 8 at addr ffff88804391b360 by task mount/5680
CPU: 2 PID: 5680 Comm: mount Not tainted 6.0.0-xfsx #6.0.0 77e7b52a4943a975441e5ac90a5ad7748b7867f6 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.15.0-1 04/01/2014 Call Trace: <TASK> dump_stack_lvl+0x34/0x44 print_report.cold+0x2cc/0x682 kasan_report+0xa3/0x120 xlog_recover_process_intents+0xa77/0xae0 [xfs fb841c7180aad3f8359438576e27867f5795667e] xlog_recover_finish+0x7d/0x970 [xfs fb841c7180aad3f8359438576e27867f5795667e] xfs_log_mount_finish+0x2d7/0x5d0 [xfs fb841c7180aad3f8359438576e27867f5795667e] xfs_mountfs+0x11d4/0x1d10 [xfs fb841c7180aad3f8359438576e27867f5795667e] xfs_fs_fill_super+0x13d5/0x1a80 [xfs fb841c7180aad3f8359438576e27867f5795667e] get_tree_bdev+0x3da/0x6e0 vfs_get_tree+0x7d/0x240 path_mount+0xdd3/0x17d0 __x64_sys_mount+0x1fa/0x270 do_syscall_64+0x2b/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 RIP: 0033:0x7ff5bc069eae Code: 48 8b 0d 85 1f 0f 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 52 1f 0f 00 f7 d8 64 89 01 48 RSP: 002b:00007ffe433fd448 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007ff5bc069eae RDX: 00005575d7213290 RSI: 00005575d72132d0 RDI: 00005575d72132b0 RBP: 00005575d7212fd0 R08: 00005575d7213230 R09: 00005575d7213fe0 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00005575d7213290 R14: 00005575d72132b0 R15: 00005575d7212fd0 </TASK>
Allocated by task 5680: kasan_save_stack+0x1e/0x40 __kasan_slab_alloc+0x66/0x80 kmem_cache_alloc+0x152/0x320 xfs_rui_init+0x17a/0x1b0 [xfs] xlog_recover_rui_commit_pass2+0xb9/0x2e0 [xfs] xlog_recover_items_pass2+0xe9/0x220 [xfs] xlog_recover_commit_trans+0x673/0x900 [xfs] xlog_recovery_process_trans+0xbe/0x130 [xfs] xlog_recover_process_data+0x103/0x2a0 [xfs] xlog_do_recovery_pass+0x548/0xc60 [xfs] xlog_do_log_recovery+0x62/0xc0 [xfs] xlog_do_recover+0x73/0x480 [xfs] xlog_recover+0x229/0x460 [xfs] xfs_log_mount+0x284/0x640 [xfs] xfs_mountfs+0xf8b/0x1d10 [xfs] xfs_fs_fill_super+0x13d5/0x1a80 [xfs] get_tree_bdev+0x3da/0x6e0 vfs_get_tree+0x7d/0x240 path_mount+0xdd3/0x17d0 __x64_sys_mount+0x1fa/0x270 do_syscall_64+0x2b/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0
Freed by task 5680: kasan_save_stack+0x1e/0x40 kasan_set_track+0x21/0x30 kasan_set_free_info+0x20/0x30 ____kasan_slab_free+0x144/0x1b0 slab_free_freelist_hook+0xab/0x180 kmem_cache_free+0x1f1/0x410 xfs_rud_item_release+0x33/0x80 [xfs] xfs_trans_free_items+0xc3/0x220 [xfs] xfs_trans_cancel+0x1fa/0x590 [xfs] xfs_rui_item_recover+0x913/0xd60 [xfs] xlog_recover_process_intents+0x24e/0xae0 [xfs] xlog_recover_finish+0x7d/0x970 [xfs] xfs_log_mount_finish+0x2d7/0x5d0 [xfs] xfs_mountfs+0x11d4/0x1d10 [xfs] xfs_fs_fill_super+0x13d5/0x1a80 [xfs] get_tree_bdev+0x3da/0x6e0 vfs_get_tree+0x7d/0x240 path_mount+0xdd3/0x17d0 __x64_sys_mount+0x1fa/0x270 do_syscall_64+0x2b/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0
The buggy address belongs to the object at ffff88804391b300 which belongs to the cache xfs_rui_item of size 688 The buggy address is located 96 bytes inside of 688-byte region [ffff88804391b300, ffff88804391b5b0)
The buggy address belongs to the physical page: page:ffffea00010e4600 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff888043919320 pfn:0x43918 head:ffffea00010e4600 order:2 compound_mapcount:0 compound_pincount:0 flags: 0x4fff80000010200(slab|head|node=1|zone=1|lastcpupid=0xfff) raw: 04fff80000010200 0000000000000000 dead000000000122 ffff88807f0eadc0 raw: ffff888043919320 0000000080140010 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected
Memory state around the buggy address: ffff88804391b200: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff88804391b280: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
ffff88804391b300: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
^ ffff88804391b380: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88804391b400: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ==================================================================
The test fuzzes an rmap btree block and starts writer threads to induce a filesystem shutdown on the corrupt block. When the filesystem is remounted, recovery will try to replay the committed rmap intent item, but the corruption problem causes the recovery transaction to fail. Cancelling the transaction frees the RUD, which frees the RUI that we recovered.
When we return to xlog_recover_process_intents, @lip is now a dangling pointer, and we cannot use it to find the iop_recover method for the tracepoint. Hence we must store the item ops before calling ->iop_recover if we want to give it to the tracepoint so that the trace data will tell us exactly which intent item failed.
Signed-off-by: Darrick J. Wong djwong@kernel.org Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Leah Rumancik leah.rumancik@gmail.com Acked-by: Chandan Babu R chandanbabu@kernel.org --- fs/xfs/xfs_log_recover.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 04961ebf16ea..3d844a250b71 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2560,6 +2560,7 @@ xlog_recover_process_intents( for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); lip != NULL; lip = xfs_trans_ail_cursor_next(ailp, &cur)) { + const struct xfs_item_ops *ops; /* * We're done when we see something other than an intent. * There should be no intents left in the AIL now. @@ -2584,13 +2585,17 @@ xlog_recover_process_intents( * deferred ops, you /must/ attach them to the capture list in * the recover routine or else those subsequent intents will be * replayed in the wrong order! + * + * The recovery function can free the log item, so we must not + * access lip after it returns. */ spin_unlock(&ailp->ail_lock); - error = lip->li_ops->iop_recover(lip, &capture_list); + ops = lip->li_ops; + error = ops->iop_recover(lip, &capture_list); spin_lock(&ailp->ail_lock); if (error) { trace_xlog_intent_recovery_failed(log->l_mp, error, - lip->li_ops->iop_recover); + ops->iop_recover); break; } }
From: Guo Xuenan guoxuenan@huawei.com
[ Upstream commit 13cf24e00665c9751951a422756d975812b71173 ]
For leaf dir, In most cases, there should be as many bestfree slots as the dir data blocks that can fit under i_size (except for [1]).
Root cause is we don't examin the number bestfree slots, when the slots number less than dir data blocks, if we need to allocate new dir data block and update the bestfree array, we will use the dir block number as index to assign bestfree array, while we did not check the leaf buf boundary which may cause UAF or other memory access problem. This issue can also triggered with test cases xfs/473 from fstests.
According to Dave Chinner & Darrick's suggestion, adding buffer verifier to detect this abnormal situation in time. Simplify the testcase for fstest xfs/554 [1]
The error log is shown as follows: ================================================================== BUG: KASAN: use-after-free in xfs_dir2_leaf_addname+0x1995/0x1ac0 Write of size 2 at addr ffff88810168b000 by task touch/1552 CPU: 5 PID: 1552 Comm: touch Not tainted 6.0.0-rc3+ #101 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 Call Trace: <TASK> dump_stack_lvl+0x4d/0x66 print_report.cold+0xf6/0x691 kasan_report+0xa8/0x120 xfs_dir2_leaf_addname+0x1995/0x1ac0 xfs_dir_createname+0x58c/0x7f0 xfs_create+0x7af/0x1010 xfs_generic_create+0x270/0x5e0 path_openat+0x270b/0x3450 do_filp_open+0x1cf/0x2b0 do_sys_openat2+0x46b/0x7a0 do_sys_open+0xb7/0x130 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7fe4d9e9312b Code: 25 00 00 41 00 3d 00 00 41 00 74 4b 64 8b 04 25 18 00 00 00 85 c0 75 67 44 89 e2 48 89 ee bf 9c ff ff ff b8 01 01 00 00 0f 05 <48> 3d 00 f0 ff ff 0f 87 91 00 00 00 48 8b 4c 24 28 64 48 33 0c 25 RSP: 002b:00007ffda4c16c20 EFLAGS: 00000246 ORIG_RAX: 0000000000000101 RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007fe4d9e9312b RDX: 0000000000000941 RSI: 00007ffda4c17f33 RDI: 00000000ffffff9c RBP: 00007ffda4c17f33 R08: 0000000000000000 R09: 0000000000000000 R10: 00000000000001b6 R11: 0000000000000246 R12: 0000000000000941 R13: 00007fe4d9f631a4 R14: 00007ffda4c17f33 R15: 0000000000000000 </TASK>
The buggy address belongs to the physical page: page:ffffea000405a2c0 refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x10168b flags: 0x2fffff80000000(node=0|zone=2|lastcpupid=0x1fffff) raw: 002fffff80000000 ffffea0004057788 ffffea000402dbc8 0000000000000000 raw: 0000000000000000 0000000000170000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected
Memory state around the buggy address: ffff88810168af00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ffff88810168af80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
ffff88810168b000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
^ ffff88810168b080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff88810168b100: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ================================================================== Disabling lock debugging due to kernel taint 00000000: 58 44 44 33 5b 53 35 c2 00 00 00 00 00 00 00 78 XDD3[S5........x XFS (sdb): Internal error xfs_dir2_data_use_free at line 1200 of file fs/xfs/libxfs/xfs_dir2_data.c. Caller xfs_dir2_data_use_free+0x28a/0xeb0 CPU: 5 PID: 1552 Comm: touch Tainted: G B 6.0.0-rc3+ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 Call Trace: <TASK> dump_stack_lvl+0x4d/0x66 xfs_corruption_error+0x132/0x150 xfs_dir2_data_use_free+0x198/0xeb0 xfs_dir2_leaf_addname+0xa59/0x1ac0 xfs_dir_createname+0x58c/0x7f0 xfs_create+0x7af/0x1010 xfs_generic_create+0x270/0x5e0 path_openat+0x270b/0x3450 do_filp_open+0x1cf/0x2b0 do_sys_openat2+0x46b/0x7a0 do_sys_open+0xb7/0x130 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7fe4d9e9312b Code: 25 00 00 41 00 3d 00 00 41 00 74 4b 64 8b 04 25 18 00 00 00 85 c0 75 67 44 89 e2 48 89 ee bf 9c ff ff ff b8 01 01 00 00 0f 05 <48> 3d 00 f0 ff ff 0f 87 91 00 00 00 48 8b 4c 24 28 64 48 33 0c 25 RSP: 002b:00007ffda4c16c20 EFLAGS: 00000246 ORIG_RAX: 0000000000000101 RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007fe4d9e9312b RDX: 0000000000000941 RSI: 00007ffda4c17f46 RDI: 00000000ffffff9c RBP: 00007ffda4c17f46 R08: 0000000000000000 R09: 0000000000000001 R10: 00000000000001b6 R11: 0000000000000246 R12: 0000000000000941 R13: 00007fe4d9f631a4 R14: 00007ffda4c17f46 R15: 0000000000000000 </TASK> XFS (sdb): Corruption detected. Unmount and run xfs_repair
[1] https://lore.kernel.org/all/20220928095355.2074025-1-guoxuenan@huawei.com/ Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Guo Xuenan guoxuenan@huawei.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Leah Rumancik leah.rumancik@gmail.com Acked-by: Chandan Babu R chandanbabu@kernel.org --- fs/xfs/libxfs/xfs_dir2_leaf.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index d9b66306a9a7..cb9e950a911d 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -146,6 +146,8 @@ xfs_dir3_leaf_check_int( xfs_dir2_leaf_tail_t *ltp; int stale; int i; + bool isleaf1 = (hdr->magic == XFS_DIR2_LEAF1_MAGIC || + hdr->magic == XFS_DIR3_LEAF1_MAGIC);
ltp = xfs_dir2_leaf_tail_p(geo, leaf);
@@ -158,8 +160,7 @@ xfs_dir3_leaf_check_int( return __this_address;
/* Leaves and bests don't overlap in leaf format. */ - if ((hdr->magic == XFS_DIR2_LEAF1_MAGIC || - hdr->magic == XFS_DIR3_LEAF1_MAGIC) && + if (isleaf1 && (char *)&hdr->ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp)) return __this_address;
@@ -175,6 +176,10 @@ xfs_dir3_leaf_check_int( } if (hdr->ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) stale++; + if (isleaf1 && xfs_dir2_dataptr_to_db(geo, + be32_to_cpu(hdr->ents[i].address)) >= + be32_to_cpu(ltp->bestcount)) + return __this_address; } if (hdr->stale != stale) return __this_address;
From: Zeng Heng zengheng4@huawei.com
[ Upstream commit cf4f4c12dea7a977a143c8fe5af1740b7f9876f8 ]
When `xfs_sysfs_init` returns failed, `mp->m_errortag` needs to free. Otherwise kmemleak would report memory leak after mounting xfs image:
unreferenced object 0xffff888101364900 (size 192): comm "mount", pid 13099, jiffies 4294915218 (age 335.207s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000f08ad25c>] __kmalloc+0x41/0x1b0 [<00000000dca9aeb6>] kmem_alloc+0xfd/0x430 [<0000000040361882>] xfs_errortag_init+0x20/0x110 [<00000000b384a0f6>] xfs_mountfs+0x6ea/0x1a30 [<000000003774395d>] xfs_fs_fill_super+0xe10/0x1a80 [<000000009cf07b6c>] get_tree_bdev+0x3e7/0x700 [<00000000046b5426>] vfs_get_tree+0x8e/0x2e0 [<00000000952ec082>] path_mount+0xf8c/0x1990 [<00000000beb1f838>] do_mount+0xee/0x110 [<000000000e9c41bb>] __x64_sys_mount+0x14b/0x1f0 [<00000000f7bb938e>] do_syscall_64+0x3b/0x90 [<000000003fcd67a9>] entry_SYSCALL_64_after_hwframe+0x63/0xcd
Fixes: c68401011522 ("xfs: expose errortag knobs via sysfs") Signed-off-by: Zeng Heng zengheng4@huawei.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Leah Rumancik leah.rumancik@gmail.com Acked-by: Chandan Babu R chandanbabu@kernel.org --- fs/xfs/xfs_error.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 81c445e9489b..b0ccec92e015 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -224,13 +224,18 @@ int xfs_errortag_init( struct xfs_mount *mp) { + int ret; + mp->m_errortag = kmem_zalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX, KM_MAYFAIL); if (!mp->m_errortag) return -ENOMEM;
- return xfs_sysfs_init(&mp->m_errortag_kobj, &xfs_errortag_ktype, - &mp->m_kobj, "errortag"); + ret = xfs_sysfs_init(&mp->m_errortag_kobj, &xfs_errortag_ktype, + &mp->m_kobj, "errortag"); + if (ret) + kmem_free(mp->m_errortag); + return ret; }
void
From: Li Zetao lizetao1@huawei.com
[ Upstream commit d08af40340cad0e025d643c3982781a8f99d5032 ]
kmemleak reported a sequence of memory leaks, and one of them indicated we failed to free a pointer: comm "mount", pid 19610, jiffies 4297086464 (age 60.635s) hex dump (first 8 bytes): 73 64 61 00 81 88 ff ff sda..... backtrace: [<00000000d77f3e04>] kstrdup_const+0x46/0x70 [<00000000e51fa804>] kobject_set_name_vargs+0x2f/0xb0 [<00000000247cd595>] kobject_init_and_add+0xb0/0x120 [<00000000f9139aaf>] xfs_mountfs+0x367/0xfc0 [<00000000250d3caf>] xfs_fs_fill_super+0xa16/0xdc0 [<000000008d873d38>] get_tree_bdev+0x256/0x390 [<000000004881f3fa>] vfs_get_tree+0x41/0xf0 [<000000008291ab52>] path_mount+0x9b3/0xdd0 [<0000000022ba8f2d>] __x64_sys_mount+0x190/0x1d0
As mentioned in kobject_init_and_add() comment, if this function returns an error, kobject_put() must be called to properly clean up the memory associated with the object. Apparently, xfs_sysfs_init() does not follow such a requirement. When kobject_init_and_add() returns an error, the space of kobj->kobject.name alloced by kstrdup_const() is unfree, which will cause the above stack.
Fix it by adding kobject_put() when kobject_init_and_add returns an error.
Fixes: a31b1d3d89e4 ("xfs: add xfs_mount sysfs kobject") Signed-off-by: Li Zetao lizetao1@huawei.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Leah Rumancik leah.rumancik@gmail.com Acked-by: Chandan Babu R chandanbabu@kernel.org --- fs/xfs/xfs_sysfs.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h index 43585850f154..513095e353a5 100644 --- a/fs/xfs/xfs_sysfs.h +++ b/fs/xfs/xfs_sysfs.h @@ -33,10 +33,15 @@ xfs_sysfs_init( const char *name) { struct kobject *parent; + int err;
parent = parent_kobj ? &parent_kobj->kobject : NULL; init_completion(&kobj->complete); - return kobject_init_and_add(&kobj->kobject, ktype, parent, "%s", name); + err = kobject_init_and_add(&kobj->kobject, ktype, parent, "%s", name); + if (err) + kobject_put(&kobj->kobject); + + return err; }
static inline void
linux-stable-mirror@lists.linaro.org