Hi Greg,
This 5.4.y backport series contains XFS fixes from v5.6. The patchset has been acked by Darrick.
Christoph Hellwig (3): xfs: fix misuse of the XFS_ATTR_INCOMPLETE flag xfs: fix IOCB_NOWAIT handling in xfs_file_dio_aio_read xfs: move incore structures out of xfs_da_format.h
Darrick J. Wong (7): xfs: introduce XFS_MAX_FILEOFF xfs: truncate should remove all blocks, not just to the end of the page cache xfs: fix s_maxbytes computation on 32-bit kernels xfs: refactor remote attr value buffer invalidation xfs: fix memory corruption during remote attr value buffer invalidation xfs: streamline xfs_attr3_leaf_inactive xfs: fix uninitialized variable in xfs_attr3_leaf_inactive
YueHaibing (1): xfs: remove unused variable 'done'
fs/xfs/libxfs/xfs_attr.c | 2 +- fs/xfs/libxfs/xfs_attr_leaf.c | 4 +- fs/xfs/libxfs/xfs_attr_leaf.h | 26 ++++-- fs/xfs/libxfs/xfs_attr_remote.c | 85 +++++++++++++------ fs/xfs/libxfs/xfs_attr_remote.h | 2 + fs/xfs/libxfs/xfs_da_btree.h | 17 +++- fs/xfs/libxfs/xfs_da_format.c | 1 + fs/xfs/libxfs/xfs_da_format.h | 59 ------------- fs/xfs/libxfs/xfs_dir2.h | 2 + fs/xfs/libxfs/xfs_dir2_priv.h | 19 +++++ fs/xfs/libxfs/xfs_format.h | 7 ++ fs/xfs/xfs_attr_inactive.c | 146 +++++++++----------------------- fs/xfs/xfs_file.c | 7 +- fs/xfs/xfs_inode.c | 25 +++--- fs/xfs/xfs_reflink.c | 3 +- fs/xfs/xfs_super.c | 48 +++++------ 16 files changed, 212 insertions(+), 241 deletions(-)
From: Christoph Hellwig hch@lst.de
commit 780d29057781d986cd87dbbe232cd02876ad430f upstream.
XFS_ATTR_INCOMPLETE is a flag in the on-disk attribute format, and thus in a different namespace as the ATTR_* flags in xfs_da_args.flags. Switch to using a XFS_DA_OP_INCOMPLETE flag in op_flags instead. Without this users might be able to inject this flag into operations using the attr by handle ioctl.
Signed-off-by: Christoph Hellwig hch@lst.de Reviewed-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Acked-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Chandan Babu R chandan.babu@oracle.com --- fs/xfs/libxfs/xfs_attr.c | 2 +- fs/xfs/libxfs/xfs_attr_leaf.c | 4 ++-- fs/xfs/libxfs/xfs_da_btree.h | 4 +++- fs/xfs/libxfs/xfs_da_format.h | 2 -- 4 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 510ca6974604..c83ff610ecb6 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -1007,7 +1007,7 @@ xfs_attr_node_addname( * The INCOMPLETE flag means that we will find the "old" * attr, not the "new" one. */ - args->flags |= XFS_ATTR_INCOMPLETE; + args->op_flags |= XFS_DA_OP_INCOMPLETE; state = xfs_da_state_alloc(); state->args = args; state->mp = mp; diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 0c23127347ac..c86ddbf6d105 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -2345,8 +2345,8 @@ xfs_attr3_leaf_lookup_int( * If we are looking for INCOMPLETE entries, show only those. * If we are looking for complete entries, show only those. */ - if ((args->flags & XFS_ATTR_INCOMPLETE) != - (entry->flags & XFS_ATTR_INCOMPLETE)) { + if (!!(args->op_flags & XFS_DA_OP_INCOMPLETE) != + !!(entry->flags & XFS_ATTR_INCOMPLETE)) { continue; } if (entry->flags & XFS_ATTR_LOCAL) { diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index ae0bbd20d9ca..eebbc66f4c05 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -82,6 +82,7 @@ typedef struct xfs_da_args { #define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */ #define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */ #define XFS_DA_OP_ALLOCVAL 0x0020 /* lookup to alloc buffer if found */ +#define XFS_DA_OP_INCOMPLETE 0x0040 /* lookup INCOMPLETE attr keys */
#define XFS_DA_OP_FLAGS \ { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ @@ -89,7 +90,8 @@ typedef struct xfs_da_args { { XFS_DA_OP_ADDNAME, "ADDNAME" }, \ { XFS_DA_OP_OKNOENT, "OKNOENT" }, \ { XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \ - { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" } + { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" }, \ + { XFS_DA_OP_INCOMPLETE, "INCOMPLETE" }
/* * Storage for holding state during Btree searches and split/join ops. diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index ae654e06b2fb..cda10902df1e 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -740,8 +740,6 @@ struct xfs_attr3_icleaf_hdr {
/* * Flags used in the leaf_entry[i].flags field. - * NOTE: the INCOMPLETE bit must not collide with the flags bits specified - * on the system call, they are "or"ed together for various operations. */ #define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */ #define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */
From: "Darrick J. Wong" darrick.wong@oracle.com
commit a5084865524dee1fe8ea1fee17c60b4369ad4f5e upstream.
Introduce a new #define for the maximum supported file block offset. We'll use this in the next patch to make it more obvious that we're doing some operation for all possible inode fork mappings after a given offset. We can't use ULLONG_MAX here because bunmapi uses that to detect when it's done.
Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Christoph Hellwig hch@lst.de Acked-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Chandan Babu R chandan.babu@oracle.com --- fs/xfs/libxfs/xfs_format.h | 7 +++++++ fs/xfs/xfs_reflink.c | 3 ++- 2 files changed, 9 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index c968b60cee15..28203b626f6a 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1540,6 +1540,13 @@ typedef struct xfs_bmdr_block { #define BMBT_BLOCKCOUNT_BITLEN 21
#define BMBT_STARTOFF_MASK ((1ULL << BMBT_STARTOFF_BITLEN) - 1) +#define BMBT_BLOCKCOUNT_MASK ((1ULL << BMBT_BLOCKCOUNT_BITLEN) - 1) + +/* + * bmbt records have a file offset (block) field that is 54 bits wide, so this + * is the largest xfs_fileoff_t that we ever expect to see. + */ +#define XFS_MAX_FILEOFF (BMBT_STARTOFF_MASK + BMBT_BLOCKCOUNT_MASK)
typedef struct xfs_bmbt_rec { __be64 l0, l1; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 904d8285c226..dfbf3f8f1ec8 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1544,7 +1544,8 @@ xfs_reflink_clear_inode_flag( * We didn't find any shared blocks so turn off the reflink flag. * First, get rid of any leftover CoW mappings. */ - error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true); + error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, XFS_MAX_FILEOFF, + true); if (error) return error;
From: "Darrick J. Wong" darrick.wong@oracle.com
commit 4bbb04abb4ee2e1f7d65e52557ba1c4038ea43ed upstream.
xfs_itruncate_extents_flags() is supposed to unmap every block in a file from EOF onwards. Oddly, it uses s_maxbytes as the upper limit to the bunmapi range, even though s_maxbytes reflects the highest offset the pagecache can support, not the highest offset that XFS supports.
The result of this confusion is that if you create a 20T file on a 64-bit machine, mount the filesystem on a 32-bit machine, and remove the file, we leak everything above 16T. Fix this by capping the bunmapi request at the maximum possible block offset, not s_maxbytes.
Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Christoph Hellwig hch@lst.de Acked-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Chandan Babu R chandan.babu@oracle.com --- fs/xfs/xfs_inode.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-)
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 7b72c189cff0..d4af6e44dd6f 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1513,7 +1513,6 @@ xfs_itruncate_extents_flags( struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp = *tpp; xfs_fileoff_t first_unmap_block; - xfs_fileoff_t last_block; xfs_filblks_t unmap_len; int error = 0; int done = 0; @@ -1536,21 +1535,22 @@ xfs_itruncate_extents_flags( * the end of the file (in a crash where the space is allocated * but the inode size is not yet updated), simply remove any * blocks which show up between the new EOF and the maximum - * possible file size. If the first block to be removed is - * beyond the maximum file size (ie it is the same as last_block), - * then there is nothing to do. + * possible file size. + * + * We have to free all the blocks to the bmbt maximum offset, even if + * the page cache can't scale that far. */ first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); - last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); - if (first_unmap_block == last_block) + if (first_unmap_block >= XFS_MAX_FILEOFF) { + WARN_ON_ONCE(first_unmap_block > XFS_MAX_FILEOFF); return 0; + }
- ASSERT(first_unmap_block < last_block); - unmap_len = last_block - first_unmap_block + 1; - while (!done) { + unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1; + while (unmap_len > 0) { ASSERT(tp->t_firstblock == NULLFSBLOCK); - error = xfs_bunmapi(tp, ip, first_unmap_block, unmap_len, flags, - XFS_ITRUNC_MAX_EXTENTS, &done); + error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len, + flags, XFS_ITRUNC_MAX_EXTENTS); if (error) goto out;
@@ -1570,7 +1570,7 @@ xfs_itruncate_extents_flags( if (whichfork == XFS_DATA_FORK) { /* Remove all pending CoW reservations. */ error = xfs_reflink_cancel_cow_blocks(ip, &tp, - first_unmap_block, last_block, true); + first_unmap_block, XFS_MAX_FILEOFF, true); if (error) goto out;
From: "Darrick J. Wong" darrick.wong@oracle.com
commit 932befe39ddea29cf47f4f1dc080d3dba668f0ca upstream.
I observed a hang in generic/308 while running fstests on a i686 kernel. The hang occurred when trying to purge the pagecache on a large sparse file that had a page created past MAX_LFS_FILESIZE, which caused an integer overflow in the pagecache xarray and resulted in an infinite loop.
I then noticed that Linus changed the definition of MAX_LFS_FILESIZE in commit 0cc3b0ec23ce ("Clarify (and fix) MAX_LFS_FILESIZE macros") so that it is now one page short of the maximum page index on 32-bit kernels. Because the XFS function to compute max offset open-codes the 2005-era MAX_LFS_FILESIZE computation and neither the vfs nor mm perform any sanity checking of s_maxbytes, the code in generic/308 can create a page above the pagecache's limit and kaboom.
Fix all this by setting s_maxbytes to MAX_LFS_FILESIZE directly and aborting the mount with a warning if our assumptions ever break. I have no answer for why this seems to have been broken for years and nobody noticed.
Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Christoph Hellwig hch@lst.de Acked-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Chandan Babu R chandan.babu@oracle.com --- fs/xfs/xfs_super.c | 48 ++++++++++++++++++++-------------------------- 1 file changed, 21 insertions(+), 27 deletions(-)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 8d1df9f8be07..a3a54a0fbffe 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -512,32 +512,6 @@ xfs_showargs( seq_puts(m, ",noquota"); }
-static uint64_t -xfs_max_file_offset( - unsigned int blockshift) -{ - unsigned int pagefactor = 1; - unsigned int bitshift = BITS_PER_LONG - 1; - - /* Figure out maximum filesize, on Linux this can depend on - * the filesystem blocksize (on 32 bit platforms). - * __block_write_begin does this in an [unsigned] long long... - * page->index << (PAGE_SHIFT - bbits) - * So, for page sized blocks (4K on 32 bit platforms), - * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is - * (((u64)PAGE_SIZE << (BITS_PER_LONG-1))-1) - * but for smaller blocksizes it is less (bbits = log2 bsize). - */ - -#if BITS_PER_LONG == 32 - ASSERT(sizeof(sector_t) == 8); - pagefactor = PAGE_SIZE; - bitshift = BITS_PER_LONG; -#endif - - return (((uint64_t)pagefactor) << bitshift) - 1; -} - /* * Set parameters for inode allocation heuristics, taking into account * filesystem size and inode32/inode64 mount options; i.e. specifically @@ -1650,6 +1624,26 @@ xfs_fs_fill_super( if (error) goto out_free_sb;
+ /* + * XFS block mappings use 54 bits to store the logical block offset. + * This should suffice to handle the maximum file size that the VFS + * supports (currently 2^63 bytes on 64-bit and ULONG_MAX << PAGE_SHIFT + * bytes on 32-bit), but as XFS and VFS have gotten the s_maxbytes + * calculation wrong on 32-bit kernels in the past, we'll add a WARN_ON + * to check this assertion. + * + * Avoid integer overflow by comparing the maximum bmbt offset to the + * maximum pagecache offset in units of fs blocks. + */ + if (XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE) > XFS_MAX_FILEOFF) { + xfs_warn(mp, +"MAX_LFS_FILESIZE block offset (%llu) exceeds extent map maximum (%llu)!", + XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE), + XFS_MAX_FILEOFF); + error = -EINVAL; + goto out_free_sb; + } + error = xfs_filestream_mount(mp); if (error) goto out_free_sb; @@ -1661,7 +1655,7 @@ xfs_fs_fill_super( sb->s_magic = XFS_SUPER_MAGIC; sb->s_blocksize = mp->m_sb.sb_blocksize; sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; - sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits); + sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_max_links = XFS_MAXLINK; sb->s_time_gran = 1; sb->s_time_min = S32_MIN;
From: Christoph Hellwig hch@lst.de
commit 7b53b868a1812a9a6ab5e69249394bd37f29ce2c upstream.
Direct I/O reads can also be used with RWF_NOWAIT & co. Fix the inode locking in xfs_file_dio_aio_read to take IOCB_NOWAIT into account.
Signed-off-by: Christoph Hellwig hch@lst.de Reviewed-by: Carlos Maiolino cmaiolino@redhat.com Reviewed-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Acked-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Chandan Babu R chandan.babu@oracle.com --- fs/xfs/xfs_file.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 203065a64765..e41c13ffa5a4 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -187,7 +187,12 @@ xfs_file_dio_aio_read(
file_accessed(iocb->ki_filp);
- xfs_ilock(ip, XFS_IOLOCK_SHARED); + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) + return -EAGAIN; + } else { + xfs_ilock(ip, XFS_IOLOCK_SHARED); + } ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL); xfs_iunlock(ip, XFS_IOLOCK_SHARED);
From: "Darrick J. Wong" darrick.wong@oracle.com
commit 8edbb26b06023de31ad7d4c9b984d99f66577929 upstream.
[Replaced XFS_IS_CORRUPT() calls with ASSERT() for 5.4.y backport]
Hoist the code that invalidates remote extended attribute value buffers into a separate helper function. This prepares us for a memory corruption fix in the next patch.
Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Christoph Hellwig hch@lst.de Acked-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Chandan Babu R chandan.babu@oracle.com --- fs/xfs/libxfs/xfs_attr_remote.c | 48 ++++++++++++++++++++------------- fs/xfs/libxfs/xfs_attr_remote.h | 2 ++ 2 files changed, 31 insertions(+), 19 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 3e39b7d40f25..4e5579edcf8c 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -551,6 +551,32 @@ xfs_attr_rmtval_set( return 0; }
+/* Mark stale any incore buffers for the remote value. */ +int +xfs_attr_rmtval_stale( + struct xfs_inode *ip, + struct xfs_bmbt_irec *map, + xfs_buf_flags_t incore_flags) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_buf *bp; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + ASSERT((map->br_startblock != DELAYSTARTBLOCK) && + (map->br_startblock != HOLESTARTBLOCK)); + + bp = xfs_buf_incore(mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, map->br_startblock), + XFS_FSB_TO_BB(mp, map->br_blockcount), incore_flags); + if (bp) { + xfs_buf_stale(bp); + xfs_buf_relse(bp); + } + + return 0; +} + /* * Remove the value associated with an attribute by deleting the * out-of-line buffer that it is stored on. @@ -559,7 +585,6 @@ int xfs_attr_rmtval_remove( struct xfs_da_args *args) { - struct xfs_mount *mp = args->dp->i_mount; xfs_dablk_t lblkno; int blkcnt; int error; @@ -574,9 +599,6 @@ xfs_attr_rmtval_remove( blkcnt = args->rmtblkcnt; while (blkcnt > 0) { struct xfs_bmbt_irec map; - struct xfs_buf *bp; - xfs_daddr_t dblkno; - int dblkcnt; int nmap;
/* @@ -588,21 +610,9 @@ xfs_attr_rmtval_remove( if (error) return error; ASSERT(nmap == 1); - ASSERT((map.br_startblock != DELAYSTARTBLOCK) && - (map.br_startblock != HOLESTARTBLOCK)); - - dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), - dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - - /* - * If the "remote" value is in the cache, remove it. - */ - bp = xfs_buf_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK); - if (bp) { - xfs_buf_stale(bp); - xfs_buf_relse(bp); - bp = NULL; - } + error = xfs_attr_rmtval_stale(args->dp, &map, XBF_TRYLOCK); + if (error) + return error;
lblkno += map.br_blockcount; blkcnt -= map.br_blockcount; diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h index 9d20b66ad379..6fb4572845ce 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.h +++ b/fs/xfs/libxfs/xfs_attr_remote.h @@ -11,5 +11,7 @@ int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen); int xfs_attr_rmtval_get(struct xfs_da_args *args); int xfs_attr_rmtval_set(struct xfs_da_args *args); int xfs_attr_rmtval_remove(struct xfs_da_args *args); +int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map, + xfs_buf_flags_t incore_flags);
#endif /* __XFS_ATTR_REMOTE_H__ */
From: "Darrick J. Wong" darrick.wong@oracle.com
commit e8db2aafcedb7d88320ab83f1000f1606b26d4d7 upstream.
[Replaced XFS_IS_CORRUPT() calls with ASSERT() for 5.4.y backport]
While running generic/103, I observed what looks like memory corruption and (with slub debugging turned on) a slub redzone warning on i386 when inactivating an inode with a 64k remote attr value.
On a v5 filesystem, maximally sized remote attr values require one block more than 64k worth of space to hold both the remote attribute value header (64 bytes). On a 4k block filesystem this results in a 68k buffer; on a 64k block filesystem, this would be a 128k buffer. Note that even though we'll never use more than 65,600 bytes of this buffer, XFS_MAX_BLOCKSIZE is 64k.
This is a problem because the definition of struct xfs_buf_log_format allows for XFS_MAX_BLOCKSIZE worth of dirty bitmap (64k). On i386 when we invalidate a remote attribute, xfs_trans_binval zeroes all 68k worth of the dirty map, writing right off the end of the log item and corrupting memory. We've gotten away with this on x86_64 for years because the compiler inserts a u32 padding on the end of struct xfs_buf_log_format.
Fortunately for us, remote attribute values are written to disk with xfs_bwrite(), which is to say that they are not logged. Fix the problem by removing all places where we could end up creating a buffer log item for a remote attribute value and leave a note explaining why. Next, replace the open-coded buffer invalidation with a call to the helper we created in the previous patch that does better checking for bad metadata before marking the buffer stale.
Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Christoph Hellwig hch@lst.de Acked-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Chandan Babu R chandan.babu@oracle.com --- fs/xfs/libxfs/xfs_attr_remote.c | 37 +++++++++++++++++++++----- fs/xfs/xfs_attr_inactive.c | 47 +++++++++------------------------ 2 files changed, 44 insertions(+), 40 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 4e5579edcf8c..de9096b8a47c 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -24,6 +24,23 @@
#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */
+/* + * Remote Attribute Values + * ======================= + * + * Remote extended attribute values are conceptually simple -- they're written + * to data blocks mapped by an inode's attribute fork, and they have an upper + * size limit of 64k. Setting a value does not involve the XFS log. + * + * However, on a v5 filesystem, maximally sized remote attr values require one + * block more than 64k worth of space to hold both the remote attribute value + * header (64 bytes). On a 4k block filesystem this results in a 68k buffer; + * on a 64k block filesystem, this would be a 128k buffer. Note that the log + * format can only handle a dirty buffer of XFS_MAX_BLOCKSIZE length (64k). + * Therefore, we /must/ ensure that remote attribute value buffers never touch + * the logging system and therefore never have a log item. + */ + /* * Each contiguous block has a header, so it is not just a simple attribute * length to FSB conversion. @@ -400,17 +417,25 @@ xfs_attr_rmtval_get( (map[i].br_startblock != HOLESTARTBLOCK)); dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); - error = xfs_trans_read_buf(mp, args->trans, - mp->m_ddev_targp, - dblkno, dblkcnt, 0, &bp, - &xfs_attr3_rmt_buf_ops); - if (error) + bp = xfs_buf_read(mp->m_ddev_targp, dblkno, dblkcnt, 0, + &xfs_attr3_rmt_buf_ops); + if (!bp) + return -ENOMEM; + error = bp->b_error; + if (error) { + xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_relse(bp); + + /* bad CRC means corrupted metadata */ + if (error == -EFSBADCRC) + error = -EFSCORRUPTED; return error; + }
error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino, &offset, &valuelen, &dst); - xfs_trans_brelse(args->trans, bp); + xfs_buf_relse(bp); if (error) return error;
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 766b1386402a..9d5c27db1239 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -25,22 +25,20 @@ #include "xfs_error.h"
/* - * Look at all the extents for this logical region, - * invalidate any buffers that are incore/in transactions. + * Invalidate any incore buffers associated with this remote attribute value + * extent. We never log remote attribute value buffers, which means that they + * won't be attached to a transaction and are therefore safe to mark stale. + * The actual bunmapi will be taken care of later. */ STATIC int -xfs_attr3_leaf_freextent( - struct xfs_trans **trans, +xfs_attr3_rmt_stale( struct xfs_inode *dp, xfs_dablk_t blkno, int blkcnt) { struct xfs_bmbt_irec map; - struct xfs_buf *bp; xfs_dablk_t tblkno; - xfs_daddr_t dblkno; int tblkcnt; - int dblkcnt; int nmap; int error;
@@ -57,35 +55,18 @@ xfs_attr3_leaf_freextent( nmap = 1; error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); - if (error) { + if (error) return error; - } ASSERT(nmap == 1); - ASSERT(map.br_startblock != DELAYSTARTBLOCK);
/* - * If it's a hole, these are already unmapped - * so there's nothing to invalidate. + * Mark any incore buffers for the remote value as stale. We + * never log remote attr value buffers, so the buffer should be + * easy to kill. */ - if (map.br_startblock != HOLESTARTBLOCK) { - - dblkno = XFS_FSB_TO_DADDR(dp->i_mount, - map.br_startblock); - dblkcnt = XFS_FSB_TO_BB(dp->i_mount, - map.br_blockcount); - bp = xfs_trans_get_buf(*trans, - dp->i_mount->m_ddev_targp, - dblkno, dblkcnt, 0); - if (!bp) - return -ENOMEM; - xfs_trans_binval(*trans, bp); - /* - * Roll to next transaction. - */ - error = xfs_trans_roll_inode(trans, dp); - if (error) - return error; - } + error = xfs_attr_rmtval_stale(dp, &map, 0); + if (error) + return error;
tblkno += map.br_blockcount; tblkcnt -= map.br_blockcount; @@ -174,9 +155,7 @@ xfs_attr3_leaf_inactive( */ error = 0; for (lp = list, i = 0; i < count; i++, lp++) { - tmp = xfs_attr3_leaf_freextent(trans, dp, - lp->valueblk, lp->valuelen); - + tmp = xfs_attr3_rmt_stale(dp, lp->valueblk, lp->valuelen); if (error == 0) error = tmp; /* save only the 1st errno */ }
From: Christoph Hellwig hch@lst.de
commit a39f089a25e75c3d17b955d8eb8bc781f23364f3 upstream.
Move the abstract in-memory version of various btree block headers out of xfs_da_format.h as they aren't on-disk formats.
Signed-off-by: Christoph Hellwig hch@lst.de Reviewed-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Acked-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Chandan Babu R chandan.babu@oracle.com --- fs/xfs/libxfs/xfs_attr_leaf.h | 23 ++++++++++++++ fs/xfs/libxfs/xfs_da_btree.h | 13 ++++++++ fs/xfs/libxfs/xfs_da_format.c | 1 + fs/xfs/libxfs/xfs_da_format.h | 57 ----------------------------------- fs/xfs/libxfs/xfs_dir2.h | 2 ++ fs/xfs/libxfs/xfs_dir2_priv.h | 19 ++++++++++++ 6 files changed, 58 insertions(+), 57 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 7b74e18becff..23dd84200e09 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -16,6 +16,29 @@ struct xfs_da_state_blk; struct xfs_inode; struct xfs_trans;
+/* + * Incore version of the attribute leaf header. + */ +struct xfs_attr3_icleaf_hdr { + uint32_t forw; + uint32_t back; + uint16_t magic; + uint16_t count; + uint16_t usedbytes; + /* + * Firstused is 32-bit here instead of 16-bit like the on-disk variant + * to support maximum fsb size of 64k without overflow issues throughout + * the attr code. Instead, the overflow condition is handled on + * conversion to/from disk. + */ + uint32_t firstused; + __u8 holes; + struct { + uint16_t base; + uint16_t size; + } freemap[XFS_ATTR_LEAF_MAPSIZE]; +}; + /* * Used to keep a list of "remote value" extents when unlinking an inode. */ diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index eebbc66f4c05..588e4674e931 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -126,6 +126,19 @@ typedef struct xfs_da_state { /* for dirv2 extrablk is data */ } xfs_da_state_t;
+/* + * In-core version of the node header to abstract the differences in the v2 and + * v3 disk format of the headers. Callers need to convert to/from disk format as + * appropriate. + */ +struct xfs_da3_icnode_hdr { + uint32_t forw; + uint32_t back; + uint16_t magic; + uint16_t count; + uint16_t level; +}; + /* * Utility macros to aid in logging changed structure fields. */ diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c index b1ae572496b6..31bb250c1899 100644 --- a/fs/xfs/libxfs/xfs_da_format.c +++ b/fs/xfs/libxfs/xfs_da_format.c @@ -13,6 +13,7 @@ #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_dir2.h" +#include "xfs_dir2_priv.h"
/* * Shortform directory ops diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index cda10902df1e..222ee48da5e8 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -93,19 +93,6 @@ struct xfs_da3_intnode { struct xfs_da_node_entry __btree[]; };
-/* - * In-core version of the node header to abstract the differences in the v2 and - * v3 disk format of the headers. Callers need to convert to/from disk format as - * appropriate. - */ -struct xfs_da3_icnode_hdr { - uint32_t forw; - uint32_t back; - uint16_t magic; - uint16_t count; - uint16_t level; -}; - /* * Directory version 2. * @@ -434,14 +421,6 @@ struct xfs_dir3_leaf_hdr { __be32 pad; /* 64 bit alignment */ };
-struct xfs_dir3_icleaf_hdr { - uint32_t forw; - uint32_t back; - uint16_t magic; - uint16_t count; - uint16_t stale; -}; - /* * Leaf block entry. */ @@ -520,19 +499,6 @@ struct xfs_dir3_free {
#define XFS_DIR3_FREE_CRC_OFF offsetof(struct xfs_dir3_free, hdr.hdr.crc)
-/* - * In core version of the free block header, abstracted away from on-disk format - * differences. Use this in the code, and convert to/from the disk version using - * xfs_dir3_free_hdr_from_disk/xfs_dir3_free_hdr_to_disk. - */ -struct xfs_dir3_icfree_hdr { - uint32_t magic; - uint32_t firstdb; - uint32_t nvalid; - uint32_t nused; - -}; - /* * Single block format. * @@ -709,29 +675,6 @@ struct xfs_attr3_leafblock { */ };
-/* - * incore, neutral version of the attribute leaf header - */ -struct xfs_attr3_icleaf_hdr { - uint32_t forw; - uint32_t back; - uint16_t magic; - uint16_t count; - uint16_t usedbytes; - /* - * firstused is 32-bit here instead of 16-bit like the on-disk variant - * to support maximum fsb size of 64k without overflow issues throughout - * the attr code. Instead, the overflow condition is handled on - * conversion to/from disk. - */ - uint32_t firstused; - __u8 holes; - struct { - uint16_t base; - uint16_t size; - } freemap[XFS_ATTR_LEAF_MAPSIZE]; -}; - /* * Special value to represent fs block size in the leaf header firstused field. * Only used when block size overflows the 2-bytes available on disk. diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index f54244779492..e170792c0acc 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -18,6 +18,8 @@ struct xfs_dir2_sf_entry; struct xfs_dir2_data_hdr; struct xfs_dir2_data_entry; struct xfs_dir2_data_unused; +struct xfs_dir3_icfree_hdr; +struct xfs_dir3_icleaf_hdr;
extern struct xfs_name xfs_name_dotdot;
diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index 59f9fb2241a5..d2eaea663e7f 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -8,6 +8,25 @@
struct dir_context;
+/* + * In-core version of the leaf and free block headers to abstract the + * differences in the v2 and v3 disk format of the headers. + */ +struct xfs_dir3_icleaf_hdr { + uint32_t forw; + uint32_t back; + uint16_t magic; + uint16_t count; + uint16_t stale; +}; + +struct xfs_dir3_icfree_hdr { + uint32_t magic; + uint32_t firstdb; + uint32_t nvalid; + uint32_t nused; +}; + /* xfs_dir2.c */ extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, xfs_dir2_db_t *dbp);
From: "Darrick J. Wong" darrick.wong@oracle.com
commit 0bb9d159bd018b271e783d3b2d3bc82fa0727321 upstream.
Now that we know we don't have to take a transaction to stale the incore buffers for a remote value, get rid of the unnecessary memory allocation in the leaf walker and call the rmt_stale function directly. Flatten the loop while we're at it.
Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Christoph Hellwig hch@lst.de Acked-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Chandan Babu R chandan.babu@oracle.com --- fs/xfs/libxfs/xfs_attr_leaf.h | 9 --- fs/xfs/xfs_attr_inactive.c | 101 ++++++++++------------------------ 2 files changed, 29 insertions(+), 81 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 23dd84200e09..38c05d6ae2aa 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -39,15 +39,6 @@ struct xfs_attr3_icleaf_hdr { } freemap[XFS_ATTR_LEAF_MAPSIZE]; };
-/* - * Used to keep a list of "remote value" extents when unlinking an inode. - */ -typedef struct xfs_attr_inactive_list { - xfs_dablk_t valueblk; /* block number of value bytes */ - int valuelen; /* number of bytes in value */ -} xfs_attr_inactive_list_t; - - /*======================================================================== * Function prototypes for the kernel. *========================================================================*/ diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 9d5c27db1239..1f331d51a901 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -37,8 +37,6 @@ xfs_attr3_rmt_stale( int blkcnt) { struct xfs_bmbt_irec map; - xfs_dablk_t tblkno; - int tblkcnt; int nmap; int error;
@@ -46,14 +44,12 @@ xfs_attr3_rmt_stale( * Roll through the "value", invalidating the attribute value's * blocks. */ - tblkno = blkno; - tblkcnt = blkcnt; - while (tblkcnt > 0) { + while (blkcnt > 0) { /* * Try to remember where we decided to put the value. */ nmap = 1; - error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt, + error = xfs_bmapi_read(dp, (xfs_fileoff_t)blkno, blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); if (error) return error; @@ -68,8 +64,8 @@ xfs_attr3_rmt_stale( if (error) return error;
- tblkno += map.br_blockcount; - tblkcnt -= map.br_blockcount; + blkno += map.br_blockcount; + blkcnt -= map.br_blockcount; }
return 0; @@ -83,84 +79,45 @@ xfs_attr3_rmt_stale( */ STATIC int xfs_attr3_leaf_inactive( - struct xfs_trans **trans, - struct xfs_inode *dp, - struct xfs_buf *bp) + struct xfs_trans **trans, + struct xfs_inode *dp, + struct xfs_buf *bp) { - struct xfs_attr_leafblock *leaf; - struct xfs_attr3_icleaf_hdr ichdr; - struct xfs_attr_leaf_entry *entry; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_mount *mp = bp->b_mount; + struct xfs_attr_leafblock *leaf = bp->b_addr; + struct xfs_attr_leaf_entry *entry; struct xfs_attr_leaf_name_remote *name_rmt; - struct xfs_attr_inactive_list *list; - struct xfs_attr_inactive_list *lp; - int error; - int count; - int size; - int tmp; - int i; - struct xfs_mount *mp = bp->b_mount; + int error; + int i;
- leaf = bp->b_addr; xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
/* - * Count the number of "remote" value extents. + * Find the remote value extents for this leaf and invalidate their + * incore buffers. */ - count = 0; entry = xfs_attr3_leaf_entryp(leaf); for (i = 0; i < ichdr.count; entry++, i++) { - if (be16_to_cpu(entry->nameidx) && - ((entry->flags & XFS_ATTR_LOCAL) == 0)) { - name_rmt = xfs_attr3_leaf_name_remote(leaf, i); - if (name_rmt->valueblk) - count++; - } - } - - /* - * If there are no "remote" values, we're done. - */ - if (count == 0) { - xfs_trans_brelse(*trans, bp); - return 0; - } + int blkcnt;
- /* - * Allocate storage for a list of all the "remote" value extents. - */ - size = count * sizeof(xfs_attr_inactive_list_t); - list = kmem_alloc(size, 0); + if (!entry->nameidx || (entry->flags & XFS_ATTR_LOCAL)) + continue;
- /* - * Identify each of the "remote" value extents. - */ - lp = list; - entry = xfs_attr3_leaf_entryp(leaf); - for (i = 0; i < ichdr.count; entry++, i++) { - if (be16_to_cpu(entry->nameidx) && - ((entry->flags & XFS_ATTR_LOCAL) == 0)) { - name_rmt = xfs_attr3_leaf_name_remote(leaf, i); - if (name_rmt->valueblk) { - lp->valueblk = be32_to_cpu(name_rmt->valueblk); - lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount, - be32_to_cpu(name_rmt->valuelen)); - lp++; - } - } - } - xfs_trans_brelse(*trans, bp); /* unlock for trans. in freextent() */ + name_rmt = xfs_attr3_leaf_name_remote(leaf, i); + if (!name_rmt->valueblk) + continue;
- /* - * Invalidate each of the "remote" value extents. - */ - error = 0; - for (lp = list, i = 0; i < count; i++, lp++) { - tmp = xfs_attr3_rmt_stale(dp, lp->valueblk, lp->valuelen); - if (error == 0) - error = tmp; /* save only the 1st errno */ + blkcnt = xfs_attr3_rmt_blocks(dp->i_mount, + be32_to_cpu(name_rmt->valuelen)); + error = xfs_attr3_rmt_stale(dp, + be32_to_cpu(name_rmt->valueblk), blkcnt); + if (error) + goto err; }
- kmem_free(list); + xfs_trans_brelse(*trans, bp); +err: return error; }
From: "Darrick J. Wong" darrick.wong@oracle.com
commit 54027a49938bbee1af62fad191139b14d4ee5cd2 upstream.
Dan Carpenter pointed out that error is uninitialized. While there never should be an attr leaf block with zero entries, let's not leave that logic bomb there.
Fixes: 0bb9d159bd01 ("xfs: streamline xfs_attr3_leaf_inactive") Reported-by: Dan Carpenter dan.carpenter@oracle.com Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Allison Collins allison.henderson@oracle.com Reviewed-by: Eric Sandeen sandeen@redhat.com Acked-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Chandan Babu R chandan.babu@oracle.com --- fs/xfs/xfs_attr_inactive.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 1f331d51a901..9c88203b537b 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -88,7 +88,7 @@ xfs_attr3_leaf_inactive( struct xfs_attr_leafblock *leaf = bp->b_addr; struct xfs_attr_leaf_entry *entry; struct xfs_attr_leaf_name_remote *name_rmt; - int error; + int error = 0; int i;
xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
From: YueHaibing yuehaibing@huawei.com
commit b3531f5fc16d4df2b12567bce48cd9f3ab5f9131 upstream.
fs/xfs/xfs_inode.c: In function 'xfs_itruncate_extents_flags': fs/xfs/xfs_inode.c:1523:8: warning: unused variable 'done' [-Wunused-variable]
commit 4bbb04abb4ee ("xfs: truncate should remove all blocks, not just to the end of the page cache") left behind this, so remove it.
Fixes: 4bbb04abb4ee ("xfs: truncate should remove all blocks, not just to the end of the page cache") Reported-by: Hulk Robot hulkci@huawei.com Reported-by: Stephen Rothwell sfr@canb.auug.org.au Signed-off-by: YueHaibing yuehaibing@huawei.com Reviewed-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Acked-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Chandan Babu R chandan.babu@oracle.com --- fs/xfs/xfs_inode.c | 1 - 1 file changed, 1 deletion(-)
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index d4af6e44dd6f..30202d8c25e4 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1515,7 +1515,6 @@ xfs_itruncate_extents_flags( xfs_fileoff_t first_unmap_block; xfs_filblks_t unmap_len; int error = 0; - int done = 0;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(!atomic_read(&VFS_I(ip)->i_count) ||
linux-stable-mirror@lists.linaro.org