From: Darrick J. Wong darrick.wong@oracle.com
commit 5a9d929d6e13278df62bd9e3d3ceae8c87ad1eea upstream.
If two programs simultaneously try to write to the same part of a file via direct IO and buffered IO, there's a chance that the post-diowrite pagecache invalidation will fail on the dirty page. When this happens, the dio write succeeded, which means that the page cache is no longer coherent with the disk!
Programs are not supposed to mix IO types and this is a clear case of data corruption, so store an EIO which will be reflected to userspace during the next fsync. Replace the WARN_ON with a ratelimited pr_crit so that the developers have /some/ kind of breadcrumb to track down the offending program(s) and file(s) involved.
Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Liu Bo bo.li.liu@oracle.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Cc: Zubin Mithra zsm@chromium.org
--- fs/direct-io.c | 24 +++++++++++++++++++++++- fs/iomap.c | 12 ++++++++++-- include/linux/fs.h | 1 + 3 files changed, 34 insertions(+), 3 deletions(-)
--- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -219,6 +219,27 @@ static inline struct page *dio_get_page( return dio->pages[sdio->head]; }
+/* + * Warn about a page cache invalidation failure during a direct io write. + */ +void dio_warn_stale_pagecache(struct file *filp) +{ + static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST); + char pathname[128]; + struct inode *inode = file_inode(filp); + char *path; + + errseq_set(&inode->i_mapping->wb_err, -EIO); + if (__ratelimit(&_rs)) { + path = file_path(filp, pathname, sizeof(pathname)); + if (IS_ERR(path)) + path = "(unknown)"; + pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n"); + pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid, + current->comm); + } +} + /** * dio_complete() - called when all DIO BIO I/O has been completed * @offset: the byte offset in the file of the completed operation @@ -290,7 +311,8 @@ static ssize_t dio_complete(struct dio * err = invalidate_inode_pages2_range(dio->inode->i_mapping, offset >> PAGE_SHIFT, (offset + ret - 1) >> PAGE_SHIFT); - WARN_ON_ONCE(err); + if (err) + dio_warn_stale_pagecache(dio->iocb->ki_filp); }
if (!(dio->flags & DIO_SKIP_DIO_COUNT)) --- a/fs/iomap.c +++ b/fs/iomap.c @@ -753,7 +753,8 @@ static ssize_t iomap_dio_complete(struct err = invalidate_inode_pages2_range(inode->i_mapping, offset >> PAGE_SHIFT, (offset + dio->size - 1) >> PAGE_SHIFT); - WARN_ON_ONCE(err); + if (err) + dio_warn_stale_pagecache(iocb->ki_filp); }
inode_dio_end(file_inode(iocb->ki_filp)); @@ -1010,9 +1011,16 @@ iomap_dio_rw(struct kiocb *iocb, struct if (ret) goto out_free_dio;
+ /* + * Try to invalidate cache pages for the range we're direct + * writing. If this invalidation fails, tough, the write will + * still work, but racing two incompatible write paths is a + * pretty crazy thing to do, so we don't support it 100%. + */ ret = invalidate_inode_pages2_range(mapping, start >> PAGE_SHIFT, end >> PAGE_SHIFT); - WARN_ON_ONCE(ret); + if (ret) + dio_warn_stale_pagecache(iocb->ki_filp); ret = 0;
if (iov_iter_rw(iter) == WRITE && !dio->wait_for_completion && --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2965,6 +2965,7 @@ enum { };
void dio_end_io(struct bio *bio); +void dio_warn_stale_pagecache(struct file *filp);
ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct block_device *bdev, struct iov_iter *iter,