Syzbot reported a BUG_ON:
==================================================================
EXT4-fs (loop0): mounted filesystem without journal. Quota mode: none.
EXT4-fs error (device loop0): ext4_mb_generate_buddy:1098: group 0, block
bitmap and bg descriptor inconsistent: 25 vs 150994969 free clusters
------------[ cut here ]------------
kernel BUG at fs/ext4/ext4_jbd2.c:53!
invalid opcode: 0000 [#1] PREEMPT SMP KASAN
CPU: 1 PID: 494 Comm: syz-executor.0 6.1.0-rc7-syzkaller-ga4412fdd49dc #0
RIP: 0010:__ext4_journal_stop+0x1b3/0x1c0
[...]
Call Trace:
ext4_write_inline_data_end+0xa39/0xdf0
ext4_da_write_end+0x1e2/0x950
generic_perform_write+0x401/0x5f0
ext4_buffered_write_iter+0x35f/0x640
ext4_file_write_iter+0x198/0x1cd0
vfs_write+0x8b5/0xef0
[...]
==================================================================
The above BUG_ON is triggered by the following race:
cpu1 cpu2
________________________|________________________
ksys_write
vfs_write
new_sync_write
ext4_file_write_iter
ext4_buffered_write_iter
generic_perform_write
ext4_da_write_begin
do_fault
do_page_mkwrite
ext4_page_mkwrite
ext4_convert_inline_data
ext4_convert_inline_data_nolock
ext4_destroy_inline_data_nolock
//clear EXT4_STATE_MAY_INLINE_DATA
ext4_map_blocks --> return error
ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)
ext4_block_write_begin
ext4_restore_inline_data
// set EXT4_STATE_MAY_INLINE_DATA
ext4_da_write_end
ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)
ext4_write_inline_data_end
handle=NULL
ext4_journal_stop(handle)
__ext4_journal_stop
ext4_put_nojournal(handle)
ref_cnt = (unsigned long)handle
BUG_ON(ref_cnt == 0) ---> BUG_ON
The root cause of this problem is that the ext4_convert_inline_data() in
ext4_page_mkwrite() does not grab i_rwsem, so it may race with
ext4_buffered_write_iter() and cause the write_begin() and write_end()
functions to be inconsistent and trigger BUG_ON.
To solve the above issue, we can not add inode_lock directly to
ext4_page_mkwrite(), which would not only cause performance degradation but
also ABBA deadlock (see Link). Hence we move ext4_convert_inline_data() to
ext4_file_mmap(), and only when inline_data is enabled and mmap a writeable
file in shared mode, we hold the lock to convert, which avoids the above
problems.
Link: https://lore.kernel.org/r/20230530102804.6t7np7om6tczscuo@quack3/
Reported-by: Jun Nie <jun.nie(a)linaro.org>
Closes: https://lore.kernel.org/lkml/63903521.5040307@huawei.com/t/
Reported-by: syzbot+a158d886ca08a3fecca4(a)syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?id=899b37f20ce4072bcdfecfe1647b39602e956e…
Fixes: 7b4cc9787fe3 ("ext4: evict inline data when writing to memory map")
CC: stable(a)vger.kernel.org # 4.12+
Signed-off-by: Baokun Li <libaokun1(a)huawei.com>
---
fs/ext4/file.c | 24 +++++++++++++++++++++++-
fs/ext4/inode.c | 4 ----
2 files changed, 23 insertions(+), 5 deletions(-)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index d101b3b0c7da..9df82d72eb90 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -795,7 +795,8 @@ static const struct vm_operations_struct ext4_file_vm_ops = {
static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file->f_mapping->host;
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct super_block *sb = inode->i_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
struct dax_device *dax_dev = sbi->s_daxdev;
if (unlikely(ext4_forced_shutdown(sbi)))
@@ -808,6 +809,27 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
if (!daxdev_mapping_supported(vma, dax_dev))
return -EOPNOTSUPP;
+ /*
+ * Writing via mmap has no logic to handle inline data, so we
+ * need to call ext4_convert_inline_data() to convert the inode
+ * to normal format before doing so, otherwise a BUG_ON will be
+ * triggered in ext4_writepages() due to the
+ * EXT4_STATE_MAY_INLINE_DATA flag. Moreover, we need to grab
+ * i_rwsem during conversion, since clearing and setting the
+ * inline data flag may race with ext4_buffered_write_iter()
+ * to trigger a BUG_ON.
+ */
+ if (ext4_has_feature_inline_data(sb) &&
+ vma->vm_flags & VM_SHARED && vma->vm_flags & VM_MAYWRITE) {
+ int err;
+
+ inode_lock(inode);
+ err = ext4_convert_inline_data(inode);
+ inode_unlock(inode);
+ if (err)
+ return err;
+ }
+
file_accessed(file);
if (IS_DAX(file_inode(file))) {
vma->vm_ops = &ext4_dax_vm_ops;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ce5f21b6c2b3..31844c4ec9fe 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -6043,10 +6043,6 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
filemap_invalidate_lock_shared(mapping);
- err = ext4_convert_inline_data(inode);
- if (err)
- goto out_ret;
-
/*
* On data journalling we skip straight to the transaction handle:
* there's no delalloc; page truncated will be checked later; the
--
2.31.1
Hi Linus
sorry for this unusual procedure of me requesting a patch to be pulled.
I asked for several months the maintainers (David: asymmetric keys,
Jarkko: key subsystem) to pick my patch but without any luck.
I signed the tag, but probably it would not matter, since my key is not
among your trusted keys.
The following changes since commit 921bdc72a0d68977092d6a64855a1b8967acc1d9:
Merge tag 'mmc-v6.4-rc1-2' of git://git.kernel.org/pub/scm/linux/kernel/git/ulfh/mmc (2023-06-02 08:35:13 -0400)
are available in the Git repository at:
https://github.com/robertosassu/linux.git tags/asym-keys-fix-for-linus-v6.4-rc5
for you to fetch changes up to c3d03e8e35e005e1a614e51bb59053eeb5857f76:
KEYS: asymmetric: Copy sig and digest in public_key_verify_signature() (2023-06-02 15:36:23 +0200)
----------------------------------------------------------------
Asymmetric keys fix for v6.4-rc5
Here is a small fix to make an unconditional copy of the buffer passed
to crypto operations, to take into account the case of the stack not in
the linear mapping area.
It has been tested and verified to fix the bug.
Signed-off-by: Roberto Sassu <roberto.sassu(a)huawei.com>
----------------------------------------------------------------
Roberto Sassu (1):
KEYS: asymmetric: Copy sig and digest in public_key_verify_signature()
crypto/asymmetric_keys/public_key.c | 38 +++++++++++++++++++++-----------------
1 file changed, 21 insertions(+), 17 deletions(-)
We are a premier financial institution offering flexible loans ranging from 10,000 to 10 million euros at an interest rate of 2% to interested individuals or companies. E-Mail:aflimited@secretary.net
NULL the dangling pipe reference while clearing watch_queue.
If not done, a reference to a freed pipe remains in the watch_queue,
as this function is called before freeing a pipe in free_pipe_info()
(see line 834 of fs/pipe.c).
The sole use of wqueue->defunct is for checking if the watch queue has
been cleared, but wqueue->pipe is also NULLed while clearing.
Thus, wqueue->defunct is superfluous, as wqueue->pipe can be checked
for NULL. Hence, the former can be removed.
Tested with keyutils testsuite.
Cc: stable(a)vger.kernel.org # 6.1
Signed-off-by: Siddh Raman Pant <code(a)siddh.me>
---
Changes in v5:
- Rebased to latest mainline.
- Added Cc to stable.
- Specify tests passing. Note that all tests in the keyutils testsuite
passed, except tests/features/builtin_trusted, which we should not
worry about as it requires some kernel preparation according to
David Howells in v4 discussion.
Changes in v4 (11 Jan 2023):
- Drop preceeding kerneldoc-changes patch and change appropriately.
Changes in v3 (8 Jan 2023):
- Minor rephrase of comment before NULLing in watch_queue_clear().
Changes in v2 (6 Aug 2022):
- Merged the NULLing and removing defunct patches.
- Removed READ_ONCE barrier in lock_wqueue().
- Better commit messages.
include/linux/watch_queue.h | 3 +--
kernel/watch_queue.c | 12 ++++++------
2 files changed, 7 insertions(+), 8 deletions(-)
diff --git a/include/linux/watch_queue.h b/include/linux/watch_queue.h
index fc6bba20273b..45cd42f55d49 100644
--- a/include/linux/watch_queue.h
+++ b/include/linux/watch_queue.h
@@ -38,7 +38,7 @@ struct watch_filter {
struct watch_queue {
struct rcu_head rcu;
struct watch_filter __rcu *filter;
- struct pipe_inode_info *pipe; /* The pipe we're using as a buffer */
+ struct pipe_inode_info *pipe; /* Pipe we use as a buffer, NULL if queue closed */
struct hlist_head watches; /* Contributory watches */
struct page **notes; /* Preallocated notifications */
unsigned long *notes_bitmap; /* Allocation bitmap for notes */
@@ -46,7 +46,6 @@ struct watch_queue {
spinlock_t lock;
unsigned int nr_notes; /* Number of notes */
unsigned int nr_pages; /* Number of pages in notes[] */
- bool defunct; /* T when queues closed */
};
/*
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index e91cb4c2833f..d0b6b390ee42 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -42,7 +42,7 @@ MODULE_AUTHOR("Red Hat, Inc.");
static inline bool lock_wqueue(struct watch_queue *wqueue)
{
spin_lock_bh(&wqueue->lock);
- if (unlikely(wqueue->defunct)) {
+ if (unlikely(!wqueue->pipe)) {
spin_unlock_bh(&wqueue->lock);
return false;
}
@@ -104,9 +104,6 @@ static bool post_one_notification(struct watch_queue *wqueue,
unsigned int head, tail, mask, note, offset, len;
bool done = false;
- if (!pipe)
- return false;
-
spin_lock_irq(&pipe->rd_wait.lock);
mask = pipe->ring_size - 1;
@@ -603,8 +600,11 @@ void watch_queue_clear(struct watch_queue *wqueue)
rcu_read_lock();
spin_lock_bh(&wqueue->lock);
- /* Prevent new notifications from being stored. */
- wqueue->defunct = true;
+ /*
+ * This pipe can be freed by callers like free_pipe_info().
+ * Removing this reference also prevents new notifications.
+ */
+ wqueue->pipe = NULL;
while (!hlist_empty(&wqueue->watches)) {
watch = hlist_entry(wqueue->watches.first, struct watch, queue_node);
--
2.39.2
Hello, the following patch cherry-picked cleanly during my testing.
Subject of Patch: xfs: verify buffer contents when we skip log replay
Commit Hash: 22ed903eee23a5b174e240f1cdfa9acf393a5210
Reason why it should be applied: This fixes CVE-2023-2124.
Kernel Versions to be applied to: 6.1, 5.15, 5.10
Thank You,
Michael Kochera
From: Xiubo Li <xiubli(a)redhat.com>
Blindly expanding the readahead windows will cause unneccessary
pagecache thrashing and also will introduce the network workload.
We should disable expanding the windows if the readahead is disabled
and also shouldn't expand the windows too much.
Expanding forward firstly instead of expanding backward for possible
sequential reads.
Bound `rreq->len` to the actual file size to restore the previous page
cache usage.
The posix_fadvise may change the maximum size of a file readahead.
Cc: stable(a)vger.kernel.org
Fixes: 49870056005c ("ceph: convert ceph_readpages to ceph_readahead")
URL: https://lore.kernel.org/ceph-devel/20230504082510.247-1-sehuww@mail.scut.ed…
URL: https://www.spinics.net/lists/ceph-users/msg76183.html
Cc: Hu Weiwen <sehuww(a)mail.scut.edu.cn>
Reviewed-by: Hu Weiwen <sehuww(a)mail.scut.edu.cn>
Tested-by: Hu Weiwen <sehuww(a)mail.scut.edu.cn>
Signed-off-by: Xiubo Li <xiubli(a)redhat.com>
---
fs/ceph/addr.c | 40 +++++++++++++++++++++++++++++++++-------
1 file changed, 33 insertions(+), 7 deletions(-)
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 93fff1a7373f..0c4fb3d23078 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -188,16 +188,42 @@ static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq)
struct inode *inode = rreq->inode;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_file_layout *lo = &ci->i_layout;
+ unsigned long max_pages = inode->i_sb->s_bdi->ra_pages;
+ loff_t end = rreq->start + rreq->len, new_end;
+ struct ceph_netfs_request_data *priv = rreq->netfs_priv;
+ unsigned long max_len;
u32 blockoff;
- u64 blockno;
- /* Expand the start downward */
- blockno = div_u64_rem(rreq->start, lo->stripe_unit, &blockoff);
- rreq->start = blockno * lo->stripe_unit;
- rreq->len += blockoff;
+ if (priv) {
+ /* Readahead is disabled by posix_fadvise POSIX_FADV_RANDOM */
+ if (priv->file_ra_disabled)
+ max_pages = 0;
+ else
+ max_pages = priv->file_ra_pages;
+
+ }
+
+ /* Readahead is disabled */
+ if (!max_pages)
+ return;
- /* Now, round up the length to the next block */
- rreq->len = roundup(rreq->len, lo->stripe_unit);
+ max_len = max_pages << PAGE_SHIFT;
+
+ /*
+ * Try to expand the length forward by rounding up it to the next
+ * block, but do not exceed the file size, unless the original
+ * request already exceeds it.
+ */
+ new_end = min(round_up(end, lo->stripe_unit), rreq->i_size);
+ if (new_end > end && new_end <= rreq->start + max_len)
+ rreq->len = new_end - rreq->start;
+
+ /* Try to expand the start downward */
+ div_u64_rem(rreq->start, lo->stripe_unit, &blockoff);
+ if (rreq->len + blockoff <= max_len) {
+ rreq->start -= blockoff;
+ rreq->len += blockoff;
+ }
}
static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq)
--
2.40.1