From: Anshuman Khandual anshuman.khandual@arm.com
[ Upstream commit 1e5823c8e86de83a43d59a522b4de29066d3b306 ]
This asserts that HUGE_MAX_HSTATE is sufficient enough preventing potential hugetlb_max_hstate runtime overflow in hugetlb_add_hstate() thus triggering a BUG_ON() there after.
Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Cc: Ard Biesheuvel ardb@kernel.org Cc: Ryan Roberts ryan.roberts@arm.com Cc: Mark Rutland mark.rutland@arm.com Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Anshuman Khandual anshuman.khandual@arm.com Reviewed-by: Ryan Roberts ryan.roberts@arm.com Reviewed-by: Gavin Shan gshan@redhat.com Link: https://lore.kernel.org/r/20241202064407.53807-1-anshuman.khandual@arm.com Signed-off-by: Will Deacon will@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- arch/arm64/mm/hugetlbpage.c | 12 ++++++++++++ 1 file changed, 12 insertions(+)
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 3215adf48a1b6..98a2a0e64e255 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -519,6 +519,18 @@ pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
static int __init hugetlbpage_init(void) { + /* + * HugeTLB pages are supported on maximum four page table + * levels (PUD, CONT PMD, PMD, CONT PTE) for a given base + * page size, corresponding to hugetlb_add_hstate() calls + * here. + * + * HUGE_MAX_HSTATE should at least match maximum supported + * HugeTLB page sizes on the platform. Any new addition to + * supported HugeTLB page sizes will also require changing + * HUGE_MAX_HSTATE as well. + */ + BUILD_BUG_ON(HUGE_MAX_HSTATE < 4); if (pud_sect_supported()) hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
From: Kees Cook kees@kernel.org
[ Upstream commit 543841d1806029889c2f69f040e88b247aba8e22 ]
Zbigniew mentioned at Linux Plumber's that systemd is interested in switching to execveat() for service execution, but can't, because the contents of /proc/pid/comm are the file descriptor which was used, instead of the path to the binary[1]. This makes the output of tools like top and ps useless, especially in a world where most fds are opened CLOEXEC so the number is truly meaningless.
When the filename passed in is empty (e.g. with AT_EMPTY_PATH), use the dentry's filename for "comm" instead of using the useless numeral from the synthetic fdpath construction. This way the actual exec machinery is unchanged, but cosmetically the comm looks reasonable to admins investigating things.
Instead of adding TASK_COMM_LEN more bytes to bprm, use one of the unused flag bits to indicate that we need to set "comm" from the dentry.
Suggested-by: Zbigniew Jędrzejewski-Szmek zbyszek@in.waw.pl Suggested-by: Tycho Andersen tandersen@netflix.com Suggested-by: Al Viro viro@zeniv.linux.org.uk Suggested-by: Linus Torvalds torvalds@linux-foundation.org Link: https://github.com/uapi-group/kernel-features#set-comm-field-before-exec [1] Reviewed-by: Aleksa Sarai cyphar@cyphar.com Tested-by: Zbigniew Jędrzejewski-Szmek zbyszek@in.waw.pl Signed-off-by: Kees Cook kees@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- fs/exec.c | 29 ++++++++++++++++++++++++++--- include/linux/binfmts.h | 4 +++- 2 files changed, 29 insertions(+), 4 deletions(-)
diff --git a/fs/exec.c b/fs/exec.c index 98cb7ba9983c7..b1f6b47ad20e1 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1341,7 +1341,28 @@ int begin_new_exec(struct linux_binprm * bprm) set_dumpable(current->mm, SUID_DUMP_USER);
perf_event_exec(); - __set_task_comm(me, kbasename(bprm->filename), true); + + /* + * If the original filename was empty, alloc_bprm() made up a path + * that will probably not be useful to admins running ps or similar. + * Let's fix it up to be something reasonable. + */ + if (bprm->comm_from_dentry) { + /* + * Hold RCU lock to keep the name from being freed behind our back. + * Use acquire semantics to make sure the terminating NUL from + * __d_alloc() is seen. + * + * Note, we're deliberately sloppy here. We don't need to care about + * detecting a concurrent rename and just want a terminated name. + */ + rcu_read_lock(); + __set_task_comm(me, smp_load_acquire(&bprm->file->f_path.dentry->d_name.name), + true); + rcu_read_unlock(); + } else { + __set_task_comm(me, kbasename(bprm->filename), true); + }
/* An exec changes our domain. We are no longer part of the thread group */ @@ -1517,11 +1538,13 @@ static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int fl if (fd == AT_FDCWD || filename->name[0] == '/') { bprm->filename = filename->name; } else { - if (filename->name[0] == '\0') + if (filename->name[0] == '\0') { bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd); - else + bprm->comm_from_dentry = 1; + } else { bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s", fd, filename->name); + } if (!bprm->fdpath) goto out_free;
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index e6c00e860951a..3305c849abd66 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -42,7 +42,9 @@ struct linux_binprm { * Set when errors can no longer be returned to the * original userspace. */ - point_of_no_return:1; + point_of_no_return:1, + /* Set when "comm" must come from the dentry. */ + comm_from_dentry:1; struct file *executable; /* Executable to pass to the interpreter */ struct file *interpreter; struct file *file;
From: Christoph Hellwig hch@lst.de
[ Upstream commit 958148a6ac061a9a80a184ea678a5fa872d0c56f ]
Otherwise feature reconfiguration can race with I/O submission.
Also drop the bio_clear_polled in the error path, as the flag does not matter for instant error completions, it is a left over from when we allowed polled I/O to proceed unpolled in this case.
Signed-off-by: Christoph Hellwig hch@lst.de Reviewed-by: Ming Lei ming.lei@redhat.com Reviewed-by: Nilay Shroff nilay@linux.ibm.com Reviewed-by: Martin K. Petersen martin.petersen@oracle.com Link: https://lore.kernel.org/r/20250110054726.1499538-4-hch@lst.de Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Sasha Levin sashal@kernel.org --- block/blk-core.c | 22 ++++++++++++---------- block/blk-mq.c | 12 ++++++++++-- 2 files changed, 22 insertions(+), 12 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c index 666efe8fa2020..6309b3f5a89dc 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -629,8 +629,14 @@ static void __submit_bio(struct bio *bio) blk_mq_submit_bio(bio); } else if (likely(bio_queue_enter(bio) == 0)) { struct gendisk *disk = bio->bi_bdev->bd_disk; - - disk->fops->submit_bio(bio); + + if ((bio->bi_opf & REQ_POLLED) && + !(disk->queue->limits.features & BLK_FEAT_POLL)) { + bio->bi_status = BLK_STS_NOTSUPP; + bio_endio(bio); + } else { + disk->fops->submit_bio(bio); + } blk_queue_exit(disk->queue); }
@@ -805,12 +811,6 @@ void submit_bio_noacct(struct bio *bio) } }
- if (!(q->limits.features & BLK_FEAT_POLL) && - (bio->bi_opf & REQ_POLLED)) { - bio_clear_polled(bio); - goto not_supported; - } - switch (bio_op(bio)) { case REQ_OP_READ: break; @@ -935,7 +935,7 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) return 0;
q = bdev_get_queue(bdev); - if (cookie == BLK_QC_T_NONE || !(q->limits.features & BLK_FEAT_POLL)) + if (cookie == BLK_QC_T_NONE) return 0;
blk_flush_plug(current->plug, false); @@ -951,7 +951,9 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) */ if (!percpu_ref_tryget(&q->q_usage_counter)) return 0; - if (queue_is_mq(q)) { + if (!(q->limits.features & BLK_FEAT_POLL)) { + ret = 0; + } else if (queue_is_mq(q)) { ret = blk_mq_poll(q, cookie, iob, flags); } else { struct gendisk *disk = q->disk; diff --git a/block/blk-mq.c b/block/blk-mq.c index 8ac19d4ae3c0a..0137a995b9f37 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3092,14 +3092,22 @@ void blk_mq_submit_bio(struct bio *bio) }
/* - * Device reconfiguration may change logical block size, so alignment - * check has to be done with queue usage counter held + * Device reconfiguration may change logical block size or reduce the + * number of poll queues, so the checks for alignment and poll support + * have to be done with queue usage counter held. */ if (unlikely(bio_unaligned(bio, q))) { bio_io_error(bio); goto queue_exit; }
+ if ((bio->bi_opf & REQ_POLLED) && + !(q->limits.features & BLK_FEAT_POLL)) { + bio->bi_status = BLK_STS_NOTSUPP; + bio_endio(bio); + goto queue_exit; + } + bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); if (!bio) goto queue_exit;
From: Sven Schnelle svens@linux.ibm.com
[ Upstream commit a88c26bb8e04ee5f2678225c0130a5fbc08eef85 ]
exrl is present in all machines currently supported, therefore prefer it over ex. This saves one instruction and doesn't need an additional register to hold the address of the target instruction.
Signed-off-by: Sven Schnelle svens@linux.ibm.com Reviewed-by: Heiko Carstens hca@linux.ibm.com Signed-off-by: Alexander Gordeev agordeev@linux.ibm.com Signed-off-by: Sasha Levin sashal@kernel.org --- arch/s390/include/asm/processor.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 8761fd01a9f09..4f8d5592c2981 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -163,8 +163,7 @@ static __always_inline void __stackleak_poison(unsigned long erase_low, " la %[addr],256(%[addr])\n" " brctg %[tmp],0b\n" "1: stg %[poison],0(%[addr])\n" - " larl %[tmp],3f\n" - " ex %[count],0(%[tmp])\n" + " exrl %[count],3f\n" " j 4f\n" "2: stg %[poison],0(%[addr])\n" " j 4f\n"
From: Hao-ran Zheng zhenghaoran154@gmail.com
[ Upstream commit 5324c4e10e9c2ce307a037e904c0d9671d7137d9 ]
A data race occurs when the function `insert_ordered_extent_file_extent()` and the function `btrfs_inode_safe_disk_i_size_write()` are executed concurrently. The function `insert_ordered_extent_file_extent()` is not locked when reading inode->disk_i_size, causing `btrfs_inode_safe_disk_i_size_write()` to cause data competition when writing inode->disk_i_size, thus affecting the value of `modify_tree`.
The specific call stack that appears during testing is as follows:
============DATA_RACE============ btrfs_drop_extents+0x89a/0xa060 [btrfs] insert_reserved_file_extent+0xb54/0x2960 [btrfs] insert_ordered_extent_file_extent+0xff5/0x1760 [btrfs] btrfs_finish_one_ordered+0x1b85/0x36a0 [btrfs] btrfs_finish_ordered_io+0x37/0x60 [btrfs] finish_ordered_fn+0x3e/0x50 [btrfs] btrfs_work_helper+0x9c9/0x27a0 [btrfs] process_scheduled_works+0x716/0xf10 worker_thread+0xb6a/0x1190 kthread+0x292/0x330 ret_from_fork+0x4d/0x80 ret_from_fork_asm+0x1a/0x30 ============OTHER_INFO============ btrfs_inode_safe_disk_i_size_write+0x4ec/0x600 [btrfs] btrfs_finish_one_ordered+0x24c7/0x36a0 [btrfs] btrfs_finish_ordered_io+0x37/0x60 [btrfs] finish_ordered_fn+0x3e/0x50 [btrfs] btrfs_work_helper+0x9c9/0x27a0 [btrfs] process_scheduled_works+0x716/0xf10 worker_thread+0xb6a/0x1190 kthread+0x292/0x330 ret_from_fork+0x4d/0x80 ret_from_fork_asm+0x1a/0x30 =================================
The main purpose of the check of the inode's disk_i_size is to avoid taking write locks on a btree path when we have a write at or beyond EOF, since in these cases we don't expect to find extent items in the root to drop. However if we end up taking write locks due to a data race on disk_i_size, everything is still correct, we only add extra lock contention on the tree in case there's concurrency from other tasks. If the race causes us to not take write locks when we actually need them, then everything is functionally correct as well, since if we find out we have extent items to drop and we took read locks (modify_tree set to 0), we release the path and retry again with write locks.
Since this data race does not affect the correctness of the function, it is a harmless data race, use data_race() to check inode->disk_i_size.
Reviewed-by: Filipe Manana fdmanana@suse.com Signed-off-by: Hao-ran Zheng zhenghaoran154@gmail.com Signed-off-by: Filipe Manana fdmanana@suse.com Reviewed-by: David Sterba dsterba@suse.com Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Sasha Levin sashal@kernel.org --- fs/btrfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 14e27473c5bce..4d7c7a296d2d1 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -224,7 +224,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, if (args->drop_cache) btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false);
- if (args->start >= inode->disk_i_size && !args->replace_extent) + if (data_race(args->start >= inode->disk_i_size) && !args->replace_extent) modify_tree = 0;
update_refs = (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);
From: Josef Bacik josef@toxicpanda.com
[ Upstream commit 6a4730b325aaa48f7a5d5ba97aff0a955e2d9cec ]
This BUG_ON is meant to catch backref cache problems, but these can arise from either bugs in the backref cache or corruption in the extent tree. Fix it to be a proper error.
Reviewed-by: Boris Burkov boris@bur.io Signed-off-by: Josef Bacik josef@toxicpanda.com Reviewed-by: David Sterba dsterba@suse.com Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Sasha Levin sashal@kernel.org --- fs/btrfs/relocation.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index db8b42f674b7c..ab2de2d1b2bee 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4405,8 +4405,18 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, WARN_ON(!first_cow && level == 0);
node = rc->backref_cache.path[level]; - BUG_ON(node->bytenr != buf->start && - node->new_bytenr != buf->start); + + /* + * If node->bytenr != buf->start and node->new_bytenr != + * buf->start then we've got the wrong backref node for what we + * expected to see here and the cache is incorrect. + */ + if (unlikely(node->bytenr != buf->start && node->new_bytenr != buf->start)) { + btrfs_err(fs_info, +"bytenr %llu was found but our backref cache was expecting %llu or %llu", + buf->start, node->bytenr, node->new_bytenr); + return -EUCLEAN; + }
btrfs_backref_drop_node_buffer(node); atomic_inc(&cow->refs);
From: Johannes Thumshirn johannes.thumshirn@wdc.com
[ Upstream commit dc14ba10781bd2629835696b7cc1febf914768e9 ]
Don't use btrfs_set_item_key_safe() to modify the keys in the RAID stripe-tree, as this can lead to corruption of the tree, which is caught by the checks in btrfs_set_item_key_safe():
BTRFS info (device nvme1n1): leaf 49168384 gen 15 total ptrs 194 free space 8329 owner 12 BTRFS info (device nvme1n1): refs 2 lock_owner 1030 current 1030 [ snip ] item 105 key (354549760 230 20480) itemoff 14587 itemsize 16 stride 0 devid 5 physical 67502080 item 106 key (354631680 230 4096) itemoff 14571 itemsize 16 stride 0 devid 1 physical 88559616 item 107 key (354631680 230 32768) itemoff 14555 itemsize 16 stride 0 devid 1 physical 88555520 item 108 key (354717696 230 28672) itemoff 14539 itemsize 16 stride 0 devid 2 physical 67604480 [ snip ] BTRFS critical (device nvme1n1): slot 106 key (354631680 230 32768) new key (354635776 230 4096) ------------[ cut here ]------------ kernel BUG at fs/btrfs/ctree.c:2602! Oops: invalid opcode: 0000 [#1] PREEMPT SMP PTI CPU: 1 UID: 0 PID: 1055 Comm: fsstress Not tainted 6.13.0-rc1+ #1464 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.2-3-gd478f380-rebuilt.opensuse.org 04/01/2014 RIP: 0010:btrfs_set_item_key_safe+0xf7/0x270 Code: <snip> RSP: 0018:ffffc90001337ab0 EFLAGS: 00010287 RAX: 0000000000000000 RBX: ffff8881115fd000 RCX: 0000000000000000 RDX: 0000000000000001 RSI: 0000000000000001 RDI: 00000000ffffffff RBP: ffff888110ed6f50 R08: 00000000ffffefff R09: ffffffff8244c500 R10: 00000000ffffefff R11: 00000000ffffffff R12: ffff888100586000 R13: 00000000000000c9 R14: ffffc90001337b1f R15: ffff888110f23b58 FS: 00007f7d75c72740(0000) GS:ffff88813bd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fa811652c60 CR3: 0000000111398001 CR4: 0000000000370eb0 Call Trace: <TASK> ? __die_body.cold+0x14/0x1a ? die+0x2e/0x50 ? do_trap+0xca/0x110 ? do_error_trap+0x65/0x80 ? btrfs_set_item_key_safe+0xf7/0x270 ? exc_invalid_op+0x50/0x70 ? btrfs_set_item_key_safe+0xf7/0x270 ? asm_exc_invalid_op+0x1a/0x20 ? btrfs_set_item_key_safe+0xf7/0x270 btrfs_partially_delete_raid_extent+0xc4/0xe0 btrfs_delete_raid_extent+0x227/0x240 __btrfs_free_extent.isra.0+0x57f/0x9c0 ? exc_coproc_segment_overrun+0x40/0x40 __btrfs_run_delayed_refs+0x2fa/0xe80 btrfs_run_delayed_refs+0x81/0xe0 btrfs_commit_transaction+0x2dd/0xbe0 ? preempt_count_add+0x52/0xb0 btrfs_sync_file+0x375/0x4c0 do_fsync+0x39/0x70 __x64_sys_fsync+0x13/0x20 do_syscall_64+0x54/0x110 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7f7d7550ef90 Code: <snip> RSP: 002b:00007ffd70237248 EFLAGS: 00000202 ORIG_RAX: 000000000000004a RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007f7d7550ef90 RDX: 000000000000013a RSI: 000000000040eb28 RDI: 0000000000000004 RBP: 000000000000001b R08: 0000000000000078 R09: 00007ffd7023725c R10: 00007f7d75400390 R11: 0000000000000202 R12: 028f5c28f5c28f5c R13: 8f5c28f5c28f5c29 R14: 000000000040b520 R15: 00007f7d75c726c8 </TASK>
While the root cause of the tree order corruption isn't clear, using btrfs_duplicate_item() to copy the item and then adjusting both the key and the per-device physical addresses is a safe way to counter this problem.
Signed-off-by: Johannes Thumshirn johannes.thumshirn@wdc.com Signed-off-by: David Sterba dsterba@suse.com Signed-off-by: Sasha Levin sashal@kernel.org --- fs/btrfs/raid-stripe-tree.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-)
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index 9ffc79f250fbb..10781c015ee8d 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -13,12 +13,13 @@ #include "volumes.h" #include "print-tree.h"
-static void btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans, +static int btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path, const struct btrfs_key *oldkey, u64 newlen, u64 frontpad) { - struct btrfs_stripe_extent *extent; + struct btrfs_root *stripe_root = trans->fs_info->stripe_root; + struct btrfs_stripe_extent *extent, *newitem; struct extent_buffer *leaf; int slot; size_t item_size; @@ -27,23 +28,38 @@ static void btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans, .type = BTRFS_RAID_STRIPE_KEY, .offset = newlen, }; + int ret;
ASSERT(oldkey->type == BTRFS_RAID_STRIPE_KEY);
leaf = path->nodes[0]; slot = path->slots[0]; item_size = btrfs_item_size(leaf, slot); + + newitem = kzalloc(item_size, GFP_NOFS); + if (!newitem) + return -ENOMEM; + extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) { struct btrfs_raid_stride *stride = &extent->strides[i]; u64 phys;
- phys = btrfs_raid_stride_physical(leaf, stride); - btrfs_set_raid_stride_physical(leaf, stride, phys + frontpad); + phys = btrfs_raid_stride_physical(leaf, stride) + frontpad; + btrfs_set_stack_raid_stride_physical(&newitem->strides[i], phys); }
- btrfs_set_item_key_safe(trans, path, &newkey); + ret = btrfs_del_item(trans, stripe_root, path); + if (ret) + goto out; + + btrfs_release_path(path); + ret = btrfs_insert_item(trans, stripe_root, &newkey, newitem, item_size); + +out: + kfree(newitem); + return ret; }
int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length)
linux-stable-mirror@lists.linaro.org