From: Linus Walleij linus.walleij@linaro.org
[ Upstream commit d6d51a96c7d63b7450860a3037f2d62388286a52 ]
Functions like memset()/memmove()/memcpy() do a lot of memory accesses.
If a bad pointer is passed to one of these functions it is important to catch this. Compiler instrumentation cannot do this since these functions are written in assembly.
KASan replaces these memory functions with instrumented variants.
The original functions are declared as weak symbols so that the strong definitions in mm/kasan/kasan.c can replace them.
The original functions have aliases with a '__' prefix in their name, so we can call the non-instrumented variant if needed.
We must use __memcpy()/__memset() in place of memcpy()/memset() when we copy .data to RAM and when we clear .bss, because kasan_early_init cannot be called before the initialization of .data and .bss.
For the kernel compression and EFI libstub's custom string libraries we need a special quirk: even if these are built without KASan enabled, they rely on the global headers for their custom string libraries, which means that e.g. memcpy() will be defined to __memcpy() and we get link failures. Since these implementations are written i C rather than assembly we use e.g. __alias(memcpy) to redirected any users back to the local implementation.
Cc: Andrey Ryabinin aryabinin@virtuozzo.com Cc: Alexander Potapenko glider@google.com Cc: Dmitry Vyukov dvyukov@google.com Cc: kasan-dev@googlegroups.com Reviewed-by: Ard Biesheuvel ardb@kernel.org Tested-by: Ard Biesheuvel ardb@kernel.org # QEMU/KVM/mach-virt/LPAE/8G Tested-by: Florian Fainelli f.fainelli@gmail.com # Brahma SoCs Tested-by: Ahmad Fatoum a.fatoum@pengutronix.de # i.MX6Q Reported-by: Russell King - ARM Linux rmk+kernel@armlinux.org.uk Signed-off-by: Ahmad Fatoum a.fatoum@pengutronix.de Signed-off-by: Abbott Liu liuwenliang@huawei.com Signed-off-by: Florian Fainelli f.fainelli@gmail.com Signed-off-by: Linus Walleij linus.walleij@linaro.org Signed-off-by: Russell King rmk+kernel@armlinux.org.uk Signed-off-by: Sasha Levin sashal@kernel.org --- arch/arm/boot/compressed/string.c | 19 +++++++++++++++++++ arch/arm/include/asm/string.h | 26 ++++++++++++++++++++++++++ arch/arm/kernel/head-common.S | 4 ++-- arch/arm/lib/memcpy.S | 3 +++ arch/arm/lib/memmove.S | 5 ++++- arch/arm/lib/memset.S | 3 +++ 6 files changed, 57 insertions(+), 3 deletions(-)
diff --git a/arch/arm/boot/compressed/string.c b/arch/arm/boot/compressed/string.c index ade5079bebbf9..8c0fa276d9946 100644 --- a/arch/arm/boot/compressed/string.c +++ b/arch/arm/boot/compressed/string.c @@ -7,6 +7,25 @@
#include <linux/string.h>
+/* + * The decompressor is built without KASan but uses the same redirects as the + * rest of the kernel when CONFIG_KASAN is enabled, defining e.g. memcpy() + * to __memcpy() but since we are not linking with the main kernel string + * library in the decompressor, that will lead to link failures. + * + * Undefine KASan's versions, define the wrapped functions and alias them to + * the right names so that when e.g. __memcpy() appear in the code, it will + * still be linked to this local version of memcpy(). + */ +#ifdef CONFIG_KASAN +#undef memcpy +#undef memmove +#undef memset +void *__memcpy(void *__dest, __const void *__src, size_t __n) __alias(memcpy); +void *__memmove(void *__dest, __const void *__src, size_t count) __alias(memmove); +void *__memset(void *s, int c, size_t count) __alias(memset); +#endif + void *memcpy(void *__dest, __const void *__src, size_t __n) { int i = 0; diff --git a/arch/arm/include/asm/string.h b/arch/arm/include/asm/string.h index 111a1d8a41ddf..6c607c68f3ad7 100644 --- a/arch/arm/include/asm/string.h +++ b/arch/arm/include/asm/string.h @@ -5,6 +5,9 @@ /* * We don't do inline string functions, since the * optimised inline asm versions are not small. + * + * The __underscore versions of some functions are for KASan to be able + * to replace them with instrumented versions. */
#define __HAVE_ARCH_STRRCHR @@ -15,15 +18,18 @@ extern char * strchr(const char * s, int c);
#define __HAVE_ARCH_MEMCPY extern void * memcpy(void *, const void *, __kernel_size_t); +extern void *__memcpy(void *dest, const void *src, __kernel_size_t n);
#define __HAVE_ARCH_MEMMOVE extern void * memmove(void *, const void *, __kernel_size_t); +extern void *__memmove(void *dest, const void *src, __kernel_size_t n);
#define __HAVE_ARCH_MEMCHR extern void * memchr(const void *, int, __kernel_size_t);
#define __HAVE_ARCH_MEMSET extern void * memset(void *, int, __kernel_size_t); +extern void *__memset(void *s, int c, __kernel_size_t n);
#define __HAVE_ARCH_MEMSET32 extern void *__memset32(uint32_t *, uint32_t v, __kernel_size_t); @@ -39,4 +45,24 @@ static inline void *memset64(uint64_t *p, uint64_t v, __kernel_size_t n) return __memset64(p, v, n * 8, v >> 32); }
+/* + * For files that are not instrumented (e.g. mm/slub.c) we + * must use non-instrumented versions of the mem* + * functions named __memcpy() etc. All such kernel code has + * been tagged with KASAN_SANITIZE_file.o = n, which means + * that the address sanitization argument isn't passed to the + * compiler, and __SANITIZE_ADDRESS__ is not set. As a result + * these defines kick in. + */ +#if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__) +#define memcpy(dst, src, len) __memcpy(dst, src, len) +#define memmove(dst, src, len) __memmove(dst, src, len) +#define memset(s, c, n) __memset(s, c, n) + +#ifndef __NO_FORTIFY +#define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */ +#endif + +#endif + #endif diff --git a/arch/arm/kernel/head-common.S b/arch/arm/kernel/head-common.S index 4a3982812a401..6840c7c60a858 100644 --- a/arch/arm/kernel/head-common.S +++ b/arch/arm/kernel/head-common.S @@ -95,7 +95,7 @@ __mmap_switched: THUMB( ldmia r4!, {r0, r1, r2, r3} ) THUMB( mov sp, r3 ) sub r2, r2, r1 - bl memcpy @ copy .data to RAM + bl __memcpy @ copy .data to RAM #endif
ARM( ldmia r4!, {r0, r1, sp} ) @@ -103,7 +103,7 @@ __mmap_switched: THUMB( mov sp, r3 ) sub r2, r1, r0 mov r1, #0 - bl memset @ clear .bss + bl __memset @ clear .bss
ldmia r4, {r0, r1, r2, r3} str r9, [r0] @ Save processor ID diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S index 09a333153dc66..ad4625d16e117 100644 --- a/arch/arm/lib/memcpy.S +++ b/arch/arm/lib/memcpy.S @@ -58,6 +58,8 @@
/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */
+.weak memcpy +ENTRY(__memcpy) ENTRY(mmiocpy) ENTRY(memcpy)
@@ -65,3 +67,4 @@ ENTRY(memcpy)
ENDPROC(memcpy) ENDPROC(mmiocpy) +ENDPROC(__memcpy) diff --git a/arch/arm/lib/memmove.S b/arch/arm/lib/memmove.S index b50e5770fb44d..fd123ea5a5a4a 100644 --- a/arch/arm/lib/memmove.S +++ b/arch/arm/lib/memmove.S @@ -24,12 +24,14 @@ * occurring in the opposite direction. */
+.weak memmove +ENTRY(__memmove) ENTRY(memmove) UNWIND( .fnstart )
subs ip, r0, r1 cmphi r2, ip - bls memcpy + bls __memcpy
stmfd sp!, {r0, r4, lr} UNWIND( .fnend ) @@ -222,3 +224,4 @@ ENTRY(memmove) 18: backward_copy_shift push=24 pull=8
ENDPROC(memmove) +ENDPROC(__memmove) diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S index 6ca4535c47fb6..0e7ff0423f50b 100644 --- a/arch/arm/lib/memset.S +++ b/arch/arm/lib/memset.S @@ -13,6 +13,8 @@ .text .align 5
+.weak memset +ENTRY(__memset) ENTRY(mmioset) ENTRY(memset) UNWIND( .fnstart ) @@ -132,6 +134,7 @@ UNWIND( .fnstart ) UNWIND( .fnend ) ENDPROC(memset) ENDPROC(mmioset) +ENDPROC(__memset)
ENTRY(__memset32) UNWIND( .fnstart )
From: Dinghao Liu dinghao.liu@zju.edu.cn
[ Upstream commit 28d211919e422f58c1e6c900e5810eee4f1ce4c8 ]
When clk_hw_register_fixed_rate_with_accuracy() fails, clk_data should be freed. It's the same for the subsequent two error paths, but we should also unregister the already registered clocks in them.
Signed-off-by: Dinghao Liu dinghao.liu@zju.edu.cn Signed-off-by: Alexandre Belloni alexandre.belloni@bootlin.com Link: https://lore.kernel.org/r/20201020061226.6572-1-dinghao.liu@zju.edu.cn Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/rtc/rtc-sun6i.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/drivers/rtc/rtc-sun6i.c b/drivers/rtc/rtc-sun6i.c index e2b8b150bcb44..f2818cdd11d82 100644 --- a/drivers/rtc/rtc-sun6i.c +++ b/drivers/rtc/rtc-sun6i.c @@ -272,7 +272,7 @@ static void __init sun6i_rtc_clk_init(struct device_node *node, 300000000); if (IS_ERR(rtc->int_osc)) { pr_crit("Couldn't register the internal oscillator\n"); - return; + goto err; }
parents[0] = clk_hw_get_name(rtc->int_osc); @@ -290,7 +290,7 @@ static void __init sun6i_rtc_clk_init(struct device_node *node, rtc->losc = clk_register(NULL, &rtc->hw); if (IS_ERR(rtc->losc)) { pr_crit("Couldn't register the LOSC clock\n"); - return; + goto err_register; }
of_property_read_string_index(node, "clock-output-names", 1, @@ -301,7 +301,7 @@ static void __init sun6i_rtc_clk_init(struct device_node *node, &rtc->lock); if (IS_ERR(rtc->ext_losc)) { pr_crit("Couldn't register the LOSC external gate\n"); - return; + goto err_register; }
clk_data->num = 2; @@ -314,6 +314,8 @@ static void __init sun6i_rtc_clk_init(struct device_node *node, of_clk_add_hw_provider(node, of_clk_hw_onecell_get, clk_data); return;
+err_register: + clk_hw_unregister_fixed_rate(rtc->int_osc); err: kfree(clk_data); }
From: Miroslav Benes mbenes@suse.cz
[ Upstream commit 5e8ed280dab9eeabc1ba0b2db5dbe9fe6debb6b5 ]
If a module fails to load due to an error in prepare_coming_module(), the following error handling in load_module() runs with MODULE_STATE_COMING in module's state. Fix it by correctly setting MODULE_STATE_GOING under "bug_cleanup" label.
Signed-off-by: Miroslav Benes mbenes@suse.cz Signed-off-by: Jessica Yu jeyu@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- kernel/module.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/kernel/module.c b/kernel/module.c index a4fa44a652a75..b34235082394b 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3991,6 +3991,7 @@ static int load_module(struct load_info *info, const char __user *uargs, MODULE_STATE_GOING, mod); klp_module_going(mod); bug_cleanup: + mod->state = MODULE_STATE_GOING; /* module_bug_cleanup needs module_mutex protection */ mutex_lock(&module_mutex); module_bug_cleanup(mod);
From: Jan Kara jack@suse.cz
[ Upstream commit 10f04d40a9fa29785206c619f80d8beedb778837 ]
The on-disk quota format supports quota files with upto 2^32 blocks. Be careful when computing quota file offsets in the quota files from block numbers as they can overflow 32-bit types. Since quota files larger than 4GB would require ~26 millions of quota users, this is mostly a theoretical concern now but better be careful, fuzzers would find the problem sooner or later anyway...
Reviewed-by: Andreas Dilger adilger@dilger.ca Signed-off-by: Jan Kara jack@suse.cz Signed-off-by: Sasha Levin sashal@kernel.org --- fs/quota/quota_tree.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c index a6f856f341dc7..c5562c871c8be 100644 --- a/fs/quota/quota_tree.c +++ b/fs/quota/quota_tree.c @@ -62,7 +62,7 @@ static ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
memset(buf, 0, info->dqi_usable_bs); return sb->s_op->quota_read(sb, info->dqi_type, buf, - info->dqi_usable_bs, blk << info->dqi_blocksize_bits); + info->dqi_usable_bs, (loff_t)blk << info->dqi_blocksize_bits); }
static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) @@ -71,7 +71,7 @@ static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) ssize_t ret;
ret = sb->s_op->quota_write(sb, info->dqi_type, buf, - info->dqi_usable_bs, blk << info->dqi_blocksize_bits); + info->dqi_usable_bs, (loff_t)blk << info->dqi_blocksize_bits); if (ret != info->dqi_usable_bs) { quota_error(sb, "dquota write failed"); if (ret >= 0) @@ -284,7 +284,7 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info, blk); goto out_buf; } - dquot->dq_off = (blk << info->dqi_blocksize_bits) + + dquot->dq_off = ((loff_t)blk << info->dqi_blocksize_bits) + sizeof(struct qt_disk_dqdbheader) + i * info->dqi_entry_size; kfree(buf); @@ -559,7 +559,7 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info, ret = -EIO; goto out_buf; } else { - ret = (blk << info->dqi_blocksize_bits) + sizeof(struct + ret = ((loff_t)blk << info->dqi_blocksize_bits) + sizeof(struct qt_disk_dqdbheader) + i * info->dqi_entry_size; } out_buf:
From: Zheng Liang zhengliang6@huawei.com
[ Upstream commit 1eab0fea2514b269e384c117f5b5772b882761f0 ]
When devm_rtc_allocate_device is failed in pl031_probe, it should release mem regions with device.
Reported-by: Hulk Robot hulkci@huawei.com Signed-off-by: Zheng Liang zhengliang6@huawei.com Signed-off-by: Alexandre Belloni alexandre.belloni@bootlin.com Acked-by: Linus Walleij linus.walleij@linaro.org Link: https://lore.kernel.org/r/20201112093139.32566-1-zhengliang6@huawei.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/rtc/rtc-pl031.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/drivers/rtc/rtc-pl031.c b/drivers/rtc/rtc-pl031.c index c6b89273feba8..d4b2ab7861266 100644 --- a/drivers/rtc/rtc-pl031.c +++ b/drivers/rtc/rtc-pl031.c @@ -361,8 +361,10 @@ static int pl031_probe(struct amba_device *adev, const struct amba_id *id)
device_init_wakeup(&adev->dev, true); ldata->rtc = devm_rtc_allocate_device(&adev->dev); - if (IS_ERR(ldata->rtc)) - return PTR_ERR(ldata->rtc); + if (IS_ERR(ldata->rtc)) { + ret = PTR_ERR(ldata->rtc); + goto out; + }
ldata->rtc->ops = ops; ldata->rtc->range_min = vendor->range_min;
From: Qinglang Miao miaoqinglang@huawei.com
[ Upstream commit ffa1797040c5da391859a9556be7b735acbe1242 ]
I noticed that iounmap() of msgr_block_addr before return from mpic_msgr_probe() in the error handling case is missing. So use devm_ioremap() instead of just ioremap() when remapping the message register block, so the mapping will be automatically released on probe failure.
Signed-off-by: Qinglang Miao miaoqinglang@huawei.com Signed-off-by: Michael Ellerman mpe@ellerman.id.au Link: https://lore.kernel.org/r/20201028091551.136400-1-miaoqinglang@huawei.com Signed-off-by: Sasha Levin sashal@kernel.org --- arch/powerpc/sysdev/mpic_msgr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/powerpc/sysdev/mpic_msgr.c b/arch/powerpc/sysdev/mpic_msgr.c index f6b253e2be409..36ec0bdd8b63c 100644 --- a/arch/powerpc/sysdev/mpic_msgr.c +++ b/arch/powerpc/sysdev/mpic_msgr.c @@ -191,7 +191,7 @@ static int mpic_msgr_probe(struct platform_device *dev)
/* IO map the message register block. */ of_address_to_resource(np, 0, &rsrc); - msgr_block_addr = ioremap(rsrc.start, resource_size(&rsrc)); + msgr_block_addr = devm_ioremap(&dev->dev, rsrc.start, resource_size(&rsrc)); if (!msgr_block_addr) { dev_err(&dev->dev, "Failed to iomap MPIC message registers"); return -EFAULT;
From: Qinglang Miao miaoqinglang@huawei.com
[ Upstream commit 59165d16c699182b86b5c65181013f1fd88feb62 ]
Add the missing destroy_workqueue() before return from i3c_master_register in the error handling case.
Signed-off-by: Qinglang Miao miaoqinglang@huawei.com Signed-off-by: Boris Brezillon boris.brezillon@collabora.com Link: https://lore.kernel.org/linux-i3c/20201028091543.136167-1-miaoqinglang@huawe... Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/i3c/master.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/drivers/i3c/master.c b/drivers/i3c/master.c index 1c6b78ad5ade4..b61bf53ec07af 100644 --- a/drivers/i3c/master.c +++ b/drivers/i3c/master.c @@ -2537,7 +2537,7 @@ int i3c_master_register(struct i3c_master_controller *master,
ret = i3c_master_bus_init(master); if (ret) - goto err_put_dev; + goto err_destroy_wq;
ret = device_add(&master->dev); if (ret) @@ -2568,6 +2568,9 @@ int i3c_master_register(struct i3c_master_controller *master, err_cleanup_bus: i3c_master_bus_cleanup(master);
+err_destroy_wq: + destroy_workqueue(master->wq); + err_put_dev: put_device(&master->dev);
From: Rustam Kovhaev rkovhaev@gmail.com
[ Upstream commit d24396c5290ba8ab04ba505176874c4e04a2d53c ]
when directory item has an invalid value set for ih_entry_count it might trigger use-after-free or out-of-bounds read in bin_search_in_dir_item()
ih_entry_count * IH_SIZE for directory item should not be larger than ih_item_len
Link: https://lore.kernel.org/r/20201101140958.3650143-1-rkovhaev@gmail.com Reported-and-tested-by: syzbot+83b6f7cf9922cae5c4d7@syzkaller.appspotmail.com Link: https://syzkaller.appspot.com/bug?extid=83b6f7cf9922cae5c4d7 Signed-off-by: Rustam Kovhaev rkovhaev@gmail.com Signed-off-by: Jan Kara jack@suse.cz Signed-off-by: Sasha Levin sashal@kernel.org --- fs/reiserfs/stree.c | 6 ++++++ 1 file changed, 6 insertions(+)
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 8bf88d690729e..476a7ff494822 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -454,6 +454,12 @@ static int is_leaf(char *buf, int blocksize, struct buffer_head *bh) "(second one): %h", ih); return 0; } + if (is_direntry_le_ih(ih) && (ih_item_len(ih) < (ih_entry_count(ih) * IH_SIZE))) { + reiserfs_warning(NULL, "reiserfs-5093", + "item entry count seems wrong %h", + ih); + return 0; + } prev_location = ih_location(ih); }
From: Trond Myklebust trond.myklebust@hammerspace.com
[ Upstream commit b6d49ecd1081740b6e632366428b960461f8158b ]
When returning the layout in nfs4_evict_inode(), we need to ensure that the layout is actually done being freed before we can proceed to free the inode itself.
Signed-off-by: Trond Myklebust trond.myklebust@hammerspace.com Signed-off-by: Sasha Levin sashal@kernel.org --- fs/nfs/nfs4super.c | 2 +- fs/nfs/pnfs.c | 33 +++++++++++++++++++++++++++++++-- fs/nfs/pnfs.h | 5 +++++ 3 files changed, 37 insertions(+), 3 deletions(-)
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index 93f5c1678ec29..984cc42ee54d8 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -67,7 +67,7 @@ static void nfs4_evict_inode(struct inode *inode) nfs_inode_evict_delegation(inode); /* Note that above delegreturn would trigger pnfs return-on-close */ pnfs_return_layout(inode); - pnfs_destroy_layout(NFS_I(inode)); + pnfs_destroy_layout_final(NFS_I(inode)); /* First call standard NFS clear_inode() code */ nfs_clear_inode(inode); nfs4_xattr_cache_zap(inode); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 0e50b9d45c320..07f59dc8cb2e7 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -294,6 +294,7 @@ void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) { struct inode *inode; + unsigned long i_state;
if (!lo) return; @@ -304,8 +305,12 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) if (!list_empty(&lo->plh_segs)) WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n"); pnfs_detach_layout_hdr(lo); + i_state = inode->i_state; spin_unlock(&inode->i_lock); pnfs_free_layout_hdr(lo); + /* Notify pnfs_destroy_layout_final() that we're done */ + if (i_state & (I_FREEING | I_CLEAR)) + wake_up_var(lo); } }
@@ -734,8 +739,7 @@ pnfs_free_lseg_list(struct list_head *free_me) } }
-void -pnfs_destroy_layout(struct nfs_inode *nfsi) +static struct pnfs_layout_hdr *__pnfs_destroy_layout(struct nfs_inode *nfsi) { struct pnfs_layout_hdr *lo; LIST_HEAD(tmp_list); @@ -753,9 +757,34 @@ pnfs_destroy_layout(struct nfs_inode *nfsi) pnfs_put_layout_hdr(lo); } else spin_unlock(&nfsi->vfs_inode.i_lock); + return lo; +} + +void pnfs_destroy_layout(struct nfs_inode *nfsi) +{ + __pnfs_destroy_layout(nfsi); } EXPORT_SYMBOL_GPL(pnfs_destroy_layout);
+static bool pnfs_layout_removed(struct nfs_inode *nfsi, + struct pnfs_layout_hdr *lo) +{ + bool ret; + + spin_lock(&nfsi->vfs_inode.i_lock); + ret = nfsi->layout != lo; + spin_unlock(&nfsi->vfs_inode.i_lock); + return ret; +} + +void pnfs_destroy_layout_final(struct nfs_inode *nfsi) +{ + struct pnfs_layout_hdr *lo = __pnfs_destroy_layout(nfsi); + + if (lo) + wait_var_event(lo, pnfs_layout_removed(nfsi, lo)); +} + static bool pnfs_layout_add_bulk_destroy_list(struct inode *inode, struct list_head *layout_list) diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 2661c44c62db4..78c3893918486 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -266,6 +266,7 @@ struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp); void pnfs_layoutget_free(struct nfs4_layoutget *lgp); void pnfs_free_lseg_list(struct list_head *tmp_list); void pnfs_destroy_layout(struct nfs_inode *); +void pnfs_destroy_layout_final(struct nfs_inode *); void pnfs_destroy_all_layouts(struct nfs_client *); int pnfs_destroy_layouts_byfsid(struct nfs_client *clp, struct nfs_fsid *fsid, @@ -710,6 +711,10 @@ static inline void pnfs_destroy_layout(struct nfs_inode *nfsi) { }
+static inline void pnfs_destroy_layout_final(struct nfs_inode *nfsi) +{ +} + static inline struct pnfs_layout_segment * pnfs_get_lseg(struct pnfs_layout_segment *lseg) {
From: Daniel Rosenberg drosen@google.com
[ Upstream commit 7ad08a58bf67594057362e45cbddd3e27e53e557 ]
Expand f2fs's casefolding support to include encrypted directories. To index casefolded+encrypted directories, we use the SipHash of the casefolded name, keyed by a key derived from the directory's fscrypt master key. This ensures that the dirhash doesn't leak information about the plaintext filenames.
Encryption keys are unavailable during roll-forward recovery, so we can't compute the dirhash when recovering a new dentry in an encrypted + casefolded directory. To avoid having to force a checkpoint when a new file is fsync'ed, store the dirhash on-disk appended to i_name.
This patch incorporates work by Eric Biggers ebiggers@google.com and Jaegeuk Kim jaegeuk@kernel.org.
Co-developed-by: Eric Biggers ebiggers@google.com Signed-off-by: Eric Biggers ebiggers@google.com Signed-off-by: Daniel Rosenberg drosen@google.com Reviewed-by: Eric Biggers ebiggers@google.com Signed-off-by: Jaegeuk Kim jaegeuk@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- fs/f2fs/dir.c | 98 +++++++++++++++++++++++++++++++++++----------- fs/f2fs/f2fs.h | 8 ++-- fs/f2fs/hash.c | 11 +++++- fs/f2fs/inline.c | 4 ++ fs/f2fs/recovery.c | 12 +++++- fs/f2fs/super.c | 6 --- 6 files changed, 106 insertions(+), 33 deletions(-)
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 4b9ef8bbfa4a9..4ee0b7070c157 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -5,6 +5,7 @@ * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ */ +#include <asm/unaligned.h> #include <linux/fs.h> #include <linux/f2fs_fs.h> #include <linux/sched/signal.h> @@ -206,30 +207,55 @@ static struct f2fs_dir_entry *find_in_block(struct inode *dir, /* * Test whether a case-insensitive directory entry matches the filename * being searched for. + * + * Returns 1 for a match, 0 for no match, and -errno on an error. */ -static bool f2fs_match_ci_name(const struct inode *dir, const struct qstr *name, +static int f2fs_match_ci_name(const struct inode *dir, const struct qstr *name, const u8 *de_name, u32 de_name_len) { const struct super_block *sb = dir->i_sb; const struct unicode_map *um = sb->s_encoding; + struct fscrypt_str decrypted_name = FSTR_INIT(NULL, de_name_len); struct qstr entry = QSTR_INIT(de_name, de_name_len); int res;
+ if (IS_ENCRYPTED(dir)) { + const struct fscrypt_str encrypted_name = + FSTR_INIT((u8 *)de_name, de_name_len); + + if (WARN_ON_ONCE(!fscrypt_has_encryption_key(dir))) + return -EINVAL; + + decrypted_name.name = kmalloc(de_name_len, GFP_KERNEL); + if (!decrypted_name.name) + return -ENOMEM; + res = fscrypt_fname_disk_to_usr(dir, 0, 0, &encrypted_name, + &decrypted_name); + if (res < 0) + goto out; + entry.name = decrypted_name.name; + entry.len = decrypted_name.len; + } + res = utf8_strncasecmp_folded(um, name, &entry); - if (res < 0) { - /* - * In strict mode, ignore invalid names. In non-strict mode, - * fall back to treating them as opaque byte sequences. - */ - if (sb_has_strict_encoding(sb) || name->len != entry.len) - return false; - return !memcmp(name->name, entry.name, name->len); + /* + * In strict mode, ignore invalid names. In non-strict mode, + * fall back to treating them as opaque byte sequences. + */ + if (res < 0 && !sb_has_strict_encoding(sb)) { + res = name->len == entry.len && + memcmp(name->name, entry.name, name->len) == 0; + } else { + /* utf8_strncasecmp_folded returns 0 on match */ + res = (res == 0); } - return res == 0; +out: + kfree(decrypted_name.name); + return res; } #endif /* CONFIG_UNICODE */
-static inline bool f2fs_match_name(const struct inode *dir, +static inline int f2fs_match_name(const struct inode *dir, const struct f2fs_filename *fname, const u8 *de_name, u32 de_name_len) { @@ -256,6 +282,7 @@ struct f2fs_dir_entry *f2fs_find_target_dentry(const struct f2fs_dentry_ptr *d, struct f2fs_dir_entry *de; unsigned long bit_pos = 0; int max_len = 0; + int res = 0;
if (max_slots) *max_slots = 0; @@ -273,10 +300,15 @@ struct f2fs_dir_entry *f2fs_find_target_dentry(const struct f2fs_dentry_ptr *d, continue; }
- if (de->hash_code == fname->hash && - f2fs_match_name(d->inode, fname, d->filename[bit_pos], - le16_to_cpu(de->name_len))) - goto found; + if (de->hash_code == fname->hash) { + res = f2fs_match_name(d->inode, fname, + d->filename[bit_pos], + le16_to_cpu(de->name_len)); + if (res < 0) + return ERR_PTR(res); + if (res) + goto found; + }
if (max_slots && max_len > *max_slots) *max_slots = max_len; @@ -326,7 +358,11 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, }
de = find_in_block(dir, dentry_page, fname, &max_slots); - if (de) { + if (IS_ERR(de)) { + *res_page = ERR_CAST(de); + de = NULL; + break; + } else if (de) { *res_page = dentry_page; break; } @@ -448,17 +484,39 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, f2fs_put_page(page, 1); }
-static void init_dent_inode(const struct f2fs_filename *fname, +static void init_dent_inode(struct inode *dir, struct inode *inode, + const struct f2fs_filename *fname, struct page *ipage) { struct f2fs_inode *ri;
+ if (!fname) /* tmpfile case? */ + return; + f2fs_wait_on_page_writeback(ipage, NODE, true, true);
/* copy name info. to this inode page */ ri = F2FS_INODE(ipage); ri->i_namelen = cpu_to_le32(fname->disk_name.len); memcpy(ri->i_name, fname->disk_name.name, fname->disk_name.len); + if (IS_ENCRYPTED(dir)) { + file_set_enc_name(inode); + /* + * Roll-forward recovery doesn't have encryption keys available, + * so it can't compute the dirhash for encrypted+casefolded + * filenames. Append it to i_name if possible. Else, disable + * roll-forward recovery of the dentry (i.e., make fsync'ing the + * file force a checkpoint) by setting LOST_PINO. + */ + if (IS_CASEFOLDED(dir)) { + if (fname->disk_name.len + sizeof(f2fs_hash_t) <= + F2FS_NAME_LEN) + put_unaligned(fname->hash, (f2fs_hash_t *) + &ri->i_name[fname->disk_name.len]); + else + file_lost_pino(inode); + } + } set_page_dirty(ipage); }
@@ -541,11 +599,7 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, return page; }
- if (fname) { - init_dent_inode(fname, page); - if (IS_ENCRYPTED(dir)) - file_set_enc_name(inode); - } + init_dent_inode(dir, inode, fname, page);
/* * This file should be checkpointed during fsync. diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9a321c52facec..8cba43eb0b24a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -533,9 +533,11 @@ struct f2fs_filename { #ifdef CONFIG_UNICODE /* * For casefolded directories: the casefolded name, but it's left NULL - * if the original name is not valid Unicode or if the filesystem is - * doing an internal operation where usr_fname is also NULL. In these - * cases we fall back to treating the name as an opaque byte sequence. + * if the original name is not valid Unicode, if the directory is both + * casefolded and encrypted and its encryption key is unavailable, or if + * the filesystem is doing an internal operation where usr_fname is also + * NULL. In all these cases we fall back to treating the name as an + * opaque byte sequence. */ struct fscrypt_str cf_name; #endif diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index de841aaf3c439..e3beac546c63a 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -111,7 +111,9 @@ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname) * If the casefolded name is provided, hash it instead of the * on-disk name. If the casefolded name is *not* provided, that * should only be because the name wasn't valid Unicode, so fall - * back to treating the name as an opaque byte sequence. + * back to treating the name as an opaque byte sequence. Note + * that to handle encrypted directories, the fallback must use + * usr_fname (plaintext) rather than disk_name (ciphertext). */ WARN_ON_ONCE(!fname->usr_fname->name); if (fname->cf_name.name) { @@ -121,6 +123,13 @@ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname) name = fname->usr_fname->name; len = fname->usr_fname->len; } + if (IS_ENCRYPTED(dir)) { + struct qstr tmp = QSTR_INIT(name, len); + + fname->hash = + cpu_to_le32(fscrypt_fname_siphash(dir, &tmp)); + return; + } } #endif fname->hash = cpu_to_le32(TEA_hash_name(name, len)); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 70384e31788db..92e9852d316ad 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -332,6 +332,10 @@ struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir, make_dentry_ptr_inline(dir, &d, inline_dentry); de = f2fs_find_target_dentry(&d, fname, NULL); unlock_page(ipage); + if (IS_ERR(de)) { + *res_page = ERR_CAST(de); + de = NULL; + } if (de) *res_page = ipage; else diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 4f12ade6410a1..0947d36af1a8e 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -5,6 +5,7 @@ * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ */ +#include <asm/unaligned.h> #include <linux/fs.h> #include <linux/f2fs_fs.h> #include "f2fs.h" @@ -128,7 +129,16 @@ static int init_recovered_filename(const struct inode *dir, }
/* Compute the hash of the filename */ - if (IS_CASEFOLDED(dir)) { + if (IS_ENCRYPTED(dir) && IS_CASEFOLDED(dir)) { + /* + * In this case the hash isn't computable without the key, so it + * was saved on-disk. + */ + if (fname->disk_name.len + sizeof(f2fs_hash_t) > F2FS_NAME_LEN) + return -EINVAL; + fname->hash = get_unaligned((f2fs_hash_t *) + &raw_inode->i_name[fname->disk_name.len]); + } else if (IS_CASEFOLDED(dir)) { err = f2fs_init_casefolded_name(dir, fname); if (err) return err; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 00eff2f518079..ba859eeca69a3 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3399,12 +3399,6 @@ static int f2fs_setup_casefold(struct f2fs_sb_info *sbi) struct unicode_map *encoding; __u16 encoding_flags;
- if (f2fs_sb_has_encrypt(sbi)) { - f2fs_err(sbi, - "Can't mount with encoding and encryption"); - return -EINVAL; - } - if (f2fs_sb_read_encoding(sbi->raw_super, &encoding_info, &encoding_flags)) { f2fs_err(sbi,
On Wed, Dec 30, 2020 at 08:02:52AM -0500, Sasha Levin wrote:
From: Daniel Rosenberg drosen@google.com
[ Upstream commit 7ad08a58bf67594057362e45cbddd3e27e53e557 ]
Expand f2fs's casefolding support to include encrypted directories. To index casefolded+encrypted directories, we use the SipHash of the casefolded name, keyed by a key derived from the directory's fscrypt master key. This ensures that the dirhash doesn't leak information about the plaintext filenames.
Encryption keys are unavailable during roll-forward recovery, so we can't compute the dirhash when recovering a new dentry in an encrypted + casefolded directory. To avoid having to force a checkpoint when a new file is fsync'ed, store the dirhash on-disk appended to i_name.
This patch incorporates work by Eric Biggers ebiggers@google.com and Jaegeuk Kim jaegeuk@kernel.org.
Co-developed-by: Eric Biggers ebiggers@google.com Signed-off-by: Eric Biggers ebiggers@google.com Signed-off-by: Daniel Rosenberg drosen@google.com Reviewed-by: Eric Biggers ebiggers@google.com Signed-off-by: Jaegeuk Kim jaegeuk@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org
Please don't backport this to the LTS kernels. This is a new feature, not a fix, and you missed prerequisite patches...
- Eric
On Wed, Dec 30, 2020 at 10:01:56AM -0800, Eric Biggers wrote:
On Wed, Dec 30, 2020 at 08:02:52AM -0500, Sasha Levin wrote:
From: Daniel Rosenberg drosen@google.com
[ Upstream commit 7ad08a58bf67594057362e45cbddd3e27e53e557 ]
Expand f2fs's casefolding support to include encrypted directories. To index casefolded+encrypted directories, we use the SipHash of the casefolded name, keyed by a key derived from the directory's fscrypt master key. This ensures that the dirhash doesn't leak information about the plaintext filenames.
Encryption keys are unavailable during roll-forward recovery, so we can't compute the dirhash when recovering a new dentry in an encrypted + casefolded directory. To avoid having to force a checkpoint when a new file is fsync'ed, store the dirhash on-disk appended to i_name.
This patch incorporates work by Eric Biggers ebiggers@google.com and Jaegeuk Kim jaegeuk@kernel.org.
Co-developed-by: Eric Biggers ebiggers@google.com Signed-off-by: Eric Biggers ebiggers@google.com Signed-off-by: Daniel Rosenberg drosen@google.com Reviewed-by: Eric Biggers ebiggers@google.com Signed-off-by: Jaegeuk Kim jaegeuk@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org
Please don't backport this to the LTS kernels. This is a new feature, not a fix, and you missed prerequisite patches...
Sure, I'l drop it. Thanks!
From: Jaegeuk Kim jaegeuk@kernel.org
[ Upstream commit a95ba66ac1457b76fe472c8e092ab1006271f16c ]
Light reported sometimes shinker gets nat_cnt < dirty_nat_cnt resulting in wrong do_shinker work. Let's avoid to return insane overflowed value by adding single tracking value.
Reported-by: Light Hsieh Light.Hsieh@mediatek.com Reviewed-by: Chao Yu yuchao0@huawei.com Signed-off-by: Jaegeuk Kim jaegeuk@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/debug.c | 11 ++++++----- fs/f2fs/f2fs.h | 10 ++++++++-- fs/f2fs/node.c | 29 ++++++++++++++++++----------- fs/f2fs/node.h | 4 ++-- fs/f2fs/shrinker.c | 4 +--- 6 files changed, 36 insertions(+), 24 deletions(-)
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 023462e80e58d..b39bf416d5114 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1600,7 +1600,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) goto out; }
- if (NM_I(sbi)->dirty_nat_cnt == 0 && + if (NM_I(sbi)->nat_cnt[DIRTY_NAT] == 0 && SIT_I(sbi)->dirty_sentries == 0 && prefree_segments(sbi) == 0) { f2fs_flush_sit_entries(sbi, cpc); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index a8357fd4f5fab..197c914119da8 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -145,8 +145,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->node_pages = NODE_MAPPING(sbi)->nrpages; if (sbi->meta_inode) si->meta_pages = META_MAPPING(sbi)->nrpages; - si->nats = NM_I(sbi)->nat_cnt; - si->dirty_nats = NM_I(sbi)->dirty_nat_cnt; + si->nats = NM_I(sbi)->nat_cnt[TOTAL_NAT]; + si->dirty_nats = NM_I(sbi)->nat_cnt[DIRTY_NAT]; si->sits = MAIN_SEGS(sbi); si->dirty_sits = SIT_I(sbi)->dirty_sentries; si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID]; @@ -278,9 +278,10 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID] + NM_I(sbi)->nid_cnt[PREALLOC_NID]) * sizeof(struct free_nid); - si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry); - si->cache_mem += NM_I(sbi)->dirty_nat_cnt * - sizeof(struct nat_entry_set); + si->cache_mem += NM_I(sbi)->nat_cnt[TOTAL_NAT] * + sizeof(struct nat_entry); + si->cache_mem += NM_I(sbi)->nat_cnt[DIRTY_NAT] * + sizeof(struct nat_entry_set); si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages); for (i = 0; i < MAX_INO_ENTRY; i++) si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8cba43eb0b24a..14ace0e21d389 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -896,6 +896,13 @@ enum nid_state { MAX_NID_STATE, };
+enum nat_state { + TOTAL_NAT, + DIRTY_NAT, + RECLAIMABLE_NAT, + MAX_NAT_STATE, +}; + struct f2fs_nm_info { block_t nat_blkaddr; /* base disk address of NAT */ nid_t max_nid; /* maximum possible node ids */ @@ -911,8 +918,7 @@ struct f2fs_nm_info { struct rw_semaphore nat_tree_lock; /* protect nat_tree_lock */ struct list_head nat_entries; /* cached nat entry list (clean) */ spinlock_t nat_list_lock; /* protect clean nat entry list */ - unsigned int nat_cnt; /* the # of cached nat entries */ - unsigned int dirty_nat_cnt; /* total num of nat entries in set */ + unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */ unsigned int nat_blocks; /* # of nat blocks */
/* free node ids management */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d5d8ce077f295..56505b8e2b8d3 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -62,8 +62,8 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) sizeof(struct free_nid)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); } else if (type == NAT_ENTRIES) { - mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> - PAGE_SHIFT; + mem_size = (nm_i->nat_cnt[TOTAL_NAT] * + sizeof(struct nat_entry)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); if (excess_cached_nats(sbi)) res = false; @@ -177,7 +177,8 @@ static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i, list_add_tail(&ne->list, &nm_i->nat_entries); spin_unlock(&nm_i->nat_list_lock);
- nm_i->nat_cnt++; + nm_i->nat_cnt[TOTAL_NAT]++; + nm_i->nat_cnt[RECLAIMABLE_NAT]++; return ne; }
@@ -207,7 +208,8 @@ static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i, static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e) { radix_tree_delete(&nm_i->nat_root, nat_get_nid(e)); - nm_i->nat_cnt--; + nm_i->nat_cnt[TOTAL_NAT]--; + nm_i->nat_cnt[RECLAIMABLE_NAT]--; __free_nat_entry(e); }
@@ -253,7 +255,8 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, if (get_nat_flag(ne, IS_DIRTY)) goto refresh_list;
- nm_i->dirty_nat_cnt++; + nm_i->nat_cnt[DIRTY_NAT]++; + nm_i->nat_cnt[RECLAIMABLE_NAT]--; set_nat_flag(ne, IS_DIRTY, true); refresh_list: spin_lock(&nm_i->nat_list_lock); @@ -273,7 +276,8 @@ static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
set_nat_flag(ne, IS_DIRTY, false); set->entry_cnt--; - nm_i->dirty_nat_cnt--; + nm_i->nat_cnt[DIRTY_NAT]--; + nm_i->nat_cnt[RECLAIMABLE_NAT]++; }
static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, @@ -2944,14 +2948,17 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) LIST_HEAD(sets); int err = 0;
- /* during unmount, let's flush nat_bits before checking dirty_nat_cnt */ + /* + * during unmount, let's flush nat_bits before checking + * nat_cnt[DIRTY_NAT]. + */ if (enabled_nat_bits(sbi, cpc)) { down_write(&nm_i->nat_tree_lock); remove_nats_in_journal(sbi); up_write(&nm_i->nat_tree_lock); }
- if (!nm_i->dirty_nat_cnt) + if (!nm_i->nat_cnt[DIRTY_NAT]) return 0;
down_write(&nm_i->nat_tree_lock); @@ -2962,7 +2969,8 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * into nat entry set. */ if (enabled_nat_bits(sbi, cpc) || - !__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL)) + !__has_cursum_space(journal, + nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL)) remove_nats_in_journal(sbi);
while ((found = __gang_lookup_nat_set(nm_i, @@ -3086,7 +3094,6 @@ static int init_node_manager(struct f2fs_sb_info *sbi) F2FS_RESERVED_NODE_NUM; nm_i->nid_cnt[FREE_NID] = 0; nm_i->nid_cnt[PREALLOC_NID] = 0; - nm_i->nat_cnt = 0; nm_i->ram_thresh = DEF_RAM_THRESHOLD; nm_i->ra_nid_pages = DEF_RA_NID_PAGES; nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD; @@ -3220,7 +3227,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) __del_from_nat_cache(nm_i, natvec[idx]); } } - f2fs_bug_on(sbi, nm_i->nat_cnt); + f2fs_bug_on(sbi, nm_i->nat_cnt[TOTAL_NAT]);
/* destroy nat set cache */ nid = 0; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 69e5859e993cf..f84541b57acbb 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -126,13 +126,13 @@ static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne,
static inline bool excess_dirty_nats(struct f2fs_sb_info *sbi) { - return NM_I(sbi)->dirty_nat_cnt >= NM_I(sbi)->max_nid * + return NM_I(sbi)->nat_cnt[DIRTY_NAT] >= NM_I(sbi)->max_nid * NM_I(sbi)->dirty_nats_ratio / 100; }
static inline bool excess_cached_nats(struct f2fs_sb_info *sbi) { - return NM_I(sbi)->nat_cnt >= DEF_NAT_CACHE_THRESHOLD; + return NM_I(sbi)->nat_cnt[TOTAL_NAT] >= DEF_NAT_CACHE_THRESHOLD; }
static inline bool excess_dirty_nodes(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index d66de5999a26d..dd3c3c7a90ec8 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -18,9 +18,7 @@ static unsigned int shrinker_run_no;
static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) { - long count = NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt; - - return count > 0 ? count : 0; + return NM_I(sbi)->nat_cnt[RECLAIMABLE_NAT]; }
static unsigned long __count_free_nids(struct f2fs_sb_info *sbi)
From: Daeho Jeong daehojeong@google.com
[ Upstream commit 6422a71ef40e4751d59b8c9412e7e2dafe085878 ]
I found out f2fs_free_dic() is invoked in a wrong timing, but f2fs_verify_bio() still needed the dic info and it triggered the below kernel panic. It has been caused by the race condition of pending_pages value between decompression and verity logic, when the same compression cluster had been split in different bios. By split bios, f2fs_verify_bio() ended up with decreasing pending_pages value before it is reset to nr_cpages by f2fs_decompress_pages() and caused the kernel panic.
[ 4416.564763] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 ... [ 4416.896016] Workqueue: fsverity_read_queue f2fs_verity_work [ 4416.908515] pc : fsverity_verify_page+0x20/0x78 [ 4416.913721] lr : f2fs_verify_bio+0x11c/0x29c [ 4416.913722] sp : ffffffc019533cd0 [ 4416.913723] x29: ffffffc019533cd0 x28: 0000000000000402 [ 4416.913724] x27: 0000000000000001 x26: 0000000000000100 [ 4416.913726] x25: 0000000000000001 x24: 0000000000000004 [ 4416.913727] x23: 0000000000001000 x22: 0000000000000000 [ 4416.913728] x21: 0000000000000000 x20: ffffffff2076f9c0 [ 4416.913729] x19: ffffffff2076f9c0 x18: ffffff8a32380c30 [ 4416.913731] x17: ffffffc01f966d97 x16: 0000000000000298 [ 4416.913732] x15: 0000000000000000 x14: 0000000000000000 [ 4416.913733] x13: f074faec89ffffff x12: 0000000000000000 [ 4416.913734] x11: 0000000000001000 x10: 0000000000001000 [ 4416.929176] x9 : ffffffff20d1f5c7 x8 : 0000000000000000 [ 4416.929178] x7 : 626d7464ff286b6b x6 : ffffffc019533ade [ 4416.929179] x5 : 000000008049000e x4 : ffffffff2793e9e0 [ 4416.929180] x3 : 000000008049000e x2 : ffffff89ecfa74d0 [ 4416.929181] x1 : 0000000000000c40 x0 : ffffffff2076f9c0 [ 4416.929184] Call trace: [ 4416.929187] fsverity_verify_page+0x20/0x78 [ 4416.929189] f2fs_verify_bio+0x11c/0x29c [ 4416.929192] f2fs_verity_work+0x58/0x84 [ 4417.050667] process_one_work+0x270/0x47c [ 4417.055354] worker_thread+0x27c/0x4d8 [ 4417.059784] kthread+0x13c/0x320 [ 4417.063693] ret_from_fork+0x10/0x18
Chao pointed this can happen by the below race condition.
Thread A f2fs_post_read_wq fsverity_wq - f2fs_read_multi_pages() - f2fs_alloc_dic - dic->pending_pages = 2 - submit_bio() - submit_bio() - f2fs_post_read_work() handle first bio - f2fs_decompress_work() - __read_end_io() - f2fs_decompress_pages() - dic->pending_pages-- - enqueue f2fs_verity_work() - f2fs_verity_work() handle first bio - f2fs_verify_bio() - dic->pending_pages-- - f2fs_post_read_work() handle second bio - f2fs_decompress_work() - enqueue f2fs_verity_work() - f2fs_verify_pages() - f2fs_free_dic()
- f2fs_verity_work() handle second bio - f2fs_verfy_bio() - use-after-free on dic
Signed-off-by: Daeho Jeong daehojeong@google.com Signed-off-by: Jaegeuk Kim jaegeuk@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- fs/f2fs/compress.c | 2 -- fs/f2fs/data.c | 58 +++++++++++++++++++++++++++++++++++++--------- fs/f2fs/f2fs.h | 1 + 3 files changed, 48 insertions(+), 13 deletions(-)
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 14262e0f1cd60..c5fee4d7ea72f 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -798,8 +798,6 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) if (cops->destroy_decompress_ctx) cops->destroy_decompress_ctx(dic); out_free_dic: - if (verity) - atomic_set(&dic->pending_pages, dic->nr_cpages); if (!verity) f2fs_decompress_end_io(dic->rpages, dic->cluster_size, ret, false); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index be4da52604edc..b29243ee1c3e5 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -202,7 +202,7 @@ static void f2fs_verify_bio(struct bio *bio) dic = (struct decompress_io_ctx *)page_private(page);
if (dic) { - if (atomic_dec_return(&dic->pending_pages)) + if (atomic_dec_return(&dic->verity_pages)) continue; f2fs_verify_pages(dic->rpages, dic->cluster_size); @@ -1027,7 +1027,8 @@ static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx)
static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, unsigned nr_pages, unsigned op_flag, - pgoff_t first_idx, bool for_write) + pgoff_t first_idx, bool for_write, + bool for_verity) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio; @@ -1049,7 +1050,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, post_read_steps |= 1 << STEP_DECRYPT; if (f2fs_compressed_file(inode)) post_read_steps |= 1 << STEP_DECOMPRESS_NOWQ; - if (f2fs_need_verity(inode, first_idx)) + if (for_verity && f2fs_need_verity(inode, first_idx)) post_read_steps |= 1 << STEP_VERITY;
if (post_read_steps) { @@ -1079,7 +1080,7 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page, struct bio *bio;
bio = f2fs_grab_read_bio(inode, blkaddr, 1, op_flags, - page->index, for_write); + page->index, for_write, true); if (IS_ERR(bio)) return PTR_ERR(bio);
@@ -2133,7 +2134,7 @@ static int f2fs_read_single_page(struct inode *inode, struct page *page, if (bio == NULL) { bio = f2fs_grab_read_bio(inode, block_nr, nr_pages, is_readahead ? REQ_RAHEAD : 0, page->index, - false); + false, true); if (IS_ERR(bio)) { ret = PTR_ERR(bio); bio = NULL; @@ -2180,6 +2181,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, const unsigned blkbits = inode->i_blkbits; const unsigned blocksize = 1 << blkbits; struct decompress_io_ctx *dic = NULL; + struct bio_post_read_ctx *ctx; + bool for_verity = false; int i; int ret = 0;
@@ -2245,10 +2248,29 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, goto out_put_dnode; }
+ /* + * It's possible to enable fsverity on the fly when handling a cluster, + * which requires complicated error handling. Instead of adding more + * complexity, let's give a rule where end_io post-processes fsverity + * per cluster. In order to do that, we need to submit bio, if previous + * bio sets a different post-process policy. + */ + if (fsverity_active(cc->inode)) { + atomic_set(&dic->verity_pages, cc->nr_cpages); + for_verity = true; + + if (bio) { + ctx = bio->bi_private; + if (!(ctx->enabled_steps & (1 << STEP_VERITY))) { + __submit_bio(sbi, bio, DATA); + bio = NULL; + } + } + } + for (i = 0; i < dic->nr_cpages; i++) { struct page *page = dic->cpages[i]; block_t blkaddr; - struct bio_post_read_ctx *ctx;
blkaddr = data_blkaddr(dn.inode, dn.node_page, dn.ofs_in_node + i + 1); @@ -2264,17 +2286,31 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, if (!bio) { bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages, is_readahead ? REQ_RAHEAD : 0, - page->index, for_write); + page->index, for_write, for_verity); if (IS_ERR(bio)) { + unsigned int remained = dic->nr_cpages - i; + bool release = false; + ret = PTR_ERR(bio); dic->failed = true; - if (!atomic_sub_return(dic->nr_cpages - i, - &dic->pending_pages)) { + + if (for_verity) { + if (!atomic_sub_return(remained, + &dic->verity_pages)) + release = true; + } else { + if (!atomic_sub_return(remained, + &dic->pending_pages)) + release = true; + } + + if (release) { f2fs_decompress_end_io(dic->rpages, - cc->cluster_size, true, - false); + cc->cluster_size, true, + false); f2fs_free_dic(dic); } + f2fs_put_dnode(&dn); *bio_ret = NULL; return ret; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 14ace0e21d389..72a25387f49bc 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1412,6 +1412,7 @@ struct decompress_io_ctx { size_t rlen; /* valid data length in rbuf */ size_t clen; /* valid data length in cbuf */ atomic_t pending_pages; /* in-flight compressed page count */ + atomic_t verity_pages; /* in-flight page count for verity */ bool failed; /* indicate IO error during decompression */ void *private; /* payload buffer for specified decompression algorithm */ void *private2; /* extra payload buffer */
From: Jessica Yu jeyu@kernel.org
[ Upstream commit 38dc717e97153e46375ee21797aa54777e5498f3 ]
Apparently there has been a longstanding race between udev/systemd and the module loader. Currently, the module loader sends a uevent right after sysfs initialization, but before the module calls its init function. However, some udev rules expect that the module has initialized already upon receiving the uevent.
This race has been triggered recently (see link in references) in some systemd mount unit files. For instance, the configfs module creates the /sys/kernel/config mount point in its init function, however the module loader issues the uevent before this happens. sys-kernel-config.mount expects to be able to mount /sys/kernel/config upon receipt of the module loading uevent, but if the configfs module has not called its init function yet, then this directory will not exist and the mount unit fails. A similar situation exists for sys-fs-fuse-connections.mount, as the fuse sysfs mount point is created during the fuse module's init function. If udev is faster than module initialization then the mount unit would fail in a similar fashion.
To fix this race, delay the module KOBJ_ADD uevent until after the module has finished calling its init routine.
References: https://github.com/systemd/systemd/issues/17586 Reviewed-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Tested-By: Nicolas Morey-Chaisemartin nmoreychaisemartin@suse.com Signed-off-by: Jessica Yu jeyu@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- kernel/module.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/kernel/module.c b/kernel/module.c index b34235082394b..e20499309b2af 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1895,7 +1895,6 @@ static int mod_sysfs_init(struct module *mod) if (err) mod_kobject_put(mod);
- /* delay uevent until full sysfs population */ out: return err; } @@ -1932,7 +1931,6 @@ static int mod_sysfs_setup(struct module *mod, add_sect_attrs(mod, info); add_notes_attrs(mod, info);
- kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); return 0;
out_unreg_modinfo_attrs: @@ -3639,6 +3637,9 @@ static noinline int do_init_module(struct module *mod) blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_LIVE, mod);
+ /* Delay uevent until module has finished its init routine */ + kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); + /* * We need to finish all async code before the module init sequence * is done. This has potential to deadlock. For example, a newly
From: Nicholas Piggin npiggin@gmail.com
[ Upstream commit 59d512e4374b2d8a6ad341475dc94c4a4bdec7d3 ]
This is way to catch some cases of decrementer overflow, when the decrementer has underflowed an odd number of times, while MSR[EE] was disabled.
With a typical small decrementer, a timer that fires when MSR[EE] is disabled will be "lost" if MSR[EE] remains disabled for between 4.3 and 8.6 seconds after the timer expires. In any case, the decrementer interrupt would be taken at 8.6 seconds and the timer would be found at that point.
So this check is for catching extreme latency events, and it prevents those latencies from being a further few seconds long. It's not obvious this is a good tradeoff. This is already a watchdog magnitude event and that situation is not improved a significantly with this check. For large decrementers, it's useless.
Therefore remove this check, which avoids a mftb when enabling hard disabled interrupts (e.g., when enabling after coming from hardware interrupt handlers). Perhaps more importantly, it also removes the clunky MSR[EE] vs PACA_IRQ_HARD_DIS incoherency in soft-interrupt replay which simplifies the code.
Signed-off-by: Nicholas Piggin npiggin@gmail.com Signed-off-by: Michael Ellerman mpe@ellerman.id.au Link: https://lore.kernel.org/r/20201107014336.2337337-1-npiggin@gmail.com Signed-off-by: Sasha Levin sashal@kernel.org --- arch/powerpc/kernel/irq.c | 53 ++------------------------- arch/powerpc/kernel/time.c | 9 ++--- arch/powerpc/platforms/powernv/opal.c | 2 +- 3 files changed, 8 insertions(+), 56 deletions(-)
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 7d0f7682d01df..6b1eca53e36cc 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -102,14 +102,6 @@ static inline notrace unsigned long get_irq_happened(void) return happened; }
-static inline notrace int decrementer_check_overflow(void) -{ - u64 now = get_tb(); - u64 *next_tb = this_cpu_ptr(&decrementers_next_tb); - - return now >= *next_tb; -} - #ifdef CONFIG_PPC_BOOK3E
/* This is called whenever we are re-enabling interrupts @@ -142,35 +134,6 @@ notrace unsigned int __check_irq_replay(void) trace_hardirqs_on(); trace_hardirqs_off();
- /* - * We are always hard disabled here, but PACA_IRQ_HARD_DIS may - * not be set, which means interrupts have only just been hard - * disabled as part of the local_irq_restore or interrupt return - * code. In that case, skip the decrementr check becaus it's - * expensive to read the TB. - * - * HARD_DIS then gets cleared here, but it's reconciled later. - * Either local_irq_disable will replay the interrupt and that - * will reconcile state like other hard interrupts. Or interrupt - * retur will replay the interrupt and in that case it sets - * PACA_IRQ_HARD_DIS by hand (see comments in entry_64.S). - */ - if (happened & PACA_IRQ_HARD_DIS) { - local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS; - - /* - * We may have missed a decrementer interrupt if hard disabled. - * Check the decrementer register in case we had a rollover - * while hard disabled. - */ - if (!(happened & PACA_IRQ_DEC)) { - if (decrementer_check_overflow()) { - local_paca->irq_happened |= PACA_IRQ_DEC; - happened |= PACA_IRQ_DEC; - } - } - } - if (happened & PACA_IRQ_DEC) { local_paca->irq_happened &= ~PACA_IRQ_DEC; return 0x900; @@ -186,6 +149,9 @@ notrace unsigned int __check_irq_replay(void) return 0x280; }
+ if (happened & PACA_IRQ_HARD_DIS) + local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS; + /* There should be nothing left ! */ BUG_ON(local_paca->irq_happened != 0);
@@ -229,18 +195,6 @@ void replay_soft_interrupts(void) if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) WARN_ON_ONCE(mfmsr() & MSR_EE);
- if (happened & PACA_IRQ_HARD_DIS) { - /* - * We may have missed a decrementer interrupt if hard disabled. - * Check the decrementer register in case we had a rollover - * while hard disabled. - */ - if (!(happened & PACA_IRQ_DEC)) { - if (decrementer_check_overflow()) - happened |= PACA_IRQ_DEC; - } - } - /* * Force the delivery of pending soft-disabled interrupts on PS3. * Any HV call will have this side effect. @@ -345,6 +299,7 @@ notrace void arch_local_irq_restore(unsigned long mask) if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) WARN_ON_ONCE(!(mfmsr() & MSR_EE)); __hard_irq_disable(); + local_paca->irq_happened |= PACA_IRQ_HARD_DIS; } else { /* * We should already be hard disabled here. We had bugs diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 74efe46f55327..7d372ff3504b2 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -552,14 +552,11 @@ void timer_interrupt(struct pt_regs *regs) struct pt_regs *old_regs; u64 now;
- /* Some implementations of hotplug will get timer interrupts while - * offline, just ignore these and we also need to set - * decrementers_next_tb as MAX to make sure __check_irq_replay - * don't replay timer interrupt when return, otherwise we'll trap - * here infinitely :( + /* + * Some implementations of hotplug will get timer interrupts while + * offline, just ignore these. */ if (unlikely(!cpu_online(smp_processor_id()))) { - *next_tb = ~(u64)0; set_dec(decrementer_max); return; } diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index d95954ad4c0af..c61c3b62c8c62 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -731,7 +731,7 @@ int opal_hmi_exception_early2(struct pt_regs *regs) return 1; }
-/* HMI exception handler called in virtual mode during check_irq_replay. */ +/* HMI exception handler called in virtual mode when irqs are next enabled. */ int opal_handle_hmi_exception(struct pt_regs *regs) { /*
From: Chao Yu yuchao0@huawei.com
[ Upstream commit e584bbe821229a3e7cc409eecd51df66f9268c21 ]
syzbot reported a bug which could cause shift-out-of-bounds issue, fix it.
Call Trace: __dump_stack lib/dump_stack.c:79 [inline] dump_stack+0x107/0x163 lib/dump_stack.c:120 ubsan_epilogue+0xb/0x5a lib/ubsan.c:148 __ubsan_handle_shift_out_of_bounds.cold+0xb1/0x181 lib/ubsan.c:395 sanity_check_raw_super fs/f2fs/super.c:2812 [inline] read_raw_super_block fs/f2fs/super.c:3267 [inline] f2fs_fill_super.cold+0x16c9/0x16f6 fs/f2fs/super.c:3519 mount_bdev+0x34d/0x410 fs/super.c:1366 legacy_get_tree+0x105/0x220 fs/fs_context.c:592 vfs_get_tree+0x89/0x2f0 fs/super.c:1496 do_new_mount fs/namespace.c:2896 [inline] path_mount+0x12ae/0x1e70 fs/namespace.c:3227 do_mount fs/namespace.c:3240 [inline] __do_sys_mount fs/namespace.c:3448 [inline] __se_sys_mount fs/namespace.c:3425 [inline] __x64_sys_mount+0x27f/0x300 fs/namespace.c:3425 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9
Reported-by: syzbot+ca9a785f8ac472085994@syzkaller.appspotmail.com Signed-off-by: Anant Thazhemadam anant.thazhemadam@gmail.com Signed-off-by: Chao Yu yuchao0@huawei.com Signed-off-by: Jaegeuk Kim jaegeuk@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- fs/f2fs/super.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ba859eeca69a3..822ae8c8fa39a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2744,7 +2744,6 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, block_t total_sections, blocks_per_seg; struct f2fs_super_block *raw_super = (struct f2fs_super_block *) (bh->b_data + F2FS_SUPER_OFFSET); - unsigned int blocksize; size_t crc_offset = 0; __u32 crc = 0;
@@ -2778,10 +2777,10 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, }
/* Currently, support only 4KB block size */ - blocksize = 1 << le32_to_cpu(raw_super->log_blocksize); - if (blocksize != F2FS_BLKSIZE) { - f2fs_info(sbi, "Invalid blocksize (%u), supports only 4KB", - blocksize); + if (le32_to_cpu(raw_super->log_blocksize) != F2FS_BLKSIZE_BITS) { + f2fs_info(sbi, "Invalid log_blocksize (%u), supports only %u", + le32_to_cpu(raw_super->log_blocksize), + F2FS_BLKSIZE_BITS); return -EFSCORRUPTED; }
From: Eric Biggers ebiggers@google.com
[ Upstream commit edf7ddbf1c5eb98b720b063b73e20e8a4a1ce673 ]
Missing calls to mntget() (or equivalently, too many calls to mntput()) are hard to detect because mntput() delays freeing mounts using task_work_add(), then again using call_rcu(). As a result, mnt_count can often be decremented to -1 without getting a KASAN use-after-free report. Such cases are still bugs though, and they point to real use-after-frees being possible.
For an example of this, see the bug fixed by commit 1b0b9cc8d379 ("vfs: fsmount: add missing mntget()"), discussed at https://lkml.kernel.org/linux-fsdevel/20190605135401.GB30925@xxxxxxxxxxxxxxx.... This bug *should* have been trivial to find. But actually, it wasn't found until syzkaller happened to use fchdir() to manipulate the reference count just right for the bug to be noticeable.
Address this by making mntput_no_expire() issue a WARN if mnt_count has become negative.
Suggested-by: Miklos Szeredi miklos@szeredi.hu Signed-off-by: Eric Biggers ebiggers@google.com Signed-off-by: Al Viro viro@zeniv.linux.org.uk Signed-off-by: Sasha Levin sashal@kernel.org --- fs/namespace.c | 9 ++++++--- fs/pnode.h | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-)
diff --git a/fs/namespace.c b/fs/namespace.c index cebaa3e817940..93006abe7946a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -156,10 +156,10 @@ static inline void mnt_add_count(struct mount *mnt, int n) /* * vfsmount lock must be held for write */ -unsigned int mnt_get_count(struct mount *mnt) +int mnt_get_count(struct mount *mnt) { #ifdef CONFIG_SMP - unsigned int count = 0; + int count = 0; int cpu;
for_each_possible_cpu(cpu) { @@ -1139,6 +1139,7 @@ static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput); static void mntput_no_expire(struct mount *mnt) { LIST_HEAD(list); + int count;
rcu_read_lock(); if (likely(READ_ONCE(mnt->mnt_ns))) { @@ -1162,7 +1163,9 @@ static void mntput_no_expire(struct mount *mnt) */ smp_mb(); mnt_add_count(mnt, -1); - if (mnt_get_count(mnt)) { + count = mnt_get_count(mnt); + if (count != 0) { + WARN_ON(count < 0); rcu_read_unlock(); unlock_mount_hash(); return; diff --git a/fs/pnode.h b/fs/pnode.h index 49a058c73e4c7..26f74e092bd98 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -44,7 +44,7 @@ int propagate_mount_busy(struct mount *, int); void propagate_mount_unlock(struct mount *); void mnt_release_group_id(struct mount *); int get_dominating_id(struct mount *mnt, const struct path *root); -unsigned int mnt_get_count(struct mount *mnt); +int mnt_get_count(struct mount *mnt); void mnt_set_mountpoint(struct mount *, struct mountpoint *, struct mount *); void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp,
From: Zhang Qilong zhangqilong3@huawei.com
[ Upstream commit 8711071e9700b67045fe5518161d63f7a03e3c9e ]
pm_runtime_get_sync() will increment pm usage counter even it failed. Forgetting to call pm_runtime_put_noidle will result in reference leak in rti_wdt_probe, so we should fix it.
Signed-off-by: Zhang Qilong zhangqilong3@huawei.com Reviewed-by: Guenter Roeck linux@roeck-us.net Link: https://lore.kernel.org/r/20201030154909.100023-1-zhangqilong3@huawei.com Signed-off-by: Guenter Roeck linux@roeck-us.net Signed-off-by: Wim Van Sebroeck wim@linux-watchdog.org Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/watchdog/rti_wdt.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/watchdog/rti_wdt.c b/drivers/watchdog/rti_wdt.c index 836319cbaca9d..359302f71f7ef 100644 --- a/drivers/watchdog/rti_wdt.c +++ b/drivers/watchdog/rti_wdt.c @@ -227,8 +227,10 @@ static int rti_wdt_probe(struct platform_device *pdev)
pm_runtime_enable(dev); ret = pm_runtime_get_sync(dev); - if (ret) + if (ret) { + pm_runtime_put_noidle(dev); return dev_err_probe(dev, ret, "runtime pm failed\n"); + }
platform_set_drvdata(pdev, wdt);
From: Christopher Obbard chris.obbard@collabora.com
[ Upstream commit 72d3e093afae79611fa38f8f2cfab9a888fe66f2 ]
The UML random driver creates a dummy device under the guest, /dev/hw_random. When this file is read from the guest, the driver reads from the host machine's /dev/random, in-turn reading from the host kernel's entropy pool. This entropy pool could have been filled by a hardware random number generator or just the host kernel's internal software entropy generator.
Currently the driver does not fill the guests kernel entropy pool, this requires a userspace tool running inside the guest (like rng-tools) to read from the dummy device provided by this driver, which then would fill the guest's internal entropy pool.
This all seems quite pointless when we are already reading from an entropy pool, so this patch aims to register the device as a hwrng device using the hwrng-core framework. This not only improves and cleans up the driver, but also fills the guest's entropy pool without having to resort to using extra userspace tools in the guest.
This is typically a nuisance when booting a guest: the random pool takes a long time (~200s) to build up enough entropy since the dummy hwrng is not used to fill the guest's pool.
This port was originally attempted by Alexander Neville "dark" (in CC, discussion in Link), but the conversation there stalled since the handling of -EAGAIN errors were no removed and longer handled by the driver. This patch attempts to use the existing method of error handling but utilises the new hwrng core.
The issue can be noticed when booting a UML guest:
[ 2.560000] random: fast init done [ 214.000000] random: crng init done
With the patch applied, filling the pool becomes a lot quicker:
[ 2.560000] random: fast init done [ 12.000000] random: crng init done
Cc: Alexander Neville dark@volatile.bz Link: https://lore.kernel.org/lkml/20190828204609.02a7ff70@TheDarkness/ Link: https://lore.kernel.org/lkml/20190829135001.6a5ff940@TheDarkness.local/ Cc: Sjoerd Simons sjoerd.simons@collabora.co.uk Signed-off-by: Christopher Obbard chris.obbard@collabora.com Acked-by: Anton Ivanov anton.ivanov@cambridgegreys.com Signed-off-by: Richard Weinberger richard@nod.at Signed-off-by: Sasha Levin sashal@kernel.org --- arch/um/drivers/random.c | 101 ++++++++------------------------- drivers/char/hw_random/Kconfig | 16 +++--- 2 files changed, 33 insertions(+), 84 deletions(-)
diff --git a/arch/um/drivers/random.c b/arch/um/drivers/random.c index ce115fce52f02..e4b9b2ce9abf4 100644 --- a/arch/um/drivers/random.c +++ b/arch/um/drivers/random.c @@ -11,6 +11,7 @@ #include <linux/fs.h> #include <linux/interrupt.h> #include <linux/miscdevice.h> +#include <linux/hw_random.h> #include <linux/delay.h> #include <linux/uaccess.h> #include <init.h> @@ -18,9 +19,8 @@ #include <os.h>
/* - * core module and version information + * core module information */ -#define RNG_VERSION "1.0.0" #define RNG_MODULE_NAME "hw_random"
/* Changed at init time, in the non-modular case, and at module load @@ -28,88 +28,36 @@ * protects against a module being loaded twice at the same time. */ static int random_fd = -1; -static DECLARE_WAIT_QUEUE_HEAD(host_read_wait); +static struct hwrng hwrng = { 0, }; +static DECLARE_COMPLETION(have_data);
-static int rng_dev_open (struct inode *inode, struct file *filp) +static int rng_dev_read(struct hwrng *rng, void *buf, size_t max, bool block) { - /* enforce read-only access to this chrdev */ - if ((filp->f_mode & FMODE_READ) == 0) - return -EINVAL; - if ((filp->f_mode & FMODE_WRITE) != 0) - return -EINVAL; + int ret;
- return 0; -} - -static atomic_t host_sleep_count = ATOMIC_INIT(0); - -static ssize_t rng_dev_read (struct file *filp, char __user *buf, size_t size, - loff_t *offp) -{ - u32 data; - int n, ret = 0, have_data; - - while (size) { - n = os_read_file(random_fd, &data, sizeof(data)); - if (n > 0) { - have_data = n; - while (have_data && size) { - if (put_user((u8) data, buf++)) { - ret = ret ? : -EFAULT; - break; - } - size--; - ret++; - have_data--; - data >>= 8; - } - } - else if (n == -EAGAIN) { - DECLARE_WAITQUEUE(wait, current); - - if (filp->f_flags & O_NONBLOCK) - return ret ? : -EAGAIN; - - atomic_inc(&host_sleep_count); + for (;;) { + ret = os_read_file(random_fd, buf, max); + if (block && ret == -EAGAIN) { add_sigio_fd(random_fd);
- add_wait_queue(&host_read_wait, &wait); - set_current_state(TASK_INTERRUPTIBLE); + ret = wait_for_completion_killable(&have_data);
- schedule(); - remove_wait_queue(&host_read_wait, &wait); + ignore_sigio_fd(random_fd); + deactivate_fd(random_fd, RANDOM_IRQ);
- if (atomic_dec_and_test(&host_sleep_count)) { - ignore_sigio_fd(random_fd); - deactivate_fd(random_fd, RANDOM_IRQ); - } + if (ret < 0) + break; + } else { + break; } - else - return n; - - if (signal_pending (current)) - return ret ? : -ERESTARTSYS; } - return ret; -}
-static const struct file_operations rng_chrdev_ops = { - .owner = THIS_MODULE, - .open = rng_dev_open, - .read = rng_dev_read, - .llseek = noop_llseek, -}; - -/* rng_init shouldn't be called more than once at boot time */ -static struct miscdevice rng_miscdev = { - HWRNG_MINOR, - RNG_MODULE_NAME, - &rng_chrdev_ops, -}; + return ret != -EAGAIN ? ret : 0; +}
static irqreturn_t random_interrupt(int irq, void *data) { - wake_up(&host_read_wait); + complete(&have_data);
return IRQ_HANDLED; } @@ -126,18 +74,19 @@ static int __init rng_init (void) goto out;
random_fd = err; - err = um_request_irq(RANDOM_IRQ, random_fd, IRQ_READ, random_interrupt, 0, "random", NULL); if (err) goto err_out_cleanup_hw;
sigio_broken(random_fd, 1); + hwrng.name = RNG_MODULE_NAME; + hwrng.read = rng_dev_read; + hwrng.quality = 1024;
- err = misc_register (&rng_miscdev); + err = hwrng_register(&hwrng); if (err) { - printk (KERN_ERR RNG_MODULE_NAME ": misc device register " - "failed\n"); + pr_err(RNG_MODULE_NAME " registering failed (%d)\n", err); goto err_out_cleanup_hw; } out: @@ -161,8 +110,8 @@ static void cleanup(void)
static void __exit rng_cleanup(void) { + hwrng_unregister(&hwrng); os_close_file(random_fd); - misc_deregister (&rng_miscdev); }
module_init (rng_init); diff --git a/drivers/char/hw_random/Kconfig b/drivers/char/hw_random/Kconfig index e92c4d9469d82..5952210526aaa 100644 --- a/drivers/char/hw_random/Kconfig +++ b/drivers/char/hw_random/Kconfig @@ -540,15 +540,15 @@ endif # HW_RANDOM
config UML_RANDOM depends on UML - tristate "Hardware random number generator" + select HW_RANDOM + tristate "UML Random Number Generator support" help This option enables UML's "hardware" random number generator. It attaches itself to the host's /dev/random, supplying as much entropy as the host has, rather than the small amount the UML gets from its - own drivers. It registers itself as a standard hardware random number - generator, major 10, minor 183, and the canonical device name is - /dev/hwrng. - The way to make use of this is to install the rng-tools package - (check your distro, or download from - http://sourceforge.net/projects/gkernel/). rngd periodically reads - /dev/hwrng and injects the entropy into /dev/random. + own drivers. It registers itself as a rng-core driver thus providing + a device which is usually called /dev/hwrng. This hardware random + number generator does feed into the kernel's random number generator + entropy pool. + + If unsure, say Y.
From: Gabriel Krisman Bertazi krisman@collabora.com
[ Upstream commit fc6b6a872dcd48c6f39c7975836d75113db67d37 ]
Internally, UBD treats each physical IO segment as a separate command to be submitted in the execution pipe. If the pipe returns a transient error after a few segments have already been written, UBD will tell the block layer to requeue the request, but there is no way to reclaim the segments already submitted. When a new attempt to dispatch the request is done, those segments already submitted will get duplicated, causing the WARN_ON below in the best case, and potentially data corruption.
In my system, running a UML instance with 2GB of RAM and a 50M UBD disk, I can reproduce the WARN_ON by simply running mkfs.fvat against the disk on a freshly booted system.
There are a few ways to around this, like reducing the pressure on the pipe by reducing the queue depth, which almost eliminates the occurrence of the problem, increasing the pipe buffer size on the host system, or by limiting the request to one physical segment, which causes the block layer to submit way more requests to resolve a single operation.
Instead, this patch modifies the format of a UBD command, such that all segments are sent through a single element in the communication pipe, turning the command submission atomic from the point of view of the block layer. The new format has a variable size, depending on the number of elements, and looks like this:
+------------+-----------+-----------+------------ | cmd_header | segment 0 | segment 1 | segment ... +------------+-----------+-----------+------------
With this format, we push a pointer to cmd_header in the submission pipe.
This has the advantage of reducing the memory footprint of executing a single request, since it allow us to merge some fields in the header. It is possible to reduce even further each segment memory footprint, by merging bitmap_words and cow_offset, for instance, but this is not the focus of this patch and is left as future work. One issue with the patch is that for a big number of segments, we now perform one big memory allocation instead of multiple small ones, but I wasn't able to trigger any real issues or -ENOMEM because of this change, that wouldn't be reproduced otherwise.
This was tested using fio with the verify-crc32 option, and by running an ext4 filesystem over this UBD device.
The original WARN_ON was:
------------[ cut here ]------------ WARNING: CPU: 0 PID: 0 at lib/refcount.c:28 refcount_warn_saturate+0x13f/0x141 refcount_t: underflow; use-after-free. Modules linked in: CPU: 0 PID: 0 Comm: swapper Not tainted 5.5.0-rc6-00002-g2a5bb2cf75c8 #346 Stack: 6084eed0 6063dc77 00000009 6084ef60 00000000 604b8d9f 6084eee0 6063dcbc 6084ef40 6006ab8d e013d780 1c00000000 Call Trace: [<600a0c1c>] ? printk+0x0/0x94 [<6004a888>] show_stack+0x13b/0x155 [<6063dc77>] ? dump_stack_print_info+0xdf/0xe8 [<604b8d9f>] ? refcount_warn_saturate+0x13f/0x141 [<6063dcbc>] dump_stack+0x2a/0x2c [<6006ab8d>] __warn+0x107/0x134 [<6008da6c>] ? wake_up_process+0x17/0x19 [<60487628>] ? blk_queue_max_discard_sectors+0x0/0xd [<6006b05f>] warn_slowpath_fmt+0xd1/0xdf [<6006af8e>] ? warn_slowpath_fmt+0x0/0xdf [<600acc14>] ? raw_read_seqcount_begin.constprop.0+0x0/0x15 [<600619ae>] ? os_nsecs+0x1d/0x2b [<604b8d9f>] refcount_warn_saturate+0x13f/0x141 [<6048bc8f>] refcount_sub_and_test.constprop.0+0x2f/0x37 [<6048c8de>] blk_mq_free_request+0xf1/0x10d [<6048ca06>] __blk_mq_end_request+0x10c/0x114 [<6005ac0f>] ubd_intr+0xb5/0x169 [<600a1a37>] __handle_irq_event_percpu+0x6b/0x17e [<600a1b70>] handle_irq_event_percpu+0x26/0x69 [<600a1bd9>] handle_irq_event+0x26/0x34 [<600a1bb3>] ? handle_irq_event+0x0/0x34 [<600a5186>] ? unmask_irq+0x0/0x37 [<600a57e6>] handle_edge_irq+0xbc/0xd6 [<600a131a>] generic_handle_irq+0x21/0x29 [<60048f6e>] do_IRQ+0x39/0x54 [...] ---[ end trace c6e7444e55386c0f ]---
Cc: Christopher Obbard chris.obbard@collabora.com Reported-by: Martyn Welch martyn@collabora.com Signed-off-by: Gabriel Krisman Bertazi krisman@collabora.com Tested-by: Christopher Obbard chris.obbard@collabora.com Acked-by: Anton Ivanov anton.ivanov@cambridgegreys.com Signed-off-by: Richard Weinberger richard@nod.at Signed-off-by: Sasha Levin sashal@kernel.org --- arch/um/drivers/ubd_kern.c | 191 ++++++++++++++++++++++--------------- 1 file changed, 115 insertions(+), 76 deletions(-)
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index eae8c83364f71..b12c1b0d3e1d0 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -47,18 +47,25 @@ /* Max request size is determined by sector mask - 32K */ #define UBD_MAX_REQUEST (8 * sizeof(long))
+struct io_desc { + char *buffer; + unsigned long length; + unsigned long sector_mask; + unsigned long long cow_offset; + unsigned long bitmap_words[2]; +}; + struct io_thread_req { struct request *req; int fds[2]; unsigned long offsets[2]; unsigned long long offset; - unsigned long length; - char *buffer; int sectorsize; - unsigned long sector_mask; - unsigned long long cow_offset; - unsigned long bitmap_words[2]; int error; + + int desc_cnt; + /* io_desc has to be the last element of the struct */ + struct io_desc io_desc[]; };
@@ -525,12 +532,7 @@ static void ubd_handler(void) blk_queue_max_write_zeroes_sectors(io_req->req->q, 0); blk_queue_flag_clear(QUEUE_FLAG_DISCARD, io_req->req->q); } - if ((io_req->error) || (io_req->buffer == NULL)) - blk_mq_end_request(io_req->req, io_req->error); - else { - if (!blk_update_request(io_req->req, io_req->error, io_req->length)) - __blk_mq_end_request(io_req->req, io_req->error); - } + blk_mq_end_request(io_req->req, io_req->error); kfree(io_req); } } @@ -946,6 +948,7 @@ static int ubd_add(int n, char **error_out) blk_queue_write_cache(ubd_dev->queue, true, false);
blk_queue_max_segments(ubd_dev->queue, MAX_SG); + blk_queue_segment_boundary(ubd_dev->queue, PAGE_SIZE - 1); err = ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, &ubd_gendisk[n]); if(err){ *error_out = "Failed to register device"; @@ -1289,37 +1292,74 @@ static void cowify_bitmap(__u64 io_offset, int length, unsigned long *cow_mask, *cow_offset += bitmap_offset; }
-static void cowify_req(struct io_thread_req *req, unsigned long *bitmap, +static void cowify_req(struct io_thread_req *req, struct io_desc *segment, + unsigned long offset, unsigned long *bitmap, __u64 bitmap_offset, __u64 bitmap_len) { - __u64 sector = req->offset >> SECTOR_SHIFT; + __u64 sector = offset >> SECTOR_SHIFT; int i;
- if (req->length > (sizeof(req->sector_mask) * 8) << SECTOR_SHIFT) + if (segment->length > (sizeof(segment->sector_mask) * 8) << SECTOR_SHIFT) panic("Operation too long");
if (req_op(req->req) == REQ_OP_READ) { - for (i = 0; i < req->length >> SECTOR_SHIFT; i++) { + for (i = 0; i < segment->length >> SECTOR_SHIFT; i++) { if(ubd_test_bit(sector + i, (unsigned char *) bitmap)) ubd_set_bit(i, (unsigned char *) - &req->sector_mask); + &segment->sector_mask); + } + } else { + cowify_bitmap(offset, segment->length, &segment->sector_mask, + &segment->cow_offset, bitmap, bitmap_offset, + segment->bitmap_words, bitmap_len); + } +} + +static void ubd_map_req(struct ubd *dev, struct io_thread_req *io_req, + struct request *req) +{ + struct bio_vec bvec; + struct req_iterator iter; + int i = 0; + unsigned long byte_offset = io_req->offset; + int op = req_op(req); + + if (op == REQ_OP_WRITE_ZEROES || op == REQ_OP_DISCARD) { + io_req->io_desc[0].buffer = NULL; + io_req->io_desc[0].length = blk_rq_bytes(req); + } else { + rq_for_each_segment(bvec, req, iter) { + BUG_ON(i >= io_req->desc_cnt); + + io_req->io_desc[i].buffer = + page_address(bvec.bv_page) + bvec.bv_offset; + io_req->io_desc[i].length = bvec.bv_len; + i++; + } + } + + if (dev->cow.file) { + for (i = 0; i < io_req->desc_cnt; i++) { + cowify_req(io_req, &io_req->io_desc[i], byte_offset, + dev->cow.bitmap, dev->cow.bitmap_offset, + dev->cow.bitmap_len); + byte_offset += io_req->io_desc[i].length; } + } - else cowify_bitmap(req->offset, req->length, &req->sector_mask, - &req->cow_offset, bitmap, bitmap_offset, - req->bitmap_words, bitmap_len); }
-static int ubd_queue_one_vec(struct blk_mq_hw_ctx *hctx, struct request *req, - u64 off, struct bio_vec *bvec) +static struct io_thread_req *ubd_alloc_req(struct ubd *dev, struct request *req, + int desc_cnt) { - struct ubd *dev = hctx->queue->queuedata; struct io_thread_req *io_req; - int ret; + int i;
- io_req = kmalloc(sizeof(struct io_thread_req), GFP_ATOMIC); + io_req = kmalloc(sizeof(*io_req) + + (desc_cnt * sizeof(struct io_desc)), + GFP_ATOMIC); if (!io_req) - return -ENOMEM; + return NULL;
io_req->req = req; if (dev->cow.file) @@ -1327,26 +1367,41 @@ static int ubd_queue_one_vec(struct blk_mq_hw_ctx *hctx, struct request *req, else io_req->fds[0] = dev->fd; io_req->error = 0; - - if (bvec != NULL) { - io_req->buffer = page_address(bvec->bv_page) + bvec->bv_offset; - io_req->length = bvec->bv_len; - } else { - io_req->buffer = NULL; - io_req->length = blk_rq_bytes(req); - } - io_req->sectorsize = SECTOR_SIZE; io_req->fds[1] = dev->fd; - io_req->cow_offset = -1; - io_req->offset = off; - io_req->sector_mask = 0; + io_req->offset = (u64) blk_rq_pos(req) << SECTOR_SHIFT; io_req->offsets[0] = 0; io_req->offsets[1] = dev->cow.data_offset;
- if (dev->cow.file) - cowify_req(io_req, dev->cow.bitmap, - dev->cow.bitmap_offset, dev->cow.bitmap_len); + for (i = 0 ; i < desc_cnt; i++) { + io_req->io_desc[i].sector_mask = 0; + io_req->io_desc[i].cow_offset = -1; + } + + return io_req; +} + +static int ubd_submit_request(struct ubd *dev, struct request *req) +{ + int segs = 0; + struct io_thread_req *io_req; + int ret; + int op = req_op(req); + + if (op == REQ_OP_FLUSH) + segs = 0; + else if (op == REQ_OP_WRITE_ZEROES || op == REQ_OP_DISCARD) + segs = 1; + else + segs = blk_rq_nr_phys_segments(req); + + io_req = ubd_alloc_req(dev, req, segs); + if (!io_req) + return -ENOMEM; + + io_req->desc_cnt = segs; + if (segs) + ubd_map_req(dev, io_req, req);
ret = os_write_file(thread_fd, &io_req, sizeof(io_req)); if (ret != sizeof(io_req)) { @@ -1357,22 +1412,6 @@ static int ubd_queue_one_vec(struct blk_mq_hw_ctx *hctx, struct request *req, return ret; }
-static int queue_rw_req(struct blk_mq_hw_ctx *hctx, struct request *req) -{ - struct req_iterator iter; - struct bio_vec bvec; - int ret; - u64 off = (u64)blk_rq_pos(req) << SECTOR_SHIFT; - - rq_for_each_segment(bvec, req, iter) { - ret = ubd_queue_one_vec(hctx, req, off, &bvec); - if (ret < 0) - return ret; - off += bvec.bv_len; - } - return 0; -} - static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -1385,17 +1424,12 @@ static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx, spin_lock_irq(&ubd_dev->lock);
switch (req_op(req)) { - /* operations with no lentgth/offset arguments */ case REQ_OP_FLUSH: - ret = ubd_queue_one_vec(hctx, req, 0, NULL); - break; case REQ_OP_READ: case REQ_OP_WRITE: - ret = queue_rw_req(hctx, req); - break; case REQ_OP_DISCARD: case REQ_OP_WRITE_ZEROES: - ret = ubd_queue_one_vec(hctx, req, (u64)blk_rq_pos(req) << 9, NULL); + ret = ubd_submit_request(ubd_dev, req); break; default: WARN_ON_ONCE(1); @@ -1483,22 +1517,22 @@ static int map_error(int error_code) * will result in unpredictable behaviour and/or crashes. */
-static int update_bitmap(struct io_thread_req *req) +static int update_bitmap(struct io_thread_req *req, struct io_desc *segment) { int n;
- if(req->cow_offset == -1) + if (segment->cow_offset == -1) return map_error(0);
- n = os_pwrite_file(req->fds[1], &req->bitmap_words, - sizeof(req->bitmap_words), req->cow_offset); - if (n != sizeof(req->bitmap_words)) + n = os_pwrite_file(req->fds[1], &segment->bitmap_words, + sizeof(segment->bitmap_words), segment->cow_offset); + if (n != sizeof(segment->bitmap_words)) return map_error(-n);
return map_error(0); }
-static void do_io(struct io_thread_req *req) +static void do_io(struct io_thread_req *req, struct io_desc *desc) { char *buf = NULL; unsigned long len; @@ -1513,21 +1547,20 @@ static void do_io(struct io_thread_req *req) return; }
- nsectors = req->length / req->sectorsize; + nsectors = desc->length / req->sectorsize; start = 0; do { - bit = ubd_test_bit(start, (unsigned char *) &req->sector_mask); + bit = ubd_test_bit(start, (unsigned char *) &desc->sector_mask); end = start; while((end < nsectors) && - (ubd_test_bit(end, (unsigned char *) - &req->sector_mask) == bit)) + (ubd_test_bit(end, (unsigned char *) &desc->sector_mask) == bit)) end++;
off = req->offset + req->offsets[bit] + start * req->sectorsize; len = (end - start) * req->sectorsize; - if (req->buffer != NULL) - buf = &req->buffer[start * req->sectorsize]; + if (desc->buffer != NULL) + buf = &desc->buffer[start * req->sectorsize];
switch (req_op(req->req)) { case REQ_OP_READ: @@ -1567,7 +1600,8 @@ static void do_io(struct io_thread_req *req) start = end; } while(start < nsectors);
- req->error = update_bitmap(req); + req->offset += len; + req->error = update_bitmap(req, desc); }
/* Changed in start_io_thread, which is serialized by being called only @@ -1600,8 +1634,13 @@ int io_thread(void *arg) }
for (count = 0; count < n/sizeof(struct io_thread_req *); count++) { + struct io_thread_req *req = (*io_req_buffer)[count]; + int i; + io_count++; - do_io((*io_req_buffer)[count]); + for (i = 0; !req->error && i < req->desc_cnt; i++) + do_io(req, &(req->io_desc[i])); + }
written = 0;
From: Johannes Berg johannes.berg@intel.com
[ Upstream commit ef4459a6da0955b533ebfc97a7d756ac090f50c9 ]
We've been running into stack overflows in helper threads corrupting memory (e.g. because somebody put printf() or os_info() there), so to avoid those causing hard-to-debug issues later on, allocate a guard page for helper thread stacks and mark it read-only.
Unfortunately, the crash dump at that point is useless as the stack tracer will try to backtrace the *kernel* thread, not the helper thread, but at least we don't survive to a random issue caused by corruption.
Signed-off-by: Johannes Berg johannes.berg@intel.com Signed-off-by: Richard Weinberger richard@nod.at Signed-off-by: Sasha Levin sashal@kernel.org --- arch/um/drivers/ubd_kern.c | 2 +- arch/um/include/shared/kern_util.h | 2 +- arch/um/kernel/process.c | 11 +++++++---- arch/um/os-Linux/helper.c | 4 ++-- 4 files changed, 11 insertions(+), 8 deletions(-)
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index b12c1b0d3e1d0..e337702c30c78 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -1195,7 +1195,7 @@ static int __init ubd_driver_init(void){ /* Letting ubd=sync be like using ubd#s= instead of ubd#= is * enough. So use anyway the io thread. */ } - stack = alloc_stack(0, 0); + stack = alloc_stack(0); io_pid = start_io_thread(stack + PAGE_SIZE - sizeof(void *), &thread_fd); if(io_pid < 0){ diff --git a/arch/um/include/shared/kern_util.h b/arch/um/include/shared/kern_util.h index ccafb62e8ccec..c3ccbc2bbd2ce 100644 --- a/arch/um/include/shared/kern_util.h +++ b/arch/um/include/shared/kern_util.h @@ -19,7 +19,7 @@ extern int kmalloc_ok; #define UML_ROUND_UP(addr) \ ((((unsigned long) addr) + PAGE_SIZE - 1) & PAGE_MASK)
-extern unsigned long alloc_stack(int order, int atomic); +extern unsigned long alloc_stack(int atomic); extern void free_stack(unsigned long stack, int order);
struct pt_regs; diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index 9505a7e87396a..bb352dc446870 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -32,6 +32,7 @@ #include <os.h> #include <skas.h> #include <linux/time-internal.h> +#include <asm/set_memory.h>
/* * This is a per-cpu array. A processor only modifies its entry and it only @@ -62,16 +63,18 @@ void free_stack(unsigned long stack, int order) free_pages(stack, order); }
-unsigned long alloc_stack(int order, int atomic) +unsigned long alloc_stack(int atomic) { - unsigned long page; + unsigned long addr; gfp_t flags = GFP_KERNEL;
if (atomic) flags = GFP_ATOMIC; - page = __get_free_pages(flags, order); + addr = __get_free_pages(flags, 1);
- return page; + set_memory_ro(addr, 1); + + return addr + PAGE_SIZE; }
static inline void set_current(struct task_struct *task) diff --git a/arch/um/os-Linux/helper.c b/arch/um/os-Linux/helper.c index 9fa6e4187d4fb..feb48d796e005 100644 --- a/arch/um/os-Linux/helper.c +++ b/arch/um/os-Linux/helper.c @@ -45,7 +45,7 @@ int run_helper(void (*pre_exec)(void *), void *pre_data, char **argv) unsigned long stack, sp; int pid, fds[2], ret, n;
- stack = alloc_stack(0, __cant_sleep()); + stack = alloc_stack(__cant_sleep()); if (stack == 0) return -ENOMEM;
@@ -116,7 +116,7 @@ int run_helper_thread(int (*proc)(void *), void *arg, unsigned int flags, unsigned long stack, sp; int pid, status, err;
- stack = alloc_stack(0, __cant_sleep()); + stack = alloc_stack(__cant_sleep()); if (stack == 0) return -ENOMEM;
On Wed, 2020-12-30 at 08:03 -0500, Sasha Levin wrote:
We've been running into stack overflows in helper threads corrupting memory
For the record, that was mostly referring to "while development", so while this change makes a few things a bit safer, I don't think there's all that much point in backporting to stable; presumably things are working OK there for people, and developers will (hopefully) be working on mainline anyway.
johannes
On Wed, Dec 30, 2020 at 03:48:57PM +0100, Johannes Berg wrote:
On Wed, 2020-12-30 at 08:03 -0500, Sasha Levin wrote:
We've been running into stack overflows in helper threads corrupting memory
For the record, that was mostly referring to "while development", so while this change makes a few things a bit safer, I don't think there's all that much point in backporting to stable; presumably things are working OK there for people, and developers will (hopefully) be working on mainline anyway.
I'll drop it, thanks!
From: Trond Myklebust trond.myklebust@hammerspace.com
[ Upstream commit 503b934a752f7e789a5f33217520e0a79f3096ac ]
Expanding the READ_PLUS extents can cause the read buffer to overflow. If it does, then don't error, but just exit early.
Signed-off-by: Trond Myklebust trond.myklebust@hammerspace.com Signed-off-by: Sasha Levin sashal@kernel.org --- fs/nfs/nfs42xdr.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-)
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 8432bd6b95f08..c078f88552695 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -1019,29 +1019,24 @@ static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *re return decode_op_hdr(xdr, OP_DEALLOCATE); }
-static int decode_read_plus_data(struct xdr_stream *xdr, struct nfs_pgio_res *res, - uint32_t *eof) +static int decode_read_plus_data(struct xdr_stream *xdr, + struct nfs_pgio_res *res) { uint32_t count, recvd; uint64_t offset; __be32 *p;
p = xdr_inline_decode(xdr, 8 + 4); - if (unlikely(!p)) - return -EIO; + if (!p) + return 1;
p = xdr_decode_hyper(p, &offset); count = be32_to_cpup(p); recvd = xdr_align_data(xdr, res->count, count); res->count += recvd;
- if (count > recvd) { - dprintk("NFS: server cheating in read reply: " - "count %u > recvd %u\n", count, recvd); - *eof = 0; + if (count > recvd) return 1; - } - return 0; }
@@ -1052,18 +1047,16 @@ static int decode_read_plus_hole(struct xdr_stream *xdr, struct nfs_pgio_res *re __be32 *p;
p = xdr_inline_decode(xdr, 8 + 8); - if (unlikely(!p)) - return -EIO; + if (!p) + return 1;
p = xdr_decode_hyper(p, &offset); p = xdr_decode_hyper(p, &length); recvd = xdr_expand_hole(xdr, res->count, length); res->count += recvd;
- if (recvd < length) { - *eof = 0; + if (recvd < length) return 1; - } return 0; }
@@ -1088,12 +1081,12 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res)
for (i = 0; i < segments; i++) { p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) - return -EIO; + if (!p) + goto early_out;
type = be32_to_cpup(p++); if (type == NFS4_CONTENT_DATA) - status = decode_read_plus_data(xdr, res, &eof); + status = decode_read_plus_data(xdr, res); else if (type == NFS4_CONTENT_HOLE) status = decode_read_plus_hole(xdr, res, &eof); else @@ -1102,12 +1095,17 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res) if (status < 0) return status; if (status > 0) - break; + goto early_out; }
out: res->eof = eof; return 0; +early_out: + if (unlikely(!i)) + return -EIO; + res->eof = 0; + return 0; }
static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res)
From: Jeff Layton jlayton@kernel.org
[ Upstream commit 68cbb8056a4c24c6a38ad2b79e0a9764b235e8fa ]
Signed-off-by: Jeff Layton jlayton@kernel.org Reviewed-by: Ilya Dryomov idryomov@gmail.com Signed-off-by: Ilya Dryomov idryomov@gmail.com Signed-off-by: Sasha Levin sashal@kernel.org --- fs/ceph/inode.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 526faf4778ce4..2462a9a84b956 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1335,6 +1335,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) in, ceph_vinop(in)); if (in->i_state & I_NEW) discard_new_inode(in); + else + iput(in); goto done; } req->r_target_inode = in;
From: Jake Wang haonan.wang2@amd.com
[ Upstream commit 410066d24cfc1071be25e402510367aca9db5cb6 ]
[Why] For certain timings, Renoir may underflow due to sr exit latency being too slow.
[How] Updated wm table for renoir.
Signed-off-by: Jake Wang haonan.wang2@amd.com Reviewed-by: Yongqiang Sun yongqiang.sun@amd.com Acked-by: Qingqing Zhuo qingqing.zhuo@amd.com Signed-off-by: Alex Deucher alexander.deucher@amd.com Signed-off-by: Sasha Levin sashal@kernel.org --- .../drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c index 6b431db146cd9..1c6e401dd4cce 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c @@ -704,24 +704,24 @@ static struct wm_table ddr4_wm_table_rn = { .wm_inst = WM_B, .wm_type = WM_TYPE_PSTATE_CHG, .pstate_latency_us = 11.72, - .sr_exit_time_us = 10.12, - .sr_enter_plus_exit_time_us = 11.48, + .sr_exit_time_us = 11.12, + .sr_enter_plus_exit_time_us = 12.48, .valid = true, }, { .wm_inst = WM_C, .wm_type = WM_TYPE_PSTATE_CHG, .pstate_latency_us = 11.72, - .sr_exit_time_us = 10.12, - .sr_enter_plus_exit_time_us = 11.48, + .sr_exit_time_us = 11.12, + .sr_enter_plus_exit_time_us = 12.48, .valid = true, }, { .wm_inst = WM_D, .wm_type = WM_TYPE_PSTATE_CHG, .pstate_latency_us = 11.72, - .sr_exit_time_us = 10.12, - .sr_enter_plus_exit_time_us = 11.48, + .sr_exit_time_us = 11.12, + .sr_enter_plus_exit_time_us = 12.48, .valid = true, }, }
From: Thomas Gleixner tglx@linutronix.de
[ Upstream commit ba8ea8e7dd6e1662e34e730eadfc52aa6816f9dd ]
can_stop_idle_tick() checks whether the do_timer() duty has been taken over by a CPU on boot. That's silly because the boot CPU always takes over with the initial clockevent device.
But even if no CPU would have installed a clockevent and taken over the duty then the question whether the tick on the current CPU can be stopped or not is moot. In that case the current CPU would have no clockevent either, so there would be nothing to keep ticking.
Remove it.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Acked-by: Frederic Weisbecker frederic@kernel.org Link: https://lore.kernel.org/r/20201206212002.725238293@linutronix.de Signed-off-by: Sasha Levin sashal@kernel.org --- kernel/time/tick-sched.c | 7 ------- 1 file changed, 7 deletions(-)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 81632cd5e3b72..e8d351b7f9b03 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -941,13 +941,6 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) */ if (tick_do_timer_cpu == cpu) return false; - /* - * Boot safety: make sure the timekeeping duty has been - * assigned before entering dyntick-idle mode, - * tick_do_timer_cpu is TICK_DO_TIMER_BOOT - */ - if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_BOOT)) - return false;
/* Should not happen for nohz-full */ if (WARN_ON_ONCE(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
From: Heiko Carstens hca@linux.ibm.com
[ Upstream commit 9365965db0c7ca7fc81eee27c21d8522d7102c32 ]
Clear the kernel stack backchain before potentially calling the lockdep trace_hardirqs_off/on functions. Without this walking the kernel backchain, e.g. during a panic, might stop too early.
Signed-off-by: Heiko Carstens hca@linux.ibm.com Signed-off-by: Sasha Levin sashal@kernel.org --- arch/s390/kernel/entry.S | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 92beb14446449..9bdf3dbd04247 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -390,6 +390,7 @@ ENTRY(system_call) mvc __PT_PSW(16,%r11),__LC_SVC_OLD_PSW mvc __PT_INT_CODE(4,%r11),__LC_SVC_ILC stg %r14,__PT_FLAGS(%r11) + xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) ENABLE_INTS .Lsysc_do_svc: # clear user controlled register to prevent speculative use @@ -406,7 +407,6 @@ ENTRY(system_call) jnl .Lsysc_nr_ok slag %r8,%r1,3 .Lsysc_nr_ok: - xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) stg %r2,__PT_ORIG_GPR2(%r11) stg %r7,STACK_FRAME_OVERHEAD(%r15) lg %r9,0(%r8,%r10) # get system call add. @@ -696,8 +696,8 @@ ENTRY(pgm_check_handler) mvc __THREAD_per_address(8,%r14),__LC_PER_ADDRESS mvc __THREAD_per_cause(2,%r14),__LC_PER_CODE mvc __THREAD_per_paid(1,%r14),__LC_PER_ACCESS_ID -6: RESTORE_SM_CLEAR_PER - xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) +6: xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) + RESTORE_SM_CLEAR_PER larl %r1,pgm_check_table llgh %r10,__PT_INT_CODE+2(%r11) nill %r10,0x007f @@ -718,8 +718,8 @@ ENTRY(pgm_check_handler) # PER event in supervisor state, must be kprobes # .Lpgm_kprobe: - RESTORE_SM_CLEAR_PER xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) + RESTORE_SM_CLEAR_PER lgr %r2,%r11 # pass pointer to pt_regs brasl %r14,do_per_trap j .Lpgm_return @@ -761,10 +761,10 @@ ENTRY(io_int_handler) stmg %r8,%r9,__PT_PSW(%r11) mvc __PT_INT_CODE(12,%r11),__LC_SUBCHANNEL_ID xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) + xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) TSTMSK __LC_CPU_FLAGS,_CIF_IGNORE_IRQ jo .Lio_restore TRACE_IRQS_OFF - xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) .Lio_loop: lgr %r2,%r11 # pass pointer to pt_regs lghi %r3,IO_INTERRUPT @@ -964,10 +964,10 @@ ENTRY(ext_int_handler) mvc __PT_INT_PARM(4,%r11),__LC_EXT_PARAMS mvc __PT_INT_PARM_LONG(8,%r11),0(%r1) xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) + xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) TSTMSK __LC_CPU_FLAGS,_CIF_IGNORE_IRQ jo .Lio_restore TRACE_IRQS_OFF - xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) lgr %r2,%r11 # pass pointer to pt_regs lghi %r3,EXT_INTERRUPT brasl %r14,do_IRQ
From: Pavel Begunkov asml.silence@gmail.com
[ Upstream commit 9cd2be519d05ee78876d55e8e902b7125f78b74f ]
list_empty_careful() is not racy only if some conditions are met, i.e. no re-adds after del_init. io_cqring_overflow_flush() does list_move(), so it's actually racy.
Remove those checks, we have ->cq_check_overflow for the fast path.
Signed-off-by: Pavel Begunkov asml.silence@gmail.com Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Sasha Levin sashal@kernel.org --- fs/io_uring.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c index 86dac2b2e2763..4b3dbe588d111 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1632,8 +1632,6 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, LIST_HEAD(list);
if (!force) { - if (list_empty_careful(&ctx->cq_overflow_list)) - return true; if ((ctx->cached_cq_tail - READ_ONCE(rings->cq.head) == rings->cq_ring_entries)) return false; @@ -6548,8 +6546,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
/* if we have a backlog and couldn't flush it all, return BUSY */ if (test_bit(0, &ctx->sq_check_overflow)) { - if (!list_empty(&ctx->cq_overflow_list) && - !io_cqring_overflow_flush(ctx, false, NULL, NULL)) + if (!io_cqring_overflow_flush(ctx, false, NULL, NULL)) return -EBUSY; }
From: Theodore Ts'o tytso@mit.edu
[ Upstream commit c9200760da8a728eb9767ca41a956764b28c1310 ]
Check for valid block size directly by validating s_log_block_size; we were doing this in two places. First, by calculating blocksize via BLOCK_SIZE << s_log_block_size, and then checking that the blocksize was valid. And then secondly, by checking s_log_block_size directly.
The first check is not reliable, and can trigger an UBSAN warning if s_log_block_size on a maliciously corrupted superblock is greater than 22. This is harmless, since the second test will correctly reject the maliciously fuzzed file system, but to make syzbot shut up, and because the two checks are duplicative in any case, delete the blocksize check, and move the s_log_block_size earlier in ext4_fill_super().
Signed-off-by: Theodore Ts'o tytso@mit.edu Reported-by: syzbot+345b75652b1d24227443@syzkaller.appspotmail.com Signed-off-by: Sasha Levin sashal@kernel.org --- fs/ext4/super.c | 40 ++++++++++++++++------------------------ 1 file changed, 16 insertions(+), 24 deletions(-)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 94472044f4c1d..950872c9c6891 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4188,18 +4188,25 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) */ sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
- blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); - - if (blocksize == PAGE_SIZE) - set_opt(sb, DIOREAD_NOLOCK); - - if (blocksize < EXT4_MIN_BLOCK_SIZE || - blocksize > EXT4_MAX_BLOCK_SIZE) { + if (le32_to_cpu(es->s_log_block_size) > + (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { ext4_msg(sb, KERN_ERR, - "Unsupported filesystem blocksize %d (%d log_block_size)", - blocksize, le32_to_cpu(es->s_log_block_size)); + "Invalid log block size: %u", + le32_to_cpu(es->s_log_block_size)); goto failed_mount; } + if (le32_to_cpu(es->s_log_cluster_size) > + (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { + ext4_msg(sb, KERN_ERR, + "Invalid log cluster size: %u", + le32_to_cpu(es->s_log_cluster_size)); + goto failed_mount; + } + + blocksize = EXT4_MIN_BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); + + if (blocksize == PAGE_SIZE) + set_opt(sb, DIOREAD_NOLOCK);
if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; @@ -4418,21 +4425,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (!ext4_feature_set_ok(sb, (sb_rdonly(sb)))) goto failed_mount;
- if (le32_to_cpu(es->s_log_block_size) > - (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { - ext4_msg(sb, KERN_ERR, - "Invalid log block size: %u", - le32_to_cpu(es->s_log_block_size)); - goto failed_mount; - } - if (le32_to_cpu(es->s_log_cluster_size) > - (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) { - ext4_msg(sb, KERN_ERR, - "Invalid log cluster size: %u", - le32_to_cpu(es->s_log_cluster_size)); - goto failed_mount; - } - if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (blocksize / 4)) { ext4_msg(sb, KERN_ERR, "Number of reserved GDT blocks insanely large: %d",
From: Takashi Iwai tiwai@suse.de
[ Upstream commit 618de0f4ef11acd8cf26902e65493d46cc20cc89 ]
The PCM hw_params core function tries to clear up the PCM buffer before actually using for avoiding the information leak from the previous usages or the usage before a new allocation. It performs the memset() with runtime->dma_bytes, but this might still leave some remaining bytes untouched; namely, the PCM buffer size is aligned in page size for mmap, hence runtime->dma_bytes doesn't necessarily cover all PCM buffer pages, and the remaining bytes are exposed via mmap.
This patch changes the memory clearance to cover the all buffer pages if the stream is supposed to be mmap-ready (that guarantees that the buffer size is aligned in page size).
Reviewed-by: Lars-Peter Clausen lars@metafoo.de Link: https://lore.kernel.org/r/20201218145625.2045-3-tiwai@suse.de Signed-off-by: Takashi Iwai tiwai@suse.de Signed-off-by: Sasha Levin sashal@kernel.org --- sound/core/pcm_native.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index 47b155a49226f..9f3f8e953ff04 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -755,8 +755,13 @@ static int snd_pcm_hw_params(struct snd_pcm_substream *substream, runtime->boundary *= 2;
/* clear the buffer for avoiding possible kernel info leaks */ - if (runtime->dma_area && !substream->ops->copy_user) - memset(runtime->dma_area, 0, runtime->dma_bytes); + if (runtime->dma_area && !substream->ops->copy_user) { + size_t size = runtime->dma_bytes; + + if (runtime->info & SNDRV_PCM_INFO_MMAP) + size = PAGE_ALIGN(size); + memset(runtime->dma_area, 0, size); + }
snd_pcm_timer_resolution_change(substream); snd_pcm_set_state(substream, SNDRV_PCM_STATE_SETUP);
From: Hyeongseok Kim hyeongseok@gmail.com
[ Upstream commit 252bd1256396cebc6fc3526127fdb0b317601318 ]
If emergency system shutdown is called, like by thermal shutdown, a dm device could be alive when the block device couldn't process I/O requests anymore. In this state, the handling of I/O errors by new dm I/O requests or by those already in-flight can lead to a verity corruption state, which is a misjudgment.
So, skip verity work in response to I/O error when system is shutting down.
Signed-off-by: Hyeongseok Kim hyeongseok@gmail.com Reviewed-by: Sami Tolvanen samitolvanen@google.com Signed-off-by: Mike Snitzer snitzer@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/md/dm-verity-target.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-)
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index f74982dcbea0d..6b8e5bdd8526d 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -537,6 +537,15 @@ static int verity_verify_io(struct dm_verity_io *io) return 0; }
+/* + * Skip verity work in response to I/O error when system is shutting down. + */ +static inline bool verity_is_system_shutting_down(void) +{ + return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF + || system_state == SYSTEM_RESTART; +} + /* * End one "io" structure with a given error. */ @@ -564,7 +573,8 @@ static void verity_end_io(struct bio *bio) { struct dm_verity_io *io = bio->bi_private;
- if (bio->bi_status && !verity_fec_is_enabled(io->v)) { + if (bio->bi_status && + (!verity_fec_is_enabled(io->v) || verity_is_system_shutting_down())) { verity_finish_io(io, bio->bi_status); return; }
From: Chunguang Xu brookxu@tencent.com
[ Upstream commit 82ef1370b0c1757ab4ce29f34c52b4e93839b0aa ]
Commit cfd732377221 ("ext4: add prefetching for block allocation bitmaps") introduced block bitmap prefetch, and expects to read block bitmaps of flex_bg through an IO. However, it seems to ignore the value range of s_log_groups_per_flex. In the scenario where the value of s_log_groups_per_flex is greater than 27, s_mb_prefetch or s_mb_prefetch_limit will overflow, cause a divide zero exception.
In addition, the logic of calculating nr is also flawed, because the size of flexbg is fixed during a single mount, but s_mb_prefetch can be modified, which causes nr to fail to meet the value condition of [1, flexbg_size].
To solve this problem, we need to set the upper limit of s_mb_prefetch. Since we expect to load block bitmaps of a flex_bg through an IO, we can consider determining a reasonable upper limit among the IO limit parameters. After consideration, we chose BLK_MAX_SEGMENT_SIZE. This is a good choice to solve divide zero problem and avoiding performance degradation.
[ Some minor code simplifications to make the changes easy to follow -- TYT ]
Reported-by: Tosk Robot tencent_os_robot@tencent.com Signed-off-by: Chunguang Xu brookxu@tencent.com Reviewed-by: Samuel Liao samuelliao@tencent.com Reviewed-by: Andreas Dilger adilger@dilger.ca Link: https://lore.kernel.org/r/1607051143-24508-1-git-send-email-brookxu@tencent.... Signed-off-by: Theodore Ts'o tytso@mit.edu Signed-off-by: Sasha Levin sashal@kernel.org --- fs/ext4/mballoc.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 24af9ed5c3e52..ca57c6bfee224 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2395,9 +2395,9 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
nr = sbi->s_mb_prefetch; if (ext4_has_feature_flex_bg(sb)) { - nr = (group / sbi->s_mb_prefetch) * - sbi->s_mb_prefetch; - nr = nr + sbi->s_mb_prefetch - group; + nr = 1 << sbi->s_log_groups_per_flex; + nr -= group & (nr - 1); + nr = min(nr, sbi->s_mb_prefetch); } prefetch_grp = ext4_mb_prefetch(sb, group, nr, &prefetch_ios); @@ -2733,7 +2733,8 @@ static int ext4_mb_init_backend(struct super_block *sb)
if (ext4_has_feature_flex_bg(sb)) { /* a single flex group is supposed to be read by a single IO */ - sbi->s_mb_prefetch = 1 << sbi->s_es->s_log_groups_per_flex; + sbi->s_mb_prefetch = min(1 << sbi->s_es->s_log_groups_per_flex, + BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9)); sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */ } else { sbi->s_mb_prefetch = 32;
From: Dan Williams dan.j.williams@intel.com
[ Upstream commit 6268d7da4d192af339f4d688942b9ccb45a65e04 ]
There are multiple locations that open-code the release of the last range in a device-dax instance. Consolidate this into a new dev_dax_trim_range() helper.
This also addresses a kmemleak report:
# cat /sys/kernel/debug/kmemleak [..] unreferenced object 0xffff976bd46f6240 (size 64): comm "ndctl", pid 23556, jiffies 4299514316 (age 5406.733s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 20 c3 37 00 00 00 .......... .7... ff ff ff 7f 38 00 00 00 00 00 00 00 00 00 00 00 ....8........... backtrace: [<00000000064003cf>] __kmalloc_track_caller+0x136/0x379 [<00000000d85e3c52>] krealloc+0x67/0x92 [<00000000d7d3ba8a>] __alloc_dev_dax_range+0x73/0x25c [<0000000027d58626>] devm_create_dev_dax+0x27d/0x416 [<00000000434abd43>] __dax_pmem_probe+0x1c9/0x1000 [dax_pmem_core] [<0000000083726c1c>] dax_pmem_probe+0x10/0x1f [dax_pmem] [<00000000b5f2319c>] nvdimm_bus_probe+0x9d/0x340 [libnvdimm] [<00000000c055e544>] really_probe+0x230/0x48d [<000000006cabd38e>] driver_probe_device+0x122/0x13b [<0000000029c7b95a>] device_driver_attach+0x5b/0x60 [<0000000053e5659b>] bind_store+0xb7/0xc3 [<00000000d3bdaadc>] drv_attr_store+0x27/0x31 [<00000000949069c5>] sysfs_kf_write+0x4a/0x57 [<000000004a8b5adf>] kernfs_fop_write+0x150/0x1e5 [<00000000bded60f0>] __vfs_write+0x1b/0x34 [<00000000b92900f0>] vfs_write+0xd8/0x1d1
Reported-by: Jane Chu jane.chu@oracle.com Cc: Zhen Lei thunder.leizhen@huawei.com Link: https://lore.kernel.org/r/160834570161.1791850.14911670304441510419.stgit@dw... Signed-off-by: Dan Williams dan.j.williams@intel.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/dax/bus.c | 44 +++++++++++++++++++++----------------------- 1 file changed, 21 insertions(+), 23 deletions(-)
diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index 27513d311242e..de7b74505e75e 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -367,19 +367,28 @@ void kill_dev_dax(struct dev_dax *dev_dax) } EXPORT_SYMBOL_GPL(kill_dev_dax);
-static void free_dev_dax_ranges(struct dev_dax *dev_dax) +static void trim_dev_dax_range(struct dev_dax *dev_dax) { + int i = dev_dax->nr_range - 1; + struct range *range = &dev_dax->ranges[i].range; struct dax_region *dax_region = dev_dax->region; - int i;
device_lock_assert(dax_region->dev); - for (i = 0; i < dev_dax->nr_range; i++) { - struct range *range = &dev_dax->ranges[i].range; - - __release_region(&dax_region->res, range->start, - range_len(range)); + dev_dbg(&dev_dax->dev, "delete range[%d]: %#llx:%#llx\n", i, + (unsigned long long)range->start, + (unsigned long long)range->end); + + __release_region(&dax_region->res, range->start, range_len(range)); + if (--dev_dax->nr_range == 0) { + kfree(dev_dax->ranges); + dev_dax->ranges = NULL; } - dev_dax->nr_range = 0; +} + +static void free_dev_dax_ranges(struct dev_dax *dev_dax) +{ + while (dev_dax->nr_range) + trim_dev_dax_range(dev_dax); }
static void unregister_dev_dax(void *dev) @@ -804,15 +813,10 @@ static int alloc_dev_dax_range(struct dev_dax *dev_dax, u64 start, return 0;
rc = devm_register_dax_mapping(dev_dax, dev_dax->nr_range - 1); - if (rc) { - dev_dbg(dev, "delete range[%d]: %pa:%pa\n", dev_dax->nr_range - 1, - &alloc->start, &alloc->end); - dev_dax->nr_range--; - __release_region(res, alloc->start, resource_size(alloc)); - return rc; - } + if (rc) + trim_dev_dax_range(dev_dax);
- return 0; + return rc; }
static int adjust_dev_dax_range(struct dev_dax *dev_dax, struct resource *res, resource_size_t size) @@ -885,12 +889,7 @@ static int dev_dax_shrink(struct dev_dax *dev_dax, resource_size_t size) if (shrink >= range_len(range)) { devm_release_action(dax_region->dev, unregister_dax_mapping, &mapping->dev); - __release_region(&dax_region->res, range->start, - range_len(range)); - dev_dax->nr_range--; - dev_dbg(dev, "delete range[%d]: %#llx:%#llx\n", i, - (unsigned long long) range->start, - (unsigned long long) range->end); + trim_dev_dax_range(dev_dax); to_shrink -= shrink; if (!to_shrink) break; @@ -1274,7 +1273,6 @@ static void dev_dax_release(struct device *dev) put_dax(dax_dev); free_dev_dax_id(dev_dax); dax_region_put(dax_region); - kfree(dev_dax->ranges); kfree(dev_dax->pgmap); kfree(dev_dax); }
Hello Sasha,
On 30.12.20 14:02, Sasha Levin wrote:
From: Linus Walleij linus.walleij@linaro.org
[ Upstream commit d6d51a96c7d63b7450860a3037f2d62388286a52 ]
Functions like memset()/memmove()/memcpy() do a lot of memory accesses.
If a bad pointer is passed to one of these functions it is important to catch this. Compiler instrumentation cannot do this since these functions are written in assembly.
KASan replaces these memory functions with instrumented variants.
Unless someone actually wants this, I suggest dropping it.
It's a prerequisite patch for KASan support on ARM32, which is new in v5.11-rc1. Backporting it on its own doesn't add any value IMO.
Cheers Ahmad
The original functions are declared as weak symbols so that the strong definitions in mm/kasan/kasan.c can replace them.
The original functions have aliases with a '__' prefix in their name, so we can call the non-instrumented variant if needed.
We must use __memcpy()/__memset() in place of memcpy()/memset() when we copy .data to RAM and when we clear .bss, because kasan_early_init cannot be called before the initialization of .data and .bss.
For the kernel compression and EFI libstub's custom string libraries we need a special quirk: even if these are built without KASan enabled, they rely on the global headers for their custom string libraries, which means that e.g. memcpy() will be defined to __memcpy() and we get link failures. Since these implementations are written i C rather than assembly we use e.g. __alias(memcpy) to redirected any users back to the local implementation.
Cc: Andrey Ryabinin aryabinin@virtuozzo.com Cc: Alexander Potapenko glider@google.com Cc: Dmitry Vyukov dvyukov@google.com Cc: kasan-dev@googlegroups.com Reviewed-by: Ard Biesheuvel ardb@kernel.org Tested-by: Ard Biesheuvel ardb@kernel.org # QEMU/KVM/mach-virt/LPAE/8G Tested-by: Florian Fainelli f.fainelli@gmail.com # Brahma SoCs Tested-by: Ahmad Fatoum a.fatoum@pengutronix.de # i.MX6Q Reported-by: Russell King - ARM Linux rmk+kernel@armlinux.org.uk Signed-off-by: Ahmad Fatoum a.fatoum@pengutronix.de Signed-off-by: Abbott Liu liuwenliang@huawei.com Signed-off-by: Florian Fainelli f.fainelli@gmail.com Signed-off-by: Linus Walleij linus.walleij@linaro.org Signed-off-by: Russell King rmk+kernel@armlinux.org.uk Signed-off-by: Sasha Levin sashal@kernel.org
arch/arm/boot/compressed/string.c | 19 +++++++++++++++++++ arch/arm/include/asm/string.h | 26 ++++++++++++++++++++++++++ arch/arm/kernel/head-common.S | 4 ++-- arch/arm/lib/memcpy.S | 3 +++ arch/arm/lib/memmove.S | 5 ++++- arch/arm/lib/memset.S | 3 +++ 6 files changed, 57 insertions(+), 3 deletions(-)
diff --git a/arch/arm/boot/compressed/string.c b/arch/arm/boot/compressed/string.c index ade5079bebbf9..8c0fa276d9946 100644 --- a/arch/arm/boot/compressed/string.c +++ b/arch/arm/boot/compressed/string.c @@ -7,6 +7,25 @@ #include <linux/string.h> +/*
- The decompressor is built without KASan but uses the same redirects as the
- rest of the kernel when CONFIG_KASAN is enabled, defining e.g. memcpy()
- to __memcpy() but since we are not linking with the main kernel string
- library in the decompressor, that will lead to link failures.
- Undefine KASan's versions, define the wrapped functions and alias them to
- the right names so that when e.g. __memcpy() appear in the code, it will
- still be linked to this local version of memcpy().
- */
+#ifdef CONFIG_KASAN +#undef memcpy +#undef memmove +#undef memset +void *__memcpy(void *__dest, __const void *__src, size_t __n) __alias(memcpy); +void *__memmove(void *__dest, __const void *__src, size_t count) __alias(memmove); +void *__memset(void *s, int c, size_t count) __alias(memset); +#endif
void *memcpy(void *__dest, __const void *__src, size_t __n) { int i = 0; diff --git a/arch/arm/include/asm/string.h b/arch/arm/include/asm/string.h index 111a1d8a41ddf..6c607c68f3ad7 100644 --- a/arch/arm/include/asm/string.h +++ b/arch/arm/include/asm/string.h @@ -5,6 +5,9 @@ /*
- We don't do inline string functions, since the
- optimised inline asm versions are not small.
- The __underscore versions of some functions are for KASan to be able
*/
- to replace them with instrumented versions.
#define __HAVE_ARCH_STRRCHR @@ -15,15 +18,18 @@ extern char * strchr(const char * s, int c); #define __HAVE_ARCH_MEMCPY extern void * memcpy(void *, const void *, __kernel_size_t); +extern void *__memcpy(void *dest, const void *src, __kernel_size_t n); #define __HAVE_ARCH_MEMMOVE extern void * memmove(void *, const void *, __kernel_size_t); +extern void *__memmove(void *dest, const void *src, __kernel_size_t n); #define __HAVE_ARCH_MEMCHR extern void * memchr(const void *, int, __kernel_size_t); #define __HAVE_ARCH_MEMSET extern void * memset(void *, int, __kernel_size_t); +extern void *__memset(void *s, int c, __kernel_size_t n); #define __HAVE_ARCH_MEMSET32 extern void *__memset32(uint32_t *, uint32_t v, __kernel_size_t); @@ -39,4 +45,24 @@ static inline void *memset64(uint64_t *p, uint64_t v, __kernel_size_t n) return __memset64(p, v, n * 8, v >> 32); } +/*
- For files that are not instrumented (e.g. mm/slub.c) we
- must use non-instrumented versions of the mem*
- functions named __memcpy() etc. All such kernel code has
- been tagged with KASAN_SANITIZE_file.o = n, which means
- that the address sanitization argument isn't passed to the
- compiler, and __SANITIZE_ADDRESS__ is not set. As a result
- these defines kick in.
- */
+#if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__) +#define memcpy(dst, src, len) __memcpy(dst, src, len) +#define memmove(dst, src, len) __memmove(dst, src, len) +#define memset(s, c, n) __memset(s, c, n)
+#ifndef __NO_FORTIFY +#define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */ +#endif
+#endif
#endif diff --git a/arch/arm/kernel/head-common.S b/arch/arm/kernel/head-common.S index 4a3982812a401..6840c7c60a858 100644 --- a/arch/arm/kernel/head-common.S +++ b/arch/arm/kernel/head-common.S @@ -95,7 +95,7 @@ __mmap_switched: THUMB( ldmia r4!, {r0, r1, r2, r3} ) THUMB( mov sp, r3 ) sub r2, r2, r1
- bl memcpy @ copy .data to RAM
- bl __memcpy @ copy .data to RAM
#endif ARM( ldmia r4!, {r0, r1, sp} ) @@ -103,7 +103,7 @@ __mmap_switched: THUMB( mov sp, r3 ) sub r2, r1, r0 mov r1, #0
- bl memset @ clear .bss
- bl __memset @ clear .bss
ldmia r4, {r0, r1, r2, r3} str r9, [r0] @ Save processor ID diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S index 09a333153dc66..ad4625d16e117 100644 --- a/arch/arm/lib/memcpy.S +++ b/arch/arm/lib/memcpy.S @@ -58,6 +58,8 @@ /* Prototype: void *memcpy(void *dest, const void *src, size_t n); */ +.weak memcpy +ENTRY(__memcpy) ENTRY(mmiocpy) ENTRY(memcpy) @@ -65,3 +67,4 @@ ENTRY(memcpy) ENDPROC(memcpy) ENDPROC(mmiocpy) +ENDPROC(__memcpy) diff --git a/arch/arm/lib/memmove.S b/arch/arm/lib/memmove.S index b50e5770fb44d..fd123ea5a5a4a 100644 --- a/arch/arm/lib/memmove.S +++ b/arch/arm/lib/memmove.S @@ -24,12 +24,14 @@
- occurring in the opposite direction.
*/ +.weak memmove +ENTRY(__memmove) ENTRY(memmove) UNWIND( .fnstart ) subs ip, r0, r1 cmphi r2, ip
bls memcpy
bls __memcpy
stmfd sp!, {r0, r4, lr} UNWIND( .fnend ) @@ -222,3 +224,4 @@ ENTRY(memmove) 18: backward_copy_shift push=24 pull=8 ENDPROC(memmove) +ENDPROC(__memmove) diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S index 6ca4535c47fb6..0e7ff0423f50b 100644 --- a/arch/arm/lib/memset.S +++ b/arch/arm/lib/memset.S @@ -13,6 +13,8 @@ .text .align 5 +.weak memset +ENTRY(__memset) ENTRY(mmioset) ENTRY(memset) UNWIND( .fnstart ) @@ -132,6 +134,7 @@ UNWIND( .fnstart ) UNWIND( .fnend ) ENDPROC(memset) ENDPROC(mmioset) +ENDPROC(__memset) ENTRY(__memset32) UNWIND( .fnstart )
On Wed, Dec 30, 2020 at 03:18:13PM +0100, Ahmad Fatoum wrote:
Hello Sasha,
On 30.12.20 14:02, Sasha Levin wrote:
From: Linus Walleij linus.walleij@linaro.org
[ Upstream commit d6d51a96c7d63b7450860a3037f2d62388286a52 ]
Functions like memset()/memmove()/memcpy() do a lot of memory accesses.
If a bad pointer is passed to one of these functions it is important to catch this. Compiler instrumentation cannot do this since these functions are written in assembly.
KASan replaces these memory functions with instrumented variants.
Unless someone actually wants this, I suggest dropping it.
It's a prerequisite patch for KASan support on ARM32, which is new in v5.11-rc1. Backporting it on its own doesn't add any value IMO.
I'll drop it, thanks.
linux-stable-mirror@lists.linaro.org