From: Bob Peterson rpeterso@redhat.com
[ Upstream commit 428f651cb80b227af47fc302e4931791f2fb4741 ]
Before this patch, function read_rindex_entry called compute_bitstructs before it allocated a glock for the rgrp. But if compute_bitstructs found a problem with the rgrp, it called gfs2_consist_rgrpd, and that called gfs2_dump_glock for rgd->rd_gl which had not yet been assigned.
read_rindex_entry compute_bitstructs gfs2_consist_rgrpd gfs2_dump_glock <---------rgd->rd_gl was not set.
This patch changes read_rindex_entry so it assigns an rgrp glock before calling compute_bitstructs so gfs2_dump_glock does not reference an unassigned pointer. If an error is discovered, the glock must also be put, so a new goto and label were added.
Reported-by: syzbot+c6fd14145e2f62ca0784@syzkaller.appspotmail.com Signed-off-by: Bob Peterson rpeterso@redhat.com Signed-off-by: Andreas Gruenbacher agruenba@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org --- fs/gfs2/rgrp.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index c056ed5c6df3..767d188e5e50 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -925,15 +925,15 @@ static int read_rindex_entry(struct gfs2_inode *ip) rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes); spin_lock_init(&rgd->rd_rsspin);
- error = compute_bitstructs(rgd); - if (error) - goto fail; - error = gfs2_glock_get(sdp, rgd->rd_addr, &gfs2_rgrp_glops, CREATE, &rgd->rd_gl); if (error) goto fail;
+ error = compute_bitstructs(rgd); + if (error) + goto fail_glock; + rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr; rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED); if (rgd->rd_data > sdp->sd_max_rg_data) @@ -950,6 +950,7 @@ static int read_rindex_entry(struct gfs2_inode *ip) }
error = 0; /* someone else read in the rgrp; free it and ignore it */ +fail_glock: gfs2_glock_put(rgd->rd_gl);
fail:
From: Vincent Whitchurch vincent.whitchurch@axis.com
[ Upstream commit c8fa17d9f08a448184f03d352145099b5beb618e ]
If the irqwork is still scheduled or running while the RTC device is removed, a use-after-free occurs in rtc_timer_do_work(). Cleanup the timerqueue and ensure the work is stopped to fix this.
BUG: KASAN: use-after-free in mutex_lock+0x94/0x110 Write of size 8 at addr ffffff801d846338 by task kworker/3:1/41
Workqueue: events rtc_timer_do_work Call trace: mutex_lock+0x94/0x110 rtc_timer_do_work+0xec/0x630 process_one_work+0x5fc/0x1344 ...
Allocated by task 551: kmem_cache_alloc_trace+0x384/0x6e0 devm_rtc_allocate_device+0xf0/0x574 devm_rtc_device_register+0x2c/0x12c ...
Freed by task 572: kfree+0x114/0x4d0 rtc_device_release+0x64/0x80 device_release+0x8c/0x1f4 kobject_put+0x1c4/0x4b0 put_device+0x20/0x30 devm_rtc_release_device+0x1c/0x30 devm_action_release+0x54/0x90 release_nodes+0x124/0x310 devres_release_group+0x170/0x240 i2c_device_remove+0xd8/0x314 ...
Last potentially related work creation: insert_work+0x5c/0x330 queue_work_on+0xcc/0x154 rtc_set_time+0x188/0x5bc rtc_dev_ioctl+0x2ac/0xbd0 ...
Signed-off-by: Vincent Whitchurch vincent.whitchurch@axis.com Signed-off-by: Alexandre Belloni alexandre.belloni@bootlin.com Link: https://lore.kernel.org/r/20211210160951.7718-1-vincent.whitchurch@axis.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/rtc/class.c | 9 +++++++++ 1 file changed, 9 insertions(+)
diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c index 9458e6d6686a..8b434213bc7a 100644 --- a/drivers/rtc/class.c +++ b/drivers/rtc/class.c @@ -26,6 +26,15 @@ struct class *rtc_class; static void rtc_device_release(struct device *dev) { struct rtc_device *rtc = to_rtc_device(dev); + struct timerqueue_head *head = &rtc->timerqueue; + struct timerqueue_node *node; + + mutex_lock(&rtc->ops_lock); + while ((node = timerqueue_getnext(head))) + timerqueue_del(head, node); + mutex_unlock(&rtc->ops_lock); + + cancel_work_sync(&rtc->irqwork);
ida_simple_remove(&rtc_ida, rtc->id); kfree(rtc);
From: David Gow davidgow@google.com
[ Upstream commit f4f03f299a56ce4d73c5431e0327b3b6cb55ebb9 ]
The syscall_handler_t type for x86_64 was defined as 'long (*)(void)', but always cast to 'long (*)(long, long, long, long, long, long)' before use. This now triggers a warning (see below).
Define syscall_handler_t as the latter instead, and remove the cast. This simplifies the code, and fixes the warning.
Warning: In file included from ../arch/um/include/asm/processor-generic.h:13 from ../arch/x86/um/asm/processor.h:41 from ../include/linux/rcupdate.h:30 from ../include/linux/rculist.h:11 from ../include/linux/pid.h:5 from ../include/linux/sched.h:14 from ../include/linux/ptrace.h:6 from ../arch/um/kernel/skas/syscall.c:7: ../arch/um/kernel/skas/syscall.c: In function ‘handle_syscall’: ../arch/x86/um/shared/sysdep/syscalls_64.h:18:11: warning: cast between incompatible function types from ‘long int (*)(void)’ to ‘long int (*)(long int, long int, long int, long int, long int, long int)’ [ -Wcast-function-type] 18 | (((long (*)(long, long, long, long, long, long)) \ | ^ ../arch/x86/um/asm/ptrace.h:36:62: note: in definition of macro ‘PT_REGS_SET_SYSCALL_RETURN’ 36 | #define PT_REGS_SET_SYSCALL_RETURN(r, res) (PT_REGS_AX(r) = (res)) | ^~~ ../arch/um/kernel/skas/syscall.c:46:33: note: in expansion of macro ‘EXECUTE_SYSCALL’ 46 | EXECUTE_SYSCALL(syscall, regs)); | ^~~~~~~~~~~~~~~
Signed-off-by: David Gow davidgow@google.com Signed-off-by: Richard Weinberger richard@nod.at Signed-off-by: Sasha Levin sashal@kernel.org --- arch/x86/um/shared/sysdep/syscalls_64.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/arch/x86/um/shared/sysdep/syscalls_64.h b/arch/x86/um/shared/sysdep/syscalls_64.h index 8a7d5e1da98e..1e6875b4ffd8 100644 --- a/arch/x86/um/shared/sysdep/syscalls_64.h +++ b/arch/x86/um/shared/sysdep/syscalls_64.h @@ -10,13 +10,12 @@ #include <linux/msg.h> #include <linux/shm.h>
-typedef long syscall_handler_t(void); +typedef long syscall_handler_t(long, long, long, long, long, long);
extern syscall_handler_t *sys_call_table[];
#define EXECUTE_SYSCALL(syscall, regs) \ - (((long (*)(long, long, long, long, long, long)) \ - (*sys_call_table[syscall]))(UPT_SYSCALL_ARG1(®s->regs), \ + (((*sys_call_table[syscall]))(UPT_SYSCALL_ARG1(®s->regs), \ UPT_SYSCALL_ARG2(®s->regs), \ UPT_SYSCALL_ARG3(®s->regs), \ UPT_SYSCALL_ARG4(®s->regs), \
From: Glenn Washburn development@efficientek.com
[ Upstream commit 3cb5a7f167c620a8b0e38b0446df2e024d2243dc ]
Check if port-helper exists and is executable. If not, write an error message to the kernel log with information to help the user diagnose the issue and exit with an error. If UML_PORT_HELPER was not set, write a message suggesting that the user set it. This makes it easier to understand why telneting to the UML instance is failing and what can be done to fix it.
Signed-off-by: Glenn Washburn development@efficientek.com Signed-off-by: Richard Weinberger richard@nod.at Signed-off-by: Sasha Levin sashal@kernel.org --- arch/um/drivers/port_user.c | 12 ++++++++++++ 1 file changed, 12 insertions(+)
diff --git a/arch/um/drivers/port_user.c b/arch/um/drivers/port_user.c index 5b5b64cb1071..133ca7bf2d91 100644 --- a/arch/um/drivers/port_user.c +++ b/arch/um/drivers/port_user.c @@ -5,6 +5,7 @@
#include <stdio.h> #include <stdlib.h> +#include <string.h> #include <errno.h> #include <termios.h> #include <unistd.h> @@ -175,6 +176,17 @@ int port_connection(int fd, int *socket, int *pid_out) if (new < 0) return -errno;
+ err = os_access(argv[2], X_OK); + if (err < 0) { + printk(UM_KERN_ERR "port_connection : error accessing port-helper " + "executable at %s: %s\n", argv[2], strerror(-err)); + if (env == NULL) + printk(UM_KERN_ERR "Set UML_PORT_HELPER environment " + "variable to path to uml-utilities port-helper " + "binary\n"); + goto out_close; + } + err = os_pipe(socket, 0, 0); if (err < 0) goto out_close;
From: Jeff LaBundy jeff@labundy.com
[ Upstream commit 409353cbe9fe48f6bc196114c442b1cff05a39bc ]
Update input_set_capability() to prevent kernel panic in case the event code exceeds the bitmap for the given event type.
Suggested-by: Tomasz Moń tomasz.mon@camlingroup.com Signed-off-by: Jeff LaBundy jeff@labundy.com Reviewed-by: Tomasz Moń tomasz.mon@camlingroup.com Link: https://lore.kernel.org/r/20220320032537.545250-1-jeff@labundy.com Signed-off-by: Dmitry Torokhov dmitry.torokhov@gmail.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/input/input.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+)
diff --git a/drivers/input/input.c b/drivers/input/input.c index f7398b996bac..5304262dbfa5 100644 --- a/drivers/input/input.c +++ b/drivers/input/input.c @@ -47,6 +47,17 @@ static DEFINE_MUTEX(input_mutex);
static const struct input_value input_value_sync = { EV_SYN, SYN_REPORT, 1 };
+static const unsigned int input_max_code[EV_CNT] = { + [EV_KEY] = KEY_MAX, + [EV_REL] = REL_MAX, + [EV_ABS] = ABS_MAX, + [EV_MSC] = MSC_MAX, + [EV_SW] = SW_MAX, + [EV_LED] = LED_MAX, + [EV_SND] = SND_MAX, + [EV_FF] = FF_MAX, +}; + static inline int is_event_supported(unsigned int code, unsigned long *bm, unsigned int max) { @@ -1978,6 +1989,14 @@ EXPORT_SYMBOL(input_get_timestamp); */ void input_set_capability(struct input_dev *dev, unsigned int type, unsigned int code) { + if (type < EV_CNT && input_max_code[type] && + code > input_max_code[type]) { + pr_err("%s: invalid code %u for type %u\n", __func__, code, + type); + dump_stack(); + return; + } + switch (type) { case EV_KEY: __set_bit(code, dev->keybit);
From: Zheng Yongjun zhengyongjun3@huawei.com
[ Upstream commit 26623eea0da3476446909af96c980768df07bbd9 ]
pm_runtime_get_sync() will increment pm usage counter even it failed. Forgetting to call pm_runtime_put_noidle will result in reference leak in stmfts_input_open, so we should fix it.
Signed-off-by: Zheng Yongjun zhengyongjun3@huawei.com Link: https://lore.kernel.org/r/20220317131604.53538-1-zhengyongjun3@huawei.com Signed-off-by: Dmitry Torokhov dmitry.torokhov@gmail.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/input/touchscreen/stmfts.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/drivers/input/touchscreen/stmfts.c b/drivers/input/touchscreen/stmfts.c index cd8805d71d97..be1dd504d5b1 100644 --- a/drivers/input/touchscreen/stmfts.c +++ b/drivers/input/touchscreen/stmfts.c @@ -339,11 +339,11 @@ static int stmfts_input_open(struct input_dev *dev)
err = pm_runtime_get_sync(&sdata->client->dev); if (err < 0) - return err; + goto out;
err = i2c_smbus_write_byte(sdata->client, STMFTS_MS_MT_SENSE_ON); if (err) - return err; + goto out;
mutex_lock(&sdata->mutex); sdata->running = true; @@ -366,7 +366,9 @@ static int stmfts_input_open(struct input_dev *dev) "failed to enable touchkey\n"); }
- return 0; +out: + pm_runtime_put_noidle(&sdata->client->dev); + return err; }
static void stmfts_input_close(struct input_dev *dev)
From: Zheng Yongjun zhengyongjun3@huawei.com
[ Upstream commit e9a36feecee0ee5845f2e0656f50f9942dd0bed3 ]
pm_runtime_get_sync() will increment pm usage counter even it failed. Forgetting to call pm_runtime_put_noidle will result in reference leak in stm32_crc_remove, so we should fix it.
Signed-off-by: Zheng Yongjun zhengyongjun3@huawei.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/crypto/stm32/stm32-crc32.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/crypto/stm32/stm32-crc32.c b/drivers/crypto/stm32/stm32-crc32.c index fb640e0ea614..2ecc970f5cae 100644 --- a/drivers/crypto/stm32/stm32-crc32.c +++ b/drivers/crypto/stm32/stm32-crc32.c @@ -332,8 +332,10 @@ static int stm32_crc_remove(struct platform_device *pdev) struct stm32_crc *crc = platform_get_drvdata(pdev); int ret = pm_runtime_get_sync(crc->dev);
- if (ret < 0) + if (ret < 0) { + pm_runtime_put_noidle(crc->dev); return ret; + }
spin_lock(&crc_list.lock); list_del(&crc->list);
From: Peter Zijlstra peterz@infradead.org
[ Upstream commit 4327d168515fd8b5b92fa1efdf1d219fb6514460 ]
The chacha_Nblock_xor_avx512vl() functions all have their own, identical, .LdoneN label, however in one particular spot {2,4} jump to the 8 version instead of their own. Resulting in:
arch/x86/crypto/chacha-x86_64.o: warning: objtool: chacha_2block_xor_avx512vl() falls through to next function chacha_8block_xor_avx512vl() arch/x86/crypto/chacha-x86_64.o: warning: objtool: chacha_4block_xor_avx512vl() falls through to next function chacha_8block_xor_avx512vl()
Make each function consistently use its own done label.
Reported-by: Stephen Rothwell sfr@canb.auug.org.au Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Reviewed-by: Martin Willi martin@strongswan.org Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Signed-off-by: Sasha Levin sashal@kernel.org --- arch/x86/crypto/chacha-avx512vl-x86_64.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/x86/crypto/chacha-avx512vl-x86_64.S b/arch/x86/crypto/chacha-avx512vl-x86_64.S index 848f9c75fd4f..596f91e3a774 100644 --- a/arch/x86/crypto/chacha-avx512vl-x86_64.S +++ b/arch/x86/crypto/chacha-avx512vl-x86_64.S @@ -172,7 +172,7 @@ ENTRY(chacha_2block_xor_avx512vl) # xor remaining bytes from partial register into output mov %rcx,%rax and $0xf,%rcx - jz .Ldone8 + jz .Ldone2 mov %rax,%r9 and $~0xf,%r9
@@ -438,7 +438,7 @@ ENTRY(chacha_4block_xor_avx512vl) # xor remaining bytes from partial register into output mov %rcx,%rax and $0xf,%rcx - jz .Ldone8 + jz .Ldone4 mov %rax,%r9 and $~0xf,%r9
From: Kai-Heng Feng kai.heng.feng@canonical.com
[ Upstream commit 5a8738571747c1e275a40b69a608657603867b7e ]
Lenovo P360 is another platform equipped with ALC897, and it needs ALC897_FIXUP_HEADSET_MIC_PIN quirk to make its headset mic work.
Signed-off-by: Kai-Heng Feng kai.heng.feng@canonical.com Link: https://lore.kernel.org/r/20220325160501.705221-1-kai.heng.feng@canonical.co... Signed-off-by: Takashi Iwai tiwai@suse.de Signed-off-by: Sasha Levin sashal@kernel.org --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index d201043d661c..4bcf833781bd 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10221,6 +10221,7 @@ static const struct snd_pci_quirk alc662_fixup_tbl[] = { SND_PCI_QUIRK(0x144d, 0xc051, "Samsung R720", ALC662_FIXUP_IDEAPAD), SND_PCI_QUIRK(0x14cd, 0x5003, "USI", ALC662_FIXUP_USI_HEADSET_MODE), SND_PCI_QUIRK(0x17aa, 0x1036, "Lenovo P520", ALC662_FIXUP_LENOVO_MULTI_CODECS), + SND_PCI_QUIRK(0x17aa, 0x1057, "Lenovo P360", ALC897_FIXUP_HEADSET_MIC_PIN), SND_PCI_QUIRK(0x17aa, 0x32ca, "Lenovo ThinkCentre M80", ALC897_FIXUP_HEADSET_MIC_PIN), SND_PCI_QUIRK(0x17aa, 0x32cb, "Lenovo ThinkCentre M70", ALC897_FIXUP_HEADSET_MIC_PIN), SND_PCI_QUIRK(0x17aa, 0x32cf, "Lenovo ThinkCentre M950", ALC897_FIXUP_HEADSET_MIC_PIN),
From: Anton Eidelman anton.eidelman@gmail.com
[ Upstream commit a4a6f3c8f61c3cfbda4998ad94596059ad7e4332 ]
nvme_mpath_init_identify() invoked from nvme_init_identify() fetches a fresh ANA log from the ctrl. This is essential to have an up to date path states for both existing namespaces and for those scan_work may discover once the ctrl is up.
This happens in the following cases: 1) A new ctrl is being connected. 2) An existing ctrl is successfully reconnected. 3) An existing ctrl is being reset.
While in (1) ctrl->namespaces is empty, (2 & 3) may have namespaces, and nvme_read_ana_log() may call nvme_update_ns_ana_state().
This result in a hang when the ANA state of an existing namespace changes and makes the disk live: nvme_mpath_set_live() issues IO to the namespace through the ctrl, which does NOT have IO queues yet.
See sample hang below.
Solution: - nvme_update_ns_ana_state() to call set_live only if ctrl is live - nvme_read_ana_log() call from nvme_mpath_init_identify() therefore only fetches and parses the ANA log; any erros in this process will fail the ctrl setup as appropriate; - a separate function nvme_mpath_update() is called in nvme_start_ctrl(); this parses the ANA log without fetching it. At this point the ctrl is live, therefore, disks can be set live normally.
Sample failure: nvme nvme0: starting error recovery nvme nvme0: Reconnecting in 10 seconds... block nvme0n6: no usable path - requeuing I/O INFO: task kworker/u8:3:312 blocked for more than 122 seconds. Tainted: G E 5.14.5-1.el7.elrepo.x86_64 #1 Workqueue: nvme-wq nvme_tcp_reconnect_ctrl_work [nvme_tcp] Call Trace: __schedule+0x2a2/0x7e0 schedule+0x4e/0xb0 io_schedule+0x16/0x40 wait_on_page_bit_common+0x15c/0x3e0 do_read_cache_page+0x1e0/0x410 read_cache_page+0x12/0x20 read_part_sector+0x46/0x100 read_lba+0x121/0x240 efi_partition+0x1d2/0x6a0 bdev_disk_changed.part.0+0x1df/0x430 bdev_disk_changed+0x18/0x20 blkdev_get_whole+0x77/0xe0 blkdev_get_by_dev+0xd2/0x3a0 __device_add_disk+0x1ed/0x310 device_add_disk+0x13/0x20 nvme_mpath_set_live+0x138/0x1b0 [nvme_core] nvme_update_ns_ana_state+0x2b/0x30 [nvme_core] nvme_update_ana_state+0xca/0xe0 [nvme_core] nvme_parse_ana_log+0xac/0x170 [nvme_core] nvme_read_ana_log+0x7d/0xe0 [nvme_core] nvme_mpath_init_identify+0x105/0x150 [nvme_core] nvme_init_identify+0x2df/0x4d0 [nvme_core] nvme_init_ctrl_finish+0x8d/0x3b0 [nvme_core] nvme_tcp_setup_ctrl+0x337/0x390 [nvme_tcp] nvme_tcp_reconnect_ctrl_work+0x24/0x40 [nvme_tcp] process_one_work+0x1bd/0x360 worker_thread+0x50/0x3d0
Signed-off-by: Anton Eidelman anton@lightbitslabs.com Reviewed-by: Sagi Grimberg sagi@grimberg.me Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/nvme/host/core.c | 1 + drivers/nvme/host/multipath.c | 25 +++++++++++++++++++++++-- drivers/nvme/host/nvme.h | 4 ++++ 3 files changed, 28 insertions(+), 2 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6a9a42809f97..79e22618817d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4047,6 +4047,7 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl) if (ctrl->queue_count > 1) { nvme_queue_scan(ctrl); nvme_start_queues(ctrl); + nvme_mpath_update(ctrl); } ctrl->created = true; } diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 4d615337e6e2..811f7b96b551 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -501,8 +501,17 @@ static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc, ns->ana_grpid = le32_to_cpu(desc->grpid); ns->ana_state = desc->state; clear_bit(NVME_NS_ANA_PENDING, &ns->flags); - - if (nvme_state_is_live(ns->ana_state)) + /* + * nvme_mpath_set_live() will trigger I/O to the multipath path device + * and in turn to this path device. However we cannot accept this I/O + * if the controller is not live. This may deadlock if called from + * nvme_mpath_init_identify() and the ctrl will never complete + * initialization, preventing I/O from completing. For this case we + * will reprocess the ANA log page in nvme_mpath_update() once the + * controller is ready. + */ + if (nvme_state_is_live(ns->ana_state) && + ns->ctrl->state == NVME_CTRL_LIVE) nvme_mpath_set_live(ns); }
@@ -586,6 +595,18 @@ static void nvme_ana_work(struct work_struct *work) nvme_read_ana_log(ctrl); }
+void nvme_mpath_update(struct nvme_ctrl *ctrl) +{ + u32 nr_change_groups = 0; + + if (!ctrl->ana_log_buf) + return; + + mutex_lock(&ctrl->ana_lock); + nvme_parse_ana_log(ctrl, &nr_change_groups, nvme_update_ana_state); + mutex_unlock(&ctrl->ana_lock); +} + static void nvme_anatt_timeout(struct timer_list *t) { struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 2df90d4355b9..1d1431dd4f9e 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -551,6 +551,7 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id); void nvme_mpath_remove_disk(struct nvme_ns_head *head); int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id); void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl); +void nvme_mpath_update(struct nvme_ctrl *ctrl); void nvme_mpath_uninit(struct nvme_ctrl *ctrl); void nvme_mpath_stop(struct nvme_ctrl *ctrl); bool nvme_mpath_clear_current_path(struct nvme_ns *ns); @@ -648,6 +649,9 @@ static inline int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, "Please enable CONFIG_NVME_MULTIPATH for full support of multi-port devices.\n"); return 0; } +static inline void nvme_mpath_update(struct nvme_ctrl *ctrl) +{ +} static inline void nvme_mpath_uninit(struct nvme_ctrl *ctrl) { }
From: Mario Limonciello mario.limonciello@amd.com
[ Upstream commit 3ae8fd41573af4fb3a490c9ed947fc936ba87190 ]
Setting the century forward has been failing on AMD platforms. There was a previous attempt at fixing this for family 0x17 as part of commit 7ad295d5196a ("rtc: Fix the AltCentury value on AMD/Hygon platform") but this was later reverted due to some problems reported that appeared to stem from an FW bug on a family 0x17 desktop system.
The same comments mentioned in the previous commit continue to apply to the newer platforms as well.
``` MC146818 driver use function mc146818_set_time() to set register RTC_FREQ_SELECT(RTC_REG_A)'s bit4-bit6 field which means divider stage reset value on Intel platform to 0x7.
While AMD/Hygon RTC_REG_A(0Ah)'s bit4 is defined as DV0 [Reference]: DV0 = 0 selects Bank 0, DV0 = 1 selects Bank 1. Bit5-bit6 is defined as reserved.
DV0 is set to 1, it will select Bank 1, which will disable AltCentury register(0x32) access. As UEFI pass acpi_gbl_FADT.century 0x32 (AltCentury), the CMOS write will be failed on code: CMOS_WRITE(century, acpi_gbl_FADT.century).
Correct RTC_REG_A bank select bit(DV0) to 0 on AMD/Hygon CPUs, it will enable AltCentury(0x32) register writing and finally setup century as expected. ```
However in closer examination the change previously submitted was also modifying bits 5 & 6 which are declared reserved in the AMD documentation. So instead modify just the DV0 bank selection bit.
Being cognizant that there was a failure reported before, split the code change out to a static function that can also be used for exclusions if any regressions such as Mikhail's pop up again.
Cc: Jinke Fan fanjinke@hygon.cn Cc: Mikhail Gavrilov mikhail.v.gavrilov@gmail.com Link: https://lore.kernel.org/all/CABXGCsMLob0DC25JS8wwAYydnDoHBSoMh2_YLPfqm3TTvDE... Link: https://www.amd.com/system/files/TechDocs/51192_Bolton_FCH_RRG.pdf Signed-off-by: Raul E Rangel rrangel@chromium.org Signed-off-by: Mario Limonciello mario.limonciello@amd.com Signed-off-by: Alexandre Belloni alexandre.belloni@bootlin.com Link: https://lore.kernel.org/r/20220111225750.1699-1-mario.limonciello@amd.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/rtc/rtc-mc146818-lib.c | 16 +++++++++++++++- include/linux/mc146818rtc.h | 2 ++ 2 files changed, 17 insertions(+), 1 deletion(-)
diff --git a/drivers/rtc/rtc-mc146818-lib.c b/drivers/rtc/rtc-mc146818-lib.c index 5add637c9ad2..b036ff33fbe6 100644 --- a/drivers/rtc/rtc-mc146818-lib.c +++ b/drivers/rtc/rtc-mc146818-lib.c @@ -99,6 +99,17 @@ unsigned int mc146818_get_time(struct rtc_time *time) } EXPORT_SYMBOL_GPL(mc146818_get_time);
+/* AMD systems don't allow access to AltCentury with DV1 */ +static bool apply_amd_register_a_behavior(void) +{ +#ifdef CONFIG_X86 + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) + return true; +#endif + return false; +} + /* Set the current date and time in the real time clock. */ int mc146818_set_time(struct rtc_time *time) { @@ -172,7 +183,10 @@ int mc146818_set_time(struct rtc_time *time) save_control = CMOS_READ(RTC_CONTROL); CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL); save_freq_select = CMOS_READ(RTC_FREQ_SELECT); - CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT); + if (apply_amd_register_a_behavior()) + CMOS_WRITE((save_freq_select & ~RTC_AMD_BANK_SELECT), RTC_FREQ_SELECT); + else + CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
#ifdef CONFIG_MACH_DECSTATION CMOS_WRITE(real_yrs, RTC_DEC_YEAR); diff --git a/include/linux/mc146818rtc.h b/include/linux/mc146818rtc.h index 0661af17a758..1e0205811394 100644 --- a/include/linux/mc146818rtc.h +++ b/include/linux/mc146818rtc.h @@ -86,6 +86,8 @@ struct cmos_rtc_board_info { /* 2 values for divider stage reset, others for "testing purposes only" */ # define RTC_DIV_RESET1 0x60 # define RTC_DIV_RESET2 0x70 + /* In AMD BKDG bit 5 and 6 are reserved, bit 4 is for select dv0 bank */ +# define RTC_AMD_BANK_SELECT 0x10 /* Periodic intr. / Square wave rate select. 0=none, 1=32.8kHz,... 15=2Hz */ # define RTC_RATE_SELECT 0x0F
From: Xiaoke Wang xkernel.wang@foxmail.com
[ Upstream commit 34123208bbcc8c884a0489f543a23fe9eebb5514 ]
kzalloc() is a memory allocation function which can return NULL when some internal memory errors happen. So it is better to check the return value of it to prevent potential wrong memory access or memory leak.
Signed-off-by: Xiaoke Wang xkernel.wang@foxmail.com Signed-off-by: Thomas Bogendoerfer tsbogend@alpha.franken.de Signed-off-by: Sasha Levin sashal@kernel.org --- arch/mips/lantiq/falcon/sysctrl.c | 2 ++ arch/mips/lantiq/xway/gptu.c | 2 ++ arch/mips/lantiq/xway/sysctrl.c | 46 ++++++++++++++++++++----------- 3 files changed, 34 insertions(+), 16 deletions(-)
diff --git a/arch/mips/lantiq/falcon/sysctrl.c b/arch/mips/lantiq/falcon/sysctrl.c index 037b08f3257e..a2837a54d972 100644 --- a/arch/mips/lantiq/falcon/sysctrl.c +++ b/arch/mips/lantiq/falcon/sysctrl.c @@ -167,6 +167,8 @@ static inline void clkdev_add_sys(const char *dev, unsigned int module, { struct clk *clk = kzalloc(sizeof(struct clk), GFP_KERNEL);
+ if (!clk) + return; clk->cl.dev_id = dev; clk->cl.con_id = NULL; clk->cl.clk = clk; diff --git a/arch/mips/lantiq/xway/gptu.c b/arch/mips/lantiq/xway/gptu.c index 3d5683e75cf1..200fe9ff641d 100644 --- a/arch/mips/lantiq/xway/gptu.c +++ b/arch/mips/lantiq/xway/gptu.c @@ -122,6 +122,8 @@ static inline void clkdev_add_gptu(struct device *dev, const char *con, { struct clk *clk = kzalloc(sizeof(struct clk), GFP_KERNEL);
+ if (!clk) + return; clk->cl.dev_id = dev_name(dev); clk->cl.con_id = con; clk->cl.clk = clk; diff --git a/arch/mips/lantiq/xway/sysctrl.c b/arch/mips/lantiq/xway/sysctrl.c index 2ee68d6e8bb9..6c2d9779ac72 100644 --- a/arch/mips/lantiq/xway/sysctrl.c +++ b/arch/mips/lantiq/xway/sysctrl.c @@ -311,6 +311,8 @@ static void clkdev_add_pmu(const char *dev, const char *con, bool deactivate, { struct clk *clk = kzalloc(sizeof(struct clk), GFP_KERNEL);
+ if (!clk) + return; clk->cl.dev_id = dev; clk->cl.con_id = con; clk->cl.clk = clk; @@ -334,6 +336,8 @@ static void clkdev_add_cgu(const char *dev, const char *con, { struct clk *clk = kzalloc(sizeof(struct clk), GFP_KERNEL);
+ if (!clk) + return; clk->cl.dev_id = dev; clk->cl.con_id = con; clk->cl.clk = clk; @@ -352,24 +356,28 @@ static void clkdev_add_pci(void) struct clk *clk_ext = kzalloc(sizeof(struct clk), GFP_KERNEL);
/* main pci clock */ - clk->cl.dev_id = "17000000.pci"; - clk->cl.con_id = NULL; - clk->cl.clk = clk; - clk->rate = CLOCK_33M; - clk->rates = valid_pci_rates; - clk->enable = pci_enable; - clk->disable = pmu_disable; - clk->module = 0; - clk->bits = PMU_PCI; - clkdev_add(&clk->cl); + if (clk) { + clk->cl.dev_id = "17000000.pci"; + clk->cl.con_id = NULL; + clk->cl.clk = clk; + clk->rate = CLOCK_33M; + clk->rates = valid_pci_rates; + clk->enable = pci_enable; + clk->disable = pmu_disable; + clk->module = 0; + clk->bits = PMU_PCI; + clkdev_add(&clk->cl); + }
/* use internal/external bus clock */ - clk_ext->cl.dev_id = "17000000.pci"; - clk_ext->cl.con_id = "external"; - clk_ext->cl.clk = clk_ext; - clk_ext->enable = pci_ext_enable; - clk_ext->disable = pci_ext_disable; - clkdev_add(&clk_ext->cl); + if (clk_ext) { + clk_ext->cl.dev_id = "17000000.pci"; + clk_ext->cl.con_id = "external"; + clk_ext->cl.clk = clk_ext; + clk_ext->enable = pci_ext_enable; + clk_ext->disable = pci_ext_disable; + clkdev_add(&clk_ext->cl); + } }
/* xway socs can generate clocks on gpio pins */ @@ -389,9 +397,15 @@ static void clkdev_add_clkout(void) char *name;
name = kzalloc(sizeof("clkout0"), GFP_KERNEL); + if (!name) + continue; sprintf(name, "clkout%d", i);
clk = kzalloc(sizeof(struct clk), GFP_KERNEL); + if (!clk) { + kfree(name); + continue; + } clk->cl.dev_id = "1f103000.cgu"; clk->cl.con_id = name; clk->cl.clk = clk;
From: Jakob Koschel jakobkoschel@gmail.com
[ Upstream commit 901aeda62efa21f2eae937bccb71b49ae531be06 ]
In preparation to limit the scope of a list iterator to the list traversal loop, use a dedicated pointer to iterate through the list [1].
Since that variable should not be used past the loop iteration, a separate variable is used to 'remember the current location within the loop'.
To either continue iterating from that position or skip the iteration (if the previous iteration was complete) list_prepare_entry() is used.
Link: https://lore.kernel.org/all/CAHk-=wgRr_D8CB-D9Kg-c=EHreAsk5SqXPwr9Y7k9sA6cWX... [1] Signed-off-by: Jakob Koschel jakobkoschel@gmail.com Link: https://lore.kernel.org/r/20220331220349.885126-1-jakobkoschel@gmail.com Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/block/drbd/drbd_main.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index a18155cdce41..ba10fa24fa1f 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -183,7 +183,7 @@ void tl_release(struct drbd_connection *connection, unsigned int barrier_nr, unsigned int set_size) { struct drbd_request *r; - struct drbd_request *req = NULL; + struct drbd_request *req = NULL, *tmp = NULL; int expect_epoch = 0; int expect_size = 0;
@@ -237,8 +237,11 @@ void tl_release(struct drbd_connection *connection, unsigned int barrier_nr, * to catch requests being barrier-acked "unexpectedly". * It usually should find the same req again, or some READ preceding it. */ list_for_each_entry(req, &connection->transfer_log, tl_requests) - if (req->epoch == expect_epoch) + if (req->epoch == expect_epoch) { + tmp = req; break; + } + req = list_prepare_entry(tmp, &connection->transfer_log, tl_requests); list_for_each_entry_safe_from(req, r, &connection->transfer_log, tl_requests) { if (req->epoch != expect_epoch) break;
From: Tzung-Bi Shih tzungbi@google.com
[ Upstream commit 0e8eb5e8acbad19ac2e1856b2fb2320184299b33 ]
Debugfs console_log uses devm memory (e.g. debug_info in cros_ec_console_log_poll()). However, lifecycles of device and debugfs are independent. An use-after-free issue is observed if userland program operates the debugfs after the memory has been freed.
The call trace: do_raw_spin_lock _raw_spin_lock_irqsave remove_wait_queue ep_unregister_pollwait ep_remove do_epoll_ctl
A Python example to reproduce the issue: ... import select ... p = select.epoll() ... f = open('/sys/kernel/debug/cros_scp/console_log') ... p.register(f, select.POLLIN) ... p.poll(1) [(4, 1)] # 4=fd, 1=select.POLLIN
[ shutdown cros_scp at the point ]
... p.poll(1) [(4, 16)] # 4=fd, 16=select.POLLHUP ... p.unregister(f)
An use-after-free issue raises here. It called epoll_ctl with EPOLL_CTL_DEL which in turn to use the workqueue in the devm (i.e. log_wq).
Detaches log reader's workqueue from devm to make sure it is persistent even if the device has been removed.
Signed-off-by: Tzung-Bi Shih tzungbi@google.com Reviewed-by: Guenter Roeck groeck@google.com Link: https://lore.kernel.org/r/20220209051130.386175-1-tzungbi@google.com Signed-off-by: Benson Leung bleung@chromium.org Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/platform/chrome/cros_ec_debugfs.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/drivers/platform/chrome/cros_ec_debugfs.c b/drivers/platform/chrome/cros_ec_debugfs.c index 6ae484989d1f..c4b57e1df192 100644 --- a/drivers/platform/chrome/cros_ec_debugfs.c +++ b/drivers/platform/chrome/cros_ec_debugfs.c @@ -26,6 +26,9 @@
#define CIRC_ADD(idx, size, value) (((idx) + (value)) & ((size) - 1))
+/* waitqueue for log readers */ +static DECLARE_WAIT_QUEUE_HEAD(cros_ec_debugfs_log_wq); + /** * struct cros_ec_debugfs - EC debugging information. * @@ -34,7 +37,6 @@ * @log_buffer: circular buffer for console log information * @read_msg: preallocated EC command and buffer to read console log * @log_mutex: mutex to protect circular buffer - * @log_wq: waitqueue for log readers * @log_poll_work: recurring task to poll EC for new console log data * @panicinfo_blob: panicinfo debugfs blob */ @@ -45,7 +47,6 @@ struct cros_ec_debugfs { struct circ_buf log_buffer; struct cros_ec_command *read_msg; struct mutex log_mutex; - wait_queue_head_t log_wq; struct delayed_work log_poll_work; /* EC panicinfo */ struct debugfs_blob_wrapper panicinfo_blob; @@ -108,7 +109,7 @@ static void cros_ec_console_log_work(struct work_struct *__work) buf_space--; }
- wake_up(&debug_info->log_wq); + wake_up(&cros_ec_debugfs_log_wq); }
mutex_unlock(&debug_info->log_mutex); @@ -142,7 +143,7 @@ static ssize_t cros_ec_console_log_read(struct file *file, char __user *buf,
mutex_unlock(&debug_info->log_mutex);
- ret = wait_event_interruptible(debug_info->log_wq, + ret = wait_event_interruptible(cros_ec_debugfs_log_wq, CIRC_CNT(cb->head, cb->tail, LOG_SIZE)); if (ret < 0) return ret; @@ -174,7 +175,7 @@ static __poll_t cros_ec_console_log_poll(struct file *file, struct cros_ec_debugfs *debug_info = file->private_data; __poll_t mask = 0;
- poll_wait(file, &debug_info->log_wq, wait); + poll_wait(file, &cros_ec_debugfs_log_wq, wait);
mutex_lock(&debug_info->log_mutex); if (CIRC_CNT(debug_info->log_buffer.head, @@ -359,7 +360,6 @@ static int cros_ec_create_console_log(struct cros_ec_debugfs *debug_info) debug_info->log_buffer.tail = 0;
mutex_init(&debug_info->log_mutex); - init_waitqueue_head(&debug_info->log_wq);
debugfs_create_file("console_log", S_IFREG | 0444, debug_info->dir, debug_info, &cros_ec_console_log_fops);
From: linyujun linyujun809@huawei.com
[ Upstream commit 9be4c88bb7924f68f88cfd47d925c2d046f51a73 ]
The following KASAN warning is detected by QEMU.
================================================================== BUG: KASAN: stack-out-of-bounds in unwind_frame+0x508/0x870 Read of size 4 at addr c36bba90 by task cat/163
CPU: 1 PID: 163 Comm: cat Not tainted 5.10.0-rc1 #40 Hardware name: ARM-Versatile Express [<c0113fac>] (unwind_backtrace) from [<c010e71c>] (show_stack+0x10/0x14) [<c010e71c>] (show_stack) from [<c0b805b4>] (dump_stack+0x98/0xb0) [<c0b805b4>] (dump_stack) from [<c0b7d658>] (print_address_description.constprop.0+0x58/0x4bc) [<c0b7d658>] (print_address_description.constprop.0) from [<c031435c>] (kasan_report+0x154/0x170) [<c031435c>] (kasan_report) from [<c0113c44>] (unwind_frame+0x508/0x870) [<c0113c44>] (unwind_frame) from [<c010e298>] (__save_stack_trace+0x110/0x134) [<c010e298>] (__save_stack_trace) from [<c01ce0d8>] (stack_trace_save+0x8c/0xb4) [<c01ce0d8>] (stack_trace_save) from [<c0313520>] (kasan_set_track+0x38/0x60) [<c0313520>] (kasan_set_track) from [<c0314cb8>] (kasan_set_free_info+0x20/0x2c) [<c0314cb8>] (kasan_set_free_info) from [<c0313474>] (__kasan_slab_free+0xec/0x120) [<c0313474>] (__kasan_slab_free) from [<c0311e20>] (kmem_cache_free+0x7c/0x334) [<c0311e20>] (kmem_cache_free) from [<c01c35dc>] (rcu_core+0x390/0xccc) [<c01c35dc>] (rcu_core) from [<c01013a8>] (__do_softirq+0x180/0x518) [<c01013a8>] (__do_softirq) from [<c0135214>] (irq_exit+0x9c/0xe0) [<c0135214>] (irq_exit) from [<c01a40e4>] (__handle_domain_irq+0xb0/0x110) [<c01a40e4>] (__handle_domain_irq) from [<c0691248>] (gic_handle_irq+0xa0/0xb8) [<c0691248>] (gic_handle_irq) from [<c0100b0c>] (__irq_svc+0x6c/0x94) Exception stack(0xc36bb928 to 0xc36bb970) b920: c36bb9c0 00000000 c0126919 c0101228 c36bb9c0 b76d7730 b940: c36b8000 c36bb9a0 c3335b00 c01ce0d8 00000003 c36bba3c c36bb940 c36bb978 b960: c010e298 c011373c 60000013 ffffffff [<c0100b0c>] (__irq_svc) from [<c011373c>] (unwind_frame+0x0/0x870) [<c011373c>] (unwind_frame) from [<00000000>] (0x0)
The buggy address belongs to the page: page:(ptrval) refcount:0 mapcount:0 mapping:00000000 index:0x0 pfn:0x636bb flags: 0x0() raw: 00000000 00000000 ef867764 00000000 00000000 00000000 ffffffff 00000000 page dumped because: kasan: bad access detected
addr c36bba90 is located in stack of task cat/163 at offset 48 in frame: stack_trace_save+0x0/0xb4
this frame has 1 object: [32, 48) 'trace'
Memory state around the buggy address: c36bb980: f1 f1 f1 f1 00 04 f2 f2 00 00 f3 f3 00 00 00 00 c36bba00: 00 00 00 00 00 00 00 00 00 00 00 00 f1 f1 f1 f1
c36bba80: 00 00 f3 f3 00 00 00 00 00 00 00 00 00 00 00 00
^ c36bbb00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 c36bbb80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ==================================================================
There is a same issue on x86 and has been resolved by the commit f7d27c35ddff ("x86/mm, kasan: Silence KASAN warnings in get_wchan()"). The solution could be applied to arm architecture too.
Signed-off-by: Lin Yujun linyujun809@huawei.com Reported-by: He Ying heying24@huawei.com Signed-off-by: Russell King (Oracle) rmk+kernel@armlinux.org.uk Signed-off-by: Sasha Levin sashal@kernel.org --- arch/arm/kernel/stacktrace.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/arch/arm/kernel/stacktrace.c b/arch/arm/kernel/stacktrace.c index db798eac7431..824774999825 100644 --- a/arch/arm/kernel/stacktrace.c +++ b/arch/arm/kernel/stacktrace.c @@ -53,17 +53,17 @@ int notrace unwind_frame(struct stackframe *frame) return -EINVAL;
frame->sp = frame->fp; - frame->fp = *(unsigned long *)(fp); - frame->pc = *(unsigned long *)(fp + 4); + frame->fp = READ_ONCE_NOCHECK(*(unsigned long *)(fp)); + frame->pc = READ_ONCE_NOCHECK(*(unsigned long *)(fp + 4)); #else /* check current frame pointer is within bounds */ if (fp < low + 12 || fp > high - 4) return -EINVAL;
/* restore the registers from the stack frame */ - frame->fp = *(unsigned long *)(fp - 12); - frame->sp = *(unsigned long *)(fp - 8); - frame->pc = *(unsigned long *)(fp - 4); + frame->fp = READ_ONCE_NOCHECK(*(unsigned long *)(fp - 12)); + frame->sp = READ_ONCE_NOCHECK(*(unsigned long *)(fp - 8)); + frame->pc = READ_ONCE_NOCHECK(*(unsigned long *)(fp - 4)); #endif
return 0;
From: Ryusuke Konishi konishi.ryusuke@gmail.com
[ Upstream commit e897be17a441fa637cd166fc3de1445131e57692 ]
Patch series "nilfs2 lockdep warning fixes".
The first two are to resolve the lockdep warning issue, and the last one is the accompanying cleanup and low priority.
Based on your comment, this series solves the issue by separating inode object as needed. Since I was worried about the impact of the object composition changes, I tested the series carefully not to cause regressions especially for delicate functions such like disk space reclamation and snapshots.
This patch (of 3):
If CONFIG_LOCKDEP is enabled, nilfs2 hits lockdep warnings at inode_to_wb() during page/folio operations for btree nodes:
WARNING: CPU: 0 PID: 6575 at include/linux/backing-dev.h:269 inode_to_wb include/linux/backing-dev.h:269 [inline] WARNING: CPU: 0 PID: 6575 at include/linux/backing-dev.h:269 folio_account_dirtied mm/page-writeback.c:2460 [inline] WARNING: CPU: 0 PID: 6575 at include/linux/backing-dev.h:269 __folio_mark_dirty+0xa7c/0xe30 mm/page-writeback.c:2509 Modules linked in: ... RIP: 0010:inode_to_wb include/linux/backing-dev.h:269 [inline] RIP: 0010:folio_account_dirtied mm/page-writeback.c:2460 [inline] RIP: 0010:__folio_mark_dirty+0xa7c/0xe30 mm/page-writeback.c:2509 ... Call Trace: __set_page_dirty include/linux/pagemap.h:834 [inline] mark_buffer_dirty+0x4e6/0x650 fs/buffer.c:1145 nilfs_btree_propagate_p fs/nilfs2/btree.c:1889 [inline] nilfs_btree_propagate+0x4ae/0xea0 fs/nilfs2/btree.c:2085 nilfs_bmap_propagate+0x73/0x170 fs/nilfs2/bmap.c:337 nilfs_collect_dat_data+0x45/0xd0 fs/nilfs2/segment.c:625 nilfs_segctor_apply_buffers+0x14a/0x470 fs/nilfs2/segment.c:1009 nilfs_segctor_scan_file+0x47a/0x700 fs/nilfs2/segment.c:1048 nilfs_segctor_collect_blocks fs/nilfs2/segment.c:1224 [inline] nilfs_segctor_collect fs/nilfs2/segment.c:1494 [inline] nilfs_segctor_do_construct+0x14f3/0x6c60 fs/nilfs2/segment.c:2036 nilfs_segctor_construct+0x7a7/0xb30 fs/nilfs2/segment.c:2372 nilfs_segctor_thread_construct fs/nilfs2/segment.c:2480 [inline] nilfs_segctor_thread+0x3c3/0xf90 fs/nilfs2/segment.c:2563 kthread+0x405/0x4f0 kernel/kthread.c:327 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295
This is because nilfs2 uses two page caches for each inode and inode->i_mapping never points to one of them, the btree node cache.
This causes inode_to_wb(inode) to refer to a different page cache than the caller page/folio operations such like __folio_start_writeback(), __folio_end_writeback(), or __folio_mark_dirty() acquired the lock.
This patch resolves the issue by allocating and using an additional inode to hold the page cache of btree nodes. The inode is attached one-to-one to the traditional nilfs2 inode if it requires a block mapping with b-tree. This setup change is in memory only and does not affect the disk format.
Link: https://lkml.kernel.org/r/1647867427-30498-1-git-send-email-konishi.ryusuke@... Link: https://lkml.kernel.org/r/1647867427-30498-2-git-send-email-konishi.ryusuke@... Link: https://lore.kernel.org/r/YXrYvIo8YRnAOJCj@casper.infradead.org Link: https://lore.kernel.org/r/9a20b33d-b38f-b4a2-4742-c1eb5b8e4d6c@redhat.com Signed-off-by: Ryusuke Konishi konishi.ryusuke@gmail.com Reported-by: syzbot+0d5b462a6f07447991b3@syzkaller.appspotmail.com Reported-by: syzbot+34ef28bb2aeb28724aa0@syzkaller.appspotmail.com Reported-by: Hao Sun sunhao.th@gmail.com Reported-by: David Hildenbrand david@redhat.com Tested-by: Ryusuke Konishi konishi.ryusuke@gmail.com Cc: Matthew Wilcox willy@infradead.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org --- fs/nilfs2/btnode.c | 23 ++++++++-- fs/nilfs2/btnode.h | 1 + fs/nilfs2/btree.c | 27 ++++++++---- fs/nilfs2/gcinode.c | 7 +-- fs/nilfs2/inode.c | 104 ++++++++++++++++++++++++++++++++++++++------ fs/nilfs2/mdt.c | 7 +-- fs/nilfs2/nilfs.h | 14 +++--- fs/nilfs2/page.c | 7 ++- fs/nilfs2/segment.c | 9 ++-- fs/nilfs2/super.c | 5 +-- 10 files changed, 154 insertions(+), 50 deletions(-)
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 4391fd3abd8f..e00e184b1261 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -20,6 +20,23 @@ #include "page.h" #include "btnode.h"
+ +/** + * nilfs_init_btnc_inode - initialize B-tree node cache inode + * @btnc_inode: inode to be initialized + * + * nilfs_init_btnc_inode() sets up an inode for B-tree node cache. + */ +void nilfs_init_btnc_inode(struct inode *btnc_inode) +{ + struct nilfs_inode_info *ii = NILFS_I(btnc_inode); + + btnc_inode->i_mode = S_IFREG; + ii->i_flags = 0; + memset(&ii->i_bmap_data, 0, sizeof(struct nilfs_bmap)); + mapping_set_gfp_mask(btnc_inode->i_mapping, GFP_NOFS); +} + void nilfs_btnode_cache_clear(struct address_space *btnc) { invalidate_mapping_pages(btnc, 0, -1); @@ -29,7 +46,7 @@ void nilfs_btnode_cache_clear(struct address_space *btnc) struct buffer_head * nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr) { - struct inode *inode = NILFS_BTNC_I(btnc); + struct inode *inode = btnc->host; struct buffer_head *bh;
bh = nilfs_grab_buffer(inode, btnc, blocknr, BIT(BH_NILFS_Node)); @@ -57,7 +74,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, struct buffer_head **pbh, sector_t *submit_ptr) { struct buffer_head *bh; - struct inode *inode = NILFS_BTNC_I(btnc); + struct inode *inode = btnc->host; struct page *page; int err;
@@ -157,7 +174,7 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc, struct nilfs_btnode_chkey_ctxt *ctxt) { struct buffer_head *obh, *nbh; - struct inode *inode = NILFS_BTNC_I(btnc); + struct inode *inode = btnc->host; __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey; int err;
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h index 0f88dbc9bcb3..05ab64d354dc 100644 --- a/fs/nilfs2/btnode.h +++ b/fs/nilfs2/btnode.h @@ -30,6 +30,7 @@ struct nilfs_btnode_chkey_ctxt { struct buffer_head *newbh; };
+void nilfs_init_btnc_inode(struct inode *btnc_inode); void nilfs_btnode_cache_clear(struct address_space *); struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr); diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 23e043eca237..919d1238ce45 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -58,7 +58,8 @@ static void nilfs_btree_free_path(struct nilfs_btree_path *path) static int nilfs_btree_get_new_block(const struct nilfs_bmap *btree, __u64 ptr, struct buffer_head **bhp) { - struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache; + struct inode *btnc_inode = NILFS_BMAP_I(btree)->i_assoc_inode; + struct address_space *btnc = btnc_inode->i_mapping; struct buffer_head *bh;
bh = nilfs_btnode_create_block(btnc, ptr); @@ -470,7 +471,8 @@ static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr, struct buffer_head **bhp, const struct nilfs_btree_readahead_info *ra) { - struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache; + struct inode *btnc_inode = NILFS_BMAP_I(btree)->i_assoc_inode; + struct address_space *btnc = btnc_inode->i_mapping; struct buffer_head *bh, *ra_bh; sector_t submit_ptr = 0; int ret; @@ -1742,6 +1744,10 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *btree, __u64 key, dat = nilfs_bmap_get_dat(btree); }
+ ret = nilfs_attach_btree_node_cache(&NILFS_BMAP_I(btree)->vfs_inode); + if (ret < 0) + return ret; + ret = nilfs_bmap_prepare_alloc_ptr(btree, dreq, dat); if (ret < 0) return ret; @@ -1914,7 +1920,7 @@ static int nilfs_btree_prepare_update_v(struct nilfs_bmap *btree, path[level].bp_ctxt.newkey = path[level].bp_newreq.bpr_ptr; path[level].bp_ctxt.bh = path[level].bp_bh; ret = nilfs_btnode_prepare_change_key( - &NILFS_BMAP_I(btree)->i_btnode_cache, + NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping, &path[level].bp_ctxt); if (ret < 0) { nilfs_dat_abort_update(dat, @@ -1940,7 +1946,7 @@ static void nilfs_btree_commit_update_v(struct nilfs_bmap *btree,
if (buffer_nilfs_node(path[level].bp_bh)) { nilfs_btnode_commit_change_key( - &NILFS_BMAP_I(btree)->i_btnode_cache, + NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping, &path[level].bp_ctxt); path[level].bp_bh = path[level].bp_ctxt.bh; } @@ -1959,7 +1965,7 @@ static void nilfs_btree_abort_update_v(struct nilfs_bmap *btree, &path[level].bp_newreq.bpr_req); if (buffer_nilfs_node(path[level].bp_bh)) nilfs_btnode_abort_change_key( - &NILFS_BMAP_I(btree)->i_btnode_cache, + NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping, &path[level].bp_ctxt); }
@@ -2135,7 +2141,8 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_bmap *btree, static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree, struct list_head *listp) { - struct address_space *btcache = &NILFS_BMAP_I(btree)->i_btnode_cache; + struct inode *btnc_inode = NILFS_BMAP_I(btree)->i_assoc_inode; + struct address_space *btcache = btnc_inode->i_mapping; struct list_head lists[NILFS_BTREE_LEVEL_MAX]; struct pagevec pvec; struct buffer_head *bh, *head; @@ -2189,12 +2196,12 @@ static int nilfs_btree_assign_p(struct nilfs_bmap *btree, path[level].bp_ctxt.newkey = blocknr; path[level].bp_ctxt.bh = *bh; ret = nilfs_btnode_prepare_change_key( - &NILFS_BMAP_I(btree)->i_btnode_cache, + NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping, &path[level].bp_ctxt); if (ret < 0) return ret; nilfs_btnode_commit_change_key( - &NILFS_BMAP_I(btree)->i_btnode_cache, + NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping, &path[level].bp_ctxt); *bh = path[level].bp_ctxt.bh; } @@ -2399,6 +2406,10 @@ int nilfs_btree_init(struct nilfs_bmap *bmap)
if (nilfs_btree_root_broken(nilfs_btree_get_root(bmap), bmap->b_inode)) ret = -EIO; + else + ret = nilfs_attach_btree_node_cache( + &NILFS_BMAP_I(bmap)->vfs_inode); + return ret; }
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index aa3c328ee189..114774ac2185 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -126,9 +126,10 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff, int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn, __u64 vbn, struct buffer_head **out_bh) { + struct inode *btnc_inode = NILFS_I(inode)->i_assoc_inode; int ret;
- ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache, + ret = nilfs_btnode_submit_block(btnc_inode->i_mapping, vbn ? : pbn, pbn, REQ_OP_READ, 0, out_bh, &pbn); if (ret == -EEXIST) /* internal code (cache hit) */ @@ -170,7 +171,7 @@ int nilfs_init_gcinode(struct inode *inode) ii->i_flags = 0; nilfs_bmap_init_gc(ii->i_bmap);
- return 0; + return nilfs_attach_btree_node_cache(inode); }
/** @@ -185,7 +186,7 @@ void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs) ii = list_first_entry(head, struct nilfs_inode_info, i_dirty); list_del_init(&ii->i_dirty); truncate_inode_pages(&ii->vfs_inode.i_data, 0); - nilfs_btnode_cache_clear(&ii->i_btnode_cache); + nilfs_btnode_cache_clear(ii->i_assoc_inode->i_mapping); iput(&ii->vfs_inode); } } diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 671085512e0f..b0a0822e371c 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -28,12 +28,14 @@ * @cno: checkpoint number * @root: pointer on NILFS root object (mounted checkpoint) * @for_gc: inode for GC flag + * @for_btnc: inode for B-tree node cache flag */ struct nilfs_iget_args { u64 ino; __u64 cno; struct nilfs_root *root; - int for_gc; + bool for_gc; + bool for_btnc; };
static int nilfs_iget_test(struct inode *inode, void *opaque); @@ -322,7 +324,8 @@ static int nilfs_insert_inode_locked(struct inode *inode, unsigned long ino) { struct nilfs_iget_args args = { - .ino = ino, .root = root, .cno = 0, .for_gc = 0 + .ino = ino, .root = root, .cno = 0, .for_gc = false, + .for_btnc = false };
return insert_inode_locked4(inode, ino, nilfs_iget_test, &args); @@ -534,6 +537,13 @@ static int nilfs_iget_test(struct inode *inode, void *opaque) return 0;
ii = NILFS_I(inode); + if (test_bit(NILFS_I_BTNC, &ii->i_state)) { + if (!args->for_btnc) + return 0; + } else if (args->for_btnc) { + return 0; + } + if (!test_bit(NILFS_I_GCINODE, &ii->i_state)) return !args->for_gc;
@@ -545,15 +555,15 @@ static int nilfs_iget_set(struct inode *inode, void *opaque) struct nilfs_iget_args *args = opaque;
inode->i_ino = args->ino; - if (args->for_gc) { + NILFS_I(inode)->i_cno = args->cno; + NILFS_I(inode)->i_root = args->root; + if (args->root && args->ino == NILFS_ROOT_INO) + nilfs_get_root(args->root); + + if (args->for_gc) NILFS_I(inode)->i_state = BIT(NILFS_I_GCINODE); - NILFS_I(inode)->i_cno = args->cno; - NILFS_I(inode)->i_root = NULL; - } else { - if (args->root && args->ino == NILFS_ROOT_INO) - nilfs_get_root(args->root); - NILFS_I(inode)->i_root = args->root; - } + if (args->for_btnc) + NILFS_I(inode)->i_state |= BIT(NILFS_I_BTNC); return 0; }
@@ -561,7 +571,8 @@ struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root, unsigned long ino) { struct nilfs_iget_args args = { - .ino = ino, .root = root, .cno = 0, .for_gc = 0 + .ino = ino, .root = root, .cno = 0, .for_gc = false, + .for_btnc = false };
return ilookup5(sb, ino, nilfs_iget_test, &args); @@ -571,7 +582,8 @@ struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root, unsigned long ino) { struct nilfs_iget_args args = { - .ino = ino, .root = root, .cno = 0, .for_gc = 0 + .ino = ino, .root = root, .cno = 0, .for_gc = false, + .for_btnc = false };
return iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args); @@ -602,7 +614,8 @@ struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino, __u64 cno) { struct nilfs_iget_args args = { - .ino = ino, .root = NULL, .cno = cno, .for_gc = 1 + .ino = ino, .root = NULL, .cno = cno, .for_gc = true, + .for_btnc = false }; struct inode *inode; int err; @@ -622,6 +635,68 @@ struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino, return inode; }
+/** + * nilfs_attach_btree_node_cache - attach a B-tree node cache to the inode + * @inode: inode object + * + * nilfs_attach_btree_node_cache() attaches a B-tree node cache to @inode, + * or does nothing if the inode already has it. This function allocates + * an additional inode to maintain page cache of B-tree nodes one-on-one. + * + * Return Value: On success, 0 is returned. On errors, one of the following + * negative error code is returned. + * + * %-ENOMEM - Insufficient memory available. + */ +int nilfs_attach_btree_node_cache(struct inode *inode) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + struct inode *btnc_inode; + struct nilfs_iget_args args; + + if (ii->i_assoc_inode) + return 0; + + args.ino = inode->i_ino; + args.root = ii->i_root; + args.cno = ii->i_cno; + args.for_gc = test_bit(NILFS_I_GCINODE, &ii->i_state) != 0; + args.for_btnc = true; + + btnc_inode = iget5_locked(inode->i_sb, inode->i_ino, nilfs_iget_test, + nilfs_iget_set, &args); + if (unlikely(!btnc_inode)) + return -ENOMEM; + if (btnc_inode->i_state & I_NEW) { + nilfs_init_btnc_inode(btnc_inode); + unlock_new_inode(btnc_inode); + } + NILFS_I(btnc_inode)->i_assoc_inode = inode; + NILFS_I(btnc_inode)->i_bmap = ii->i_bmap; + ii->i_assoc_inode = btnc_inode; + + return 0; +} + +/** + * nilfs_detach_btree_node_cache - detach the B-tree node cache from the inode + * @inode: inode object + * + * nilfs_detach_btree_node_cache() detaches the B-tree node cache and its + * holder inode bound to @inode, or does nothing if @inode doesn't have it. + */ +void nilfs_detach_btree_node_cache(struct inode *inode) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + struct inode *btnc_inode = ii->i_assoc_inode; + + if (btnc_inode) { + NILFS_I(btnc_inode)->i_assoc_inode = NULL; + ii->i_assoc_inode = NULL; + iput(btnc_inode); + } +} + void nilfs_write_inode_common(struct inode *inode, struct nilfs_inode *raw_inode, int has_bmap) { @@ -770,7 +845,8 @@ static void nilfs_clear_inode(struct inode *inode) if (test_bit(NILFS_I_BMAP, &ii->i_state)) nilfs_bmap_clear(ii->i_bmap);
- nilfs_btnode_cache_clear(&ii->i_btnode_cache); + if (!test_bit(NILFS_I_BTNC, &ii->i_state)) + nilfs_detach_btree_node_cache(inode);
if (ii->i_root && inode->i_ino == NILFS_ROOT_INO) nilfs_put_root(ii->i_root); diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 700870a92bc4..3a1200220b97 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -531,7 +531,7 @@ int nilfs_mdt_save_to_shadow_map(struct inode *inode) goto out;
ret = nilfs_copy_dirty_pages(&shadow->frozen_btnodes, - &ii->i_btnode_cache); + ii->i_assoc_inode->i_mapping); if (ret) goto out;
@@ -622,8 +622,9 @@ void nilfs_mdt_restore_from_shadow_map(struct inode *inode) nilfs_clear_dirty_pages(inode->i_mapping, true); nilfs_copy_back_pages(inode->i_mapping, &shadow->frozen_data);
- nilfs_clear_dirty_pages(&ii->i_btnode_cache, true); - nilfs_copy_back_pages(&ii->i_btnode_cache, &shadow->frozen_btnodes); + nilfs_clear_dirty_pages(ii->i_assoc_inode->i_mapping, true); + nilfs_copy_back_pages(ii->i_assoc_inode->i_mapping, + &shadow->frozen_btnodes);
nilfs_bmap_restore(ii->i_bmap, &shadow->bmap_store);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 42395ba52da6..3b646d377237 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -28,7 +28,7 @@ * @i_xattr: <TODO> * @i_dir_start_lookup: page index of last successful search * @i_cno: checkpoint number for GC inode - * @i_btnode_cache: cached pages of b-tree nodes + * @i_assoc_inode: associated inode (B-tree node cache holder or back pointer) * @i_dirty: list for connecting dirty files * @xattr_sem: semaphore for extended attributes processing * @i_bh: buffer contains disk inode @@ -43,7 +43,7 @@ struct nilfs_inode_info { __u64 i_xattr; /* sector_t ??? */ __u32 i_dir_start_lookup; __u64 i_cno; /* check point number for GC inode */ - struct address_space i_btnode_cache; + struct inode *i_assoc_inode; struct list_head i_dirty; /* List for connecting dirty files */
#ifdef CONFIG_NILFS_XATTR @@ -75,13 +75,6 @@ NILFS_BMAP_I(const struct nilfs_bmap *bmap) return container_of(bmap, struct nilfs_inode_info, i_bmap_data); }
-static inline struct inode *NILFS_BTNC_I(struct address_space *btnc) -{ - struct nilfs_inode_info *ii = - container_of(btnc, struct nilfs_inode_info, i_btnode_cache); - return &ii->vfs_inode; -} - /* * Dynamic state flags of NILFS on-memory inode (i_state) */ @@ -98,6 +91,7 @@ enum { NILFS_I_INODE_SYNC, /* dsync is not allowed for inode */ NILFS_I_BMAP, /* has bmap and btnode_cache */ NILFS_I_GCINODE, /* inode for GC, on memory only */ + NILFS_I_BTNC, /* inode for btree node cache */ };
/* @@ -264,6 +258,8 @@ struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root, unsigned long ino); extern struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino, __u64 cno); +int nilfs_attach_btree_node_cache(struct inode *inode); +void nilfs_detach_btree_node_cache(struct inode *inode); extern void nilfs_update_inode(struct inode *, struct buffer_head *, int); extern void nilfs_truncate(struct inode *); extern void nilfs_evict_inode(struct inode *); diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index d7fc8d369d89..98d21ad9a073 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -450,10 +450,9 @@ void nilfs_mapping_init(struct address_space *mapping, struct inode *inode) /* * NILFS2 needs clear_page_dirty() in the following two cases: * - * 1) For B-tree node pages and data pages of the dat/gcdat, NILFS2 clears - * page dirty flags when it copies back pages from the shadow cache - * (gcdat->{i_mapping,i_btnode_cache}) to its original cache - * (dat->{i_mapping,i_btnode_cache}). + * 1) For B-tree node pages and data pages of DAT file, NILFS2 clears dirty + * flag of pages when it copies back pages from shadow cache to the + * original cache. * * 2) Some B-tree operations like insertion or deletion may dispose buffers * in dirty state, and this needs to cancel the dirty state of their pages. diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 91b58c897f92..eb3ac7619088 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -738,15 +738,18 @@ static void nilfs_lookup_dirty_node_buffers(struct inode *inode, struct list_head *listp) { struct nilfs_inode_info *ii = NILFS_I(inode); - struct address_space *mapping = &ii->i_btnode_cache; + struct inode *btnc_inode = ii->i_assoc_inode; struct pagevec pvec; struct buffer_head *bh, *head; unsigned int i; pgoff_t index = 0;
+ if (!btnc_inode) + return; + pagevec_init(&pvec);
- while (pagevec_lookup_tag(&pvec, mapping, &index, + while (pagevec_lookup_tag(&pvec, btnc_inode->i_mapping, &index, PAGECACHE_TAG_DIRTY)) { for (i = 0; i < pagevec_count(&pvec); i++) { bh = head = page_buffers(pvec.pages[i]); @@ -2410,7 +2413,7 @@ nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head) continue; list_del_init(&ii->i_dirty); truncate_inode_pages(&ii->vfs_inode.i_data, 0); - nilfs_btnode_cache_clear(&ii->i_btnode_cache); + nilfs_btnode_cache_clear(ii->i_assoc_inode->i_mapping); iput(&ii->vfs_inode); } } diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 5729ee86da9a..b5bc9f0c6a40 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -151,7 +151,8 @@ struct inode *nilfs_alloc_inode(struct super_block *sb) ii->i_bh = NULL; ii->i_state = 0; ii->i_cno = 0; - nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode); + ii->i_assoc_inode = NULL; + ii->i_bmap = &ii->i_bmap_data; return &ii->vfs_inode; }
@@ -1375,8 +1376,6 @@ static void nilfs_inode_init_once(void *obj) #ifdef CONFIG_NILFS_XATTR init_rwsem(&ii->xattr_sem); #endif - address_space_init_once(&ii->i_btnode_cache); - ii->i_bmap = &ii->i_bmap_data; inode_init_once(&ii->vfs_inode); }
From: Ryusuke Konishi konishi.ryusuke@gmail.com
[ Upstream commit 6e211930f79aa45d422009a5f2e5467d2369ffe5 ]
During disk space reclamation, nilfs2 still emits the following lockdep warning due to page/folio operations on shadowed page caches that nilfs2 uses to get a snapshot of DAT file in memory:
WARNING: CPU: 0 PID: 2643 at include/linux/backing-dev.h:272 __folio_mark_dirty+0x645/0x670 ... RIP: 0010:__folio_mark_dirty+0x645/0x670 ... Call Trace: filemap_dirty_folio+0x74/0xd0 __set_page_dirty_nobuffers+0x85/0xb0 nilfs_copy_dirty_pages+0x288/0x510 [nilfs2] nilfs_mdt_save_to_shadow_map+0x50/0xe0 [nilfs2] nilfs_clean_segments+0xee/0x5d0 [nilfs2] nilfs_ioctl_clean_segments.isra.19+0xb08/0xf40 [nilfs2] nilfs_ioctl+0xc52/0xfb0 [nilfs2] __x64_sys_ioctl+0x11d/0x170
This fixes the remaining warning by using inode objects to hold those page caches.
Link: https://lkml.kernel.org/r/1647867427-30498-3-git-send-email-konishi.ryusuke@... Signed-off-by: Ryusuke Konishi konishi.ryusuke@gmail.com Tested-by: Ryusuke Konishi konishi.ryusuke@gmail.com Cc: Matthew Wilcox willy@infradead.org Cc: David Hildenbrand david@redhat.com Cc: Hao Sun sunhao.th@gmail.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org --- fs/nilfs2/dat.c | 4 ++- fs/nilfs2/inode.c | 63 ++++++++++++++++++++++++++++++++++++++++++++--- fs/nilfs2/mdt.c | 38 +++++++++++++++++++--------- fs/nilfs2/mdt.h | 6 ++--- fs/nilfs2/nilfs.h | 2 ++ 5 files changed, 92 insertions(+), 21 deletions(-)
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index 6f4066636be9..a3523a243e11 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -497,7 +497,9 @@ int nilfs_dat_read(struct super_block *sb, size_t entry_size, di = NILFS_DAT_I(dat); lockdep_set_class(&di->mi.mi_sem, &dat_lock_key); nilfs_palloc_setup_cache(dat, &di->palloc_cache); - nilfs_mdt_setup_shadow_map(dat, &di->shadow); + err = nilfs_mdt_setup_shadow_map(dat, &di->shadow); + if (err) + goto failed;
err = nilfs_read_inode_common(dat, raw_inode); if (err) diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index b0a0822e371c..35b0bfe9324f 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -29,6 +29,7 @@ * @root: pointer on NILFS root object (mounted checkpoint) * @for_gc: inode for GC flag * @for_btnc: inode for B-tree node cache flag + * @for_shadow: inode for shadowed page cache flag */ struct nilfs_iget_args { u64 ino; @@ -36,6 +37,7 @@ struct nilfs_iget_args { struct nilfs_root *root; bool for_gc; bool for_btnc; + bool for_shadow; };
static int nilfs_iget_test(struct inode *inode, void *opaque); @@ -325,7 +327,7 @@ static int nilfs_insert_inode_locked(struct inode *inode, { struct nilfs_iget_args args = { .ino = ino, .root = root, .cno = 0, .for_gc = false, - .for_btnc = false + .for_btnc = false, .for_shadow = false };
return insert_inode_locked4(inode, ino, nilfs_iget_test, &args); @@ -543,6 +545,12 @@ static int nilfs_iget_test(struct inode *inode, void *opaque) } else if (args->for_btnc) { return 0; } + if (test_bit(NILFS_I_SHADOW, &ii->i_state)) { + if (!args->for_shadow) + return 0; + } else if (args->for_shadow) { + return 0; + }
if (!test_bit(NILFS_I_GCINODE, &ii->i_state)) return !args->for_gc; @@ -564,6 +572,8 @@ static int nilfs_iget_set(struct inode *inode, void *opaque) NILFS_I(inode)->i_state = BIT(NILFS_I_GCINODE); if (args->for_btnc) NILFS_I(inode)->i_state |= BIT(NILFS_I_BTNC); + if (args->for_shadow) + NILFS_I(inode)->i_state |= BIT(NILFS_I_SHADOW); return 0; }
@@ -572,7 +582,7 @@ struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root, { struct nilfs_iget_args args = { .ino = ino, .root = root, .cno = 0, .for_gc = false, - .for_btnc = false + .for_btnc = false, .for_shadow = false };
return ilookup5(sb, ino, nilfs_iget_test, &args); @@ -583,7 +593,7 @@ struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root, { struct nilfs_iget_args args = { .ino = ino, .root = root, .cno = 0, .for_gc = false, - .for_btnc = false + .for_btnc = false, .for_shadow = false };
return iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args); @@ -615,7 +625,7 @@ struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino, { struct nilfs_iget_args args = { .ino = ino, .root = NULL, .cno = cno, .for_gc = true, - .for_btnc = false + .for_btnc = false, .for_shadow = false }; struct inode *inode; int err; @@ -662,6 +672,7 @@ int nilfs_attach_btree_node_cache(struct inode *inode) args.cno = ii->i_cno; args.for_gc = test_bit(NILFS_I_GCINODE, &ii->i_state) != 0; args.for_btnc = true; + args.for_shadow = test_bit(NILFS_I_SHADOW, &ii->i_state) != 0;
btnc_inode = iget5_locked(inode->i_sb, inode->i_ino, nilfs_iget_test, nilfs_iget_set, &args); @@ -697,6 +708,50 @@ void nilfs_detach_btree_node_cache(struct inode *inode) } }
+/** + * nilfs_iget_for_shadow - obtain inode for shadow mapping + * @inode: inode object that uses shadow mapping + * + * nilfs_iget_for_shadow() allocates a pair of inodes that holds page + * caches for shadow mapping. The page cache for data pages is set up + * in one inode and the one for b-tree node pages is set up in the + * other inode, which is attached to the former inode. + * + * Return Value: On success, a pointer to the inode for data pages is + * returned. On errors, one of the following negative error code is returned + * in a pointer type. + * + * %-ENOMEM - Insufficient memory available. + */ +struct inode *nilfs_iget_for_shadow(struct inode *inode) +{ + struct nilfs_iget_args args = { + .ino = inode->i_ino, .root = NULL, .cno = 0, .for_gc = false, + .for_btnc = false, .for_shadow = true + }; + struct inode *s_inode; + int err; + + s_inode = iget5_locked(inode->i_sb, inode->i_ino, nilfs_iget_test, + nilfs_iget_set, &args); + if (unlikely(!s_inode)) + return ERR_PTR(-ENOMEM); + if (!(s_inode->i_state & I_NEW)) + return inode; + + NILFS_I(s_inode)->i_flags = 0; + memset(NILFS_I(s_inode)->i_bmap, 0, sizeof(struct nilfs_bmap)); + mapping_set_gfp_mask(s_inode->i_mapping, GFP_NOFS); + + err = nilfs_attach_btree_node_cache(s_inode); + if (unlikely(err)) { + iget_failed(s_inode); + return ERR_PTR(err); + } + unlock_new_inode(s_inode); + return s_inode; +} + void nilfs_write_inode_common(struct inode *inode, struct nilfs_inode *raw_inode, int has_bmap) { diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 3a1200220b97..7c9055d767d1 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -469,9 +469,18 @@ int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz) void nilfs_mdt_clear(struct inode *inode) { struct nilfs_mdt_info *mdi = NILFS_MDT(inode); + struct nilfs_shadow_map *shadow = mdi->mi_shadow;
if (mdi->mi_palloc_cache) nilfs_palloc_destroy_cache(inode); + + if (shadow) { + struct inode *s_inode = shadow->inode; + + shadow->inode = NULL; + iput(s_inode); + mdi->mi_shadow = NULL; + } }
/** @@ -505,12 +514,15 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode, struct nilfs_shadow_map *shadow) { struct nilfs_mdt_info *mi = NILFS_MDT(inode); + struct inode *s_inode;
INIT_LIST_HEAD(&shadow->frozen_buffers); - address_space_init_once(&shadow->frozen_data); - nilfs_mapping_init(&shadow->frozen_data, inode); - address_space_init_once(&shadow->frozen_btnodes); - nilfs_mapping_init(&shadow->frozen_btnodes, inode); + + s_inode = nilfs_iget_for_shadow(inode); + if (IS_ERR(s_inode)) + return PTR_ERR(s_inode); + + shadow->inode = s_inode; mi->mi_shadow = shadow; return 0; } @@ -524,13 +536,14 @@ int nilfs_mdt_save_to_shadow_map(struct inode *inode) struct nilfs_mdt_info *mi = NILFS_MDT(inode); struct nilfs_inode_info *ii = NILFS_I(inode); struct nilfs_shadow_map *shadow = mi->mi_shadow; + struct inode *s_inode = shadow->inode; int ret;
- ret = nilfs_copy_dirty_pages(&shadow->frozen_data, inode->i_mapping); + ret = nilfs_copy_dirty_pages(s_inode->i_mapping, inode->i_mapping); if (ret) goto out;
- ret = nilfs_copy_dirty_pages(&shadow->frozen_btnodes, + ret = nilfs_copy_dirty_pages(NILFS_I(s_inode)->i_assoc_inode->i_mapping, ii->i_assoc_inode->i_mapping); if (ret) goto out; @@ -547,7 +560,7 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh) struct page *page; int blkbits = inode->i_blkbits;
- page = grab_cache_page(&shadow->frozen_data, bh->b_page->index); + page = grab_cache_page(shadow->inode->i_mapping, bh->b_page->index); if (!page) return -ENOMEM;
@@ -579,7 +592,7 @@ nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh) struct page *page; int n;
- page = find_lock_page(&shadow->frozen_data, bh->b_page->index); + page = find_lock_page(shadow->inode->i_mapping, bh->b_page->index); if (page) { if (page_has_buffers(page)) { n = bh_offset(bh) >> inode->i_blkbits; @@ -620,11 +633,11 @@ void nilfs_mdt_restore_from_shadow_map(struct inode *inode) nilfs_palloc_clear_cache(inode);
nilfs_clear_dirty_pages(inode->i_mapping, true); - nilfs_copy_back_pages(inode->i_mapping, &shadow->frozen_data); + nilfs_copy_back_pages(inode->i_mapping, shadow->inode->i_mapping);
nilfs_clear_dirty_pages(ii->i_assoc_inode->i_mapping, true); nilfs_copy_back_pages(ii->i_assoc_inode->i_mapping, - &shadow->frozen_btnodes); + NILFS_I(shadow->inode)->i_assoc_inode->i_mapping);
nilfs_bmap_restore(ii->i_bmap, &shadow->bmap_store);
@@ -639,10 +652,11 @@ void nilfs_mdt_clear_shadow_map(struct inode *inode) { struct nilfs_mdt_info *mi = NILFS_MDT(inode); struct nilfs_shadow_map *shadow = mi->mi_shadow; + struct inode *shadow_btnc_inode = NILFS_I(shadow->inode)->i_assoc_inode;
down_write(&mi->mi_sem); nilfs_release_frozen_buffers(shadow); - truncate_inode_pages(&shadow->frozen_data, 0); - truncate_inode_pages(&shadow->frozen_btnodes, 0); + truncate_inode_pages(shadow->inode->i_mapping, 0); + truncate_inode_pages(shadow_btnc_inode->i_mapping, 0); up_write(&mi->mi_sem); } diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h index e77aea4bb921..9d8ac0d27c16 100644 --- a/fs/nilfs2/mdt.h +++ b/fs/nilfs2/mdt.h @@ -18,14 +18,12 @@ /** * struct nilfs_shadow_map - shadow mapping of meta data file * @bmap_store: shadow copy of bmap state - * @frozen_data: shadowed dirty data pages - * @frozen_btnodes: shadowed dirty b-tree nodes' pages + * @inode: holder of page caches used in shadow mapping * @frozen_buffers: list of frozen buffers */ struct nilfs_shadow_map { struct nilfs_bmap_store bmap_store; - struct address_space frozen_data; - struct address_space frozen_btnodes; + struct inode *inode; struct list_head frozen_buffers; };
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 3b646d377237..4895d978369a 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -92,6 +92,7 @@ enum { NILFS_I_BMAP, /* has bmap and btnode_cache */ NILFS_I_GCINODE, /* inode for GC, on memory only */ NILFS_I_BTNC, /* inode for btree node cache */ + NILFS_I_SHADOW, /* inode for shadowed page cache */ };
/* @@ -260,6 +261,7 @@ extern struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino, __u64 cno); int nilfs_attach_btree_node_cache(struct inode *inode); void nilfs_detach_btree_node_cache(struct inode *inode); +struct inode *nilfs_iget_for_shadow(struct inode *inode); extern void nilfs_update_inode(struct inode *, struct buffer_head *, int); extern void nilfs_truncate(struct inode *); extern void nilfs_evict_inode(struct inode *);
linux-stable-mirror@lists.linaro.org