From: Jonas Karlman jonas@kwiboo.se
[ Upstream commit fc8670d1f72b746ff3a5fe441f1fca4c4dba0e6f ]
media_device_cleanup() and v4l2_m2m_unregister_media_controller() were missing in the probe error path. While at it, re-order calls in the remove path to unregister/cleanup things in the reverse order they were initialized/registered.
Signed-off-by: Jonas Karlman jonas@kwiboo.se Signed-off-by: Boris Brezillon boris.brezillon@collabora.com Signed-off-by: Hans Verkuil hverkuil-cisco@xs4all.nl Signed-off-by: Mauro Carvalho Chehab mchehab+samsung@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/staging/media/rockchip/vpu/rockchip_vpu_drv.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/drivers/staging/media/rockchip/vpu/rockchip_vpu_drv.c b/drivers/staging/media/rockchip/vpu/rockchip_vpu_drv.c index 962412c79b917..33b556b3f0df8 100644 --- a/drivers/staging/media/rockchip/vpu/rockchip_vpu_drv.c +++ b/drivers/staging/media/rockchip/vpu/rockchip_vpu_drv.c @@ -481,10 +481,12 @@ static int rockchip_vpu_probe(struct platform_device *pdev) return 0; err_video_dev_unreg: if (vpu->vfd_enc) { + v4l2_m2m_unregister_media_controller(vpu->m2m_dev); video_unregister_device(vpu->vfd_enc); video_device_release(vpu->vfd_enc); } err_m2m_rel: + media_device_cleanup(&vpu->mdev); v4l2_m2m_release(vpu->m2m_dev); err_v4l2_unreg: v4l2_device_unregister(&vpu->v4l2_dev); @@ -501,13 +503,13 @@ static int rockchip_vpu_remove(struct platform_device *pdev) v4l2_info(&vpu->v4l2_dev, "Removing %s\n", pdev->name);
media_device_unregister(&vpu->mdev); - v4l2_m2m_unregister_media_controller(vpu->m2m_dev); - v4l2_m2m_release(vpu->m2m_dev); - media_device_cleanup(&vpu->mdev); if (vpu->vfd_enc) { + v4l2_m2m_unregister_media_controller(vpu->m2m_dev); video_unregister_device(vpu->vfd_enc); video_device_release(vpu->vfd_enc); } + media_device_cleanup(&vpu->mdev); + v4l2_m2m_release(vpu->m2m_dev); v4l2_device_unregister(&vpu->v4l2_dev); clk_bulk_unprepare(vpu->variant->num_clocks, vpu->clocks); pm_runtime_disable(vpu->dev);
From: Jonas Karlman jonas@kwiboo.se
[ Upstream commit 5c5b90f5cbad77dc15d8b5582efdb2e362bcd710 ]
Those calls are needed to restore a clean PM state when the probe fails or when the driver is unloaded such that future ->probe() calls can initialize runtime PM again.
Signed-off-by: Jonas Karlman jonas@kwiboo.se Signed-off-by: Boris Brezillon boris.brezillon@collabora.com Signed-off-by: Hans Verkuil hverkuil-cisco@xs4all.nl Signed-off-by: Mauro Carvalho Chehab mchehab+samsung@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/staging/media/rockchip/vpu/rockchip_vpu_drv.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/drivers/staging/media/rockchip/vpu/rockchip_vpu_drv.c b/drivers/staging/media/rockchip/vpu/rockchip_vpu_drv.c index 33b556b3f0df8..d489b5dd54d7a 100644 --- a/drivers/staging/media/rockchip/vpu/rockchip_vpu_drv.c +++ b/drivers/staging/media/rockchip/vpu/rockchip_vpu_drv.c @@ -492,6 +492,7 @@ static int rockchip_vpu_probe(struct platform_device *pdev) v4l2_device_unregister(&vpu->v4l2_dev); err_clk_unprepare: clk_bulk_unprepare(vpu->variant->num_clocks, vpu->clocks); + pm_runtime_dont_use_autosuspend(vpu->dev); pm_runtime_disable(vpu->dev); return ret; } @@ -512,6 +513,7 @@ static int rockchip_vpu_remove(struct platform_device *pdev) v4l2_m2m_release(vpu->m2m_dev); v4l2_device_unregister(&vpu->v4l2_dev); clk_bulk_unprepare(vpu->variant->num_clocks, vpu->clocks); + pm_runtime_dont_use_autosuspend(vpu->dev); pm_runtime_disable(vpu->dev); return 0; }
From: Kangjie Lu kjlu@umn.edu
[ Upstream commit 23015b22e47c5409620b1726a677d69e5cd032ba ]
In case create_workqueue fails, the fix releases resources and returns -ENOMEM to avoid NULL pointer dereference.
Signed-off-by: Kangjie Lu kjlu@umn.edu Acked-by: Alexandre Bounine alex.bou9@gmail.com Cc: Matt Porter mporter@kernel.crashing.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/rapidio/rio_cm.c | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/drivers/rapidio/rio_cm.c b/drivers/rapidio/rio_cm.c index bad0e0ea4f305..ef989a15aefc4 100644 --- a/drivers/rapidio/rio_cm.c +++ b/drivers/rapidio/rio_cm.c @@ -2145,6 +2145,14 @@ static int riocm_add_mport(struct device *dev, mutex_init(&cm->rx_lock); riocm_rx_fill(cm, RIOCM_RX_RING_SIZE); cm->rx_wq = create_workqueue(DRV_NAME "/rxq"); + if (!cm->rx_wq) { + riocm_error("failed to allocate IBMBOX_%d on %s", + cmbox, mport->name); + rio_release_outb_mbox(mport, cmbox); + kfree(cm); + return -ENOMEM; + } + INIT_WORK(&cm->rx_work, rio_ibmsg_handler);
cm->tx_slot = 0;
From: Hou Tao houtao1@huawei.com
[ Upstream commit bd8309de0d60838eef6fb575b0c4c7e95841cf73 ]
fsync() needs to make sure the data & meta-data of file are persistent after the return of fsync(), even when a power-failure occurs later. In the case of fat-fs, the FAT belongs to the meta-data of file, so we need to issue a flush after the writeback of FAT instead before.
Also bail out early when any stage of fsync fails.
Link: http://lkml.kernel.org/r/20190409030158.136316-1-houtao1@huawei.com Signed-off-by: Hou Tao houtao1@huawei.com Acked-by: OGAWA Hirofumi hirofumi@mail.parknet.co.jp Cc: Al Viro viro@zeniv.linux.org.uk Cc: Jan Kara jack@suse.cz Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org --- fs/fat/file.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/fs/fat/file.c b/fs/fat/file.c index 13935ee99e1e5..87baa023f8316 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -193,12 +193,17 @@ static int fat_file_release(struct inode *inode, struct file *filp) int fat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct inode *inode = filp->f_mapping->host; - int res, err; + int err; + + err = __generic_file_fsync(filp, start, end, datasync); + if (err) + return err;
- res = generic_file_fsync(filp, start, end, datasync); err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping); + if (err) + return err;
- return res ? res : err; + return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); }
From: Christian Brauner christian@brauner.io
[ Upstream commit e260ad01f0aa9e96b5386d5cd7184afd949dc457 ]
Currently when userspace gives us a values that overflow e.g. file-max and other callers of __do_proc_doulongvec_minmax() we simply ignore the new value and leave the current value untouched.
This can be problematic as it gives the illusion that the limit has indeed be bumped when in fact it failed. This commit makes sure to return EINVAL when an overflow is detected. Please note that this is a userspace facing change.
Link: http://lkml.kernel.org/r/20190210203943.8227-4-christian@brauner.io Signed-off-by: Christian Brauner christian@brauner.io Acked-by: Luis Chamberlain mcgrof@kernel.org Cc: Kees Cook keescook@chromium.org Cc: Alexey Dobriyan adobriyan@gmail.com Cc: Al Viro viro@zeniv.linux.org.uk Cc: Dominik Brodowski linux@dominikbrodowski.net Cc: "Eric W. Biederman" ebiederm@xmission.com Cc: Joe Lawrence joe.lawrence@redhat.com Cc: Waiman Long longman@redhat.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org --- kernel/sysctl.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f50f1471c1199..9664d5ae2de10 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2818,8 +2818,10 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int if (neg) continue; val = convmul * val / convdiv; - if ((min && val < *min) || (max && val > *max)) - continue; + if ((min && val < *min) || (max && val > *max)) { + err = -EINVAL; + break; + } *i = val; } else { val = convdiv * (*i) / convmul;
From: Li Rongqing lirongqing@baidu.com
[ Upstream commit d6a2946a88f524a47cc9b79279667137899db807 ]
msgctl10 of ltp triggers the following lockup When CONFIG_KASAN is enabled on large memory SMP systems, the pages initialization can take a long time, if msgctl10 requests a huge block memory, and it will block rcu scheduler, so release cpu actively.
After adding schedule() in free_msg, free_msg can not be called when holding spinlock, so adding msg to a tmp list, and free it out of spinlock
rcu: INFO: rcu_preempt detected stalls on CPUs/tasks: rcu: Tasks blocked on level-1 rcu_node (CPUs 16-31): P32505 rcu: Tasks blocked on level-1 rcu_node (CPUs 48-63): P34978 rcu: (detected by 11, t=35024 jiffies, g=44237529, q=16542267) msgctl10 R running task 21608 32505 2794 0x00000082 Call Trace: preempt_schedule_irq+0x4c/0xb0 retint_kernel+0x1b/0x2d RIP: 0010:__is_insn_slot_addr+0xfb/0x250 Code: 82 1d 00 48 8b 9b 90 00 00 00 4c 89 f7 49 c1 ee 03 e8 59 83 1d 00 48 b8 00 00 00 00 00 fc ff df 4c 39 eb 48 89 9d 58 ff ff ff <41> c6 04 06 f8 74 66 4c 8d 75 98 4c 89 f1 48 c1 e9 03 48 01 c8 48 RSP: 0018:ffff88bce041f758 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff13 RAX: dffffc0000000000 RBX: ffffffff8471bc50 RCX: ffffffff828a2a57 RDX: dffffc0000000000 RSI: dffffc0000000000 RDI: ffff88bce041f780 RBP: ffff88bce041f828 R08: ffffed15f3f4c5b3 R09: ffffed15f3f4c5b3 R10: 0000000000000001 R11: ffffed15f3f4c5b2 R12: 000000318aee9b73 R13: ffffffff8471bc50 R14: 1ffff1179c083ef0 R15: 1ffff1179c083eec kernel_text_address+0xc1/0x100 __kernel_text_address+0xe/0x30 unwind_get_return_address+0x2f/0x50 __save_stack_trace+0x92/0x100 create_object+0x380/0x650 __kmalloc+0x14c/0x2b0 load_msg+0x38/0x1a0 do_msgsnd+0x19e/0xcf0 do_syscall_64+0x117/0x400 entry_SYSCALL_64_after_hwframe+0x49/0xbe
rcu: INFO: rcu_preempt detected stalls on CPUs/tasks: rcu: Tasks blocked on level-1 rcu_node (CPUs 0-15): P32170 rcu: (detected by 14, t=35016 jiffies, g=44237525, q=12423063) msgctl10 R running task 21608 32170 32155 0x00000082 Call Trace: preempt_schedule_irq+0x4c/0xb0 retint_kernel+0x1b/0x2d RIP: 0010:lock_acquire+0x4d/0x340 Code: 48 81 ec c0 00 00 00 45 89 c6 4d 89 cf 48 8d 6c 24 20 48 89 3c 24 48 8d bb e4 0c 00 00 89 74 24 0c 48 c7 44 24 20 b3 8a b5 41 <48> c1 ed 03 48 c7 44 24 28 b4 25 18 84 48 c7 44 24 30 d0 54 7a 82 RSP: 0018:ffff88af83417738 EFLAGS: 00000282 ORIG_RAX: ffffffffffffff13 RAX: dffffc0000000000 RBX: ffff88bd335f3080 RCX: 0000000000000002 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff88bd335f3d64 RBP: ffff88af83417758 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000001 R11: ffffed13f3f745b2 R12: 0000000000000000 R13: 0000000000000002 R14: 0000000000000000 R15: 0000000000000000 is_bpf_text_address+0x32/0xe0 kernel_text_address+0xec/0x100 __kernel_text_address+0xe/0x30 unwind_get_return_address+0x2f/0x50 __save_stack_trace+0x92/0x100 save_stack+0x32/0xb0 __kasan_slab_free+0x130/0x180 kfree+0xfa/0x2d0 free_msg+0x24/0x50 do_msgrcv+0x508/0xe60 do_syscall_64+0x117/0x400 entry_SYSCALL_64_after_hwframe+0x49/0xbe
Davidlohr said: "So after releasing the lock, the msg rbtree/list is empty and new calls will not see those in the newly populated tmp_msg list, and therefore they cannot access the delayed msg freeing pointers, which is good. Also the fact that the node_cache is now freed before the actual messages seems to be harmless as this is wanted for msg_insert() avoiding GFP_ATOMIC allocations, and after releasing the info->lock the thing is freed anyway so it should not change things"
Link: http://lkml.kernel.org/r/1552029161-4957-1-git-send-email-lirongqing@baidu.c... Signed-off-by: Li RongQing lirongqing@baidu.com Signed-off-by: Zhang Yu zhangyu31@baidu.com Reviewed-by: Davidlohr Bueso dbueso@suse.de Cc: Manfred Spraul manfred@colorfullife.com Cc: Arnd Bergmann arnd@arndb.de Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org --- ipc/mqueue.c | 10 ++++++++-- ipc/msgutil.c | 6 ++++++ 2 files changed, 14 insertions(+), 2 deletions(-)
diff --git a/ipc/mqueue.c b/ipc/mqueue.c index c595bed7bfcbc..43f8ddd637159 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -391,7 +391,8 @@ static void mqueue_evict_inode(struct inode *inode) struct user_struct *user; unsigned long mq_bytes, mq_treesize; struct ipc_namespace *ipc_ns; - struct msg_msg *msg; + struct msg_msg *msg, *nmsg; + LIST_HEAD(tmp_msg);
clear_inode(inode);
@@ -402,10 +403,15 @@ static void mqueue_evict_inode(struct inode *inode) info = MQUEUE_I(inode); spin_lock(&info->lock); while ((msg = msg_get(info)) != NULL) - free_msg(msg); + list_add_tail(&msg->m_list, &tmp_msg); kfree(info->node_cache); spin_unlock(&info->lock);
+ list_for_each_entry_safe(msg, nmsg, &tmp_msg, m_list) { + list_del(&msg->m_list); + free_msg(msg); + } + /* Total amount of bytes accounted for the mqueue */ mq_treesize = info->attr.mq_maxmsg * sizeof(struct msg_msg) + min_t(unsigned int, info->attr.mq_maxmsg, MQ_PRIO_MAX) * diff --git a/ipc/msgutil.c b/ipc/msgutil.c index 84598025a6ade..e65593742e2be 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -18,6 +18,7 @@ #include <linux/utsname.h> #include <linux/proc_ns.h> #include <linux/uaccess.h> +#include <linux/sched.h>
#include "util.h"
@@ -64,6 +65,9 @@ static struct msg_msg *alloc_msg(size_t len) pseg = &msg->next; while (len > 0) { struct msg_msgseg *seg; + + cond_resched(); + alen = min(len, DATALEN_SEG); seg = kmalloc(sizeof(*seg) + alen, GFP_KERNEL_ACCOUNT); if (seg == NULL) @@ -176,6 +180,8 @@ void free_msg(struct msg_msg *msg) kfree(msg); while (seg != NULL) { struct msg_msgseg *tmp = seg->next; + + cond_resched(); kfree(seg); seg = tmp; }
From: Brian Masney masneyb@onstation.org
[ Upstream commit 90f94660e53189755676543954101de78c26253b ]
msm_gem_describe() would attempt to dereference a NULL pointer via the address space pointer when no IOMMU is present. Correct this by adding the appropriate check.
Signed-off-by: Brian Masney masneyb@onstation.org Fixes: 575f0485508b ("drm/msm: Clean up and enhance the output of the 'gem' debugfs node") Signed-off-by: Sean Paul seanpaul@chromium.org Link: https://patchwork.freedesktop.org/patch/msgid/20190513234105.7531-2-masneyb@... Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/gpu/drm/msm/msm_gem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index c8886d3071fa3..bdf0089771bde 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -805,7 +805,8 @@ void msm_gem_describe(struct drm_gem_object *obj, struct seq_file *m) seq_puts(m, " vmas:");
list_for_each_entry(vma, &msm_obj->vmas, list) - seq_printf(m, " [%s: %08llx,%s,inuse=%d]", vma->aspace->name, + seq_printf(m, " [%s: %08llx,%s,inuse=%d]", + vma->aspace != NULL ? vma->aspace->name : NULL, vma->iova, vma->mapped ? "mapped" : "unmapped", vma->inuse);
From: Guenter Roeck linux@roeck-us.net
[ Upstream commit 3e01ae2612bdd7975c74ec7123d7f8f5e6eed795 ]
The following warning is seen on systems with broken clock divider.
INFO: trying to register non-static key. the code is fine but needs lockdep annotation. turning off the locking correctness validator. CPU: 0 PID: 1 Comm: swapper Not tainted 5.1.0-09698-g1fb3b52 #1 Hardware name: ARM Integrator/CP (Device Tree) [<c0011be8>] (unwind_backtrace) from [<c000ebb8>] (show_stack+0x10/0x18) [<c000ebb8>] (show_stack) from [<c07d3fd0>] (dump_stack+0x18/0x24) [<c07d3fd0>] (dump_stack) from [<c0060d48>] (register_lock_class+0x674/0x6f8) [<c0060d48>] (register_lock_class) from [<c005de2c>] (__lock_acquire+0x68/0x2128) [<c005de2c>] (__lock_acquire) from [<c0060408>] (lock_acquire+0x110/0x21c) [<c0060408>] (lock_acquire) from [<c07f755c>] (_raw_spin_lock+0x34/0x48) [<c07f755c>] (_raw_spin_lock) from [<c0536c8c>] (pl111_display_enable+0xf8/0x5fc) [<c0536c8c>] (pl111_display_enable) from [<c0502f54>] (drm_atomic_helper_commit_modeset_enables+0x1ec/0x244)
Since commit eedd6033b4c8 ("drm/pl111: Support variants with broken clock divider"), the spinlock is not initialized if the clock divider is broken. Initialize it earlier to fix the problem.
Fixes: eedd6033b4c8 ("drm/pl111: Support variants with broken clock divider") Cc: Linus Walleij linus.walleij@linaro.org Signed-off-by: Guenter Roeck linux@roeck-us.net Signed-off-by: Linus Walleij linus.walleij@linaro.org Link: https://patchwork.freedesktop.org/patch/msgid/1557758781-23586-1-git-send-em... Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/gpu/drm/pl111/pl111_display.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/pl111/pl111_display.c b/drivers/gpu/drm/pl111/pl111_display.c index 754f6b25f2652..6d9f78612deeb 100644 --- a/drivers/gpu/drm/pl111/pl111_display.c +++ b/drivers/gpu/drm/pl111/pl111_display.c @@ -531,14 +531,15 @@ pl111_init_clock_divider(struct drm_device *drm) dev_err(drm->dev, "CLCD: unable to get clcdclk.\n"); return PTR_ERR(parent); } + + spin_lock_init(&priv->tim2_lock); + /* If the clock divider is broken, use the parent directly */ if (priv->variant->broken_clockdivider) { priv->clk = parent; return 0; } parent_name = __clk_get_name(parent); - - spin_lock_init(&priv->tim2_lock); div->init = &init;
ret = devm_clk_hw_register(drm->dev, div);
From: Arnd Bergmann arnd@arndb.de
[ Upstream commit be167862ae7dd85c56d385209a4890678e1b0488 ]
Patch series "compiler: allow all arches to enable CONFIG_OPTIMIZE_INLINING", v3.
This patch (of 11):
When function tracing for IPIs is enabled, we get a warning for an overflow of the ipi_types array with the IPI_CPU_BACKTRACE type as triggered by raise_nmi():
arch/arm/kernel/smp.c: In function 'raise_nmi': arch/arm/kernel/smp.c:489:2: error: array subscript is above array bounds [-Werror=array-bounds] trace_ipi_raise(target, ipi_types[ipinr]);
This is a correct warning as we actually overflow the array here.
This patch raise_nmi() to call __smp_cross_call() instead of smp_cross_call(), to avoid calling into ftrace. For clarification, I'm also adding a two new code comments describing how this one is special.
The warning appears to have shown up after commit e7273ff49acf ("ARM: 8488/1: Make IPI_CPU_BACKTRACE a "non-secure" SGI"), which changed the number assignment from '15' to '8', but as far as I can tell has existed since the IPI tracepoints were first introduced. If we decide to backport this patch to stable kernels, we probably need to backport e7273ff49acf as well.
[yamada.masahiro@socionext.com: rebase on v5.1-rc1] Link: http://lkml.kernel.org/r/20190423034959.13525-2-yamada.masahiro@socionext.co... Fixes: e7273ff49acf ("ARM: 8488/1: Make IPI_CPU_BACKTRACE a "non-secure" SGI") Fixes: 365ec7b17327 ("ARM: add IPI tracepoints") # v3.17 Signed-off-by: Arnd Bergmann arnd@arndb.de Signed-off-by: Masahiro Yamada yamada.masahiro@socionext.com Cc: Heiko Carstens heiko.carstens@de.ibm.com Cc: Arnd Bergmann arnd@arndb.de Cc: Ingo Molnar mingo@redhat.com Cc: Christophe Leroy christophe.leroy@c-s.fr Cc: Mathieu Malaterre malat@debian.org Cc: "H. Peter Anvin" hpa@zytor.com Cc: Thomas Gleixner tglx@linutronix.de Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Paul Mackerras paulus@samba.org Cc: Ralf Baechle ralf@linux-mips.org Cc: Stefan Agner stefan@agner.ch Cc: Boris Brezillon bbrezillon@kernel.org Cc: Miquel Raynal miquel.raynal@bootlin.com Cc: Richard Weinberger richard@nod.at Cc: David Woodhouse dwmw2@infradead.org Cc: Brian Norris computersforpeace@gmail.com Cc: Marek Vasut marek.vasut@gmail.com Cc: Russell King rmk+kernel@arm.linux.org.uk Cc: Borislav Petkov bp@suse.de Cc: Mark Rutland mark.rutland@arm.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org --- arch/arm/include/asm/hardirq.h | 1 + arch/arm/kernel/smp.c | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/arch/arm/include/asm/hardirq.h b/arch/arm/include/asm/hardirq.h index cba23eaa60721..7a88f160b1fbe 100644 --- a/arch/arm/include/asm/hardirq.h +++ b/arch/arm/include/asm/hardirq.h @@ -6,6 +6,7 @@ #include <linux/threads.h> #include <asm/irq.h>
+/* number of IPIS _not_ including IPI_CPU_BACKTRACE */ #define NR_IPI 7
typedef struct { diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index a3ce7c5365fa8..bada66ef44193 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -76,6 +76,10 @@ enum ipi_msg_type { IPI_CPU_STOP, IPI_IRQ_WORK, IPI_COMPLETION, + /* + * CPU_BACKTRACE is special and not included in NR_IPI + * or tracable with trace_ipi_* + */ IPI_CPU_BACKTRACE, /* * SGI8-15 can be reserved by secure firmware, and thus may @@ -803,7 +807,7 @@ core_initcall(register_cpufreq_notifier);
static void raise_nmi(cpumask_t *mask) { - smp_cross_call(mask, IPI_CPU_BACKTRACE); + __smp_cross_call(mask, IPI_CPU_BACKTRACE); }
void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self)
From: Jérôme Glisse jglisse@redhat.com
[ Upstream commit 734fb89968900b5c5f8edd5038bd4cdeab8c61d2 ]
To avoid random config build issue, select mmu notifier when HMM is selected. In any cases when HMM get selected it will be by users that will also wants the mmu notifier.
Link: http://lkml.kernel.org/r/20190403193318.16478-2-jglisse@redhat.com Signed-off-by: Jérôme Glisse jglisse@redhat.com Acked-by: Balbir Singh bsingharora@gmail.com Cc: Ralph Campbell rcampbell@nvidia.com Cc: John Hubbard jhubbard@nvidia.com Cc: Dan Williams dan.j.williams@intel.com Cc: Arnd Bergmann arnd@arndb.de Cc: Dan Carpenter dan.carpenter@oracle.com Cc: Ira Weiny ira.weiny@intel.com Cc: Matthew Wilcox willy@infradead.org Cc: Souptick Joarder jrdr.linux@gmail.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/Kconfig b/mm/Kconfig index 25c71eb8a7dbd..2e6d24d783f78 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -694,12 +694,12 @@ config DEV_PAGEMAP_OPS
config HMM bool + select MMU_NOTIFIER select MIGRATE_VMA_HELPER
config HMM_MIRROR bool "HMM mirror CPU page table into a device page table" depends on ARCH_HAS_HMM - select MMU_NOTIFIER select HMM help Select HMM_MIRROR if you want to mirror range of the CPU page table of a
From: Mike Kravetz mike.kravetz@oracle.com
[ Upstream commit 0919e1b69ab459e06df45d3ba6658d281962db80 ]
When a huge page is allocated, PagePrivate() is set if the allocation consumed a reservation. When freeing a huge page, PagePrivate is checked. If set, it indicates the reservation should be restored. PagePrivate being set at free huge page time mostly happens on error paths.
When huge page reservations are created, a check is made to determine if the mapping is associated with an explicitly mounted filesystem. If so, pages are also reserved within the filesystem. The default action when freeing a huge page is to decrement the usage count in any associated explicitly mounted filesystem. However, if the reservation is to be restored the reservation/use count within the filesystem should not be decrementd. Otherwise, a subsequent page allocation and free for the same mapping location will cause the file filesystem usage to go 'negative'.
Filesystem Size Used Avail Use% Mounted on nodev 4.0G -4.0M 4.1G - /opt/hugepool
To fix, when freeing a huge page do not adjust filesystem usage if PagePrivate() is set to indicate the reservation should be restored.
I did not cc stable as the problem has been around since reserves were added to hugetlbfs and nobody has noticed.
Link: http://lkml.kernel.org/r/20190328234704.27083-2-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz mike.kravetz@oracle.com Reviewed-by: Naoya Horiguchi n-horiguchi@ah.jp.nec.com Cc: Davidlohr Bueso dave@stgolabs.net Cc: Joonsoo Kim iamjoonsoo.kim@lge.com Cc: Michal Hocko mhocko@kernel.org Cc: "Kirill A . Shutemov" kirill.shutemov@linux.intel.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org --- mm/hugetlb.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c161069bfdbc9..3880deb5f8a4c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1257,12 +1257,23 @@ void free_huge_page(struct page *page) ClearPagePrivate(page);
/* - * A return code of zero implies that the subpool will be under its - * minimum size if the reservation is not restored after page is free. - * Therefore, force restore_reserve operation. + * If PagePrivate() was set on page, page allocation consumed a + * reservation. If the page was associated with a subpool, there + * would have been a page reserved in the subpool before allocation + * via hugepage_subpool_get_pages(). Since we are 'restoring' the + * reservtion, do not call hugepage_subpool_put_pages() as this will + * remove the reserved page from the subpool. */ - if (hugepage_subpool_put_pages(spool, 1) == 0) - restore_reserve = true; + if (!restore_reserve) { + /* + * A return code of zero implies that the subpool will be + * under its minimum size if the reservation is not restored + * after page is free. Therefore, force restore_reserve + * operation. + */ + if (hugepage_subpool_put_pages(spool, 1) == 0) + restore_reserve = true; + }
spin_lock(&hugetlb_lock); clear_page_huge_active(page);
From: David Hildenbrand david@redhat.com
[ Upstream commit d9eb1417c77df7ce19abd2e41619e9dceccbdf2a ]
Patch series "mm/memory_hotplug: Better error handling when removing memory", v1.
Error handling when removing memory is somewhat messed up right now. Some errors result in warnings, others are completely ignored. Memory unplug code can essentially not deal with errors properly as of now. remove_memory() will never fail.
We have basically two choices: 1. Allow arch_remov_memory() and friends to fail, propagating errors via remove_memory(). Might be problematic (e.g. DIMMs consisting of multiple pieces added/removed separately). 2. Don't allow the functions to fail, handling errors in a nicer way.
It seems like most errors that can theoretically happen are really corner cases and mostly theoretical (e.g. "section not valid"). However e.g. aborting removal of sections while all callers simply continue in case of errors is not nice.
If we can gurantee that removal of memory always works (and WARN/skip in case of theoretical errors so we can figure out what is going on), we can go ahead and implement better error handling when adding memory.
E.g. via add_memory():
arch_add_memory() ret = do_stuff() if (ret) { arch_remove_memory(); goto error; }
Handling here that arch_remove_memory() might fail is basically impossible. So I suggest, let's avoid reporting errors while removing memory, warning on theoretical errors instead and continuing instead of aborting.
This patch (of 4):
__add_pages() doesn't add the memory resource, so __remove_pages() shouldn't remove it. Let's factor it out. Especially as it is a special case for memory used as system memory, added via add_memory() and friends.
We now remove the resource after removing the sections instead of doing it the other way around. I don't think this change is problematic.
add_memory() register memory resource arch_add_memory()
remove_memory arch_remove_memory() release memory resource
While at it, explain why we ignore errors and that it only happeny if we remove memory in a different granularity as we added it.
[david@redhat.com: fix printk warning] Link: http://lkml.kernel.org/r/20190417120204.6997-1-david@redhat.com Link: http://lkml.kernel.org/r/20190409100148.24703-2-david@redhat.com Signed-off-by: David Hildenbrand david@redhat.com Reviewed-by: Oscar Salvador osalvador@suse.de Cc: Michal Hocko mhocko@suse.com Cc: David Hildenbrand david@redhat.com Cc: Pavel Tatashin pasha.tatashin@soleen.com Cc: Wei Yang richard.weiyang@gmail.com Cc: Qian Cai cai@lca.pw Cc: Arun KS arunks@codeaurora.org Cc: Mathieu Malaterre malat@debian.org Cc: Andrew Banman andrew.banman@hpe.com Cc: Andy Lutomirski luto@kernel.org Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Borislav Petkov bp@alien8.de Cc: Christophe Leroy christophe.leroy@c-s.fr Cc: Dave Hansen dave.hansen@linux.intel.com Cc: Fenghua Yu fenghua.yu@intel.com Cc: Geert Uytterhoeven geert@linux-m68k.org Cc: Greg Kroah-Hartman gregkh@linuxfoundation.org Cc: Heiko Carstens heiko.carstens@de.ibm.com Cc: "H. Peter Anvin" hpa@zytor.com Cc: Ingo Molnar mingo@kernel.org Cc: Ingo Molnar mingo@redhat.com Cc: Joonsoo Kim iamjoonsoo.kim@lge.com Cc: "Kirill A. Shutemov" kirill.shutemov@linux.intel.com Cc: Martin Schwidefsky schwidefsky@de.ibm.com Cc: Masahiro Yamada yamada.masahiro@socionext.com Cc: Michael Ellerman mpe@ellerman.id.au Cc: Mike Rapoport rppt@linux.ibm.com Cc: Mike Travis mike.travis@hpe.com Cc: Nicholas Piggin npiggin@gmail.com Cc: Oscar Salvador osalvador@suse.com Cc: Paul Mackerras paulus@samba.org Cc: Peter Zijlstra peterz@infradead.org Cc: "Rafael J. Wysocki" rafael@kernel.org Cc: Rich Felker dalias@libc.org Cc: Rob Herring robh@kernel.org Cc: Stefan Agner stefan@agner.ch Cc: Thomas Gleixner tglx@linutronix.de Cc: Tony Luck tony.luck@intel.com Cc: Vasily Gorbik gor@linux.ibm.com Cc: Yoshinori Sato ysato@users.sourceforge.jp Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org --- mm/memory_hotplug.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 7493f50ee8800..e06e7a89d0e5b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -559,20 +559,6 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, if (is_dev_zone(zone)) { if (altmap) map_offset = vmem_altmap_offset(altmap); - } else { - resource_size_t start, size; - - start = phys_start_pfn << PAGE_SHIFT; - size = nr_pages * PAGE_SIZE; - - ret = release_mem_region_adjustable(&iomem_resource, start, - size); - if (ret) { - resource_size_t endres = start + size - 1; - - pr_warn("Unable to release resource <%pa-%pa> (%d)\n", - &start, &endres, ret); - } }
clear_zone_contiguous(zone); @@ -1828,6 +1814,26 @@ void try_offline_node(int nid) } EXPORT_SYMBOL(try_offline_node);
+static void __release_memory_resource(resource_size_t start, + resource_size_t size) +{ + int ret; + + /* + * When removing memory in the same granularity as it was added, + * this function never fails. It might only fail if resources + * have to be adjusted or split. We'll ignore the error, as + * removing of memory cannot fail. + */ + ret = release_mem_region_adjustable(&iomem_resource, start, size); + if (ret) { + resource_size_t endres = start + size - 1; + + pr_warn("Unable to release resource <%pa-%pa> (%d)\n", + &start, &endres, ret); + } +} + /** * remove_memory * @nid: the node ID @@ -1862,6 +1868,7 @@ void __ref __remove_memory(int nid, u64 start, u64 size) memblock_remove(start, size);
arch_remove_memory(nid, start, size, NULL); + __release_memory_resource(start, size);
try_offline_node(nid);
From: Linxu Fang fanglinxu@huawei.com
[ Upstream commit 299c83dce9ea3a79bb4b5511d2cb996b6b8e5111 ]
342332e6a925 ("mm/page_alloc.c: introduce kernelcore=mirror option") and later patches rewrote the calculation of node spanned pages.
e506b99696a2 ("mem-hotplug: fix node spanned pages when we have a movable node"), but the current code still has problems,
When we have a node with only zone_movable and the node id is not zero, the size of node spanned pages is double added.
That's because we have an empty normal zone, and zone_start_pfn or zone_end_pfn is not between arch_zone_lowest_possible_pfn and arch_zone_highest_possible_pfn, so we need to use clamp to constrain the range just like the commit <96e907d13602> (bootmem: Reimplement __absent_pages_in_range() using for_each_mem_pfn_range()).
e.g. Zone ranges: DMA [mem 0x0000000000001000-0x0000000000ffffff] DMA32 [mem 0x0000000001000000-0x00000000ffffffff] Normal [mem 0x0000000100000000-0x000000023fffffff] Movable zone start for each node Node 0: 0x0000000100000000 Node 1: 0x0000000140000000 Early memory node ranges node 0: [mem 0x0000000000001000-0x000000000009efff] node 0: [mem 0x0000000000100000-0x00000000bffdffff] node 0: [mem 0x0000000100000000-0x000000013fffffff] node 1: [mem 0x0000000140000000-0x000000023fffffff]
node 0 DMA spanned:0xfff present:0xf9e absent:0x61 node 0 DMA32 spanned:0xff000 present:0xbefe0 absent:0x40020 node 0 Normal spanned:0 present:0 absent:0 node 0 Movable spanned:0x40000 present:0x40000 absent:0 On node 0 totalpages(node_present_pages): 1048446 node_spanned_pages:1310719 node 1 DMA spanned:0 present:0 absent:0 node 1 DMA32 spanned:0 present:0 absent:0 node 1 Normal spanned:0x100000 present:0x100000 absent:0 node 1 Movable spanned:0x100000 present:0x100000 absent:0 On node 1 totalpages(node_present_pages): 2097152 node_spanned_pages:2097152 Memory: 6967796K/12582392K available (16388K kernel code, 3686K rwdata, 4468K rodata, 2160K init, 10444K bss, 5614596K reserved, 0K cma-reserved)
It shows that the current memory of node 1 is double added. After this patch, the problem is fixed.
node 0 DMA spanned:0xfff present:0xf9e absent:0x61 node 0 DMA32 spanned:0xff000 present:0xbefe0 absent:0x40020 node 0 Normal spanned:0 present:0 absent:0 node 0 Movable spanned:0x40000 present:0x40000 absent:0 On node 0 totalpages(node_present_pages): 1048446 node_spanned_pages:1310719 node 1 DMA spanned:0 present:0 absent:0 node 1 DMA32 spanned:0 present:0 absent:0 node 1 Normal spanned:0 present:0 absent:0 node 1 Movable spanned:0x100000 present:0x100000 absent:0 On node 1 totalpages(node_present_pages): 1048576 node_spanned_pages:1048576 memory: 6967796K/8388088K available (16388K kernel code, 3686K rwdata, 4468K rodata, 2160K init, 10444K bss, 1420292K reserved, 0K cma-reserved)
Link: http://lkml.kernel.org/r/1554178276-10372-1-git-send-email-fanglinxu@huawei.... Signed-off-by: Linxu Fang fanglinxu@huawei.com Cc: Taku Izumi izumi.taku@jp.fujitsu.com Cc: Xishi Qiu qiuxishi@huawei.com Cc: Michal Hocko mhocko@suse.com Cc: Vlastimil Babka vbabka@suse.cz Cc: Pavel Tatashin pavel.tatashin@microsoft.com Cc: Oscar Salvador osalvador@suse.de Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org --- mm/page_alloc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d59be95ba45cf..140acb4b69aae 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6182,13 +6182,15 @@ static unsigned long __init zone_spanned_pages_in_node(int nid, unsigned long *zone_end_pfn, unsigned long *ignored) { + unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; + unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; /* When hotadd a new node from cpu_up(), the node should be empty */ if (!node_start_pfn && !node_end_pfn) return 0;
/* Get the start and end of the zone */ - *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; - *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; + *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); + *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); adjust_zone_range_for_zone_movable(nid, zone_type, node_start_pfn, node_end_pfn, zone_start_pfn, zone_end_pfn);
From: Yue Hu huyue2@yulong.com
[ Upstream commit 1df3a339074e31db95c4790ea9236874b13ccd87 ]
f022d8cb7ec7 ("mm: cma: Don't crash on allocation if CMA area can't be activated") fixes the crash issue when activation fails via setting cma->count as 0, same logic exists if bitmap allocation fails.
Link: http://lkml.kernel.org/r/20190325081309.6004-1-zbestahu@gmail.com Signed-off-by: Yue Hu huyue2@yulong.com Reviewed-by: Anshuman Khandual anshuman.khandual@arm.com Cc: Joonsoo Kim iamjoonsoo.kim@lge.com Cc: Laura Abbott labbott@redhat.com Cc: Mike Rapoport rppt@linux.vnet.ibm.com Cc: Randy Dunlap rdunlap@infradead.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org --- mm/cma.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/mm/cma.c b/mm/cma.c index f4f3a8a57d862..f160ce31ef469 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -106,8 +106,10 @@ static int __init cma_activate_area(struct cma *cma)
cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
- if (!cma->bitmap) + if (!cma->bitmap) { + cma->count = 0; return -ENOMEM; + }
WARN_ON_ONCE(!pfn_valid(pfn)); zone = page_zone(pfn_to_page(pfn));
From: Christoph Hellwig hch@lst.de
[ Upstream commit 54c7a8916a887f357088f99e9c3a7720cd57d2c8 ]
Patch series "initramfs tidyups".
I've spent some time chasing down behavior in initramfs and found plenty of opportunity to improve the code. A first stab on that is contained in this series.
This patch (of 7):
We free the initrd memory for all successful or error cases except for the case where opening /initrd.image fails, which looks like an oversight.
Steven said:
: This also changes the behaviour when CONFIG_INITRAMFS_FORCE is enabled : - specifically it means that the initrd is freed (previously it was : ignored and never freed). But that seems like reasonable behaviour and : the previous behaviour looks like another oversight.
Link: http://lkml.kernel.org/r/20190213174621.29297-3-hch@lst.de Signed-off-by: Christoph Hellwig hch@lst.de Reviewed-by: Steven Price steven.price@arm.com Acked-by: Mike Rapoport rppt@linux.ibm.com Cc: Catalin Marinas catalin.marinas@arm.com [arm64] Cc: Geert Uytterhoeven geert@linux-m68k.org [m68k] Cc: Alexander Viro viro@zeniv.linux.org.uk Cc: Russell King linux@armlinux.org.uk Cc: Will Deacon will.deacon@arm.com Cc: Guan Xuetao gxt@pku.edu.cn Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org --- init/initramfs.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-)
diff --git a/init/initramfs.c b/init/initramfs.c index fca899622937f..ccc88aacc8649 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -612,13 +612,12 @@ static int __init populate_rootfs(void) printk(KERN_INFO "Trying to unpack rootfs image as initramfs...\n"); err = unpack_to_rootfs((char *)initrd_start, initrd_end - initrd_start); - if (!err) { - free_initrd(); + if (!err) goto done; - } else { - clean_rootfs(); - unpack_to_rootfs(__initramfs_start, __initramfs_size); - } + + clean_rootfs(); + unpack_to_rootfs(__initramfs_start, __initramfs_size); + printk(KERN_INFO "rootfs image is not initramfs (%s)" "; looks like an initrd\n", err); fd = ksys_open("/initrd.image", @@ -632,7 +631,6 @@ static int __init populate_rootfs(void) written, initrd_end - initrd_start);
ksys_close(fd); - free_initrd(); } done: /* empty statement */; @@ -642,9 +640,9 @@ static int __init populate_rootfs(void) initrd_end - initrd_start); if (err) printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err); - free_initrd(); #endif } + free_initrd(); flush_delayed_fput(); return 0; }
From: Baoquan He bhe@redhat.com
[ Upstream commit d3ba3ae19751e476b0840a0c9a673a5766fa3219 ]
In node_states_check_changes_online(), N_HIGH_MEMORY is used to substitute ZONE_HIGHMEM directly. This is not right. N_HIGH_MEMORY is to mark the memory state of node. Here zone index is checked, which should be compared with 'ZONE_HIGHMEM' accordingly.
Replace it with ZONE_HIGHMEM.
This is a code cleanup - no known runtime effects.
Link: http://lkml.kernel.org/r/20190320080732.14933-1-bhe@redhat.com Fixes: 8efe33f40f3e ("mm/memory_hotplug.c: simplify node_states_check_changes_online") Signed-off-by: Baoquan He bhe@redhat.com Reviewed-by: David Hildenbrand david@redhat.com Acked-by: Michal Hocko mhocko@suse.com Reviewed-by: Oscar Salvador osalvador@suse.de Cc: Wei Yang richard.weiyang@gmail.com Cc: Mike Rapoport rppt@linux.ibm.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org --- mm/memory_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e06e7a89d0e5b..33f1b8d307e64 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -684,7 +684,7 @@ static void node_states_check_changes_online(unsigned long nr_pages, if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY)) arg->status_change_nid_normal = nid; #ifdef CONFIG_HIGHMEM - if (zone_idx(zone) <= N_HIGH_MEMORY && !node_state(nid, N_HIGH_MEMORY)) + if (zone_idx(zone) <= ZONE_HIGHMEM && !node_state(nid, N_HIGH_MEMORY)) arg->status_change_nid_high = nid; #endif }
From: Yue Hu huyue2@yulong.com
[ Upstream commit 2b59e01a3aa665f751d1410b99fae9336bd424e1 ]
Currently one bit in cma bitmap represents number of pages rather than one page, cma->count means cma size in pages. So to find available pages via find_next_zero_bit()/find_next_bit() we should use cma size not in pages but in bits although current free pages number is correct due to zero value of order_per_bit. Once order_per_bit is changed the bitmap status will be incorrect.
The size input in cma_debug_show_areas() is not correct. It will affect the available pages at some position to debug the failure issue.
This is an example with order_per_bit = 1
Before this change: [ 4.120060] cma: number of available pages: 1@93+4@108+7@121+7@137+7@153+7@169+7@185+7@201+3@213+3@221+3@229+3@237+3@245+3@253+3@261+3@269+3@277+3@285+3@293+3@301+3@309+3@317+3@325+19@333+15@369+512@512=> 638 free of 1024 total pages
After this change: [ 4.143234] cma: number of available pages: 2@93+8@108+14@121+14@137+14@153+14@169+14@185+14@201+6@213+6@221+6@229+6@237+6@245+6@253+6@261+6@269+6@277+6@285+6@293+6@301+6@309+6@317+6@325+38@333+30@369=> 252 free of 1024 total pages
Obviously the bitmap status before is incorrect.
Link: http://lkml.kernel.org/r/20190320060829.9144-1-zbestahu@gmail.com Signed-off-by: Yue Hu huyue2@yulong.com Reviewed-by: Andrew Morton akpm@linux-foundation.org Cc: Joonsoo Kim iamjoonsoo.kim@lge.com Cc: Ingo Molnar mingo@kernel.org Cc: Vlastimil Babka vbabka@suse.cz Cc: Mike Rapoport rppt@linux.vnet.ibm.com Cc: Randy Dunlap rdunlap@infradead.org Cc: Laura Abbott labbott@redhat.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org --- mm/cma.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-)
diff --git a/mm/cma.c b/mm/cma.c index f160ce31ef469..0b6d6c63bcef5 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -371,23 +371,26 @@ int __init cma_declare_contiguous(phys_addr_t base, #ifdef CONFIG_CMA_DEBUG static void cma_debug_show_areas(struct cma *cma) { - unsigned long next_zero_bit, next_set_bit; + unsigned long next_zero_bit, next_set_bit, nr_zero; unsigned long start = 0; - unsigned int nr_zero, nr_total = 0; + unsigned long nr_part, nr_total = 0; + unsigned long nbits = cma_bitmap_maxno(cma);
mutex_lock(&cma->lock); pr_info("number of available pages: "); for (;;) { - next_zero_bit = find_next_zero_bit(cma->bitmap, cma->count, start); - if (next_zero_bit >= cma->count) + next_zero_bit = find_next_zero_bit(cma->bitmap, nbits, start); + if (next_zero_bit >= nbits) break; - next_set_bit = find_next_bit(cma->bitmap, cma->count, next_zero_bit); + next_set_bit = find_next_bit(cma->bitmap, nbits, next_zero_bit); nr_zero = next_set_bit - next_zero_bit; - pr_cont("%s%u@%lu", nr_total ? "+" : "", nr_zero, next_zero_bit); - nr_total += nr_zero; + nr_part = nr_zero << cma->order_per_bit; + pr_cont("%s%lu@%lu", nr_total ? "+" : "", nr_part, + next_zero_bit); + nr_total += nr_part; start = next_zero_bit + nr_zero; } - pr_cont("=> %u free of %lu total pages\n", nr_total, cma->count); + pr_cont("=> %lu free of %lu total pages\n", nr_total, cma->count); mutex_unlock(&cma->lock); } #else
From: "Aneesh Kumar K.V" aneesh.kumar@linux.ibm.com
[ Upstream commit 024eee0e83f0df52317be607ca521e0fc572aa07 ]
MADV_DONTNEED is handled with mmap_sem taken in read mode. We call page_mkclean without holding mmap_sem.
MADV_DONTNEED implies that pages in the region are unmapped and subsequent access to the pages in that range is handled as a new page fault. This implies that if we don't have parallel access to the region when MADV_DONTNEED is run we expect those range to be unallocated.
w.r.t page_mkclean() we need to make sure that we don't break the MADV_DONTNEED semantics. MADV_DONTNEED check for pmd_none without holding pmd_lock. This implies we skip the pmd if we temporarily mark pmd none. Avoid doing that while marking the page clean.
Keep the sequence same for dax too even though we don't support MADV_DONTNEED for dax mapping
The bug was noticed by code review and I didn't observe any failures w.r.t test run. This is similar to
commit 58ceeb6bec86d9140f9d91d71a710e963523d063 Author: Kirill A. Shutemov kirill.shutemov@linux.intel.com Date: Thu Apr 13 14:56:26 2017 -0700
thp: fix MADV_DONTNEED vs. MADV_FREE race
commit ced108037c2aa542b3ed8b7afd1576064ad1362a Author: Kirill A. Shutemov kirill.shutemov@linux.intel.com Date: Thu Apr 13 14:56:20 2017 -0700
thp: fix MADV_DONTNEED vs. numa balancing race
Link: http://lkml.kernel.org/r/20190321040610.14226-1-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V aneesh.kumar@linux.ibm.com Reviewed-by: Andrew Morton akpm@linux-foundation.org Cc: Dan Williams dan.j.williams@intel.com Cc:"Kirill A . Shutemov" kirill@shutemov.name Cc: Andrea Arcangeli aarcange@redhat.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org --- fs/dax.c | 2 +- mm/rmap.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/fs/dax.c b/fs/dax.c index 8eb3e8c2b4bdc..163ebd6cc0d1c 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -814,7 +814,7 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, goto unlock_pmd;
flush_cache_page(vma, address, pfn); - pmd = pmdp_huge_clear_flush(vma, address, pmdp); + pmd = pmdp_invalidate(vma, address, pmdp); pmd = pmd_wrprotect(pmd); pmd = pmd_mkclean(pmd); set_pmd_at(vma->vm_mm, address, pmdp, pmd); diff --git a/mm/rmap.c b/mm/rmap.c index 0454ecc29537a..e0710b258c417 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -928,7 +928,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, continue;
flush_cache_page(vma, address, page_to_pfn(page)); - entry = pmdp_huge_clear_flush(vma, address, pmd); + entry = pmdp_invalidate(vma, address, pmd); entry = pmd_wrprotect(entry); entry = pmd_mkclean(entry); set_pmd_at(vma->vm_mm, address, pmd, entry);
From: Yue Hu huyue2@yulong.com
[ Upstream commit f0fd50504a54f5548eb666dc16ddf8394e44e4b7 ]
If not find zero bit in find_next_zero_bit(), it will return the size parameter passed in, so the start bit should be compared with bitmap_maxno rather than cma->count. Although getting maxchunk is working fine due to zero value of order_per_bit currently, the operation will be stuck if order_per_bit is set as non-zero.
Link: http://lkml.kernel.org/r/20190319092734.276-1-zbestahu@gmail.com Signed-off-by: Yue Hu huyue2@yulong.com Reviewed-by: Andrew Morton akpm@linux-foundation.org Cc: Michal Hocko mhocko@suse.com Cc: Joe Perches joe@perches.com Cc: David Rientjes rientjes@google.com Cc: Dmitry Safonov d.safonov@partner.samsung.com Cc: Joonsoo Kim iamjoonsoo.kim@lge.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org --- mm/cma_debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/cma_debug.c b/mm/cma_debug.c index ad6723e9d110a..3e0415076cc9e 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -58,7 +58,7 @@ static int cma_maxchunk_get(void *data, u64 *val) mutex_lock(&cma->lock); for (;;) { start = find_next_zero_bit(cma->bitmap, bitmap_maxno, end); - if (start >= cma->count) + if (start >= bitmap_maxno) break; end = find_next_bit(cma->bitmap, bitmap_maxno, start); maxchunk = max(end - start, maxchunk);
From: Qian Cai cai@lca.pw
[ Upstream commit 745e10146c31b1c6ed3326286704ae251b17f663 ]
"cat /proc/slab_allocators" could hang forever on SMP machines with kmemleak or object debugging enabled due to other CPUs running do_drain() will keep making kmemleak_object or debug_objects_cache dirty and unable to escape the first loop in leaks_show(),
do { set_store_user_clean(cachep); drain_cpu_caches(cachep); ...
} while (!is_store_user_clean(cachep));
For example,
do_drain slabs_destroy slab_destroy kmem_cache_free __cache_free ___cache_free kmemleak_free_recursive delete_object_full __delete_object put_object free_object_rcu kmem_cache_free cache_free_debugcheck --> dirty kmemleak_object
One approach is to check cachep->name and skip both kmemleak_object and debug_objects_cache in leaks_show(). The other is to set store_user_clean after drain_cpu_caches() which leaves a small window between drain_cpu_caches() and set_store_user_clean() where per-CPU caches could be dirty again lead to slightly wrong information has been stored but could also speed up things significantly which sounds like a good compromise. For example,
# cat /proc/slab_allocators 0m42.778s # 1st approach 0m0.737s # 2nd approach
[akpm@linux-foundation.org: tweak comment] Link: http://lkml.kernel.org/r/20190411032635.10325-1-cai@lca.pw Fixes: d31676dfde25 ("mm/slab: alternative implementation for DEBUG_SLAB_LEAK") Signed-off-by: Qian Cai cai@lca.pw Reviewed-by: Andrew Morton akpm@linux-foundation.org Cc: Vlastimil Babka vbabka@suse.cz Cc: Christoph Lameter cl@linux.com Cc: Pekka Enberg penberg@kernel.org Cc: David Rientjes rientjes@google.com Cc: Joonsoo Kim iamjoonsoo.kim@lge.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org --- mm/slab.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/mm/slab.c b/mm/slab.c index f4bbc53008f3b..932a439149cdb 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -4317,8 +4317,12 @@ static int leaks_show(struct seq_file *m, void *p) * whole processing. */ do { - set_store_user_clean(cachep); drain_cpu_caches(cachep); + /* + * drain_cpu_caches() could make kmemleak_object and + * debug_objects_cache dirty, so reset afterwards. + */ + set_store_user_clean(cachep);
x[1] = 0;
From: Cyrill Gorcunov gorcunov@gmail.com
[ Upstream commit a9e73998f9d705c94a8dca9687633adc0f24a19a ]
While validating new map we require the @start_data to be strictly less than @end_data, which is fine for regular applications (this is why this nit didn't trigger for that long). These members are set from executable loaders such as elf handers, still it is pretty valid to have a loadable data section with zero size in file, in such case the start_data is equal to end_data once kernel loader finishes.
As a result when we're trying to restore such programs the procedure fails and the kernel returns -EINVAL. From the image dump of a program:
| "mm_start_code": "0x400000", | "mm_end_code": "0x8f5fb4", | "mm_start_data": "0xf1bfb0", | "mm_end_data": "0xf1bfb0",
Thus we need to change validate_prctl_map from strictly less to less or equal operator use.
Link: http://lkml.kernel.org/r/20190408143554.GY1421@uranus.lan Fixes: f606b77f1a9e3 ("prctl: PR_SET_MM -- introduce PR_SET_MM_MAP operation") Signed-off-by: Cyrill Gorcunov gorcunov@gmail.com Cc: Andrey Vagin avagin@gmail.com Cc: Dmitry Safonov 0x7f454c46@gmail.com Cc: Pavel Emelyanov xemul@virtuozzo.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org --- kernel/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/sys.c b/kernel/sys.c index f7eb62eceb243..82ba11e6d2e50 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1923,7 +1923,7 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map) ((unsigned long)prctl_map->__m1 __op \ (unsigned long)prctl_map->__m2) ? 0 : -EINVAL error = __prctl_check_order(start_code, <, end_code); - error |= __prctl_check_order(start_data, <, end_data); + error |= __prctl_check_order(start_data,<=, end_data); error |= __prctl_check_order(start_brk, <=, brk); error |= __prctl_check_order(arg_start, <=, arg_end); error |= __prctl_check_order(env_start, <=, env_end);
From: Jiada Wang jiada_wang@mentor.com
[ Upstream commit 63f55fcea50c25ae5ad45af92d08dae3b84534c2 ]
Currently IRQ remains enabled after .remove, later if device is probed, IRQ is requested before .thermal_init, this may cause IRQ function be called before device is initialized.
this patch disables interrupt in .remove, to ensure irq function only be called after device is fully initialized.
Signed-off-by: Jiada Wang jiada_wang@mentor.com Reviewed-by: Simon Horman horms+renesas@verge.net.au Reviewed-by: Daniel Lezcano daniel.lezcano@linaro.org Signed-off-by: Eduardo Valentin edubezval@gmail.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/thermal/rcar_gen3_thermal.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/drivers/thermal/rcar_gen3_thermal.c b/drivers/thermal/rcar_gen3_thermal.c index 75786cc8e2f91..0a0562551611b 100644 --- a/drivers/thermal/rcar_gen3_thermal.c +++ b/drivers/thermal/rcar_gen3_thermal.c @@ -330,6 +330,9 @@ MODULE_DEVICE_TABLE(of, rcar_gen3_thermal_dt_ids); static int rcar_gen3_thermal_remove(struct platform_device *pdev) { struct device *dev = &pdev->dev; + struct rcar_gen3_thermal_priv *priv = dev_get_drvdata(dev); + + rcar_thermal_irq_set(priv, false);
pm_runtime_put(dev); pm_runtime_disable(dev);
From: Amit Kucheria amit.kucheria@linaro.org
[ Upstream commit fc7d18cf6a923cde7f5e7ba2c1105bb106d3e29a ]
We print a calibration failure message on -EPROBE_DEFER from nvmem/qfprom as follows: [ 3.003090] qcom-tsens 4a9000.thermal-sensor: version: 1.4 [ 3.005376] qcom-tsens 4a9000.thermal-sensor: tsens calibration failed [ 3.113248] qcom-tsens 4a9000.thermal-sensor: version: 1.4
This confuses people when, in fact, calibration succeeds later when nvmem/qfprom device is available. Don't print this message on a -EPROBE_DEFER.
Signed-off-by: Amit Kucheria amit.kucheria@linaro.org Signed-off-by: Eduardo Valentin edubezval@gmail.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/thermal/qcom/tsens.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/thermal/qcom/tsens.c b/drivers/thermal/qcom/tsens.c index f1ec9bbe4717e..54b2c0e3c4f49 100644 --- a/drivers/thermal/qcom/tsens.c +++ b/drivers/thermal/qcom/tsens.c @@ -160,7 +160,8 @@ static int tsens_probe(struct platform_device *pdev) if (tmdev->ops->calibrate) { ret = tmdev->ops->calibrate(tmdev); if (ret < 0) { - dev_err(dev, "tsens calibration failed\n"); + if (ret != -EPROBE_DEFER) + dev_err(dev, "tsens calibration failed\n"); return ret; } }
From: Daniel Gomez dagmcr@gmail.com
[ Upstream commit 9e364e87ad7f2c636276c773d718cda29d62b741 ]
MODULE_DEVICE_TABLE(of, <of_match_table> should be called to complete DT OF mathing mechanism and register it.
Before this patch: modinfo drivers/mfd/tps65912-spi.ko | grep alias alias: spi:tps65912
After this patch: modinfo drivers/mfd/tps65912-spi.ko | grep alias alias: of:N*T*Cti,tps65912C* alias: of:N*T*Cti,tps65912 alias: spi:tps65912
Reported-by: Javier Martinez Canillas javier@dowhile0.org Signed-off-by: Daniel Gomez dagmcr@gmail.com Signed-off-by: Lee Jones lee.jones@linaro.org Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/mfd/tps65912-spi.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/drivers/mfd/tps65912-spi.c b/drivers/mfd/tps65912-spi.c index 3bd75061f7776..f78be039e4637 100644 --- a/drivers/mfd/tps65912-spi.c +++ b/drivers/mfd/tps65912-spi.c @@ -27,6 +27,7 @@ static const struct of_device_id tps65912_spi_of_match_table[] = { { .compatible = "ti,tps65912", }, { /* sentinel */ } }; +MODULE_DEVICE_TABLE(of, tps65912_spi_of_match_table);
static int tps65912_spi_probe(struct spi_device *spi) {
From: Binbin Wu binbin.wu@intel.com
[ Upstream commit dad06532292d77f37fbe831a02948a593500f682 ]
In virtualized setup, when system reboots due to warm reset interrupt storm is seen.
Call Trace: <IRQ> dump_stack+0x70/0xa5 __report_bad_irq+0x2e/0xc0 note_interrupt+0x248/0x290 ? add_interrupt_randomness+0x30/0x220 handle_irq_event_percpu+0x54/0x80 handle_irq_event+0x39/0x60 handle_fasteoi_irq+0x91/0x150 handle_irq+0x108/0x180 do_IRQ+0x52/0xf0 common_interrupt+0xf/0xf </IRQ> RIP: 0033:0x76fc2cfabc1d Code: 24 28 bf 03 00 00 00 31 c0 48 8d 35 63 77 0e 00 48 8d 15 2e 94 0e 00 4c 89 f9 49 89 d9 4c 89 d3 e8 b8 e2 01 00 48 8b 54 24 18 <48> 89 ef 48 89 de 4c 89 e1 e8 d5 97 01 00 84 c0 74 2d 48 8b 04 24 RSP: 002b:00007ffd247c1fc0 EFLAGS: 00000293 ORIG_RAX: ffffffffffffffda RAX: 0000000000000000 RBX: 00007ffd247c1ff0 RCX: 000000000003d3ce RDX: 0000000000000000 RSI: 00007ffd247c1ff0 RDI: 000076fc2cbb6010 RBP: 000076fc2cded010 R08: 00007ffd247c2210 R09: 00007ffd247c22a0 R10: 000076fc29465470 R11: 0000000000000000 R12: 00007ffd247c1fc0 R13: 000076fc2ce8e470 R14: 000076fc27ec9960 R15: 0000000000000414 handlers: [<000000000d3fa913>] idma64_irq Disabling IRQ #27
To avoid interrupt storm, set the device in reset state before bringing out the device from reset state.
Changelog v2: - correct the subject line by adding "mfd: "
Signed-off-by: Binbin Wu binbin.wu@intel.com Acked-by: Mika Westerberg mika.westerberg@linux.intel.com Reviewed-by: Andy Shevchenko andriy.shevchenko@linux.intel.com Signed-off-by: Lee Jones lee.jones@linaro.org Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/mfd/intel-lpss.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/drivers/mfd/intel-lpss.c b/drivers/mfd/intel-lpss.c index 50bffc3382d77..ff3fba16e7359 100644 --- a/drivers/mfd/intel-lpss.c +++ b/drivers/mfd/intel-lpss.c @@ -273,6 +273,9 @@ static void intel_lpss_init_dev(const struct intel_lpss *lpss) { u32 value = LPSS_PRIV_SSP_REG_DIS_DMA_FIN;
+ /* Set the device in reset state */ + writel(0, lpss->priv + LPSS_PRIV_RESETS); + intel_lpss_deassert_reset(lpss);
intel_lpss_set_remap_addr(lpss);
From: Ben Skeggs bskeggs@redhat.com
[ Upstream commit 13d03e9daf70dab032c03dc172e75bb98ad899c4 ]
Where possible, we want the failsafe link configuration (one which won't hang the OR during modeset because of not enough bandwidth for the mode) to also be supported by the sink.
This prevents "link rate unsupported by sink" messages when link training fails.
Signed-off-by: Ben Skeggs bskeggs@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/gpu/drm/nouveau/nvkm/engine/disp/dp.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/dp.c b/drivers/gpu/drm/nouveau/nvkm/engine/disp/dp.c index 5f301e632599b..818d21bd28d31 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/dp.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/dp.c @@ -365,8 +365,15 @@ nvkm_dp_train(struct nvkm_dp *dp, u32 dataKBps) * and it's better to have a failed modeset than that. */ for (cfg = nvkm_dp_rates; cfg->rate; cfg++) { - if (cfg->nr <= outp_nr && cfg->nr <= outp_bw) - failsafe = cfg; + if (cfg->nr <= outp_nr && cfg->nr <= outp_bw) { + /* Try to respect sink limits too when selecting + * lowest link configuration. + */ + if (!failsafe || + (cfg->nr <= sink_nr && cfg->bw <= sink_bw)) + failsafe = cfg; + } + if (failsafe && cfg[1].rate < dataKBps) break; }
From: Tony Lindgren tony@atomide.com
[ Upstream commit 48171d0ea7caccf21c9ee3ae75eb370f2a756062 ]
I noticed that we can get a -EREMOTEIO errors on at least omap4 duovero:
twl6040 0-004b: Failed to write 2d = 19: -121
And then any following register access will produce errors.
There 2d offset above is register ACCCTL that gets written on twl6040 powerup. With error checking added to the related regcache_sync() call, the -EREMOTEIO error is reproducable on twl6040 powerup at least duovero.
To fix the error, we need to wait until twl6040 is accessible after the powerup. Based on tests on omap4 duovero, we need to wait over 8ms after powerup before register write will complete without failures. Let's also make sure we warn about possible errors too.
Note that we have twl6040_patch[] reg_sequence with the ACCCTL register configuration and regcache_sync() will write the new value to ACCCTL.
Signed-off-by: Tony Lindgren tony@atomide.com Acked-by: Peter Ujfalusi peter.ujfalusi@ti.com Signed-off-by: Lee Jones lee.jones@linaro.org Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/mfd/twl6040.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-)
diff --git a/drivers/mfd/twl6040.c b/drivers/mfd/twl6040.c index 7c3c5fd5fcd04..86052c5c60696 100644 --- a/drivers/mfd/twl6040.c +++ b/drivers/mfd/twl6040.c @@ -322,8 +322,19 @@ int twl6040_power(struct twl6040 *twl6040, int on) } }
+ /* + * Register access can produce errors after power-up unless we + * wait at least 8ms based on measurements on duovero. + */ + usleep_range(10000, 12000); + /* Sync with the HW */ - regcache_sync(twl6040->regmap); + ret = regcache_sync(twl6040->regmap); + if (ret) { + dev_err(twl6040->dev, "Failed to sync with the HW: %i\n", + ret); + goto out; + }
/* Default PLL configuration after power up */ twl6040->pll = TWL6040_SYSCLK_SEL_LPPLL;
From: Stephane Eranian eranian@google.com
[ Upstream commit c7a286577d7592720c2f179aadfb325a1ff48c95 ]
This patch fixes a restriction/bug introduced by:
583feb08e7f7 ("perf/x86/intel: Fix handling of wakeup_events for multi-entry PEBS")
The original patch prevented using multi-entry PEBS when wakeup_events != 0. However given that wakeup_events is part of a union with wakeup_watermark, it means that in watermark mode, PEBS multi-entry is also disabled which is not the intent. This patch fixes this by checking is watermark mode is enabled.
Signed-off-by: Stephane Eranian eranian@google.com Cc: Linus Torvalds torvalds@linux-foundation.org Cc: Peter Zijlstra peterz@infradead.org Cc: Thomas Gleixner tglx@linutronix.de Cc: jolsa@redhat.com Cc: kan.liang@intel.com Cc: vincent.weaver@maine.edu Fixes: 583feb08e7f7 ("perf/x86/intel: Fix handling of wakeup_events for multi-entry PEBS") Link: http://lkml.kernel.org/r/20190514003400.224340-1-eranian@google.com Signed-off-by: Ingo Molnar mingo@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- arch/x86/events/intel/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 71fb8b7b29545..e7bb4572a89fe 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3184,7 +3184,7 @@ static int intel_pmu_hw_config(struct perf_event *event) return ret;
if (event->attr.precise_ip) { - if (!(event->attr.freq || event->attr.wakeup_events)) { + if (!(event->attr.freq || (event->attr.wakeup_events && !event->attr.watermark))) { event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD; if (!(event->attr.sample_type & ~intel_pmu_large_pebs_flags(event)))
From: Ben Skeggs bskeggs@redhat.com
[ Upstream commit a0b694d0af21c9993d1a39a75fd814bd48bf7eb4 ]
HW has error checks in place which check that pixel depth is explicitly provided on DP, while HDMI has a "default" setting that we use.
In multi-display configurations with identical modelines, but different protocols (HDMI + DP, in this case), it was possible for the DP head to get swapped to the head which previously drove the HDMI output, without updating HeadSetControlOutputResource(), triggering the error check and hanging the core update.
Reported-by: Lyude Paul lyude@redhat.com Signed-off-by: Ben Skeggs bskeggs@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/gpu/drm/nouveau/dispnv50/head.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/nouveau/dispnv50/head.c b/drivers/gpu/drm/nouveau/dispnv50/head.c index ac97ebce5b351..4f0b254ebabf2 100644 --- a/drivers/gpu/drm/nouveau/dispnv50/head.c +++ b/drivers/gpu/drm/nouveau/dispnv50/head.c @@ -306,7 +306,7 @@ nv50_head_atomic_check(struct drm_crtc *crtc, struct drm_crtc_state *state) asyh->set.or = head->func->or != NULL; }
- if (asyh->state.mode_changed) + if (asyh->state.mode_changed || asyh->state.connectors_changed) nv50_head_atomic_check_mode(head, asyh);
if (asyh->state.color_mgmt_changed ||
From: Matt Redfearn matt.redfearn@thinci.com
[ Upstream commit 67793bd3b3948dc8c8384b6430e036a30a0ecb43 ]
The driver currently sets register 0xfb (Low Refresh Rate) based on the value of mode->vrefresh. Firstly, this field is specified to be in Hz, but the magic numbers used by the code are Hz * 1000. This essentially leads to the low refresh rate always being set to 0x01, since the vrefresh value will always be less than 24000. Fix the magic numbers to be in Hz. Secondly, according to the comment in drm_modes.h, the field is not supposed to be used in a functional way anyway. Instead, use the helper function drm_mode_vrefresh().
Fixes: 9c8af882bf12 ("drm: Add adv7511 encoder driver") Reviewed-by: Laurent Pinchart laurent.pinchart@ideasonboard.com Signed-off-by: Matt Redfearn matt.redfearn@thinci.com Signed-off-by: Sean Paul seanpaul@chromium.org Link: https://patchwork.freedesktop.org/patch/msgid/20190424132210.26338-1-matt.re... Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/gpu/drm/bridge/adv7511/adv7511_drv.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c b/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c index 85c2d407a52e1..e7ddd3e3db920 100644 --- a/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c +++ b/drivers/gpu/drm/bridge/adv7511/adv7511_drv.c @@ -747,11 +747,11 @@ static void adv7511_mode_set(struct adv7511 *adv7511, vsync_polarity = 1; }
- if (mode->vrefresh <= 24000) + if (drm_mode_vrefresh(mode) <= 24) low_refresh_rate = ADV7511_LOW_REFRESH_RATE_24HZ; - else if (mode->vrefresh <= 25000) + else if (drm_mode_vrefresh(mode) <= 25) low_refresh_rate = ADV7511_LOW_REFRESH_RATE_25HZ; - else if (mode->vrefresh <= 30000) + else if (drm_mode_vrefresh(mode) <= 30) low_refresh_rate = ADV7511_LOW_REFRESH_RATE_30HZ; else low_refresh_rate = ADV7511_LOW_REFRESH_RATE_NONE;
From: Josh Poimboeuf jpoimboe@redhat.com
[ Upstream commit e6da9567959e164f82bc81967e0d5b10dee870b4 ]
The ignore flag is set on fake jumps in order to keep add_jump_destinations() from setting their jump_dest, since it already got set when the fake jump was created.
But using the ignore flag is a bit of a hack. It's normally used to skip validation of an instruction, which doesn't really make sense for fake jumps.
Also, after the next patch, using the ignore flag for fake jumps can trigger a false "why am I validating an ignored function?" warning.
Instead just add an explicit check in add_jump_destinations() to skip fake jumps.
Signed-off-by: Josh Poimboeuf jpoimboe@redhat.com Cc: Linus Torvalds torvalds@linux-foundation.org Cc: Peter Zijlstra peterz@infradead.org Cc: Thomas Gleixner tglx@linutronix.de Link: http://lkml.kernel.org/r/71abc072ff48b2feccc197723a9c52859476c068.1557766718... Signed-off-by: Ingo Molnar mingo@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- tools/objtool/check.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 2cd57730381b8..ecf5fc77f50b5 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -28,6 +28,8 @@ #include <linux/hashtable.h> #include <linux/kernel.h>
+#define FAKE_JUMP_OFFSET -1 + struct alternative { struct list_head list; struct instruction *insn; @@ -501,7 +503,7 @@ static int add_jump_destinations(struct objtool_file *file) insn->type != INSN_JUMP_UNCONDITIONAL) continue;
- if (insn->ignore) + if (insn->ignore || insn->offset == FAKE_JUMP_OFFSET) continue;
rela = find_rela_by_dest_range(insn->sec, insn->offset, @@ -670,10 +672,10 @@ static int handle_group_alt(struct objtool_file *file, clear_insn_state(&fake_jump->state);
fake_jump->sec = special_alt->new_sec; - fake_jump->offset = -1; + fake_jump->offset = FAKE_JUMP_OFFSET; fake_jump->type = INSN_JUMP_UNCONDITIONAL; fake_jump->jump_dest = list_next_entry(last_orig_insn, list); - fake_jump->ignore = true; + fake_jump->func = orig_insn->func; }
if (!special_alt->new_len) {
From: Ben Skeggs bskeggs@redhat.com
[ Upstream commit d2434e4d942c32cadcbdbcd32c58f35098f3b604 ]
Cursor position updates were accidentally causing us to attempt to interlock window with window immediate, and without a matching window immediate update, NVDisplay could hang forever in some circumstances.
Fixes suspend/resume on (at least) Quadro RTX4000 (TU104).
Reported-by: Lyude Paul lyude@redhat.com Signed-off-by: Ben Skeggs bskeggs@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/gpu/drm/nouveau/dispnv50/disp.h | 1 + drivers/gpu/drm/nouveau/dispnv50/wimmc37b.c | 1 + drivers/gpu/drm/nouveau/dispnv50/wndw.c | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/nouveau/dispnv50/disp.h b/drivers/gpu/drm/nouveau/dispnv50/disp.h index 2216c58620c2d..7c41b0599d1ac 100644 --- a/drivers/gpu/drm/nouveau/dispnv50/disp.h +++ b/drivers/gpu/drm/nouveau/dispnv50/disp.h @@ -41,6 +41,7 @@ struct nv50_disp_interlock { NV50_DISP_INTERLOCK__SIZE } type; u32 data; + u32 wimm; };
void corec37d_ntfy_init(struct nouveau_bo *, u32); diff --git a/drivers/gpu/drm/nouveau/dispnv50/wimmc37b.c b/drivers/gpu/drm/nouveau/dispnv50/wimmc37b.c index 9103b8494279c..f7dbd965e4e72 100644 --- a/drivers/gpu/drm/nouveau/dispnv50/wimmc37b.c +++ b/drivers/gpu/drm/nouveau/dispnv50/wimmc37b.c @@ -75,6 +75,7 @@ wimmc37b_init_(const struct nv50_wimm_func *func, struct nouveau_drm *drm, return ret; }
+ wndw->interlock.wimm = wndw->interlock.data; wndw->immd = func; return 0; } diff --git a/drivers/gpu/drm/nouveau/dispnv50/wndw.c b/drivers/gpu/drm/nouveau/dispnv50/wndw.c index ba9eea2ff16bb..bbdb6d274c047 100644 --- a/drivers/gpu/drm/nouveau/dispnv50/wndw.c +++ b/drivers/gpu/drm/nouveau/dispnv50/wndw.c @@ -149,7 +149,7 @@ nv50_wndw_flush_set(struct nv50_wndw *wndw, u32 *interlock, if (asyw->set.point) { if (asyw->set.point = false, asyw->set.mask) interlock[wndw->interlock.type] |= wndw->interlock.data; - interlock[NV50_DISP_INTERLOCK_WIMM] |= wndw->interlock.data; + interlock[NV50_DISP_INTERLOCK_WIMM] |= wndw->interlock.wimm;
wndw->immd->point(wndw, asyw); wndw->immd->update(wndw, interlock);
From: Krzesimir Nowak krzesimir@kinvolk.io
[ Upstream commit e2f7fc0ac6957cabff4cecf6c721979b571af208 ]
Commit 31fd85816dbe ("bpf: permits narrower load from bpf program context fields") made the verifier add AND instructions to clear the unwanted bits with a mask when doing a narrow load. The mask is computed with
(1 << size * 8) - 1
where "size" is the size of the narrow load. When doing a 4 byte load of a an 8 byte field the verifier shifts the literal 1 by 32 places to the left. This results in an overflow of a signed integer, which is an undefined behavior. Typically, the computed mask was zero, so the result of the narrow load ended up being zero too.
Cast the literal to long long to avoid overflows. Note that narrow load of the 4 byte fields does not have the undefined behavior, because the load size can only be either 1 or 2 bytes, so shifting 1 by 8 or 16 places will not overflow it. And reading 4 bytes would not be a narrow load of a 4 bytes field.
Fixes: 31fd85816dbe ("bpf: permits narrower load from bpf program context fields") Reviewed-by: Alban Crequy alban@kinvolk.io Reviewed-by: Iago López Galeiras iago@kinvolk.io Signed-off-by: Krzesimir Nowak krzesimir@kinvolk.io Cc: Yonghong Song yhs@fb.com Signed-off-by: Daniel Borkmann daniel@iogearbox.net Signed-off-by: Sasha Levin sashal@kernel.org --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d53825b6fcd94..3b287b42212ea 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6612,7 +6612,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn->dst_reg, shift); insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg, - (1 << size * 8) - 1); + (1ULL << size * 8) - 1); } }
From: Chris Packham chris.packham@alliedtelesis.co.nz
[ Upstream commit 259799ea5a9aa099a267f3b99e1f7078bbaf5c5e ]
Use gen_rtx_set instead of gen_rtx_SET. The former is a wrapper macro that handles the difference between GCC versions implementing the latter.
This fixes the following error on my system with g++ 5.4.0 as the host compiler
HOSTCXX -fPIC scripts/gcc-plugins/arm_ssp_per_task_plugin.o scripts/gcc-plugins/arm_ssp_per_task_plugin.c:42:14: error: macro "gen_rtx_SET" requires 3 arguments, but only 2 given mask)), ^ scripts/gcc-plugins/arm_ssp_per_task_plugin.c: In function ‘unsigned int arm_pertask_ssp_rtl_execute()’: scripts/gcc-plugins/arm_ssp_per_task_plugin.c:39:20: error: ‘gen_rtx_SET’ was not declared in this scope emit_insn_before(gen_rtx_SET
Signed-off-by: Chris Packham chris.packham@alliedtelesis.co.nz Fixes: 189af4657186 ("ARM: smp: add support for per-task stack canaries") Cc: stable@vger.kernel.org Tested-by: Douglas Anderson dianders@chromium.org Signed-off-by: Kees Cook keescook@chromium.org Signed-off-by: Sasha Levin sashal@kernel.org --- scripts/gcc-plugins/arm_ssp_per_task_plugin.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/scripts/gcc-plugins/arm_ssp_per_task_plugin.c b/scripts/gcc-plugins/arm_ssp_per_task_plugin.c index 89c47f57d1ce1..8c1af9bdcb1bb 100644 --- a/scripts/gcc-plugins/arm_ssp_per_task_plugin.c +++ b/scripts/gcc-plugins/arm_ssp_per_task_plugin.c @@ -36,7 +36,7 @@ static unsigned int arm_pertask_ssp_rtl_execute(void) mask = GEN_INT(sext_hwi(sp_mask, GET_MODE_PRECISION(Pmode))); masked_sp = gen_reg_rtx(Pmode);
- emit_insn_before(gen_rtx_SET(masked_sp, + emit_insn_before(gen_rtx_set(masked_sp, gen_rtx_AND(Pmode, stack_pointer_rtx, mask)),
From: Michael Ellerman mpe@ellerman.id.au
[ Upstream commit 2b8358a951b1e2a534a54924cd8245e58a1c5fb8 ]
The mpc85xx EDAC driver can be configured as a module but then fails to build because it uses two unexported symbols:
ERROR: ".pci_find_hose_for_OF_device" [drivers/edac/mpc85xx_edac_mod.ko] undefined! ERROR: ".early_find_capability" [drivers/edac/mpc85xx_edac_mod.ko] undefined!
We don't want to export those symbols just for this driver, so make the driver only configurable as a built-in.
This seems to have been broken since at least
c92132f59806 ("edac/85xx: Add PCIe error interrupt edac support")
(Nov 2013).
[ bp: make it depend on EDAC=y so that the EDAC core doesn't get built as a module. ]
Signed-off-by: Michael Ellerman mpe@ellerman.id.au Signed-off-by: Borislav Petkov bp@suse.de Acked-by: Johannes Thumshirn jth@kernel.org Cc: James Morse james.morse@arm.com Cc: Mauro Carvalho Chehab mchehab@kernel.org Cc: linux-edac linux-edac@vger.kernel.org Cc: linuxppc-dev@ozlabs.org Cc: morbidrsa@gmail.com Link: https://lkml.kernel.org/r/20190502141941.12927-1-mpe@ellerman.id.au Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/edac/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index e286b5b990035..a3e6750393380 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -251,8 +251,8 @@ config EDAC_PND2 micro-server but may appear on others in the future.
config EDAC_MPC85XX - tristate "Freescale MPC83xx / MPC85xx" - depends on FSL_SOC + bool "Freescale MPC83xx / MPC85xx" + depends on FSL_SOC && EDAC=y help Support for error detection and correction on the Freescale MPC8349, MPC8560, MPC8540, MPC8548, T4240
From: ZhangXiaoxu zhangxiaoxu5@huawei.com
[ Upstream commit f02f3755dbd14fb935d24b14650fff9ba92243b8 ]
stat command with soft mount never return after server is stopped.
When alloc a new client, the state of the client will be set to NFS4CLNT_LEASE_EXPIRED.
When the server is stopped, the state manager will work, and accord the state to recover. But the state is NFS4CLNT_LEASE_EXPIRED, it will drain the slot table and lead other task to wait queue, until the client recovered. Then the stat command is hung.
When discover server trunking, the client will renew the lease, but check the client state, it lead the client state corruption.
So, we need to call state manager to recover it when detect server ip trunking.
Signed-off-by: ZhangXiaoxu zhangxiaoxu5@huawei.com Cc: stable@vger.kernel.org Signed-off-by: Anna Schumaker Anna.Schumaker@Netapp.com Signed-off-by: Sasha Levin sashal@kernel.org --- fs/nfs/nfs4state.c | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 02488b50534ac..6999e870baa97 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -159,6 +159,10 @@ int nfs40_discover_server_trunking(struct nfs_client *clp, /* Sustain the lease, even if it's empty. If the clientid4 * goes stale it's of no use for trunking discovery. */ nfs4_schedule_state_renewal(*result); + + /* If the client state need to recover, do it. */ + if (clp->cl_state) + nfs4_schedule_state_manager(clp); } out: return status;
From: Martin Blumenstingl martin.blumenstingl@googlemail.com
[ Upstream commit f173747fffdf037c791405ab4f1ec0eb392fc48e ]
Holding the spin-lock for all of the code in meson_pwm_apply() can result in a "BUG: scheduling while atomic". This can happen because clk_get_rate() (which is called from meson_pwm_calc()) may sleep. Only hold the spin-lock when modifying registers to solve this.
The reason why we need a spin-lock in the driver is because the REG_MISC_AB register is shared between the two channels provided by one PWM controller. The only functions where REG_MISC_AB is modified are meson_pwm_enable() and meson_pwm_disable() so the register reads/writes in there need to be protected by the spin-lock.
The original code also used the spin-lock to protect the values in struct meson_pwm_channel. This could be necessary if two consumers can use the same PWM channel. However, PWM core doesn't allow this so we don't need to protect the values in struct meson_pwm_channel with a lock.
Fixes: 211ed630753d2f ("pwm: Add support for Meson PWM Controller") Signed-off-by: Martin Blumenstingl martin.blumenstingl@googlemail.com Reviewed-by: Uwe Kleine-König u.kleine-koenig@pengutronix.de Reviewed-by: Neil Armstrong narmstrong@baylibre.com Signed-off-by: Thierry Reding thierry.reding@gmail.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/pwm/pwm-meson.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-)
diff --git a/drivers/pwm/pwm-meson.c b/drivers/pwm/pwm-meson.c index c1ed641b3e266..f6e738ad7bd92 100644 --- a/drivers/pwm/pwm-meson.c +++ b/drivers/pwm/pwm-meson.c @@ -111,6 +111,10 @@ struct meson_pwm { const struct meson_pwm_data *data; void __iomem *base; u8 inverter_mask; + /* + * Protects register (write) access to the REG_MISC_AB register + * that is shared between the two PWMs. + */ spinlock_t lock; };
@@ -235,6 +239,7 @@ static void meson_pwm_enable(struct meson_pwm *meson, { u32 value, clk_shift, clk_enable, enable; unsigned int offset; + unsigned long flags;
switch (id) { case 0: @@ -255,6 +260,8 @@ static void meson_pwm_enable(struct meson_pwm *meson, return; }
+ spin_lock_irqsave(&meson->lock, flags); + value = readl(meson->base + REG_MISC_AB); value &= ~(MISC_CLK_DIV_MASK << clk_shift); value |= channel->pre_div << clk_shift; @@ -267,11 +274,14 @@ static void meson_pwm_enable(struct meson_pwm *meson, value = readl(meson->base + REG_MISC_AB); value |= enable; writel(value, meson->base + REG_MISC_AB); + + spin_unlock_irqrestore(&meson->lock, flags); }
static void meson_pwm_disable(struct meson_pwm *meson, unsigned int id) { u32 value, enable; + unsigned long flags;
switch (id) { case 0: @@ -286,9 +296,13 @@ static void meson_pwm_disable(struct meson_pwm *meson, unsigned int id) return; }
+ spin_lock_irqsave(&meson->lock, flags); + value = readl(meson->base + REG_MISC_AB); value &= ~enable; writel(value, meson->base + REG_MISC_AB); + + spin_unlock_irqrestore(&meson->lock, flags); }
static int meson_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, @@ -296,19 +310,16 @@ static int meson_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, { struct meson_pwm_channel *channel = pwm_get_chip_data(pwm); struct meson_pwm *meson = to_meson_pwm(chip); - unsigned long flags; int err = 0;
if (!state) return -EINVAL;
- spin_lock_irqsave(&meson->lock, flags); - if (!state->enabled) { meson_pwm_disable(meson, pwm->hwpwm); channel->state.enabled = false;
- goto unlock; + return 0; }
if (state->period != channel->state.period || @@ -329,7 +340,7 @@ static int meson_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, err = meson_pwm_calc(meson, channel, pwm->hwpwm, state->duty_cycle, state->period); if (err < 0) - goto unlock; + return err;
channel->state.polarity = state->polarity; channel->state.period = state->period; @@ -341,9 +352,7 @@ static int meson_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, channel->state.enabled = true; }
-unlock: - spin_unlock_irqrestore(&meson->lock, flags); - return err; + return 0; }
static void meson_pwm_get_state(struct pwm_chip *chip, struct pwm_device *pwm,
From: Fabien Dessenne fabien.dessenne@st.com
[ Upstream commit 68a1c8485cf83734d4da9d81cd3b5d2ae7c0339b ]
On failure of_irq_get() returns a negative value or zero, which is not handled as an error in the existing implementation. Instead of using this API, use platform_get_irq() that returns exclusively a negative value on failure. Also, do not output an error log in case of defer probe error.
Signed-off-by: Fabien Dessenne fabien.dessenne@st.com Signed-off-by: Jassi Brar jaswinder.singh@linaro.org Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/mailbox/stm32-ipcc.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-)
diff --git a/drivers/mailbox/stm32-ipcc.c b/drivers/mailbox/stm32-ipcc.c index a338bd4cd7db6..b4efc6069082a 100644 --- a/drivers/mailbox/stm32-ipcc.c +++ b/drivers/mailbox/stm32-ipcc.c @@ -8,9 +8,9 @@ #include <linux/bitfield.h> #include <linux/clk.h> #include <linux/interrupt.h> +#include <linux/io.h> #include <linux/mailbox_controller.h> #include <linux/module.h> -#include <linux/of_irq.h> #include <linux/platform_device.h> #include <linux/pm_wakeirq.h>
@@ -240,9 +240,11 @@ static int stm32_ipcc_probe(struct platform_device *pdev)
/* irq */ for (i = 0; i < IPCC_IRQ_NUM; i++) { - ipcc->irqs[i] = of_irq_get_byname(dev->of_node, irq_name[i]); + ipcc->irqs[i] = platform_get_irq_byname(pdev, irq_name[i]); if (ipcc->irqs[i] < 0) { - dev_err(dev, "no IRQ specified %s\n", irq_name[i]); + if (ipcc->irqs[i] != -EPROBE_DEFER) + dev_err(dev, "no IRQ specified %s\n", + irq_name[i]); ret = ipcc->irqs[i]; goto err_clk; } @@ -263,9 +265,10 @@ static int stm32_ipcc_probe(struct platform_device *pdev)
/* wakeup */ if (of_property_read_bool(np, "wakeup-source")) { - ipcc->wkp = of_irq_get_byname(dev->of_node, "wakeup"); + ipcc->wkp = platform_get_irq_byname(pdev, "wakeup"); if (ipcc->wkp < 0) { - dev_err(dev, "could not get wakeup IRQ\n"); + if (ipcc->wkp != -EPROBE_DEFER) + dev_err(dev, "could not get wakeup IRQ\n"); ret = ipcc->wkp; goto err_clk; }
From: Miroslav Lichvar mlichvar@redhat.com
[ Upstream commit fdc6bae940ee9eb869e493990540098b8c0fd6ab ]
The ADJ_TAI adjtimex mode sets the TAI-UTC offset of the system clock. It is typically set by NTP/PTP implementations and it is automatically updated by the kernel on leap seconds. The initial value is zero (which applications may interpret as unknown), but this value cannot be set by adjtimex. This limitation seems to go back to the original "nanokernel" implementation by David Mills.
Change the ADJ_TAI check to accept zero as a valid TAI-UTC offset in order to allow setting it back to the initial value.
Fixes: 153b5d054ac2 ("ntp: support for TAI") Suggested-by: Ondrej Mosnacek omosnace@redhat.com Signed-off-by: Miroslav Lichvar mlichvar@redhat.com Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: John Stultz john.stultz@linaro.org Cc: Richard Cochran richardcochran@gmail.com Cc: Prarit Bhargava prarit@redhat.com Link: https://lkml.kernel.org/r/20190417084833.7401-1-mlichvar@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org --- kernel/time/ntp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 36a2bef001253..2f11dc592eb3a 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -689,7 +689,7 @@ static inline void process_adjtimex_modes(const struct timex *txc, s32 *time_tai time_constant = max(time_constant, 0l); }
- if (txc->modes & ADJ_TAI && txc->constant > 0) + if (txc->modes & ADJ_TAI && txc->constant >= 0) *time_tai = txc->constant;
if (txc->modes & ADJ_OFFSET)
From: Chao Yu yuchao0@huawei.com
[ Upstream commit 22d61e286e2d9097dae36f75ed48801056b77cac ]
As Jungyeon reported in bugzilla:
https://bugzilla.kernel.org/show_bug.cgi?id=203227
- Overview When mounting the attached crafted image, following errors are reported. Additionally, it hangs on sync after trying to mount it.
The image is intentionally fuzzed from a normal f2fs image for testing. Compile options for F2FS are as follows. CONFIG_F2FS_FS=y CONFIG_F2FS_STAT_FS=y CONFIG_F2FS_FS_XATTR=y CONFIG_F2FS_FS_POSIX_ACL=y CONFIG_F2FS_CHECK_FS=y
- Reproduces mkdir test mount -t f2fs tmp.img test sync
- Messages kernel BUG at fs/f2fs/recovery.c:549! RIP: 0010:recover_data+0x167a/0x1780 Call Trace: f2fs_recover_fsync_data+0x613/0x710 f2fs_fill_super+0x1043/0x1aa0 mount_bdev+0x16d/0x1a0 mount_fs+0x4a/0x170 vfs_kern_mount+0x5d/0x100 do_mount+0x200/0xcf0 ksys_mount+0x79/0xc0 __x64_sys_mount+0x1c/0x20 do_syscall_64+0x43/0xf0 entry_SYSCALL_64_after_hwframe+0x44/0xa9
During recovery, if ofs_of_node is inconsistent in between recovered node page and original checkpointed node page, let's just fail recovery instead of making kernel panic.
Signed-off-by: Chao Yu yuchao0@huawei.com Signed-off-by: Jaegeuk Kim jaegeuk@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- fs/f2fs/recovery.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index e3883db868d81..73338c432e7e4 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -546,7 +546,15 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, goto err;
f2fs_bug_on(sbi, ni.ino != ino_of_node(page)); - f2fs_bug_on(sbi, ofs_of_node(dn.node_page) != ofs_of_node(page)); + + if (ofs_of_node(dn.node_page) != ofs_of_node(page)) { + f2fs_msg(sbi->sb, KERN_WARNING, + "Inconsistent ofs_of_node, ino:%lu, ofs:%u, %u", + inode->i_ino, ofs_of_node(dn.node_page), + ofs_of_node(page)); + err = -EFAULT; + goto err; + }
for (; start < end; start++, dn.ofs_in_node++) { block_t src, dest;
From: Chao Yu yuchao0@huawei.com
[ Upstream commit 05573d6ccf702df549a7bdeabef31e4753df1a90 ]
As Jungyeon reported in bugzilla:
https://bugzilla.kernel.org/show_bug.cgi?id=203239
- Overview When mounting the attached crafted image and running program, following errors are reported. Additionally, it hangs on sync after running program.
The image is intentionally fuzzed from a normal f2fs image for testing. Compile options for F2FS are as follows. CONFIG_F2FS_FS=y CONFIG_F2FS_STAT_FS=y CONFIG_F2FS_FS_XATTR=y CONFIG_F2FS_FS_POSIX_ACL=y CONFIG_F2FS_CHECK_FS=y
- Reproduces cc poc_15.c ./run.sh f2fs sync
- Kernel messages ------------[ cut here ]------------ kernel BUG at fs/f2fs/segment.c:3162! RIP: 0010:f2fs_inplace_write_data+0x12d/0x160 Call Trace: f2fs_do_write_data_page+0x3c1/0x820 __write_data_page+0x156/0x720 f2fs_write_cache_pages+0x20d/0x460 f2fs_write_data_pages+0x1b4/0x300 do_writepages+0x15/0x60 __filemap_fdatawrite_range+0x7c/0xb0 file_write_and_wait_range+0x2c/0x80 f2fs_do_sync_file+0x102/0x810 do_fsync+0x33/0x60 __x64_sys_fsync+0xb/0x10 do_syscall_64+0x43/0xf0 entry_SYSCALL_64_after_hwframe+0x44/0xa9
The reason is f2fs_inplace_write_data() will trigger kernel panic due to data block locates in node type segment.
To avoid panic, let's just return error code and set SBI_NEED_FSCK to give a hint to fsck for latter repairing.
Signed-off-by: Chao Yu yuchao0@huawei.com Signed-off-by: Jaegeuk Kim jaegeuk@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- fs/f2fs/segment.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 2b809b54d81bb..b3f1f75af05cc 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3170,13 +3170,18 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) { int err; struct f2fs_sb_info *sbi = fio->sbi; + unsigned int segno;
fio->new_blkaddr = fio->old_blkaddr; /* i/o temperature is needed for passing down write hints */ __get_segment_type(fio);
- f2fs_bug_on(sbi, !IS_DATASEG(get_seg_entry(sbi, - GET_SEGNO(sbi, fio->new_blkaddr))->type)); + segno = GET_SEGNO(sbi, fio->new_blkaddr); + + if (!IS_DATASEG(get_seg_entry(sbi, segno)->type)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + return -EFAULT; + }
stat_inc_inplace_blocks(fio->sbi);
From: Chao Yu yuchao0@huawei.com
[ Upstream commit 988385795c7f46b231982d54750587f204bd558b ]
There are some places in where we missed to unlock page or unlock page incorrectly, fix them.
Signed-off-by: Chao Yu yuchao0@huawei.com Signed-off-by: Jaegeuk Kim jaegeuk@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- fs/f2fs/recovery.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-)
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 73338c432e7e4..b14c718139a96 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -325,8 +325,10 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, break; }
- if (!is_recoverable_dnode(page)) + if (!is_recoverable_dnode(page)) { + f2fs_put_page(page, 1); break; + }
if (!is_fsync_dnode(page)) goto next; @@ -338,8 +340,10 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, if (!check_only && IS_INODE(page) && is_dent_dnode(page)) { err = f2fs_recover_inode_page(sbi, page); - if (err) + if (err) { + f2fs_put_page(page, 1); break; + } quota_inode = true; }
@@ -355,6 +359,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, err = 0; goto next; } + f2fs_put_page(page, 1); break; } } @@ -370,6 +375,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, "%s: detect looped node chain, " "blkaddr:%u, next:%u", __func__, blkaddr, next_blkaddr_of_node(page)); + f2fs_put_page(page, 1); err = -EINVAL; break; } @@ -380,7 +386,6 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
f2fs_ra_meta_pages_cond(sbi, blkaddr); } - f2fs_put_page(page, 1); return err; }
@@ -674,8 +679,10 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, */ if (IS_INODE(page)) { err = recover_inode(entry->inode, page); - if (err) + if (err) { + f2fs_put_page(page, 1); break; + } } if (entry->last_dentry == blkaddr) { err = recover_dentry(entry->inode, page, dir_list);
From: Chao Yu yuchao0@huawei.com
[ Upstream commit 8b6810f8acfe429fde7c7dad4714692cc5f75651 ]
As Jungyeon reported in bugzilla:
https://bugzilla.kernel.org/show_bug.cgi?id=203219
- Overview When mounting the attached crafted image and running program, I got this error. Additionally, it hangs on sync after running the program.
The image is intentionally fuzzed from a normal f2fs image for testing and I enabled option CONFIG_F2FS_CHECK_FS on.
- Reproduces cc poc_06.c mkdir test mount -t f2fs tmp.img test cp a.out test cd test sudo ./a.out sync
- Messages kernel BUG at fs/f2fs/node.c:1183! RIP: 0010:f2fs_remove_inode_page+0x294/0x2d0 Call Trace: f2fs_evict_inode+0x2a3/0x3a0 evict+0xba/0x180 __dentry_kill+0xbe/0x160 dentry_kill+0x46/0x180 dput+0xbb/0x100 do_renameat2+0x3c9/0x550 __x64_sys_rename+0x17/0x20 do_syscall_64+0x43/0xf0 entry_SYSCALL_64_after_hwframe+0x44/0xa9
The reason is f2fs_remove_inode_page() will trigger kernel panic due to inconsistent i_blocks value of inode.
To avoid panic, let's just print debug message and set SBI_NEED_FSCK to give a hint to fsck for latter repairing of potential image corruption.
Signed-off-by: Chao Yu yuchao0@huawei.com [Jaegeuk Kim: fix build warning and add unlikely] Signed-off-by: Jaegeuk Kim jaegeuk@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- fs/f2fs/node.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 3f99ab2886955..d45ecef751165 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1179,8 +1179,14 @@ int f2fs_remove_inode_page(struct inode *inode) f2fs_put_dnode(&dn); return -EIO; } - f2fs_bug_on(F2FS_I_SB(inode), - inode->i_blocks != 0 && inode->i_blocks != 8); + + if (unlikely(inode->i_blocks != 0 && inode->i_blocks != 8)) { + f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, + "Inconsistent i_blocks, ino:%lu, iblocks:%llu", + inode->i_ino, + (unsigned long long)inode->i_blocks); + set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); + }
/* will put inode & node pages */ err = truncate_node(&dn);
From: Chao Yu yuchao0@huawei.com
[ Upstream commit 626bcf2b7ce87211dba565f2bfa7842ba5be5c1b ]
As Jungyeon reported in bugzilla:
https://bugzilla.kernel.org/show_bug.cgi?id=203225
- Overview When mounting the attached crafted image and unmounting it, following errors are reported. Additionally, it hangs on sync after unmounting.
The image is intentionally fuzzed from a normal f2fs image for testing. Compile options for F2FS are as follows. CONFIG_F2FS_FS=y CONFIG_F2FS_STAT_FS=y CONFIG_F2FS_FS_XATTR=y CONFIG_F2FS_FS_POSIX_ACL=y CONFIG_F2FS_CHECK_FS=y
- Reproduces mkdir test mount -t f2fs tmp.img test touch test/t umount test sync
- Messages kernel BUG at fs/f2fs/node.c:3073! RIP: 0010:f2fs_destroy_node_manager+0x2f0/0x300 Call Trace: f2fs_put_super+0xf4/0x270 generic_shutdown_super+0x62/0x110 kill_block_super+0x1c/0x50 kill_f2fs_super+0xad/0xd0 deactivate_locked_super+0x35/0x60 cleanup_mnt+0x36/0x70 task_work_run+0x75/0x90 exit_to_usermode_loop+0x93/0xa0 do_syscall_64+0xba/0xf0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0010:f2fs_destroy_node_manager+0x2f0/0x300
NAT table is corrupted, so reserved meta/node inode ids were added into free list incorrectly, during file creation, since reserved id has cached in inode hash, so it fails the creation and preallocated nid can not be released later, result in kernel panic.
To fix this issue, let's do nid boundary check during free nid loading.
Signed-off-by: Chao Yu yuchao0@huawei.com Signed-off-by: Jaegeuk Kim jaegeuk@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- fs/f2fs/node.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d45ecef751165..63bb6134d39ae 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2082,6 +2082,9 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, if (unlikely(nid == 0)) return false;
+ if (unlikely(f2fs_check_nid_range(sbi, nid))) + return false; + i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS); i->nid = nid; i->state = FREE_NID;
From: Chao Yu yuchao0@huawei.com
[ Upstream commit 546d22f070d64a7b96f57c93333772085d3a5e6d ]
As Jungyeon reported in bugzilla:
https://bugzilla.kernel.org/show_bug.cgi?id=203217
- Overview When mounting the attached crafted image and running program, I got this error. Additionally, it hangs on sync after running the program.
The image is intentionally fuzzed from a normal f2fs image for testing and I enabled option CONFIG_F2FS_CHECK_FS on.
- Reproduces cc poc_test_05.c mkdir test mount -t f2fs tmp.img test sudo ./a.out sync
- Messages kernel BUG at fs/f2fs/inode.c:707! RIP: 0010:f2fs_evict_inode+0x33f/0x3a0 Call Trace: evict+0xba/0x180 f2fs_iget+0x598/0xdf0 f2fs_lookup+0x136/0x320 __lookup_slow+0x92/0x140 lookup_slow+0x30/0x50 walk_component+0x1c1/0x350 path_lookupat+0x62/0x200 filename_lookup+0xb3/0x1a0 do_readlinkat+0x56/0x110 __x64_sys_readlink+0x16/0x20 do_syscall_64+0x43/0xf0 entry_SYSCALL_64_after_hwframe+0x44/0xa9
During inode loading, __recover_inline_status() can recovery inode status and set inode dirty, once we failed in following process, it will fail the check in f2fs_evict_inode, result in trigger BUG_ON().
Let's clear dirty inode in error path of f2fs_iget() to avoid panic.
Signed-off-by: Chao Yu yuchao0@huawei.com Signed-off-by: Jaegeuk Kim jaegeuk@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- fs/f2fs/inode.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index bec52961630b9..8c8d40e07ebaf 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -473,6 +473,7 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) return inode;
bad_inode: + f2fs_inode_synced(inode); iget_failed(inode); trace_f2fs_iget_exit(inode, ret); return ERR_PTR(ret);
From: Chao Yu yuchao0@huawei.com
[ Upstream commit 5e159cd349bf3a31fb7e35c23a93308eb30f4f71 ]
As Jungyeon reported in bugzilla:
https://bugzilla.kernel.org/show_bug.cgi?id=203209
- Overview When mounting the attached crafted image and running program, I got this error. Additionally, it hangs on sync after the this script.
The image is intentionally fuzzed from a normal f2fs image for testing and I enabled option CONFIG_F2FS_CHECK_FS on.
- Reproduces cc poc_01.c ./run.sh f2fs sync
kernel BUG at fs/f2fs/f2fs.h:1788! RIP: 0010:f2fs_truncate_data_blocks_range+0x342/0x350 Call Trace: f2fs_truncate_blocks+0x36d/0x3c0 f2fs_truncate+0x88/0x110 f2fs_setattr+0x3e1/0x460 notify_change+0x2da/0x400 do_truncate+0x6d/0xb0 do_sys_ftruncate+0xf1/0x160 do_syscall_64+0x43/0xf0 entry_SYSCALL_64_after_hwframe+0x44/0xa9
The reason is dec_valid_block_count() will trigger kernel panic due to inconsistent count in between inode.i_blocks and actual block.
To avoid panic, let's just print debug message and set SBI_NEED_FSCK to give a hint to fsck for latter repairing.
Signed-off-by: Chao Yu yuchao0@huawei.com [Jaegeuk Kim: fix build warning and add unlikely] Signed-off-by: Jaegeuk Kim jaegeuk@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- fs/f2fs/f2fs.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 48f1bbf3e87eb..f8dfa64429f64 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1787,6 +1787,7 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, return -ENOSPC; }
+void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...); static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, block_t count) @@ -1795,13 +1796,21 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
spin_lock(&sbi->stat_lock); f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count); - f2fs_bug_on(sbi, inode->i_blocks < sectors); sbi->total_valid_block_count -= (block_t)count; if (sbi->reserved_blocks && sbi->current_reserved_blocks < sbi->reserved_blocks) sbi->current_reserved_blocks = min(sbi->reserved_blocks, sbi->current_reserved_blocks + count); spin_unlock(&sbi->stat_lock); + if (unlikely(inode->i_blocks < sectors)) { + f2fs_msg(sbi->sb, KERN_WARNING, + "Inconsistent i_blocks, ino:%lu, iblocks:%llu, sectors:%llu", + inode->i_ino, + (unsigned long long)inode->i_blocks, + (unsigned long long)sectors); + set_sbi_flag(sbi, SBI_NEED_FSCK); + return; + } f2fs_i_blocks_write(inode, count, false, true); }
@@ -2808,7 +2817,6 @@ static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi,
bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); -void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...); static inline void verify_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) {
From: Chao Yu yuchao0@huawei.com
[ Upstream commit 622927f3b8809206f6da54a6a7ed4df1a7770fce ]
With below mkfs and mount option:
MKFS_OPTIONS -- -O extra_attr -O project_quota -O inode_checksum -O flexible_inline_xattr -O inode_crtime -f MOUNT_OPTIONS -- -o noinline_xattr
We may miss xattr data with below testcase: - mkdir dir - setfattr -n "user.name" -v 0 dir - for ((i = 0; i < 190; i++)) do touch dir/$i; done - umount - mount - getfattr -n "user.name" dir
user.name: No such attribute
The root cause is that we persist xattr data into reserved inline xattr space, even if inline_xattr is not enable in inline directory inode, after inline dentry conversion, reserved space no longer exists, so that xattr data missed.
Let's use inline xattr space only if inline_xattr flag is set on inode to fix this iusse.
Fixes: 6afc662e68b5 ("f2fs: support flexible inline xattr size") Signed-off-by: Chao Yu yuchao0@huawei.com Signed-off-by: Jaegeuk Kim jaegeuk@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- fs/f2fs/f2fs.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f8dfa64429f64..2149d1f190d62 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2570,7 +2570,9 @@ static inline void *inline_xattr_addr(struct inode *inode, struct page *page)
static inline int inline_xattr_size(struct inode *inode) { - return get_inline_xattr_addrs(inode) * sizeof(__le32); + if (f2fs_has_inline_xattr(inode)) + return get_inline_xattr_addrs(inode) * sizeof(__le32); + return 0; }
static inline int f2fs_has_inline_data(struct inode *inode)
From: Chao Yu yuchao0@huawei.com
[ Upstream commit ea6d7e72fea49402aa445345aade7a26b81732e3 ]
As Jungyeon reported in bugzilla:
https://bugzilla.kernel.org/show_bug.cgi?id=203213
- Overview When mounting the attached crafted image and running program, I got this error. Additionally, it hangs on sync after running the this script.
The image is intentionally fuzzed from a normal f2fs image for testing and I enabled option CONFIG_F2FS_CHECK_FS on.
- Reproduces mkdir test mount -t f2fs tmp.img test cp a.out test cd test sudo ./a.out sync
kernel BUG at fs/f2fs/f2fs.h:2012! RIP: 0010:truncate_node+0x2c9/0x2e0 Call Trace: f2fs_truncate_xattr_node+0xa1/0x130 f2fs_remove_inode_page+0x82/0x2d0 f2fs_evict_inode+0x2a3/0x3a0 evict+0xba/0x180 __dentry_kill+0xbe/0x160 dentry_kill+0x46/0x180 dput+0xbb/0x100 do_renameat2+0x3c9/0x550 __x64_sys_rename+0x17/0x20 do_syscall_64+0x43/0xf0 entry_SYSCALL_64_after_hwframe+0x44/0xa9
The reason is dec_valid_node_count() will trigger kernel panic due to inconsistent count in between inode.i_blocks and actual block.
To avoid panic, let's just print debug message and set SBI_NEED_FSCK to give a hint to fsck for latter repairing.
Signed-off-by: Chao Yu yuchao0@huawei.com [Jaegeuk Kim: fix build warning and add unlikely] Signed-off-by: Jaegeuk Kim jaegeuk@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- fs/f2fs/f2fs.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 2149d1f190d62..1686d09db3f1a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2028,7 +2028,6 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
f2fs_bug_on(sbi, !sbi->total_valid_block_count); f2fs_bug_on(sbi, !sbi->total_valid_node_count); - f2fs_bug_on(sbi, !is_inode && !inode->i_blocks);
sbi->total_valid_node_count--; sbi->total_valid_block_count--; @@ -2038,10 +2037,19 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
spin_unlock(&sbi->stat_lock);
- if (is_inode) + if (is_inode) { dquot_free_inode(inode); - else + } else { + if (unlikely(inode->i_blocks == 0)) { + f2fs_msg(sbi->sb, KERN_WARNING, + "Inconsistent i_blocks, ino:%lu, iblocks:%llu", + inode->i_ino, + (unsigned long long)inode->i_blocks); + set_sbi_flag(sbi, SBI_NEED_FSCK); + return; + } f2fs_i_blocks_write(inode, 1, false, true); + } }
static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi)
From: Chao Yu yuchao0@huawei.com
[ Upstream commit e95bcdb2fefa129f37bd9035af1d234ca92ee4ef ]
As Jungyeon reported in bugzilla:
https://bugzilla.kernel.org/show_bug.cgi?id=203233
- Overview When mounting the attached crafted image and running program, following errors are reported. Additionally, it hangs on sync after running program.
The image is intentionally fuzzed from a normal f2fs image for testing. Compile options for F2FS are as follows. CONFIG_F2FS_FS=y CONFIG_F2FS_STAT_FS=y CONFIG_F2FS_FS_XATTR=y CONFIG_F2FS_FS_POSIX_ACL=y CONFIG_F2FS_CHECK_FS=y
- Reproduces cc poc_13.c mkdir test mount -t f2fs tmp.img test cp a.out test cd test sudo ./a.out sync
- Kernel messages F2FS-fs (sdb): Bitmap was wrongly set, blk:4608 kernel BUG at fs/f2fs/segment.c:2102! RIP: 0010:update_sit_entry+0x394/0x410 Call Trace: f2fs_allocate_data_block+0x16f/0x660 do_write_page+0x62/0x170 f2fs_do_write_node_page+0x33/0xa0 __write_node_page+0x270/0x4e0 f2fs_sync_node_pages+0x5df/0x670 f2fs_write_checkpoint+0x372/0x1400 f2fs_sync_fs+0xa3/0x130 f2fs_do_sync_file+0x1a6/0x810 do_fsync+0x33/0x60 __x64_sys_fsync+0xb/0x10 do_syscall_64+0x43/0xf0 entry_SYSCALL_64_after_hwframe+0x44/0xa9
sit.vblocks and sum valid block count in sit.valid_map may be inconsistent, segment w/ zero vblocks will be treated as free segment, while allocating in free segment, we may allocate a free block, if its bitmap is valid previously, it can cause kernel crash due to bitmap verification failure.
Anyway, to avoid further serious metadata inconsistence and corruption, it is necessary and worth to detect SIT inconsistence. So let's enable check_block_count() to verify vblocks and valid_map all the time rather than do it only CONFIG_F2FS_CHECK_FS is enabled.
Signed-off-by: Chao Yu yuchao0@huawei.com Signed-off-by: Jaegeuk Kim jaegeuk@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- fs/f2fs/segment.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index a77f76f528b61..5af21e53ecf58 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -672,7 +672,6 @@ static inline void verify_block_addr(struct f2fs_io_info *fio, block_t blk_addr) static inline int check_block_count(struct f2fs_sb_info *sbi, int segno, struct f2fs_sit_entry *raw_sit) { -#ifdef CONFIG_F2FS_CHECK_FS bool is_valid = test_bit_le(0, raw_sit->valid_map) ? true : false; int valid_blocks = 0; int cur_pos = 0, next_pos; @@ -699,7 +698,7 @@ static inline int check_block_count(struct f2fs_sb_info *sbi, set_sbi_flag(sbi, SBI_NEED_FSCK); return -EINVAL; } -#endif + /* check segment usage, and check boundary of a given segment number */ if (unlikely(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg || segno > TOTAL_SEGS(sbi) - 1)) {
From: Chao Yu yuchao0@huawei.com
[ Upstream commit 793ab1c8a792f8bccd7ae4c5be02bd275410b3af ]
As Jungyeon reported in bugzilla:
https://bugzilla.kernel.org/show_bug.cgi?id=203211
- Overview When mounting the attached crafted image and making a new file, I got this error and the error messages keep repeating.
The image is intentionally fuzzed from a normal f2fs image for testing and I run with option CONFIG_F2FS_CHECK_FS on.
- Reproduces mkdir test mount -t f2fs tmp.img test cd test touch t
- Messages [ 58.820451] F2FS-fs (sdb): Inconsistent segment (1) type [1, 0] in SSA and SIT [ 58.821485] F2FS-fs (sdb): Inconsistent segment (1) type [1, 0] in SSA and SIT [ 58.822530] F2FS-fs (sdb): Inconsistent segment (1) type [1, 0] in SSA and SIT [ 58.823571] F2FS-fs (sdb): Inconsistent segment (1) type [1, 0] in SSA and SIT [ 58.824616] F2FS-fs (sdb): Inconsistent segment (1) type [1, 0] in SSA and SIT [ 58.825640] F2FS-fs (sdb): Inconsistent segment (1) type [1, 0] in SSA and SIT [ 58.826663] F2FS-fs (sdb): Inconsistent segment (1) type [1, 0] in SSA and SIT [ 58.827698] F2FS-fs (sdb): Inconsistent segment (1) type [1, 0] in SSA and SIT [ 58.828719] F2FS-fs (sdb): Inconsistent segment (1) type [1, 0] in SSA and SIT [ 58.829759] F2FS-fs (sdb): Inconsistent segment (1) type [1, 0] in SSA and SIT [ 58.830783] F2FS-fs (sdb): Inconsistent segment (1) type [1, 0] in SSA and SIT [ 58.831828] F2FS-fs (sdb): Inconsistent segment (1) type [1, 0] in SSA and SIT [ 58.832869] F2FS-fs (sdb): Inconsistent segment (1) type [1, 0] in SSA and SIT [ 58.833888] F2FS-fs (sdb): Inconsistent segment (1) type [1, 0] in SSA and SIT [ 58.834945] F2FS-fs (sdb): Inconsistent segment (1) type [1, 0] in SSA and SIT [ 58.835996] F2FS-fs (sdb): Inconsistent segment (1) type [1, 0] in SSA and SIT [ 58.837028] F2FS-fs (sdb): Inconsistent segment (1) type [1, 0] in SSA and SIT [ 58.838051] F2FS-fs (sdb): Inconsistent segment (1) type [1, 0] in SSA and SIT [ 58.839072] F2FS-fs (sdb): Inconsistent segment (1) type [1, 0] in SSA and SIT [ 58.840100] F2FS-fs (sdb): Inconsistent segment (1) type [1, 0] in SSA and SIT [ 58.841147] F2FS-fs (sdb): Inconsistent segment (1) type [1, 0] in SSA and SIT [ 58.842186] F2FS-fs (sdb): Inconsistent segment (1) type [1, 0] in SSA and SIT [ 58.843214] F2FS-fs (sdb): Inconsistent segment (1) type [1, 0] in SSA and SIT [ 58.844267] F2FS-fs (sdb): Inconsistent segment (1) type [1, 0] in SSA and SIT [ 58.845282] F2FS-fs (sdb): Inconsistent segment (1) type [1, 0] in SSA and SIT [ 58.846305] F2FS-fs (sdb): Inconsistent segment (1) type [1, 0] in SSA and SIT [ 58.847341] F2FS-fs (sdb): Inconsistent segment (1) type [1, 0] in SSA and SIT ... (repeating)
During GC, if segment type stored in SSA and SIT is inconsistent, we just skip migrating current segment directly, since we need to know the exact type to decide the migration function we use.
So in foreground GC, we will easily run into a infinite loop as we may select the same victim segment which has inconsistent type due to greedy policy. In order to end up this, we choose to shutdown filesystem. For backgrond GC, we need to do that as well, so that we can avoid latter potential infinite looped foreground GC.
Signed-off-by: Chao Yu yuchao0@huawei.com Signed-off-by: Jaegeuk Kim jaegeuk@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- fs/f2fs/gc.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index ab764bd106de1..a66a8752e5f65 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1175,6 +1175,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, "type [%d, %d] in SSA and SIT", segno, type, GET_SUM_TYPE((&sum->footer))); set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_stop_checkpoint(sbi, false); goto skip; }
From: Chao Yu yuchao0@huawei.com
[ Upstream commit 45a746881576977f85504c21a75547f10c5c0a8e ]
With below mkfs and mount option, generic/339 of fstest will report that scratch image becomes corrupted.
MKFS_OPTIONS -- -O extra_attr -O project_quota -O inode_checksum -O flexible_inline_xattr -O inode_crtime -f /dev/zram1 MOUNT_OPTIONS -- -o acl,user_xattr -o discard,noinline_xattr /dev/zram1 /mnt/scratch_f2fs
[ASSERT] (f2fs_check_dirent_position:1315) --> Wrong position of dirent pino:1970, name: (...) level:8, dir_level:0, pgofs:951, correct range:[900, 901]
In old kernel, inline data and directory always reserved 200 bytes in inode layout, even if inline_xattr is disabled, then new kernel tries to retrieve that space for non-inline xattr inode, but for inline dentry, its layout size should be fixed, so we just keep that reserved space.
But the problem here is that, after inline dentry conversion, inline dentry layout no longer exists, if we still reserve inline xattr space, after dents updates, there will be a hole in inline xattr space, which can break hierarchy hash directory structure.
This patch fixes this issue by retrieving inline xattr space after inline dentry conversion.
Fixes: 6afc662e68b5 ("f2fs: support flexible inline xattr size") Signed-off-by: Chao Yu yuchao0@huawei.com Signed-off-by: Jaegeuk Kim jaegeuk@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- fs/f2fs/inline.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+)
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index aacbb864ec1ed..0c2251d595ec8 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -420,6 +420,14 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, stat_dec_inline_dir(dir); clear_inode_flag(dir, FI_INLINE_DENTRY);
+ /* + * should retrieve reserved space which was used to keep + * inline_dentry's structure for backward compatibility. + */ + if (!f2fs_sb_has_flexible_inline_xattr(F2FS_I_SB(dir)) && + !f2fs_has_inline_xattr(dir)) + F2FS_I(dir)->i_inline_xattr_size = 0; + f2fs_i_depth_write(dir, 1); if (i_size_read(dir) < PAGE_SIZE) f2fs_i_size_write(dir, PAGE_SIZE); @@ -501,6 +509,15 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage,
stat_dec_inline_dir(dir); clear_inode_flag(dir, FI_INLINE_DENTRY); + + /* + * should retrieve reserved space which was used to keep + * inline_dentry's structure for backward compatibility. + */ + if (!f2fs_sb_has_flexible_inline_xattr(F2FS_I_SB(dir)) && + !f2fs_has_inline_xattr(dir)) + F2FS_I(dir)->i_inline_xattr_size = 0; + kvfree(backup_dentry); return 0; recover:
From: Chao Yu yuchao0@huawei.com
[ Upstream commit b42b179bda9ff11075a6fc2bac4d9e400513679a ]
As Jungyeon reported in bugzilla:
https://bugzilla.kernel.org/show_bug.cgi?id=203221
- Overview When mounting the attached crafted image and running program, this error is reported.
The image is intentionally fuzzed from a normal f2fs image for testing and I enabled option CONFIG_F2FS_CHECK_FS on.
- Reproduces cc poc_07.c mkdir test mount -t f2fs tmp.img test cp a.out test cd test sudo ./a.out
- Messages kernel BUG at fs/f2fs/node.c:1279! RIP: 0010:read_node_page+0xcf/0xf0 Call Trace: __get_node_page+0x6b/0x2f0 f2fs_iget+0x8f/0xdf0 f2fs_lookup+0x136/0x320 __lookup_slow+0x92/0x140 lookup_slow+0x30/0x50 walk_component+0x1c1/0x350 path_lookupat+0x62/0x200 filename_lookup+0xb3/0x1a0 do_fchmodat+0x3e/0xa0 __x64_sys_chmod+0x12/0x20 do_syscall_64+0x43/0xf0 entry_SYSCALL_64_after_hwframe+0x44/0xa9
On below paths, we can have opportunity to readahead inode page - gc_node_segment -> f2fs_ra_node_page - gc_data_segment -> f2fs_ra_node_page - f2fs_fill_dentries -> f2fs_ra_node_page
Unlike synchronized read, on readahead path, we can set page uptodate before verifying page's checksum, then read_node_page() will trigger kernel panic once it encounters a uptodated page w/ incorrect checksum.
So considering readahead scenario, we have to do checksum each time when loading inode page even if it is uptodated.
Signed-off-by: Chao Yu yuchao0@huawei.com Signed-off-by: Jaegeuk Kim jaegeuk@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- fs/f2fs/inode.c | 4 ++-- fs/f2fs/node.c | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-)
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 8c8d40e07ebaf..24c81703ec56f 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -176,8 +176,8 @@ bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page)
if (provided != calculated) f2fs_msg(sbi->sb, KERN_WARNING, - "checksum invalid, ino = %x, %x vs. %x", - ino_of_node(page), provided, calculated); + "checksum invalid, nid = %lu, ino_of_node = %x, %x vs. %x", + page->index, ino_of_node(page), provided, calculated);
return provided == calculated; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 63bb6134d39ae..e29d5f6735ae9 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1281,9 +1281,10 @@ static int read_node_page(struct page *page, int op_flags) int err;
if (PageUptodate(page)) { -#ifdef CONFIG_F2FS_CHECK_FS - f2fs_bug_on(sbi, !f2fs_inode_chksum_verify(sbi, page)); -#endif + if (!f2fs_inode_chksum_verify(sbi, page)) { + ClearPageUptodate(page); + return -EBADMSG; + } return LOCKED_PAGE; }
From: Eugen Hristev eugen.hristev@microchip.com
[ Upstream commit 1e4e25c4959c10728fbfcc6a286f9503d32dfe02 ]
The subsystem will free the asd memory on notifier cleanup, if the asd is added to the notifier. However the memory is freed using kfree. Thus, we cannot allocate the asd using devm_* This can lead to crashes and problems. To test this issue, just return an error at probe, but cleanup the notifier beforehand.
Fixes: 106267444f ("[media] atmel-isc: add the Image Sensor Controller code")
Signed-off-by: Eugen Hristev eugen.hristev@microchip.com Signed-off-by: Hans Verkuil hverkuil-cisco@xs4all.nl Signed-off-by: Mauro Carvalho Chehab mchehab+samsung@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/media/platform/atmel/atmel-isc.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/drivers/media/platform/atmel/atmel-isc.c b/drivers/media/platform/atmel/atmel-isc.c index 50178968b8a63..5b0e96bd8c7e5 100644 --- a/drivers/media/platform/atmel/atmel-isc.c +++ b/drivers/media/platform/atmel/atmel-isc.c @@ -2065,8 +2065,11 @@ static int isc_parse_dt(struct device *dev, struct isc_device *isc) break; }
- subdev_entity->asd = devm_kzalloc(dev, - sizeof(*subdev_entity->asd), GFP_KERNEL); + /* asd will be freed by the subsystem once it's added to the + * notifier list + */ + subdev_entity->asd = kzalloc(sizeof(*subdev_entity->asd), + GFP_KERNEL); if (!subdev_entity->asd) { of_node_put(rem); ret = -ENOMEM; @@ -2210,6 +2213,7 @@ static int atmel_isc_probe(struct platform_device *pdev) subdev_entity->asd); if (ret) { fwnode_handle_put(subdev_entity->asd->match.fwnode); + kfree(subdev_entity->asd); goto cleanup_subdev; }
From: John Sperbeck jsperbeck@google.com
[ Upstream commit 198790d9a3aeaef5792d33a560020861126edc22 ]
In free_percpu() we sometimes call pcpu_schedule_balance_work() to queue a work item (which does a wakeup) while holding pcpu_lock. This creates an unnecessary lock dependency between pcpu_lock and the scheduler's pi_lock. There are other places where we call pcpu_schedule_balance_work() without hold pcpu_lock, and this case doesn't need to be different.
Moving the call outside the lock prevents the following lockdep splat when running tools/testing/selftests/bpf/{test_maps,test_progs} in sequence with lockdep enabled:
====================================================== WARNING: possible circular locking dependency detected 5.1.0-dbg-DEV #1 Not tainted ------------------------------------------------------ kworker/23:255/18872 is trying to acquire lock: 000000000bc79290 (&(&pool->lock)->rlock){-.-.}, at: __queue_work+0xb2/0x520
but task is already holding lock: 00000000e3e7a6aa (pcpu_lock){..-.}, at: free_percpu+0x36/0x260
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #4 (pcpu_lock){..-.}: lock_acquire+0x9e/0x180 _raw_spin_lock_irqsave+0x3a/0x50 pcpu_alloc+0xfa/0x780 __alloc_percpu_gfp+0x12/0x20 alloc_htab_elem+0x184/0x2b0 __htab_percpu_map_update_elem+0x252/0x290 bpf_percpu_hash_update+0x7c/0x130 __do_sys_bpf+0x1912/0x1be0 __x64_sys_bpf+0x1a/0x20 do_syscall_64+0x59/0x400 entry_SYSCALL_64_after_hwframe+0x49/0xbe
-> #3 (&htab->buckets[i].lock){....}: lock_acquire+0x9e/0x180 _raw_spin_lock_irqsave+0x3a/0x50 htab_map_update_elem+0x1af/0x3a0
-> #2 (&rq->lock){-.-.}: lock_acquire+0x9e/0x180 _raw_spin_lock+0x2f/0x40 task_fork_fair+0x37/0x160 sched_fork+0x211/0x310 copy_process.part.43+0x7b1/0x2160 _do_fork+0xda/0x6b0 kernel_thread+0x29/0x30 rest_init+0x22/0x260 arch_call_rest_init+0xe/0x10 start_kernel+0x4fd/0x520 x86_64_start_reservations+0x24/0x26 x86_64_start_kernel+0x6f/0x72 secondary_startup_64+0xa4/0xb0
-> #1 (&p->pi_lock){-.-.}: lock_acquire+0x9e/0x180 _raw_spin_lock_irqsave+0x3a/0x50 try_to_wake_up+0x41/0x600 wake_up_process+0x15/0x20 create_worker+0x16b/0x1e0 workqueue_init+0x279/0x2ee kernel_init_freeable+0xf7/0x288 kernel_init+0xf/0x180 ret_from_fork+0x24/0x30
-> #0 (&(&pool->lock)->rlock){-.-.}: __lock_acquire+0x101f/0x12a0 lock_acquire+0x9e/0x180 _raw_spin_lock+0x2f/0x40 __queue_work+0xb2/0x520 queue_work_on+0x38/0x80 free_percpu+0x221/0x260 pcpu_freelist_destroy+0x11/0x20 stack_map_free+0x2a/0x40 bpf_map_free_deferred+0x3c/0x50 process_one_work+0x1f7/0x580 worker_thread+0x54/0x410 kthread+0x10f/0x150 ret_from_fork+0x24/0x30
other info that might help us debug this:
Chain exists of: &(&pool->lock)->rlock --> &htab->buckets[i].lock --> pcpu_lock
Possible unsafe locking scenario:
CPU0 CPU1 ---- ---- lock(pcpu_lock); lock(&htab->buckets[i].lock); lock(pcpu_lock); lock(&(&pool->lock)->rlock);
*** DEADLOCK ***
3 locks held by kworker/23:255/18872: #0: 00000000b36a6e16 ((wq_completion)events){+.+.}, at: process_one_work+0x17a/0x580 #1: 00000000dfd966f0 ((work_completion)(&map->work)){+.+.}, at: process_one_work+0x17a/0x580 #2: 00000000e3e7a6aa (pcpu_lock){..-.}, at: free_percpu+0x36/0x260
stack backtrace: CPU: 23 PID: 18872 Comm: kworker/23:255 Not tainted 5.1.0-dbg-DEV #1 Hardware name: ... Workqueue: events bpf_map_free_deferred Call Trace: dump_stack+0x67/0x95 print_circular_bug.isra.38+0x1c6/0x220 check_prev_add.constprop.50+0x9f6/0xd20 __lock_acquire+0x101f/0x12a0 lock_acquire+0x9e/0x180 _raw_spin_lock+0x2f/0x40 __queue_work+0xb2/0x520 queue_work_on+0x38/0x80 free_percpu+0x221/0x260 pcpu_freelist_destroy+0x11/0x20 stack_map_free+0x2a/0x40 bpf_map_free_deferred+0x3c/0x50 process_one_work+0x1f7/0x580 worker_thread+0x54/0x410 kthread+0x10f/0x150 ret_from_fork+0x24/0x30
Signed-off-by: John Sperbeck jsperbeck@google.com Signed-off-by: Dennis Zhou dennis@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- mm/percpu.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/mm/percpu.c b/mm/percpu.c index 59bd6a51954c9..4ad3a4214249e 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1721,6 +1721,7 @@ void free_percpu(void __percpu *ptr) struct pcpu_chunk *chunk; unsigned long flags; int off; + bool need_balance = false;
if (!ptr) return; @@ -1742,7 +1743,7 @@ void free_percpu(void __percpu *ptr)
list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) if (pos != chunk) { - pcpu_schedule_balance_work(); + need_balance = true; break; } } @@ -1750,6 +1751,9 @@ void free_percpu(void __percpu *ptr) trace_percpu_free_percpu(chunk->base_addr, off, ptr);
spin_unlock_irqrestore(&pcpu_lock, flags); + + if (need_balance) + pcpu_schedule_balance_work(); } EXPORT_SYMBOL_GPL(free_percpu);
From: Masami Hiramatsu mhiramat@kernel.org
[ Upstream commit 3dd1f7f24f8ceec00bbbc364c2ac3c893f0fdc4c ]
Fix to make the type of $comm "string". If we set the other type to $comm argument, it shows meaningless value or wrong data. Currently probe events allow us to set string array type (e.g. ":string[2]"), or other digit types like x8 on $comm. But since clearly $comm is just a string data, it should not be fetched by other types including array.
Link: http://lkml.kernel.org/r/155723736241.9149.14582064184468574539.stgit@devnot...
Cc: Andreas Ziegler andreas.ziegler@fau.de Cc: Ingo Molnar mingo@redhat.com Cc: stable@vger.kernel.org Fixes: 533059281ee5 ("tracing: probeevent: Introduce new argument fetching code") Signed-off-by: Masami Hiramatsu mhiramat@kernel.org Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Sasha Levin sashal@kernel.org --- kernel/trace/trace_probe.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-)
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 9962cb5da8acd..44f078cda0ac8 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -405,13 +405,14 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, return -E2BIG; } } - /* - * The default type of $comm should be "string", and it can't be - * dereferenced. - */ - if (!t && strcmp(arg, "$comm") == 0) + + /* Since $comm can not be dereferred, we can find $comm by strcmp */ + if (strcmp(arg, "$comm") == 0) { + /* The type of $comm must be "string", and not an array. */ + if (parg->count || (t && strcmp(t, "string"))) + return -EINVAL; parg->type = find_fetch_type("string"); - else + } else parg->type = find_fetch_type(t); if (!parg->type) { pr_info("Unsupported type: %s\n", t);
From: Elazar Leibovich elazar@lightbitslabs.com
[ Upstream commit cbe08bcbbe787315c425dde284dcb715cfbf3f39 ]
When reading only part of the id file, the ppos isn't tracked correctly. This is taken care by simple_read_from_buffer.
Reading a single byte, and then the next byte would result EOF.
While this seems like not a big deal, this breaks abstractions that reads information from files unbuffered. See for example https://github.com/golang/go/issues/29399
This code was mentioned as problematic in commit cd458ba9d5a5 ("tracing: Do not (ab)use trace_seq in event_id_read()")
An example C code that show this bug is:
#include <stdio.h> #include <stdint.h>
#include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <unistd.h>
int main(int argc, char **argv) { if (argc < 2) return 1; int fd = open(argv[1], O_RDONLY); char c; read(fd, &c, 1); printf("First %c\n", c); read(fd, &c, 1); printf("Second %c\n", c); }
Then run with, e.g.
sudo ./a.out /sys/kernel/debug/tracing/events/tcp/tcp_set_state/id
You'll notice you're getting the first character twice, instead of the first two characters in the id file.
Link: http://lkml.kernel.org/r/20181231115837.4932-1-elazar@lightbitslabs.com
Cc: Orit Wasserman orit.was@gmail.com Cc: Oleg Nesterov oleg@redhat.com Cc: Ingo Molnar mingo@redhat.com Cc: stable@vger.kernel.org Fixes: 23725aeeab10b ("ftrace: provide an id file for each event") Signed-off-by: Elazar Leibovich elazar@lightbitslabs.com Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Sasha Levin sashal@kernel.org --- kernel/trace/trace_events.c | 3 --- 1 file changed, 3 deletions(-)
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5b3b0c3c8a47e..d910e36c34b5c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1318,9 +1318,6 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) char buf[32]; int len;
- if (*ppos) - return 0; - if (unlikely(!id)) return -ENODEV;
From: YueHaibing yuehaibing@huawei.com
[ Upstream commit 35399f87e271f7cf3048eab00a421a6519ac8441 ]
In configfs_register_group(), if create_default_group() failed, we forget to unlink the group. It will left a invalid item in the parent list, which may trigger the use-after-free issue seen below:
BUG: KASAN: use-after-free in __list_add_valid+0xd4/0xe0 lib/list_debug.c:26 Read of size 8 at addr ffff8881ef61ae20 by task syz-executor.0/5996
CPU: 1 PID: 5996 Comm: syz-executor.0 Tainted: G C 5.0.0+ #5 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0xa9/0x10e lib/dump_stack.c:113 print_address_description+0x65/0x270 mm/kasan/report.c:187 kasan_report+0x149/0x18d mm/kasan/report.c:317 __list_add_valid+0xd4/0xe0 lib/list_debug.c:26 __list_add include/linux/list.h:60 [inline] list_add_tail include/linux/list.h:93 [inline] link_obj+0xb0/0x190 fs/configfs/dir.c:759 link_group+0x1c/0x130 fs/configfs/dir.c:784 configfs_register_group+0x56/0x1e0 fs/configfs/dir.c:1751 configfs_register_default_group+0x72/0xc0 fs/configfs/dir.c:1834 ? 0xffffffffc1be0000 iio_sw_trigger_init+0x23/0x1000 [industrialio_sw_trigger] do_one_initcall+0xbc/0x47d init/main.c:887 do_init_module+0x1b5/0x547 kernel/module.c:3456 load_module+0x6405/0x8c10 kernel/module.c:3804 __do_sys_finit_module+0x162/0x190 kernel/module.c:3898 do_syscall_64+0x9f/0x450 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x462e99 Code: f7 d8 64 89 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f494ecbcc58 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 RAX: ffffffffffffffda RBX: 000000000073bf00 RCX: 0000000000462e99 RDX: 0000000000000000 RSI: 0000000020000180 RDI: 0000000000000003 RBP: 00007f494ecbcc70 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007f494ecbd6bc R13: 00000000004bcefa R14: 00000000006f6fb0 R15: 0000000000000004
Allocated by task 5987: set_track mm/kasan/common.c:87 [inline] __kasan_kmalloc.constprop.3+0xa0/0xd0 mm/kasan/common.c:497 kmalloc include/linux/slab.h:545 [inline] kzalloc include/linux/slab.h:740 [inline] configfs_register_default_group+0x4c/0xc0 fs/configfs/dir.c:1829 0xffffffffc1bd0023 do_one_initcall+0xbc/0x47d init/main.c:887 do_init_module+0x1b5/0x547 kernel/module.c:3456 load_module+0x6405/0x8c10 kernel/module.c:3804 __do_sys_finit_module+0x162/0x190 kernel/module.c:3898 do_syscall_64+0x9f/0x450 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe
Freed by task 5987: set_track mm/kasan/common.c:87 [inline] __kasan_slab_free+0x130/0x180 mm/kasan/common.c:459 slab_free_hook mm/slub.c:1429 [inline] slab_free_freelist_hook mm/slub.c:1456 [inline] slab_free mm/slub.c:3003 [inline] kfree+0xe1/0x270 mm/slub.c:3955 configfs_register_default_group+0x9a/0xc0 fs/configfs/dir.c:1836 0xffffffffc1bd0023 do_one_initcall+0xbc/0x47d init/main.c:887 do_init_module+0x1b5/0x547 kernel/module.c:3456 load_module+0x6405/0x8c10 kernel/module.c:3804 __do_sys_finit_module+0x162/0x190 kernel/module.c:3898 do_syscall_64+0x9f/0x450 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe
The buggy address belongs to the object at ffff8881ef61ae00 which belongs to the cache kmalloc-192 of size 192 The buggy address is located 32 bytes inside of 192-byte region [ffff8881ef61ae00, ffff8881ef61aec0) The buggy address belongs to the page: page:ffffea0007bd8680 count:1 mapcount:0 mapping:ffff8881f6c03000 index:0xffff8881ef61a700 flags: 0x2fffc0000000200(slab) raw: 02fffc0000000200 ffffea0007ca4740 0000000500000005 ffff8881f6c03000 raw: ffff8881ef61a700 000000008010000c 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected
Memory state around the buggy address: ffff8881ef61ad00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ffff8881ef61ad80: 00 00 00 00 00 00 00 00 fc fc fc fc fc fc fc fc
ffff8881ef61ae00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
^ ffff8881ef61ae80: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ffff8881ef61af00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
Fixes: 5cf6a51e6062 ("configfs: allow dynamic group creation") Reported-by: Hulk Robot hulkci@huawei.com Signed-off-by: YueHaibing yuehaibing@huawei.com Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Sasha Levin sashal@kernel.org --- fs/configfs/dir.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-)
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 39843fa7e11b0..920d350df37b6 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1755,12 +1755,19 @@ int configfs_register_group(struct config_group *parent_group,
inode_lock_nested(d_inode(parent), I_MUTEX_PARENT); ret = create_default_group(parent_group, group); - if (!ret) { - spin_lock(&configfs_dirent_lock); - configfs_dir_set_ready(group->cg_item.ci_dentry->d_fsdata); - spin_unlock(&configfs_dirent_lock); - } + if (ret) + goto err_out; + + spin_lock(&configfs_dirent_lock); + configfs_dir_set_ready(group->cg_item.ci_dentry->d_fsdata); + spin_unlock(&configfs_dirent_lock); + inode_unlock(d_inode(parent)); + return 0; +err_out: inode_unlock(d_inode(parent)); + mutex_lock(&subsys->su_mutex); + unlink_group(group); + mutex_unlock(&subsys->su_mutex); return ret; } EXPORT_SYMBOL(configfs_register_group);
From: Maciej Żenczykowski maze@google.com
[ Upstream commit 689a58605b63173acb0a8cf954af6a8f60440c93 ]
Memory: 509108K/542612K available (3835K kernel code, 919K rwdata, 1028K rodata, 129K init, 211K bss, 33504K reserved, 0K cma-reserved) NR_IRQS: 15 clocksource: timer: mask: 0xffffffffffffffff max_cycles: 0x1cd42e205, max_idle_ns: 881590404426 ns ------------[ cut here ]------------ WARNING: CPU: 0 PID: 0 at kernel/time/clockevents.c:458 clockevents_register_device+0x72/0x140 posix-timer cpumask == cpu_all_mask, using cpu_possible_mask instead Modules linked in: CPU: 0 PID: 0 Comm: swapper Not tainted 5.1.0-rc4-00048-ged79cc87302b #4 Stack: 604ebda0 603c5370 604ebe20 6046fd17 00000000 6006fcbb 604ebdb0 603c53b5 604ebe10 6003bfc4 604ebdd0 9000001ca Call Trace: [<6006fcbb>] ? printk+0x0/0x94 [<60083160>] ? clockevents_register_device+0x72/0x140 [<6001f16e>] show_stack+0x13b/0x155 [<603c5370>] ? dump_stack_print_info+0xe2/0xeb [<6006fcbb>] ? printk+0x0/0x94 [<603c53b5>] dump_stack+0x2a/0x2c [<6003bfc4>] __warn+0x10e/0x13e [<60070320>] ? vprintk_func+0xc8/0xcf [<60030fd6>] ? block_signals+0x0/0x16 [<6006fcbb>] ? printk+0x0/0x94 [<6003c08b>] warn_slowpath_fmt+0x97/0x99 [<600311a1>] ? set_signals+0x0/0x3f [<6003bff4>] ? warn_slowpath_fmt+0x0/0x99 [<600842cb>] ? tick_oneshot_mode_active+0x44/0x4f [<60030fd6>] ? block_signals+0x0/0x16 [<6006fcbb>] ? printk+0x0/0x94 [<6007d2d5>] ? __clocksource_select+0x20/0x1b1 [<60030fd6>] ? block_signals+0x0/0x16 [<6006fcbb>] ? printk+0x0/0x94 [<60083160>] clockevents_register_device+0x72/0x140 [<60031192>] ? get_signals+0x0/0xf [<60030fd6>] ? block_signals+0x0/0x16 [<6006fcbb>] ? printk+0x0/0x94 [<60002eec>] um_timer_setup+0xc8/0xca [<60001b59>] start_kernel+0x47f/0x57e [<600035bc>] start_kernel_proc+0x49/0x4d [<6006c483>] ? kmsg_dump_register+0x82/0x8a [<6001de62>] new_thread_handler+0x81/0xb2 [<60003571>] ? kmsg_dumper_stdout_init+0x1a/0x1c [<60020c75>] uml_finishsetup+0x54/0x59
random: get_random_bytes called from init_oops_id+0x27/0x34 with crng_init=0 ---[ end trace 00173d0117a88acb ]--- Calibrating delay loop... 6941.90 BogoMIPS (lpj=34709504)
Signed-off-by: Maciej Żenczykowski maze@google.com Cc: Jeff Dike jdike@addtoit.com Cc: Richard Weinberger richard@nod.at Cc: Anton Ivanov anton.ivanov@cambridgegreys.com Cc: linux-um@lists.infradead.org Cc: linux-kernel@vger.kernel.org
Signed-off-by: Richard Weinberger richard@nod.at Signed-off-by: Sasha Levin sashal@kernel.org --- arch/um/kernel/time.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index 052de4c8acb2e..0c572a48158e8 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -56,7 +56,7 @@ static int itimer_one_shot(struct clock_event_device *evt) static struct clock_event_device timer_clockevent = { .name = "posix-timer", .rating = 250, - .cpumask = cpu_all_mask, + .cpumask = cpu_possible_mask, .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, .set_state_shutdown = itimer_shutdown,
From: Jisheng Zhang Jisheng.Zhang@synaptics.com
[ Upstream commit 9e2b5de5604a6ff2626c51e77014d92c9299722c ]
If we ever did MSI-related initializations, we need to call dw_pcie_free_msi() in the error code path.
Remove the IS_ENABLED(CONFIG_PCI_MSI) check for MSI init because pci_msi_enabled() already has a stub for !CONFIG_PCI_MSI.
Signed-off-by: Jisheng Zhang Jisheng.Zhang@synaptics.com Signed-off-by: Lorenzo Pieralisi lorenzo.pieralisi@arm.com Signed-off-by: Bjorn Helgaas bhelgaas@google.com Acked-by: Gustavo Pimentel gustavo.pimentel@synopsys.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/pci/controller/dwc/pcie-designware-host.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c index 9c5614f21b8eb..d14a6237bdf5a 100644 --- a/drivers/pci/controller/dwc/pcie-designware-host.c +++ b/drivers/pci/controller/dwc/pcie-designware-host.c @@ -439,7 +439,7 @@ int dw_pcie_host_init(struct pcie_port *pp) if (ret) pci->num_viewport = 2;
- if (IS_ENABLED(CONFIG_PCI_MSI) && pci_msi_enabled()) { + if (pci_msi_enabled()) { /* * If a specific SoC driver needs to change the * default number of vectors, it needs to implement @@ -477,7 +477,7 @@ int dw_pcie_host_init(struct pcie_port *pp) if (pp->ops->host_init) { ret = pp->ops->host_init(pp); if (ret) - goto error; + goto err_free_msi; }
pp->root_bus_nr = pp->busn->start; @@ -491,7 +491,7 @@ int dw_pcie_host_init(struct pcie_port *pp)
ret = pci_scan_root_bus_bridge(bridge); if (ret) - goto error; + goto err_free_msi;
bus = bridge->bus;
@@ -507,6 +507,9 @@ int dw_pcie_host_init(struct pcie_port *pp) pci_bus_add_devices(bus); return 0;
+err_free_msi: + if (pci_msi_enabled() && !pp->ops->msi_host_init) + dw_pcie_free_msi(pp); error: pci_free_host_bridge(bridge); return ret;
From: Jisheng Zhang Jisheng.Zhang@synaptics.com
[ Upstream commit dc69a3d567941784c3d00e1d0834582b42b0b3e7 ]
To avoid a memory leak, free the page allocated for MSI IRQ in dw_pcie_free_msi().
Signed-off-by: Jisheng Zhang Jisheng.Zhang@synaptics.com Signed-off-by: Lorenzo Pieralisi lorenzo.pieralisi@arm.com Signed-off-by: Bjorn Helgaas bhelgaas@google.com Acked-by: Gustavo Pimentel gustavo.pimentel@synopsys.com Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/pci/controller/dwc/pcie-designware-host.c | 12 ++++++++---- drivers/pci/controller/dwc/pcie-designware.h | 1 + 2 files changed, 9 insertions(+), 4 deletions(-)
diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c index d14a6237bdf5a..86ee6d1888d75 100644 --- a/drivers/pci/controller/dwc/pcie-designware-host.c +++ b/drivers/pci/controller/dwc/pcie-designware-host.c @@ -303,20 +303,24 @@ void dw_pcie_free_msi(struct pcie_port *pp)
irq_domain_remove(pp->msi_domain); irq_domain_remove(pp->irq_domain); + + if (pp->msi_page) + __free_page(pp->msi_page); }
void dw_pcie_msi_init(struct pcie_port *pp) { struct dw_pcie *pci = to_dw_pcie_from_pp(pp); struct device *dev = pci->dev; - struct page *page; u64 msi_target;
- page = alloc_page(GFP_KERNEL); - pp->msi_data = dma_map_page(dev, page, 0, PAGE_SIZE, DMA_FROM_DEVICE); + pp->msi_page = alloc_page(GFP_KERNEL); + pp->msi_data = dma_map_page(dev, pp->msi_page, 0, PAGE_SIZE, + DMA_FROM_DEVICE); if (dma_mapping_error(dev, pp->msi_data)) { dev_err(dev, "Failed to map MSI data\n"); - __free_page(page); + __free_page(pp->msi_page); + pp->msi_page = NULL; return; } msi_target = (u64)pp->msi_data; diff --git a/drivers/pci/controller/dwc/pcie-designware.h b/drivers/pci/controller/dwc/pcie-designware.h index 9943d8c68335d..0cede55a7bcc1 100644 --- a/drivers/pci/controller/dwc/pcie-designware.h +++ b/drivers/pci/controller/dwc/pcie-designware.h @@ -176,6 +176,7 @@ struct pcie_port { struct irq_domain *irq_domain; struct irq_domain *msi_domain; dma_addr_t msi_data; + struct page *msi_page; u32 num_vectors; u32 irq_status[MAX_MSI_CTRLS]; raw_spinlock_t lock;
From: Kristian Evensen kristian.evensen@gmail.com
[ Upstream commit f8e608982022fad035160870f5b06086d3cba54d ]
Commit 59c08c69c278 ("netfilter: ctnetlink: Support L3 protocol-filter on flush") introduced a user-space regression when flushing connection track entries. Before this commit, the nfgen_family field was not used by the kernel and all entries were removed. Since this commit, nfgen_family is used to filter out entries that should not be removed. One example a broken tool is conntrack. conntrack always sets nfgen_family to AF_INET, so after 59c08c69c278 only IPv4 entries were removed with the -F parameter.
Pablo Neira Ayuso suggested using nfgenmsg->version to resolve the regression, and this commit implements his suggestion. nfgenmsg->version is so far set to zero, so it is well-suited to be used as a flag for selecting old or new flush behavior. If version is 0, nfgen_family is ignored and all entries are used. If user-space sets the version to one (or any other value than 0), then the new behavior is used. As version only can have two valid values, I chose not to add a new NFNETLINK_VERSION-constant.
Fixes: 59c08c69c278 ("netfilter: ctnetlink: Support L3 protocol-filter on flush") Reported-by: Nicolas Dichtel nicolas.dichtel@6wind.com Suggested-by: Pablo Neira Ayuso pablo@netfilter.org Signed-off-by: Kristian Evensen kristian.evensen@gmail.com Tested-by: Nicolas Dichtel nicolas.dichtel@6wind.com Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org Signed-off-by: Sasha Levin sashal@kernel.org --- net/netfilter/nf_conntrack_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 36619ad8ab8c2..8233dfafb339d 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1254,7 +1254,7 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl, struct nf_conntrack_tuple tuple; struct nf_conn *ct; struct nfgenmsg *nfmsg = nlmsg_data(nlh); - u_int8_t u3 = nfmsg->nfgen_family; + u_int8_t u3 = nfmsg->version ? nfmsg->nfgen_family : AF_UNSPEC; struct nf_conntrack_zone zone; int err;
From: Amir Goldstein amir73il@gmail.com
[ Upstream commit d989903058a83e8536cc7aadf9256a47d5c173fe ]
Overlayfs "fake" path is used for stacked file operations on underlying files. Operations on files with "fake" path must not generate fsnotify events with path data, because those events have already been generated at overlayfs layer and because the reported event->fd for fanotify marks on underlying inode/filesystem will have the wrong path (the overlayfs path).
Link: https://lore.kernel.org/linux-fsdevel/20190423065024.12695-1-jencce.kernel@g... Reported-by: Murphy Zhou jencce.kernel@gmail.com Fixes: d1d04ef8572b ("ovl: stack file ops") Signed-off-by: Amir Goldstein amir73il@gmail.com Signed-off-by: Miklos Szeredi mszeredi@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org --- fs/overlayfs/file.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 84dd957efa24a..6f6eb638a320f 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -29,10 +29,11 @@ static struct file *ovl_open_realfile(const struct file *file, struct inode *inode = file_inode(file); struct file *realfile; const struct cred *old_cred; + int flags = file->f_flags | O_NOATIME | FMODE_NONOTIFY;
old_cred = ovl_override_creds(inode->i_sb); - realfile = open_with_fake_path(&file->f_path, file->f_flags | O_NOATIME, - realinode, current_cred()); + realfile = open_with_fake_path(&file->f_path, flags, realinode, + current_cred()); revert_creds(old_cred);
pr_debug("open(%p[%pD2/%c], 0%o) -> (%p, 0%o)\n", @@ -50,7 +51,7 @@ static int ovl_change_flags(struct file *file, unsigned int flags) int err;
/* No atime modificaton on underlying */ - flags |= O_NOATIME; + flags |= O_NOATIME | FMODE_NONOTIFY;
/* If some flag changed that cannot be changed then something's amiss */ if (WARN_ON((file->f_flags ^ flags) & ~OVL_SETFL_MASK))
From: Ludovic Barre ludovic.barre@st.com
[ Upstream commit 8520ce1e17799b220ff421d4f39438c9c572ade3 ]
The IRQ handler, mmci_irq(), loops until all status bits have been cleared. However, the status bit signaling busy in variant->busy_detect_flag, may be set even if busy detection isn't monitored for the current request.
This may be the case for the CMD11 when switching the I/O voltage, which leads to that mmci_irq() busy loops in IRQ context. Fix this problem, by clearing the status bit for busy, before continuing to validate the condition for the loop. This is safe, because the busy status detection has already been taken care of by mmci_cmd_irq().
Signed-off-by: Ludovic Barre ludovic.barre@st.com Signed-off-by: Ulf Hansson ulf.hansson@linaro.org Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/mmc/host/mmci.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c index e352f5ad58018..cb010a83faf4b 100644 --- a/drivers/mmc/host/mmci.c +++ b/drivers/mmc/host/mmci.c @@ -1534,9 +1534,10 @@ static irqreturn_t mmci_irq(int irq, void *dev_id) }
/* - * Don't poll for busy completion in irq context. + * Busy detection has been handled by mmci_cmd_irq() above. + * Clear the status bit to prevent polling in IRQ context. */ - if (host->variant->busy_detect && host->busy_status) + if (host->variant->busy_detect_flag) status &= ~host->variant->busy_detect_flag;
ret = 1;
From: Taehee Yoo ap420073@gmail.com
[ Upstream commit 43c8f131184faf20c07221f3e09724611c6525d8 ]
rhashtable_insert_fast() may return an error value when memory allocation fails, but flow_offload_add() does not check for errors. This patch just adds missing error checking.
Fixes: ac2a66665e23 ("netfilter: add generic flow table infrastructure") Signed-off-by: Taehee Yoo ap420073@gmail.com Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org Signed-off-by: Sasha Levin sashal@kernel.org --- net/netfilter/nf_flow_table_core.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-)
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index c0c72ae9df42f..604446c0264cc 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -185,14 +185,25 @@ static const struct rhashtable_params nf_flow_offload_rhash_params = {
int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) { - flow->timeout = (u32)jiffies; + int err;
- rhashtable_insert_fast(&flow_table->rhashtable, - &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node, - nf_flow_offload_rhash_params); - rhashtable_insert_fast(&flow_table->rhashtable, - &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node, - nf_flow_offload_rhash_params); + err = rhashtable_insert_fast(&flow_table->rhashtable, + &flow->tuplehash[0].node, + nf_flow_offload_rhash_params); + if (err < 0) + return err; + + err = rhashtable_insert_fast(&flow_table->rhashtable, + &flow->tuplehash[1].node, + nf_flow_offload_rhash_params); + if (err < 0) { + rhashtable_remove_fast(&flow_table->rhashtable, + &flow->tuplehash[0].node, + nf_flow_offload_rhash_params); + return err; + } + + flow->timeout = (u32)jiffies; return 0; } EXPORT_SYMBOL_GPL(flow_offload_add);
From: Jakub Jankowski shasta@toxcorp.com
[ Upstream commit f5e85ce8e733c2547827f6268136b70b802eabdb ]
Since commit bc7d811ace4a ("netfilter: nf_ct_h323: Convert CHECK_BOUND macro to function"), NAT traversal for H.323 doesn't work, failing to parse H323-UserInformation. nf_h323_error_boundary() compares contents of the bitstring, not the addresses, preventing valid H.323 packets from being conntrack'd.
This looks like an oversight from when CHECK_BOUND macro was converted to a function.
To fix it, stop dereferencing bs->cur and bs->end.
Fixes: bc7d811ace4a ("netfilter: nf_ct_h323: Convert CHECK_BOUND macro to function") Signed-off-by: Jakub Jankowski shasta@toxcorp.com Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org Signed-off-by: Sasha Levin sashal@kernel.org --- net/netfilter/nf_conntrack_h323_asn1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/netfilter/nf_conntrack_h323_asn1.c b/net/netfilter/nf_conntrack_h323_asn1.c index 1601275efe2d1..4c2ef42e189cb 100644 --- a/net/netfilter/nf_conntrack_h323_asn1.c +++ b/net/netfilter/nf_conntrack_h323_asn1.c @@ -172,7 +172,7 @@ static int nf_h323_error_boundary(struct bitstr *bs, size_t bytes, size_t bits) if (bits % BITS_PER_BYTE > 0) bytes++;
- if (*bs->cur + bytes > *bs->end) + if (bs->cur + bytes > bs->end) return 1;
return 0;
From: Serge Semin fancer.lancer@gmail.com
[ Upstream commit 93fa5b280761a4dbb14c5330f260380385ab2b49 ]
There are situations when memory regions coming from dts may be too big for the platform physical address space. This especially concerns XPA-capable systems. Bootloader may determine more than 4GB memory available and pass it to the kernel over dts memory node, while kernel is built without XPA/64BIT support. In this case the region may either simply be truncated by add_memory_region() method or by u64->phys_addr_t type casting. But in worst case the method can even drop the memory region if it exceeds PHYS_ADDR_MAX size. So lets make sure the retrieved from dts memory regions are valid, and if some of them aren't, just manually truncate them with a warning printed out.
Signed-off-by: Serge Semin fancer.lancer@gmail.com Signed-off-by: Paul Burton paul.burton@mips.com Cc: Ralf Baechle ralf@linux-mips.org Cc: James Hogan jhogan@kernel.org Cc: Mike Rapoport rppt@linux.ibm.com Cc: Andrew Morton akpm@linux-foundation.org Cc: Michal Hocko mhocko@suse.com Cc: Greg Kroah-Hartman gregkh@linuxfoundation.org Cc: Thomas Bogendoerfer tbogendoerfer@suse.de Cc: Huacai Chen chenhc@lemote.com Cc: Stefan Agner stefan@agner.ch Cc: Stephen Rothwell sfr@canb.auug.org.au Cc: Alexandre Belloni alexandre.belloni@bootlin.com Cc: Juergen Gross jgross@suse.com Cc: Serge Semin Sergey.Semin@t-platforms.ru Cc: linux-mips@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Sasha Levin sashal@kernel.org --- arch/mips/kernel/prom.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-)
diff --git a/arch/mips/kernel/prom.c b/arch/mips/kernel/prom.c index 93b8e0b4332f7..b9d6c6ec41778 100644 --- a/arch/mips/kernel/prom.c +++ b/arch/mips/kernel/prom.c @@ -41,7 +41,19 @@ char *mips_get_machine_name(void) #ifdef CONFIG_USE_OF void __init early_init_dt_add_memory_arch(u64 base, u64 size) { - return add_memory_region(base, size, BOOT_MEM_RAM); + if (base >= PHYS_ADDR_MAX) { + pr_warn("Trying to add an invalid memory region, skipped\n"); + return; + } + + /* Truncate the passed memory region instead of type casting */ + if (base + size - 1 >= PHYS_ADDR_MAX || base + size < base) { + pr_warn("Truncate memory region %llx @ %llx to size %llx\n", + size, base, PHYS_ADDR_MAX - base); + size = PHYS_ADDR_MAX - base; + } + + add_memory_region(base, size, BOOT_MEM_RAM); }
int __init early_init_dt_reserve_memory_arch(phys_addr_t base,
From: Florian Westphal fw@strlen.de
[ Upstream commit edbd82c5fba009f68d20b5db585be1e667c605f6 ]
Following splat gets triggered when nfnetlink monitor is running while xtables-nft selftests are running:
net/netfilter/nf_tables_api.c:1272 suspicious rcu_dereference_check() usage! other info that might help us debug this:
1 lock held by xtables-nft-mul/27006: #0: 00000000e0f85be9 (&net->nft.commit_mutex){+.+.}, at: nf_tables_valid_genid+0x1a/0x50 Call Trace: nf_tables_fill_chain_info.isra.45+0x6cc/0x6e0 nf_tables_chain_notify+0xf8/0x1a0 nf_tables_commit+0x165c/0x1740
nf_tables_fill_chain_info() can be called both from dumps (rcu read locked) or from the transaction path if a userspace process subscribed to nftables notifications.
In the 'table dump' case, rcu_access_pointer() cannot be used: We do not hold transaction mutex so the pointer can be NULLed right after the check. Just unconditionally fetch the value, then have the helper return immediately if its NULL.
In the notification case we don't hold the rcu read lock, but updates are prevented due to transaction mutex. Use rcu_dereference_check() to make lockdep aware of this.
Signed-off-by: Florian Westphal fw@strlen.de Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org Signed-off-by: Sasha Levin sashal@kernel.org --- net/netfilter/nf_tables_api.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 25c2b98b9a960..eb3915b76ff47 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1147,6 +1147,9 @@ static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats) u64 pkts, bytes; int cpu;
+ if (!stats) + return 0; + memset(&total, 0, sizeof(total)); for_each_possible_cpu(cpu) { cpu_stats = per_cpu_ptr(stats, cpu); @@ -1204,6 +1207,7 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, if (nft_is_base_chain(chain)) { const struct nft_base_chain *basechain = nft_base_chain(chain); const struct nf_hook_ops *ops = &basechain->ops; + struct nft_stats __percpu *stats; struct nlattr *nest;
nest = nla_nest_start(skb, NFTA_CHAIN_HOOK); @@ -1225,8 +1229,9 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, if (nla_put_string(skb, NFTA_CHAIN_TYPE, basechain->type->name)) goto nla_put_failure;
- if (rcu_access_pointer(basechain->stats) && - nft_dump_stats(skb, rcu_dereference(basechain->stats))) + stats = rcu_dereference_check(basechain->stats, + lockdep_commit_lock_is_held(net)); + if (nft_dump_stats(skb, stats)) goto nla_put_failure; }
From: Guenter Roeck linux@roeck-us.net
[ Upstream commit f627ac0e12cd2736e60b9f5782ecec1d97251f77 ]
Watchdog pretimeout governors were enabled from the default governor selection using "select". As a result, the default governor was always built into the kernel, even if no watchdog driver was loaded. By using "depends on" instead of "select", we are in better control, and the governors can all be built as modules. At the same time, set the default configuration option for pretimeout governors to match WATCHDOG_CORE (meaning all pretimeout governors are by default enabled if pretimeout support is enabled).
The practical impact of this change is minimal. Previously, selecting a default governor automatically enabled that governor. Now, a default governor can only be selected if that governor has been enabled. Consequently, the order of governor selection is now reversed: The governor selection is now first, followed by default governor selection.
Signed-off-by: Guenter Roeck linux@roeck-us.net Signed-off-by: Wim Van Sebroeck wim@linux-watchdog.org Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/watchdog/Kconfig | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-)
diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index 57f017d74a976..b65c907fc306a 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -2004,6 +2004,20 @@ config WATCHDOG_PRETIMEOUT_GOV
if WATCHDOG_PRETIMEOUT_GOV
+config WATCHDOG_PRETIMEOUT_GOV_NOOP + tristate "Noop watchdog pretimeout governor" + default WATCHDOG_CORE + help + Noop watchdog pretimeout governor, only an informational + message is added to kernel log buffer. + +config WATCHDOG_PRETIMEOUT_GOV_PANIC + tristate "Panic watchdog pretimeout governor" + default WATCHDOG_CORE + help + Panic watchdog pretimeout governor, on watchdog pretimeout + event put the kernel into panic. + choice prompt "Default Watchdog Pretimeout Governor" default WATCHDOG_PRETIMEOUT_DEFAULT_GOV_PANIC @@ -2014,7 +2028,7 @@ choice
config WATCHDOG_PRETIMEOUT_DEFAULT_GOV_NOOP bool "noop" - select WATCHDOG_PRETIMEOUT_GOV_NOOP + depends on WATCHDOG_PRETIMEOUT_GOV_NOOP help Use noop watchdog pretimeout governor by default. If noop governor is selected by a user, write a short message to @@ -2022,7 +2036,7 @@ config WATCHDOG_PRETIMEOUT_DEFAULT_GOV_NOOP
config WATCHDOG_PRETIMEOUT_DEFAULT_GOV_PANIC bool "panic" - select WATCHDOG_PRETIMEOUT_GOV_PANIC + depends on WATCHDOG_PRETIMEOUT_GOV_PANIC help Use panic watchdog pretimeout governor by default, if a watchdog pretimeout event happens, consider that @@ -2030,18 +2044,6 @@ config WATCHDOG_PRETIMEOUT_DEFAULT_GOV_PANIC
endchoice
-config WATCHDOG_PRETIMEOUT_GOV_NOOP - tristate "Noop watchdog pretimeout governor" - help - Noop watchdog pretimeout governor, only an informational - message is added to kernel log buffer. - -config WATCHDOG_PRETIMEOUT_GOV_PANIC - tristate "Panic watchdog pretimeout governor" - help - Panic watchdog pretimeout governor, on watchdog pretimeout - event put the kernel into panic. - endif # WATCHDOG_PRETIMEOUT_GOV
endif # WATCHDOG
From: Georg Hofmann georg@hofmannsweb.com
[ Upstream commit b07e228eee69601addba98b47b1a3850569e5013 ]
The documentated behavior is: if max_hw_heartbeat_ms is implemented, the minimum of the set_timeout argument and max_hw_heartbeat_ms should be used. This patch implements this behavior. Previously only the first 7bits were used and the input argument was returned.
Signed-off-by: Georg Hofmann georg@hofmannsweb.com Reviewed-by: Guenter Roeck linux@roeck-us.net Signed-off-by: Guenter Roeck linux@roeck-us.net Signed-off-by: Wim Van Sebroeck wim@linux-watchdog.org Signed-off-by: Sasha Levin sashal@kernel.org --- drivers/watchdog/imx2_wdt.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/watchdog/imx2_wdt.c b/drivers/watchdog/imx2_wdt.c index 2b52514eaa86a..7e7bdcbbc741b 100644 --- a/drivers/watchdog/imx2_wdt.c +++ b/drivers/watchdog/imx2_wdt.c @@ -178,8 +178,10 @@ static void __imx2_wdt_set_timeout(struct watchdog_device *wdog, static int imx2_wdt_set_timeout(struct watchdog_device *wdog, unsigned int new_timeout) { - __imx2_wdt_set_timeout(wdog, new_timeout); + unsigned int actual;
+ actual = min(new_timeout, wdog->max_hw_heartbeat_ms * 1000); + __imx2_wdt_set_timeout(wdog, actual); wdog->timeout = new_timeout; return 0; }
linux-stable-mirror@lists.linaro.org