April 2018 - Linux-stable-mirror

patch "usbip: vhci_hcd: check rhport before using in vhci_hub_control()" added to usb-linus

by gregkh＠linuxfoundation.org

This is a note to let you know that I've just added the patch titled usbip: vhci_hcd: check rhport before using in vhci_hub_control() to my usb git tree which can be found at git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git in the usb-linus branch. The patch will show up in the next release of the linux-next tree (usually sometime within the next 24 hours during the week.) The patch will hopefully also be merged in Linus's tree for the next -rc kernel release. If you have any questions about this process, please let me know. >From 5b22f676118ff25049382041da0db8012e57c9e8 Mon Sep 17 00:00:00 2001 From: Shuah Khan <shuahkh(a)osg.samsung.com> Date: Thu, 5 Apr 2018 16:31:49 -0600 Subject: usbip: vhci_hcd: check rhport before using in vhci_hub_control() Validate !rhport < 0 before using it to access port_status array. Signed-off-by: Shuah Khan <shuahkh(a)osg.samsung.com> Cc: stable <stable(a)vger.kernel.org> Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org> --- drivers/usb/usbip/vhci_hcd.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/usb/usbip/vhci_hcd.c b/drivers/usb/usbip/vhci_hcd.c index 20e3d4609583..d11f3f8dad40 100644 --- a/drivers/usb/usbip/vhci_hcd.c +++ b/drivers/usb/usbip/vhci_hcd.c @@ -354,6 +354,8 @@ static int vhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, usbip_dbg_vhci_rh(" ClearHubFeature\n"); break; case ClearPortFeature: + if (rhport < 0) + goto error; switch (wValue) { case USB_PORT_FEAT_SUSPEND: if (hcd->speed == HCD_USB3) { @@ -511,11 +513,16 @@ static int vhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, goto error; } + if (rhport < 0) + goto error; + vhci_hcd->port_status[rhport] |= USB_PORT_STAT_SUSPEND; break; case USB_PORT_FEAT_POWER: usbip_dbg_vhci_rh( " SetPortFeature: USB_PORT_FEAT_POWER\n"); + if (rhport < 0) + goto error; if (hcd->speed == HCD_USB3) vhci_hcd->port_status[rhport] |= USB_SS_PORT_STAT_POWER; else @@ -524,6 +531,8 @@ static int vhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, case USB_PORT_FEAT_BH_PORT_RESET: usbip_dbg_vhci_rh( " SetPortFeature: USB_PORT_FEAT_BH_PORT_RESET\n"); + if (rhport < 0) + goto error; /* Applicable only for USB3.0 hub */ if (hcd->speed != HCD_USB3) { pr_err("USB_PORT_FEAT_BH_PORT_RESET req not " @@ -534,6 +543,8 @@ static int vhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, case USB_PORT_FEAT_RESET: usbip_dbg_vhci_rh( " SetPortFeature: USB_PORT_FEAT_RESET\n"); + if (rhport < 0) + goto error; /* if it's already enabled, disable */ if (hcd->speed == HCD_USB3) { vhci_hcd->port_status[rhport] = 0; @@ -554,6 +565,8 @@ static int vhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, default: usbip_dbg_vhci_rh(" SetPortFeature: default %d\n", wValue); + if (rhport < 0) + goto error; if (hcd->speed == HCD_USB3) { if ((vhci_hcd->port_status[rhport] & USB_SS_PORT_STAT_POWER) != 0) { -- 2.17.0

7 years, 7 months

1
0
0 0

patch "USB: Increment wakeup count on remote wakeup." added to usb-linus

by gregkh＠linuxfoundation.org

This is a note to let you know that I've just added the patch titled USB: Increment wakeup count on remote wakeup. to my usb git tree which can be found at git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/usb.git in the usb-linus branch. The patch will show up in the next release of the linux-next tree (usually sometime within the next 24 hours during the week.) The patch will hopefully also be merged in Linus's tree for the next -rc kernel release. If you have any questions about this process, please let me know. >From 83a62c51ba7b3c0bf45150c4eac7aefc6c785e94 Mon Sep 17 00:00:00 2001 From: Ravi Chandra Sadineni <ravisadineni(a)chromium.org> Date: Fri, 20 Apr 2018 11:08:21 -0700 Subject: USB: Increment wakeup count on remote wakeup. On chromebooks we depend on wakeup count to identify the wakeup source. But currently USB devices do not increment the wakeup count when they trigger the remote wake. This patch addresses the same. Resume condition is reported differently on USB 2.0 and USB 3.0 devices. On USB 2.0 devices, a wake capable device, if wake enabled, drives resume signal to indicate a remote wake (USB 2.0 spec section 7.1.7.7). The upstream facing port then sets C_PORT_SUSPEND bit and reports a port change event (USB 2.0 spec section 11.24.2.7.2.3). Thus if a port has resumed before driving the resume signal from the host and C_PORT_SUSPEND is set, then the device attached to the given port might be the reason for the last system wakeup. Increment the wakeup count for the same. On USB 3.0 devices, a function may signal that it wants to exit from device suspend by sending a Function Wake Device Notification to the host (USB3.0 spec section 8.5.6.4) Thus on receiving the Function Wake, increment the wakeup count. Signed-off-by: Ravi Chandra Sadineni <ravisadineni(a)chromium.org> Acked-by: Alan Stern <stern(a)rowland.harvard.edu> Cc: stable <stable(a)vger.kernel.org> Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org> --- drivers/usb/core/hcd.c | 1 + drivers/usb/core/hub.c | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index 777036ae6367..00bb8417050f 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -2377,6 +2377,7 @@ void usb_hcd_resume_root_hub (struct usb_hcd *hcd) spin_lock_irqsave (&hcd_root_hub_lock, flags); if (hcd->rh_registered) { + pm_wakeup_event(&hcd->self.root_hub->dev, 0); set_bit(HCD_FLAG_WAKEUP_PENDING, &hcd->flags); queue_work(pm_wq, &hcd->wakeup_work); } diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index f6ea16e9f6bb..aa9968d90a48 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -653,12 +653,17 @@ void usb_wakeup_notification(struct usb_device *hdev, unsigned int portnum) { struct usb_hub *hub; + struct usb_port *port_dev; if (!hdev) return; hub = usb_hub_to_struct_hub(hdev); if (hub) { + port_dev = hub->ports[portnum - 1]; + if (port_dev && port_dev->child) + pm_wakeup_event(&port_dev->child->dev, 0); + set_bit(portnum, hub->wakeup_bits); kick_hub_wq(hub); } @@ -3434,8 +3439,11 @@ int usb_port_resume(struct usb_device *udev, pm_message_t msg) /* Skip the initial Clear-Suspend step for a remote wakeup */ status = hub_port_status(hub, port1, &portstatus, &portchange); - if (status == 0 && !port_is_suspended(hub, portstatus)) + if (status == 0 && !port_is_suspended(hub, portstatus)) { + if (portchange & USB_PORT_STAT_C_SUSPEND) + pm_wakeup_event(&udev->dev, 0); goto SuspendCleared; + } /* see 7.1.7.7; affects power usage, but not budgeting */ if (hub_is_superspeed(hub->hdev)) -- 2.17.0

7 years, 7 months

1
0
0 0

Backport of 2e898e4c0a38 for 4.4, 4.9, 4.14, and 4.16

by Nathan Chancellor

Hi Greg, As promised, here is the backport of 2e898e4c0a38 ("writeback: safer lock nesting") for 4.4, 4.9, 4.14, and 4.16, all on top of their latest versions upstream. Let me know if there are any issues! Nathan

7 years, 7 months

2
6
0 0

[PATCH 4.4] fanotify: fix logic of events on child

by Nathan Chancellor

From: Amir Goldstein <amir73il(a)gmail.com> commit 54a307ba8d3cd00a3902337ffaae28f436eeb1a4 upstream. When event on child inodes are sent to the parent inode mark and parent inode mark was not marked with FAN_EVENT_ON_CHILD, the event will not be delivered to the listener process. However, if the same process also has a mount mark, the event to the parent inode will be delivered regadless of the mount mark mask. This behavior is incorrect in the case where the mount mark mask does not contain the specific event type. For example, the process adds a mark on a directory with mask FAN_MODIFY (without FAN_EVENT_ON_CHILD) and a mount mark with mask FAN_CLOSE_NOWRITE (without FAN_ONDIR). A modify event on a file inside that directory (and inside that mount) should not create a FAN_MODIFY event, because neither of the marks requested to get that event on the file. Fixes: 1968f5eed54c ("fanotify: use both marks when possible") Cc: stable <stable(a)vger.kernel.org> Signed-off-by: Amir Goldstein <amir73il(a)gmail.com> Signed-off-by: Jan Kara <jack(a)suse.cz> [natechancellor: Fix small conflict due to lack of 3cd5eca8d7a2f] Signed-off-by: Nathan Chancellor <natechancellor(a)gmail.com> --- fs/notify/fanotify/fanotify.c | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index e0e5f7c3c99f..8a459b179183 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -92,7 +92,7 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, u32 event_mask, void *data, int data_type) { - __u32 marks_mask, marks_ignored_mask; + __u32 marks_mask = 0, marks_ignored_mask = 0; struct path *path = data; pr_debug("%s: inode_mark=%p vfsmnt_mark=%p mask=%x data=%p" @@ -108,24 +108,20 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, !d_can_lookup(path->dentry)) return false; - if (inode_mark && vfsmnt_mark) { - marks_mask = (vfsmnt_mark->mask | inode_mark->mask); - marks_ignored_mask = (vfsmnt_mark->ignored_mask | inode_mark->ignored_mask); - } else if (inode_mark) { - /* - * if the event is for a child and this inode doesn't care about - * events on the child, don't send it! - */ - if ((event_mask & FS_EVENT_ON_CHILD) && - !(inode_mark->mask & FS_EVENT_ON_CHILD)) - return false; - marks_mask = inode_mark->mask; - marks_ignored_mask = inode_mark->ignored_mask; - } else if (vfsmnt_mark) { - marks_mask = vfsmnt_mark->mask; - marks_ignored_mask = vfsmnt_mark->ignored_mask; - } else { - BUG(); + /* + * if the event is for a child and this inode doesn't care about + * events on the child, don't send it! + */ + if (inode_mark && + (!(event_mask & FS_EVENT_ON_CHILD) || + (inode_mark->mask & FS_EVENT_ON_CHILD))) { + marks_mask |= inode_mark->mask; + marks_ignored_mask |= inode_mark->ignored_mask; + } + + if (vfsmnt_mark) { + marks_mask |= vfsmnt_mark->mask; + marks_ignored_mask |= vfsmnt_mark->ignored_mask; } if (d_is_dir(path->dentry) && -- 2.17.0

7 years, 7 months

2
1
0 0

[PATCH 4.4] writeback: safer lock nesting

by Nathan Chancellor

From: Greg Thelen <gthelen(a)google.com> commit 2e898e4c0a3897ccd434adac5abb8330194f527b upstream. lock_page_memcg()/unlock_page_memcg() use spin_lock_irqsave/restore() if the page's memcg is undergoing move accounting, which occurs when a process leaves its memcg for a new one that has memory.move_charge_at_immigrate set. unlocked_inode_to_wb_begin,end() use spin_lock_irq/spin_unlock_irq() if the given inode is switching writeback domains. Switches occur when enough writes are issued from a new domain. This existing pattern is thus suspicious: lock_page_memcg(page); unlocked_inode_to_wb_begin(inode, &locked); ... unlocked_inode_to_wb_end(inode, locked); unlock_page_memcg(page); If both inode switch and process memcg migration are both in-flight then unlocked_inode_to_wb_end() will unconditionally enable interrupts while still holding the lock_page_memcg() irq spinlock. This suggests the possibility of deadlock if an interrupt occurs before unlock_page_memcg(). truncate __cancel_dirty_page lock_page_memcg unlocked_inode_to_wb_begin unlocked_inode_to_wb_end <interrupts mistakenly enabled> <interrupt> end_page_writeback test_clear_page_writeback lock_page_memcg <deadlock> unlock_page_memcg Due to configuration limitations this deadlock is not currently possible because we don't mix cgroup writeback (a cgroupv2 feature) and memory.move_charge_at_immigrate (a cgroupv1 feature). If the kernel is hacked to always claim inode switching and memcg moving_account, then this script triggers lockup in less than a minute: cd /mnt/cgroup/memory mkdir a b echo 1 > a/memory.move_charge_at_immigrate echo 1 > b/memory.move_charge_at_immigrate ( echo $BASHPID > a/cgroup.procs while true; do dd if=/dev/zero of=/mnt/big bs=1M count=256 done ) & while true; do sync done & sleep 1h & SLEEP=$! while true; do echo $SLEEP > a/cgroup.procs echo $SLEEP > b/cgroup.procs done The deadlock does not seem possible, so it's debatable if there's any reason to modify the kernel. I suggest we should to prevent future surprises. And Wang Long said "this deadlock occurs three times in our environment", so there's more reason to apply this, even to stable. Stable 4.4 has minor conflicts applying this patch. For a clean 4.4 patch see "[PATCH for-4.4] writeback: safer lock nesting" https://lkml.org/lkml/2018/4/11/146 Wang Long said "this deadlock occurs three times in our environment" [gthelen(a)google.com: v4] Link: http://lkml.kernel.org/r/20180411084653.254724-1-gthelen@google.com [akpm(a)linux-foundation.org: comment tweaks, struct initialization simplification] Change-Id: Ibb773e8045852978f6207074491d262f1b3fb613 Link: http://lkml.kernel.org/r/20180410005908.167976-1-gthelen@google.com Fixes: 682aa8e1a6a1 ("writeback: implement unlocked_inode_to_wb transaction and use it for stat updates") Signed-off-by: Greg Thelen <gthelen(a)google.com> Reported-by: Wang Long <wanglong19(a)meituan.com> Acked-by: Wang Long <wanglong19(a)meituan.com> Acked-by: Michal Hocko <mhocko(a)suse.com> Reviewed-by: Andrew Morton <akpm(a)linux-foundation.org> Cc: Johannes Weiner <hannes(a)cmpxchg.org> Cc: Tejun Heo <tj(a)kernel.org> Cc: Nicholas Piggin <npiggin(a)gmail.com> Cc: <stable(a)vger.kernel.org> [v4.2+] Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org> [natechancellor: Applied to 4.4 based on Greg's backport on lkml.org] Signed-off-by: Nathan Chancellor <natechancellor(a)gmail.com> --- fs/fs-writeback.c | 7 ++++--- include/linux/backing-dev-defs.h | 5 +++++ include/linux/backing-dev.h | 31 +++++++++++++++++-------------- mm/page-writeback.c | 18 +++++++++--------- 4 files changed, 35 insertions(+), 26 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 22b30249fbcb..0fe667875852 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -747,11 +747,12 @@ int inode_congested(struct inode *inode, int cong_bits) */ if (inode && inode_to_wb_is_valid(inode)) { struct bdi_writeback *wb; - bool locked, congested; + struct wb_lock_cookie lock_cookie = {}; + bool congested; - wb = unlocked_inode_to_wb_begin(inode, &locked); + wb = unlocked_inode_to_wb_begin(inode, &lock_cookie); congested = wb_congested(wb, cong_bits); - unlocked_inode_to_wb_end(inode, locked); + unlocked_inode_to_wb_end(inode, &lock_cookie); return congested; } diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 140c29635069..a307c37c2e6c 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -191,6 +191,11 @@ static inline void set_bdi_congested(struct backing_dev_info *bdi, int sync) set_wb_congested(bdi->wb.congested, sync); } +struct wb_lock_cookie { + bool locked; + unsigned long flags; +}; + #ifdef CONFIG_CGROUP_WRITEBACK /** diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 89d3de3e096b..361274ce5815 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -366,7 +366,7 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode) /** * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction * @inode: target inode - * @lockedp: temp bool output param, to be passed to the end function + * @cookie: output param, to be passed to the end function * * The caller wants to access the wb associated with @inode but isn't * holding inode->i_lock, mapping->tree_lock or wb->list_lock. This @@ -374,12 +374,12 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode) * association doesn't change until the transaction is finished with * unlocked_inode_to_wb_end(). * - * The caller must call unlocked_inode_to_wb_end() with *@lockdep - * afterwards and can't sleep during transaction. IRQ may or may not be - * disabled on return. + * The caller must call unlocked_inode_to_wb_end() with *@cookie afterwards and + * can't sleep during the transaction. IRQs may or may not be disabled on + * return. */ static inline struct bdi_writeback * -unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) +unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) { rcu_read_lock(); @@ -387,10 +387,10 @@ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) * Paired with store_release in inode_switch_wb_work_fn() and * ensures that we see the new wb if we see cleared I_WB_SWITCH. */ - *lockedp = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; + cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; - if (unlikely(*lockedp)) - spin_lock_irq(&inode->i_mapping->tree_lock); + if (unlikely(cookie->locked)) + spin_lock_irqsave(&inode->i_mapping->tree_lock, cookie->flags); /* * Protected by either !I_WB_SWITCH + rcu_read_lock() or tree_lock. @@ -402,12 +402,14 @@ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) /** * unlocked_inode_to_wb_end - end inode wb access transaction * @inode: target inode - * @locked: *@lockedp from unlocked_inode_to_wb_begin() + * @cookie: @cookie from unlocked_inode_to_wb_begin() */ -static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked) +static inline void unlocked_inode_to_wb_end(struct inode *inode, + struct wb_lock_cookie *cookie) { - if (unlikely(locked)) - spin_unlock_irq(&inode->i_mapping->tree_lock); + if (unlikely(cookie->locked)) + spin_unlock_irqrestore(&inode->i_mapping->tree_lock, + cookie->flags); rcu_read_unlock(); } @@ -454,12 +456,13 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode) } static inline struct bdi_writeback * -unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) +unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) { return inode_to_wb(inode); } -static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked) +static inline void unlocked_inode_to_wb_end(struct inode *inode, + struct wb_lock_cookie *cookie) { } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 6d0dbde4503b..3309dbda7ffa 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2510,13 +2510,13 @@ void account_page_redirty(struct page *page) if (mapping && mapping_cap_account_dirty(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; - bool locked; + struct wb_lock_cookie cookie = {}; - wb = unlocked_inode_to_wb_begin(inode, &locked); + wb = unlocked_inode_to_wb_begin(inode, &cookie); current->nr_dirtied--; dec_zone_page_state(page, NR_DIRTIED); dec_wb_stat(wb, WB_DIRTIED); - unlocked_inode_to_wb_end(inode, locked); + unlocked_inode_to_wb_end(inode, &cookie); } } EXPORT_SYMBOL(account_page_redirty); @@ -2622,15 +2622,15 @@ void cancel_dirty_page(struct page *page) struct inode *inode = mapping->host; struct bdi_writeback *wb; struct mem_cgroup *memcg; - bool locked; + struct wb_lock_cookie cookie = {}; memcg = mem_cgroup_begin_page_stat(page); - wb = unlocked_inode_to_wb_begin(inode, &locked); + wb = unlocked_inode_to_wb_begin(inode, &cookie); if (TestClearPageDirty(page)) account_page_cleaned(page, mapping, memcg, wb); - unlocked_inode_to_wb_end(inode, locked); + unlocked_inode_to_wb_end(inode, &cookie); mem_cgroup_end_page_stat(memcg); } else { ClearPageDirty(page); @@ -2663,7 +2663,7 @@ int clear_page_dirty_for_io(struct page *page) struct inode *inode = mapping->host; struct bdi_writeback *wb; struct mem_cgroup *memcg; - bool locked; + struct wb_lock_cookie cookie = {}; /* * Yes, Virginia, this is indeed insane. @@ -2701,14 +2701,14 @@ int clear_page_dirty_for_io(struct page *page) * exclusion. */ memcg = mem_cgroup_begin_page_stat(page); - wb = unlocked_inode_to_wb_begin(inode, &locked); + wb = unlocked_inode_to_wb_begin(inode, &cookie); if (TestClearPageDirty(page)) { mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); dec_zone_page_state(page, NR_FILE_DIRTY); dec_wb_stat(wb, WB_RECLAIMABLE); ret = 1; } - unlocked_inode_to_wb_end(inode, locked); + unlocked_inode_to_wb_end(inode, &cookie); mem_cgroup_end_page_stat(memcg); return ret; } -- 2.17.0

7 years, 7 months

2
2
0 0

[PATCH 4.4] ext4: bugfix for mmaped pages in mpage_release_unused_pages()

by Nathan Chancellor

From: wangguang <wang.guang55(a)zte.com.cn> commit 4e800c0359d9a53e6bf0ab216954971b2515247f upstream. Pages clear buffers after ext4 delayed block allocation failed, However, it does not clean its pte_dirty flag. if the pages unmap ,in cording to the pte_dirty , unmap_page_range may try to call __set_page_dirty, which may lead to the bugon at mpage_prepare_extent_to_map:head = page_buffers(page);. This patch just call clear_page_dirty_for_io to clean pte_dirty at mpage_release_unused_pages for pages mmaped. Steps to reproduce the bug: （1） mmap a file in ext4 addr = (char *)mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); memset(addr, 'i', 4096); （2） return EIO at ext4_writepages->mpage_map_and_submit_extent->mpage_map_one_extent which causes this log message to be print: ext4_msg(sb, KERN_CRIT, "Delayed block allocation failed for " "inode %lu at logical offset %llu with" " max blocks %u with error %d", inode->i_ino, (unsigned long long)map->m_lblk, (unsigned)map->m_len, -err); (3）Unmap the addr cause warning at __set_page_dirty:WARN_ON_ONCE(warn && !PageUptodate(page)); (4) wait for a minute,then bugon happen. Cc: stable(a)vger.kernel.org Signed-off-by: wangguang <wangguang03(a)zte.com> Signed-off-by: Theodore Ts'o <tytso(a)mit.edu> [@nathanchance: Resolved conflict from lack of 09cbfeaf1a5a6] Signed-off-by: Nathan Chancellor <natechancellor(a)gmail.com> --- One more bug fix I came across, sorry I forgot to send it with the other two patches! --- fs/ext4/inode.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f0cabc8c96cb..3bddd47660d8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1515,6 +1515,8 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); if (invalidate) { + if (page_mapped(page)) + clear_page_dirty_for_io(page); block_invalidatepage(page, 0, PAGE_CACHE_SIZE); ClearPageUptodate(page); } -- 2.17.0

7 years, 7 months

2
1
0 0

[PATCH v4.14] net: dsa: Discard frames from unused ports

by Andrew Lunn

[ Upstream commit fc5f33768cca7144f8d793205b229d46740d183b ] The Marvell switches under some conditions will pass a frame to the host with the port being the CPU port. Such frames are invalid, and should be dropped. Not dropping them can result in a crash when incrementing the receive statistics for an invalid port. This has been reworked for 4.14, which does not have the central dsa_master_find_slave() function, so each tag driver needs to check. Reported-by: Chris Healy <cphealy(a)gmail.com> Fixes: 91da11f870f0 ("net: Distributed Switch Architecture protocol support") Signed-off-by: Andrew Lunn <andrew(a)lunn.ch> Reviewed-by: Florian Fainelli <f.fainelli(a)gmail.com> Signed-off-by: David S. Miller <davem(a)davemloft.net> --- net/dsa/tag_brcm.c | 3 +++ net/dsa/tag_dsa.c | 3 +++ net/dsa/tag_edsa.c | 3 +++ net/dsa/tag_ksz.c | 3 +++ net/dsa/tag_lan9303.c | 3 +++ net/dsa/tag_mtk.c | 3 +++ net/dsa/tag_qca.c | 3 +++ net/dsa/tag_trailer.c | 3 +++ 8 files changed, 24 insertions(+) diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index dbb016434ace..de92fc1fc3be 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -121,6 +121,9 @@ static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, if (source_port >= ds->num_ports || !ds->ports[source_port].netdev) return NULL; + if (unlikely(ds->cpu_port_mask & BIT(source_port))) + return NULL; + /* Remove Broadcom tag and update checksum */ skb_pull_rcsum(skb, BRCM_TAG_LEN); diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index fbf9ca954773..b3008a9bacf3 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -107,6 +107,9 @@ static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev, if (source_port >= ds->num_ports || !ds->ports[source_port].netdev) return NULL; + if (unlikely(ds->cpu_port_mask & BIT(source_port))) + return NULL; + /* * Convert the DSA header to an 802.1q header if the 'tagged' * bit in the DSA header is set. If the 'tagged' bit is clear, diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index 76367ba1b2e2..c86b6d90576d 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -120,6 +120,9 @@ static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev, if (source_port >= ds->num_ports || !ds->ports[source_port].netdev) return NULL; + if (unlikely(ds->cpu_port_mask & BIT(source_port))) + return NULL; + /* * If the 'tagged' bit is set, convert the DSA tag to a 802.1q * tag and delete the ethertype part. If the 'tagged' bit is diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index 010ca0a336c4..6c894692b9cd 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -92,6 +92,9 @@ static struct sk_buff *ksz_rcv(struct sk_buff *skb, struct net_device *dev, if (source_port >= ds->num_ports || !ds->ports[source_port].netdev) return NULL; + if (unlikely(ds->cpu_port_mask & BIT(source_port))) + return NULL; + pskb_trim_rcsum(skb, skb->len - KSZ_EGRESS_TAG_LEN); skb->dev = ds->ports[source_port].netdev; diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c index 0b9826105e42..2d1603009e16 100644 --- a/net/dsa/tag_lan9303.c +++ b/net/dsa/tag_lan9303.c @@ -108,6 +108,9 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, return NULL; } + if (unlikely(ds->cpu_port_mask & BIT(source_port))) + return NULL; + if (!ds->ports[source_port].netdev) { dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid netdev or device\n"); return NULL; diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c index ec8ee5f43255..5c471854412d 100644 --- a/net/dsa/tag_mtk.c +++ b/net/dsa/tag_mtk.c @@ -81,6 +81,9 @@ static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev, if (!ds->ports[port].netdev) return NULL; + if (unlikely(ds->cpu_port_mask & BIT(port))) + return NULL; + skb->dev = ds->ports[port].netdev; return skb; diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index 1d4c70711c0f..b8c05f1cf47d 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -104,6 +104,9 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev, if (!ds->ports[port].netdev) return NULL; + if (unlikely(ds->cpu_port_mask & BIT(port))) + return NULL; + /* Update skb & forward the frame accordingly */ skb->dev = ds->ports[port].netdev; diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index d2fd4923aa3e..fcc9aa72877d 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -76,6 +76,9 @@ static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev, if (source_port >= ds->num_ports || !ds->ports[source_port].netdev) return NULL; + if (unlikely(ds->cpu_port_mask & BIT(source_port))) + return NULL; + pskb_trim_rcsum(skb, skb->len - 4); skb->dev = ds->ports[source_port].netdev; -- 2.17.0

7 years, 7 months

3
2
0 0

FAILED: patch "[PATCH] writeback: safer lock nesting" failed to apply to 4.4-stable tree

by gregkh＠linuxfoundation.org

The patch below does not apply to the 4.4-stable tree. If someone wants it applied there, or to any other stable or longterm tree, then please email the backport, including the original git commit id to <stable(a)vger.kernel.org>. thanks, greg k-h ------------------ original commit in Linus's tree ------------------ >From 2e898e4c0a3897ccd434adac5abb8330194f527b Mon Sep 17 00:00:00 2001 From: Greg Thelen <gthelen(a)google.com> Date: Fri, 20 Apr 2018 14:55:42 -0700 Subject: [PATCH] writeback: safer lock nesting lock_page_memcg()/unlock_page_memcg() use spin_lock_irqsave/restore() if the page's memcg is undergoing move accounting, which occurs when a process leaves its memcg for a new one that has memory.move_charge_at_immigrate set. unlocked_inode_to_wb_begin,end() use spin_lock_irq/spin_unlock_irq() if the given inode is switching writeback domains. Switches occur when enough writes are issued from a new domain. This existing pattern is thus suspicious: lock_page_memcg(page); unlocked_inode_to_wb_begin(inode, &locked); ... unlocked_inode_to_wb_end(inode, locked); unlock_page_memcg(page); If both inode switch and process memcg migration are both in-flight then unlocked_inode_to_wb_end() will unconditionally enable interrupts while still holding the lock_page_memcg() irq spinlock. This suggests the possibility of deadlock if an interrupt occurs before unlock_page_memcg(). truncate __cancel_dirty_page lock_page_memcg unlocked_inode_to_wb_begin unlocked_inode_to_wb_end <interrupts mistakenly enabled> <interrupt> end_page_writeback test_clear_page_writeback lock_page_memcg <deadlock> unlock_page_memcg Due to configuration limitations this deadlock is not currently possible because we don't mix cgroup writeback (a cgroupv2 feature) and memory.move_charge_at_immigrate (a cgroupv1 feature). If the kernel is hacked to always claim inode switching and memcg moving_account, then this script triggers lockup in less than a minute: cd /mnt/cgroup/memory mkdir a b echo 1 > a/memory.move_charge_at_immigrate echo 1 > b/memory.move_charge_at_immigrate ( echo $BASHPID > a/cgroup.procs while true; do dd if=/dev/zero of=/mnt/big bs=1M count=256 done ) & while true; do sync done & sleep 1h & SLEEP=$! while true; do echo $SLEEP > a/cgroup.procs echo $SLEEP > b/cgroup.procs done The deadlock does not seem possible, so it's debatable if there's any reason to modify the kernel. I suggest we should to prevent future surprises. And Wang Long said "this deadlock occurs three times in our environment", so there's more reason to apply this, even to stable. Stable 4.4 has minor conflicts applying this patch. For a clean 4.4 patch see "[PATCH for-4.4] writeback: safer lock nesting" https://lkml.org/lkml/2018/4/11/146 Wang Long said "this deadlock occurs three times in our environment" [gthelen(a)google.com: v4] Link: http://lkml.kernel.org/r/20180411084653.254724-1-gthelen@google.com [akpm(a)linux-foundation.org: comment tweaks, struct initialization simplification] Change-Id: Ibb773e8045852978f6207074491d262f1b3fb613 Link: http://lkml.kernel.org/r/20180410005908.167976-1-gthelen@google.com Fixes: 682aa8e1a6a1 ("writeback: implement unlocked_inode_to_wb transaction and use it for stat updates") Signed-off-by: Greg Thelen <gthelen(a)google.com> Reported-by: Wang Long <wanglong19(a)meituan.com> Acked-by: Wang Long <wanglong19(a)meituan.com> Acked-by: Michal Hocko <mhocko(a)suse.com> Reviewed-by: Andrew Morton <akpm(a)linux-foundation.org> Cc: Johannes Weiner <hannes(a)cmpxchg.org> Cc: Tejun Heo <tj(a)kernel.org> Cc: Nicholas Piggin <npiggin(a)gmail.com> Cc: <stable(a)vger.kernel.org> [v4.2+] Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 4b12ba70a895..47d7c151fcba 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -745,11 +745,12 @@ int inode_congested(struct inode *inode, int cong_bits) */ if (inode && inode_to_wb_is_valid(inode)) { struct bdi_writeback *wb; - bool locked, congested; + struct wb_lock_cookie lock_cookie = {}; + bool congested; - wb = unlocked_inode_to_wb_begin(inode, &locked); + wb = unlocked_inode_to_wb_begin(inode, &lock_cookie); congested = wb_congested(wb, cong_bits); - unlocked_inode_to_wb_end(inode, locked); + unlocked_inode_to_wb_end(inode, &lock_cookie); return congested; } diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index bfe86b54f6c1..0bd432a4d7bd 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -223,6 +223,11 @@ static inline void set_bdi_congested(struct backing_dev_info *bdi, int sync) set_wb_congested(bdi->wb.congested, sync); } +struct wb_lock_cookie { + bool locked; + unsigned long flags; +}; + #ifdef CONFIG_CGROUP_WRITEBACK /** diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index f6be4b0b6c18..72ca0f3d39f3 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -347,7 +347,7 @@ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode) /** * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction * @inode: target inode - * @lockedp: temp bool output param, to be passed to the end function + * @cookie: output param, to be passed to the end function * * The caller wants to access the wb associated with @inode but isn't * holding inode->i_lock, the i_pages lock or wb->list_lock. This @@ -355,12 +355,12 @@ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode) * association doesn't change until the transaction is finished with * unlocked_inode_to_wb_end(). * - * The caller must call unlocked_inode_to_wb_end() with *@lockdep - * afterwards and can't sleep during transaction. IRQ may or may not be - * disabled on return. + * The caller must call unlocked_inode_to_wb_end() with *@cookie afterwards and + * can't sleep during the transaction. IRQs may or may not be disabled on + * return. */ static inline struct bdi_writeback * -unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) +unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) { rcu_read_lock(); @@ -368,10 +368,10 @@ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) * Paired with store_release in inode_switch_wb_work_fn() and * ensures that we see the new wb if we see cleared I_WB_SWITCH. */ - *lockedp = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; + cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; - if (unlikely(*lockedp)) - xa_lock_irq(&inode->i_mapping->i_pages); + if (unlikely(cookie->locked)) + xa_lock_irqsave(&inode->i_mapping->i_pages, cookie->flags); /* * Protected by either !I_WB_SWITCH + rcu_read_lock() or the i_pages @@ -383,12 +383,13 @@ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) /** * unlocked_inode_to_wb_end - end inode wb access transaction * @inode: target inode - * @locked: *@lockedp from unlocked_inode_to_wb_begin() + * @cookie: @cookie from unlocked_inode_to_wb_begin() */ -static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked) +static inline void unlocked_inode_to_wb_end(struct inode *inode, + struct wb_lock_cookie *cookie) { - if (unlikely(locked)) - xa_unlock_irq(&inode->i_mapping->i_pages); + if (unlikely(cookie->locked)) + xa_unlock_irqrestore(&inode->i_mapping->i_pages, cookie->flags); rcu_read_unlock(); } @@ -435,12 +436,13 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode) } static inline struct bdi_writeback * -unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) +unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) { return inode_to_wb(inode); } -static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked) +static inline void unlocked_inode_to_wb_end(struct inode *inode, + struct wb_lock_cookie *cookie) { } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5c1a3279e63f..337c6afb3345 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2502,13 +2502,13 @@ void account_page_redirty(struct page *page) if (mapping && mapping_cap_account_dirty(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; - bool locked; + struct wb_lock_cookie cookie = {}; - wb = unlocked_inode_to_wb_begin(inode, &locked); + wb = unlocked_inode_to_wb_begin(inode, &cookie); current->nr_dirtied--; dec_node_page_state(page, NR_DIRTIED); dec_wb_stat(wb, WB_DIRTIED); - unlocked_inode_to_wb_end(inode, locked); + unlocked_inode_to_wb_end(inode, &cookie); } } EXPORT_SYMBOL(account_page_redirty); @@ -2614,15 +2614,15 @@ void __cancel_dirty_page(struct page *page) if (mapping_cap_account_dirty(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; - bool locked; + struct wb_lock_cookie cookie = {}; lock_page_memcg(page); - wb = unlocked_inode_to_wb_begin(inode, &locked); + wb = unlocked_inode_to_wb_begin(inode, &cookie); if (TestClearPageDirty(page)) account_page_cleaned(page, mapping, wb); - unlocked_inode_to_wb_end(inode, locked); + unlocked_inode_to_wb_end(inode, &cookie); unlock_page_memcg(page); } else { ClearPageDirty(page); @@ -2654,7 +2654,7 @@ int clear_page_dirty_for_io(struct page *page) if (mapping && mapping_cap_account_dirty(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; - bool locked; + struct wb_lock_cookie cookie = {}; /* * Yes, Virginia, this is indeed insane. @@ -2691,14 +2691,14 @@ int clear_page_dirty_for_io(struct page *page) * always locked coming in here, so we get the desired * exclusion. */ - wb = unlocked_inode_to_wb_begin(inode, &locked); + wb = unlocked_inode_to_wb_begin(inode, &cookie); if (TestClearPageDirty(page)) { dec_lruvec_page_state(page, NR_FILE_DIRTY); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); dec_wb_stat(wb, WB_RECLAIMABLE); ret = 1; } - unlocked_inode_to_wb_end(inode, locked); + unlocked_inode_to_wb_end(inode, &cookie); return ret; } return TestClearPageDirty(page);

7 years, 7 months

1
0
0 0

FAILED: patch "[PATCH] writeback: safer lock nesting" failed to apply to 4.9-stable tree

by gregkh＠linuxfoundation.org

The patch below does not apply to the 4.9-stable tree. If someone wants it applied there, or to any other stable or longterm tree, then please email the backport, including the original git commit id to <stable(a)vger.kernel.org>. thanks, greg k-h ------------------ original commit in Linus's tree ------------------ >From 2e898e4c0a3897ccd434adac5abb8330194f527b Mon Sep 17 00:00:00 2001 From: Greg Thelen <gthelen(a)google.com> Date: Fri, 20 Apr 2018 14:55:42 -0700 Subject: [PATCH] writeback: safer lock nesting lock_page_memcg()/unlock_page_memcg() use spin_lock_irqsave/restore() if the page's memcg is undergoing move accounting, which occurs when a process leaves its memcg for a new one that has memory.move_charge_at_immigrate set. unlocked_inode_to_wb_begin,end() use spin_lock_irq/spin_unlock_irq() if the given inode is switching writeback domains. Switches occur when enough writes are issued from a new domain. This existing pattern is thus suspicious: lock_page_memcg(page); unlocked_inode_to_wb_begin(inode, &locked); ... unlocked_inode_to_wb_end(inode, locked); unlock_page_memcg(page); If both inode switch and process memcg migration are both in-flight then unlocked_inode_to_wb_end() will unconditionally enable interrupts while still holding the lock_page_memcg() irq spinlock. This suggests the possibility of deadlock if an interrupt occurs before unlock_page_memcg(). truncate __cancel_dirty_page lock_page_memcg unlocked_inode_to_wb_begin unlocked_inode_to_wb_end <interrupts mistakenly enabled> <interrupt> end_page_writeback test_clear_page_writeback lock_page_memcg <deadlock> unlock_page_memcg Due to configuration limitations this deadlock is not currently possible because we don't mix cgroup writeback (a cgroupv2 feature) and memory.move_charge_at_immigrate (a cgroupv1 feature). If the kernel is hacked to always claim inode switching and memcg moving_account, then this script triggers lockup in less than a minute: cd /mnt/cgroup/memory mkdir a b echo 1 > a/memory.move_charge_at_immigrate echo 1 > b/memory.move_charge_at_immigrate ( echo $BASHPID > a/cgroup.procs while true; do dd if=/dev/zero of=/mnt/big bs=1M count=256 done ) & while true; do sync done & sleep 1h & SLEEP=$! while true; do echo $SLEEP > a/cgroup.procs echo $SLEEP > b/cgroup.procs done The deadlock does not seem possible, so it's debatable if there's any reason to modify the kernel. I suggest we should to prevent future surprises. And Wang Long said "this deadlock occurs three times in our environment", so there's more reason to apply this, even to stable. Stable 4.4 has minor conflicts applying this patch. For a clean 4.4 patch see "[PATCH for-4.4] writeback: safer lock nesting" https://lkml.org/lkml/2018/4/11/146 Wang Long said "this deadlock occurs three times in our environment" [gthelen(a)google.com: v4] Link: http://lkml.kernel.org/r/20180411084653.254724-1-gthelen@google.com [akpm(a)linux-foundation.org: comment tweaks, struct initialization simplification] Change-Id: Ibb773e8045852978f6207074491d262f1b3fb613 Link: http://lkml.kernel.org/r/20180410005908.167976-1-gthelen@google.com Fixes: 682aa8e1a6a1 ("writeback: implement unlocked_inode_to_wb transaction and use it for stat updates") Signed-off-by: Greg Thelen <gthelen(a)google.com> Reported-by: Wang Long <wanglong19(a)meituan.com> Acked-by: Wang Long <wanglong19(a)meituan.com> Acked-by: Michal Hocko <mhocko(a)suse.com> Reviewed-by: Andrew Morton <akpm(a)linux-foundation.org> Cc: Johannes Weiner <hannes(a)cmpxchg.org> Cc: Tejun Heo <tj(a)kernel.org> Cc: Nicholas Piggin <npiggin(a)gmail.com> Cc: <stable(a)vger.kernel.org> [v4.2+] Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 4b12ba70a895..47d7c151fcba 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -745,11 +745,12 @@ int inode_congested(struct inode *inode, int cong_bits) */ if (inode && inode_to_wb_is_valid(inode)) { struct bdi_writeback *wb; - bool locked, congested; + struct wb_lock_cookie lock_cookie = {}; + bool congested; - wb = unlocked_inode_to_wb_begin(inode, &locked); + wb = unlocked_inode_to_wb_begin(inode, &lock_cookie); congested = wb_congested(wb, cong_bits); - unlocked_inode_to_wb_end(inode, locked); + unlocked_inode_to_wb_end(inode, &lock_cookie); return congested; } diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index bfe86b54f6c1..0bd432a4d7bd 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -223,6 +223,11 @@ static inline void set_bdi_congested(struct backing_dev_info *bdi, int sync) set_wb_congested(bdi->wb.congested, sync); } +struct wb_lock_cookie { + bool locked; + unsigned long flags; +}; + #ifdef CONFIG_CGROUP_WRITEBACK /** diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index f6be4b0b6c18..72ca0f3d39f3 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -347,7 +347,7 @@ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode) /** * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction * @inode: target inode - * @lockedp: temp bool output param, to be passed to the end function + * @cookie: output param, to be passed to the end function * * The caller wants to access the wb associated with @inode but isn't * holding inode->i_lock, the i_pages lock or wb->list_lock. This @@ -355,12 +355,12 @@ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode) * association doesn't change until the transaction is finished with * unlocked_inode_to_wb_end(). * - * The caller must call unlocked_inode_to_wb_end() with *@lockdep - * afterwards and can't sleep during transaction. IRQ may or may not be - * disabled on return. + * The caller must call unlocked_inode_to_wb_end() with *@cookie afterwards and + * can't sleep during the transaction. IRQs may or may not be disabled on + * return. */ static inline struct bdi_writeback * -unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) +unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) { rcu_read_lock(); @@ -368,10 +368,10 @@ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) * Paired with store_release in inode_switch_wb_work_fn() and * ensures that we see the new wb if we see cleared I_WB_SWITCH. */ - *lockedp = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; + cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; - if (unlikely(*lockedp)) - xa_lock_irq(&inode->i_mapping->i_pages); + if (unlikely(cookie->locked)) + xa_lock_irqsave(&inode->i_mapping->i_pages, cookie->flags); /* * Protected by either !I_WB_SWITCH + rcu_read_lock() or the i_pages @@ -383,12 +383,13 @@ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) /** * unlocked_inode_to_wb_end - end inode wb access transaction * @inode: target inode - * @locked: *@lockedp from unlocked_inode_to_wb_begin() + * @cookie: @cookie from unlocked_inode_to_wb_begin() */ -static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked) +static inline void unlocked_inode_to_wb_end(struct inode *inode, + struct wb_lock_cookie *cookie) { - if (unlikely(locked)) - xa_unlock_irq(&inode->i_mapping->i_pages); + if (unlikely(cookie->locked)) + xa_unlock_irqrestore(&inode->i_mapping->i_pages, cookie->flags); rcu_read_unlock(); } @@ -435,12 +436,13 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode) } static inline struct bdi_writeback * -unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) +unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) { return inode_to_wb(inode); } -static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked) +static inline void unlocked_inode_to_wb_end(struct inode *inode, + struct wb_lock_cookie *cookie) { } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5c1a3279e63f..337c6afb3345 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2502,13 +2502,13 @@ void account_page_redirty(struct page *page) if (mapping && mapping_cap_account_dirty(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; - bool locked; + struct wb_lock_cookie cookie = {}; - wb = unlocked_inode_to_wb_begin(inode, &locked); + wb = unlocked_inode_to_wb_begin(inode, &cookie); current->nr_dirtied--; dec_node_page_state(page, NR_DIRTIED); dec_wb_stat(wb, WB_DIRTIED); - unlocked_inode_to_wb_end(inode, locked); + unlocked_inode_to_wb_end(inode, &cookie); } } EXPORT_SYMBOL(account_page_redirty); @@ -2614,15 +2614,15 @@ void __cancel_dirty_page(struct page *page) if (mapping_cap_account_dirty(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; - bool locked; + struct wb_lock_cookie cookie = {}; lock_page_memcg(page); - wb = unlocked_inode_to_wb_begin(inode, &locked); + wb = unlocked_inode_to_wb_begin(inode, &cookie); if (TestClearPageDirty(page)) account_page_cleaned(page, mapping, wb); - unlocked_inode_to_wb_end(inode, locked); + unlocked_inode_to_wb_end(inode, &cookie); unlock_page_memcg(page); } else { ClearPageDirty(page); @@ -2654,7 +2654,7 @@ int clear_page_dirty_for_io(struct page *page) if (mapping && mapping_cap_account_dirty(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; - bool locked; + struct wb_lock_cookie cookie = {}; /* * Yes, Virginia, this is indeed insane. @@ -2691,14 +2691,14 @@ int clear_page_dirty_for_io(struct page *page) * always locked coming in here, so we get the desired * exclusion. */ - wb = unlocked_inode_to_wb_begin(inode, &locked); + wb = unlocked_inode_to_wb_begin(inode, &cookie); if (TestClearPageDirty(page)) { dec_lruvec_page_state(page, NR_FILE_DIRTY); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); dec_wb_stat(wb, WB_RECLAIMABLE); ret = 1; } - unlocked_inode_to_wb_end(inode, locked); + unlocked_inode_to_wb_end(inode, &cookie); return ret; } return TestClearPageDirty(page);

7 years, 7 months

1
0
0 0

FAILED: patch "[PATCH] writeback: safer lock nesting" failed to apply to 4.14-stable tree

by gregkh＠linuxfoundation.org

The patch below does not apply to the 4.14-stable tree. If someone wants it applied there, or to any other stable or longterm tree, then please email the backport, including the original git commit id to <stable(a)vger.kernel.org>. thanks, greg k-h ------------------ original commit in Linus's tree ------------------ >From 2e898e4c0a3897ccd434adac5abb8330194f527b Mon Sep 17 00:00:00 2001 From: Greg Thelen <gthelen(a)google.com> Date: Fri, 20 Apr 2018 14:55:42 -0700 Subject: [PATCH] writeback: safer lock nesting lock_page_memcg()/unlock_page_memcg() use spin_lock_irqsave/restore() if the page's memcg is undergoing move accounting, which occurs when a process leaves its memcg for a new one that has memory.move_charge_at_immigrate set. unlocked_inode_to_wb_begin,end() use spin_lock_irq/spin_unlock_irq() if the given inode is switching writeback domains. Switches occur when enough writes are issued from a new domain. This existing pattern is thus suspicious: lock_page_memcg(page); unlocked_inode_to_wb_begin(inode, &locked); ... unlocked_inode_to_wb_end(inode, locked); unlock_page_memcg(page); If both inode switch and process memcg migration are both in-flight then unlocked_inode_to_wb_end() will unconditionally enable interrupts while still holding the lock_page_memcg() irq spinlock. This suggests the possibility of deadlock if an interrupt occurs before unlock_page_memcg(). truncate __cancel_dirty_page lock_page_memcg unlocked_inode_to_wb_begin unlocked_inode_to_wb_end <interrupts mistakenly enabled> <interrupt> end_page_writeback test_clear_page_writeback lock_page_memcg <deadlock> unlock_page_memcg Due to configuration limitations this deadlock is not currently possible because we don't mix cgroup writeback (a cgroupv2 feature) and memory.move_charge_at_immigrate (a cgroupv1 feature). If the kernel is hacked to always claim inode switching and memcg moving_account, then this script triggers lockup in less than a minute: cd /mnt/cgroup/memory mkdir a b echo 1 > a/memory.move_charge_at_immigrate echo 1 > b/memory.move_charge_at_immigrate ( echo $BASHPID > a/cgroup.procs while true; do dd if=/dev/zero of=/mnt/big bs=1M count=256 done ) & while true; do sync done & sleep 1h & SLEEP=$! while true; do echo $SLEEP > a/cgroup.procs echo $SLEEP > b/cgroup.procs done The deadlock does not seem possible, so it's debatable if there's any reason to modify the kernel. I suggest we should to prevent future surprises. And Wang Long said "this deadlock occurs three times in our environment", so there's more reason to apply this, even to stable. Stable 4.4 has minor conflicts applying this patch. For a clean 4.4 patch see "[PATCH for-4.4] writeback: safer lock nesting" https://lkml.org/lkml/2018/4/11/146 Wang Long said "this deadlock occurs three times in our environment" [gthelen(a)google.com: v4] Link: http://lkml.kernel.org/r/20180411084653.254724-1-gthelen@google.com [akpm(a)linux-foundation.org: comment tweaks, struct initialization simplification] Change-Id: Ibb773e8045852978f6207074491d262f1b3fb613 Link: http://lkml.kernel.org/r/20180410005908.167976-1-gthelen@google.com Fixes: 682aa8e1a6a1 ("writeback: implement unlocked_inode_to_wb transaction and use it for stat updates") Signed-off-by: Greg Thelen <gthelen(a)google.com> Reported-by: Wang Long <wanglong19(a)meituan.com> Acked-by: Wang Long <wanglong19(a)meituan.com> Acked-by: Michal Hocko <mhocko(a)suse.com> Reviewed-by: Andrew Morton <akpm(a)linux-foundation.org> Cc: Johannes Weiner <hannes(a)cmpxchg.org> Cc: Tejun Heo <tj(a)kernel.org> Cc: Nicholas Piggin <npiggin(a)gmail.com> Cc: <stable(a)vger.kernel.org> [v4.2+] Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org> diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 4b12ba70a895..47d7c151fcba 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -745,11 +745,12 @@ int inode_congested(struct inode *inode, int cong_bits) */ if (inode && inode_to_wb_is_valid(inode)) { struct bdi_writeback *wb; - bool locked, congested; + struct wb_lock_cookie lock_cookie = {}; + bool congested; - wb = unlocked_inode_to_wb_begin(inode, &locked); + wb = unlocked_inode_to_wb_begin(inode, &lock_cookie); congested = wb_congested(wb, cong_bits); - unlocked_inode_to_wb_end(inode, locked); + unlocked_inode_to_wb_end(inode, &lock_cookie); return congested; } diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index bfe86b54f6c1..0bd432a4d7bd 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -223,6 +223,11 @@ static inline void set_bdi_congested(struct backing_dev_info *bdi, int sync) set_wb_congested(bdi->wb.congested, sync); } +struct wb_lock_cookie { + bool locked; + unsigned long flags; +}; + #ifdef CONFIG_CGROUP_WRITEBACK /** diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index f6be4b0b6c18..72ca0f3d39f3 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -347,7 +347,7 @@ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode) /** * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction * @inode: target inode - * @lockedp: temp bool output param, to be passed to the end function + * @cookie: output param, to be passed to the end function * * The caller wants to access the wb associated with @inode but isn't * holding inode->i_lock, the i_pages lock or wb->list_lock. This @@ -355,12 +355,12 @@ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode) * association doesn't change until the transaction is finished with * unlocked_inode_to_wb_end(). * - * The caller must call unlocked_inode_to_wb_end() with *@lockdep - * afterwards and can't sleep during transaction. IRQ may or may not be - * disabled on return. + * The caller must call unlocked_inode_to_wb_end() with *@cookie afterwards and + * can't sleep during the transaction. IRQs may or may not be disabled on + * return. */ static inline struct bdi_writeback * -unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) +unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) { rcu_read_lock(); @@ -368,10 +368,10 @@ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) * Paired with store_release in inode_switch_wb_work_fn() and * ensures that we see the new wb if we see cleared I_WB_SWITCH. */ - *lockedp = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; + cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; - if (unlikely(*lockedp)) - xa_lock_irq(&inode->i_mapping->i_pages); + if (unlikely(cookie->locked)) + xa_lock_irqsave(&inode->i_mapping->i_pages, cookie->flags); /* * Protected by either !I_WB_SWITCH + rcu_read_lock() or the i_pages @@ -383,12 +383,13 @@ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) /** * unlocked_inode_to_wb_end - end inode wb access transaction * @inode: target inode - * @locked: *@lockedp from unlocked_inode_to_wb_begin() + * @cookie: @cookie from unlocked_inode_to_wb_begin() */ -static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked) +static inline void unlocked_inode_to_wb_end(struct inode *inode, + struct wb_lock_cookie *cookie) { - if (unlikely(locked)) - xa_unlock_irq(&inode->i_mapping->i_pages); + if (unlikely(cookie->locked)) + xa_unlock_irqrestore(&inode->i_mapping->i_pages, cookie->flags); rcu_read_unlock(); } @@ -435,12 +436,13 @@ static inline struct bdi_writeback *inode_to_wb(struct inode *inode) } static inline struct bdi_writeback * -unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp) +unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) { return inode_to_wb(inode); } -static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked) +static inline void unlocked_inode_to_wb_end(struct inode *inode, + struct wb_lock_cookie *cookie) { } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5c1a3279e63f..337c6afb3345 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2502,13 +2502,13 @@ void account_page_redirty(struct page *page) if (mapping && mapping_cap_account_dirty(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; - bool locked; + struct wb_lock_cookie cookie = {}; - wb = unlocked_inode_to_wb_begin(inode, &locked); + wb = unlocked_inode_to_wb_begin(inode, &cookie); current->nr_dirtied--; dec_node_page_state(page, NR_DIRTIED); dec_wb_stat(wb, WB_DIRTIED); - unlocked_inode_to_wb_end(inode, locked); + unlocked_inode_to_wb_end(inode, &cookie); } } EXPORT_SYMBOL(account_page_redirty); @@ -2614,15 +2614,15 @@ void __cancel_dirty_page(struct page *page) if (mapping_cap_account_dirty(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; - bool locked; + struct wb_lock_cookie cookie = {}; lock_page_memcg(page); - wb = unlocked_inode_to_wb_begin(inode, &locked); + wb = unlocked_inode_to_wb_begin(inode, &cookie); if (TestClearPageDirty(page)) account_page_cleaned(page, mapping, wb); - unlocked_inode_to_wb_end(inode, locked); + unlocked_inode_to_wb_end(inode, &cookie); unlock_page_memcg(page); } else { ClearPageDirty(page); @@ -2654,7 +2654,7 @@ int clear_page_dirty_for_io(struct page *page) if (mapping && mapping_cap_account_dirty(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; - bool locked; + struct wb_lock_cookie cookie = {}; /* * Yes, Virginia, this is indeed insane. @@ -2691,14 +2691,14 @@ int clear_page_dirty_for_io(struct page *page) * always locked coming in here, so we get the desired * exclusion. */ - wb = unlocked_inode_to_wb_begin(inode, &locked); + wb = unlocked_inode_to_wb_begin(inode, &cookie); if (TestClearPageDirty(page)) { dec_lruvec_page_state(page, NR_FILE_DIRTY); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); dec_wb_stat(wb, WB_RECLAIMABLE); ret = 1; } - unlocked_inode_to_wb_end(inode, locked); + unlocked_inode_to_wb_end(inode, &cookie); return ret; } return TestClearPageDirty(page);

7 years, 7 months

1
0
0 0

2025

2024

2023

2022

2021

2020

2019

2018

2017

Linux-stable-mirror April 2018