In the request completion path with CQE, request type is being checked
after the request is getting completed. This is resulting in returning
the wrong request type and leading to the IO hang issue.
ASYNC request type is getting returned for DCMD type requests.
Because of this mismatch, mq->cqe_busy flag is never getting cleared
and the driver is not invoking blk_mq_hw_run_queue. So requests are not
getting dispatched to the LLD from the block layer.
All these eventually leading to IO hang issues.
So, get the request type before completing the request.
Cc: <stable(a)vger.kernel.org> # v4.19+
Signed-off-by: Veerabhadrarao Badiganti <vbadigan(a)codeaurora.org>
---
drivers/mmc/core/block.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index 8499b56..c5367e2 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -1370,6 +1370,7 @@ static void mmc_blk_cqe_complete_rq(struct mmc_queue *mq, struct request *req)
struct mmc_request *mrq = &mqrq->brq.mrq;
struct request_queue *q = req->q;
struct mmc_host *host = mq->card->host;
+ enum mmc_issue_type issue_type = mmc_issue_type(mq, req);
unsigned long flags;
bool put_card;
int err;
@@ -1399,7 +1400,7 @@ static void mmc_blk_cqe_complete_rq(struct mmc_queue *mq, struct request *req)
spin_lock_irqsave(&mq->lock, flags);
- mq->in_flight[mmc_issue_type(mq, req)] -= 1;
+ mq->in_flight[issue_type] -= 1;
put_card = (mmc_tot_in_flight(mq) == 0);
--
Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., is a member of Code Aurora Forum, a Linux Foundation Collaborative Project
Hi,
I believe some patches are needed to fix build issues on Hexagon:
ac32292c8552f7e8517be184e65dd09786e991f9 hexagon: clean up ioremap
7312b70699252074d753c5005fc67266c547bbe3 hexagon: define ioremap_uc
The same is for stable v5.4.
Best,
Tuowen
The following race occurs while accessing the dmabuf object exported as
file:
P1 P2
dma_buf_release() dmabuffs_dname()
[say lsof reading /proc/<P1 pid>/fd/<num>]
read dmabuf stored in dentry->d_fsdata
Free the dmabuf object
Start accessing the dmabuf structure
In the above description, the dmabuf object freed in P1 is being
accessed from P2 which is resulting into the use-after-free. Below is
the dump stack reported.
We are reading the dmabuf object stored in the dentry->d_fsdata but
there is no binding between the dentry and the dmabuf which means that
the dmabuf can be freed while it is being read from ->d_fsdata and
inuse. Reviews on the patch V1 says that protecting the dmabuf inuse
with an extra refcount is not a viable solution as the exported dmabuf
is already under file's refcount and keeping the multiple refcounts on
the same object coordinated is not possible.
As we are reading the dmabuf in ->d_fsdata just to get the user passed
name, we can directly store the name in d_fsdata thus can avoid the
reading of dmabuf altogether.
Call Trace:
kasan_report+0x12/0x20
__asan_report_load8_noabort+0x14/0x20
dmabuffs_dname+0x4f4/0x560
tomoyo_realpath_from_path+0x165/0x660
tomoyo_get_realpath
tomoyo_check_open_permission+0x2a3/0x3e0
tomoyo_file_open
tomoyo_file_open+0xa9/0xd0
security_file_open+0x71/0x300
do_dentry_open+0x37a/0x1380
vfs_open+0xa0/0xd0
path_openat+0x12ee/0x3490
do_filp_open+0x192/0x260
do_sys_openat2+0x5eb/0x7e0
do_sys_open+0xf2/0x180
Fixes: bb2bb9030425 ("dma-buf: add DMA_BUF_SET_NAME ioctls")
Reported-by: syzbot+3643a18836bce555bff6(a)syzkaller.appspotmail.com
Cc: <stable(a)vger.kernel.org> [5.3+]
Signed-off-by: Charan Teja Reddy <charante(a)codeaurora.org>
---
Changes in v2:
- Pass the user passed name in ->d_fsdata instead of dmabuf
- Improve the commit message
Changes in v1: (https://patchwork.kernel.org/patch/11514063/)
drivers/dma-buf/dma-buf.c | 17 ++++++++++-------
1 file changed, 10 insertions(+), 7 deletions(-)
diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index 01ce125..0071f7d 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -25,6 +25,7 @@
#include <linux/mm.h>
#include <linux/mount.h>
#include <linux/pseudo_fs.h>
+#include <linux/dcache.h>
#include <uapi/linux/dma-buf.h>
#include <uapi/linux/magic.h>
@@ -40,15 +41,13 @@ struct dma_buf_list {
static char *dmabuffs_dname(struct dentry *dentry, char *buffer, int buflen)
{
- struct dma_buf *dmabuf;
char name[DMA_BUF_NAME_LEN];
size_t ret = 0;
- dmabuf = dentry->d_fsdata;
- dma_resv_lock(dmabuf->resv, NULL);
- if (dmabuf->name)
- ret = strlcpy(name, dmabuf->name, DMA_BUF_NAME_LEN);
- dma_resv_unlock(dmabuf->resv);
+ spin_lock(&dentry->d_lock);
+ if (dentry->d_fsdata)
+ ret = strlcpy(name, dentry->d_fsdata, DMA_BUF_NAME_LEN);
+ spin_unlock(&dentry->d_lock);
return dynamic_dname(dentry, buffer, buflen, "/%s:%s",
dentry->d_name.name, ret > 0 ? name : "");
@@ -80,12 +79,16 @@ static int dma_buf_fs_init_context(struct fs_context *fc)
static int dma_buf_release(struct inode *inode, struct file *file)
{
struct dma_buf *dmabuf;
+ struct dentry *dentry = file->f_path.dentry;
if (!is_dma_buf_file(file))
return -EINVAL;
dmabuf = file->private_data;
+ spin_lock(&dentry->d_lock);
+ dentry->d_fsdata = NULL;
+ spin_unlock(&dentry->d_lock);
BUG_ON(dmabuf->vmapping_counter);
/*
@@ -343,6 +346,7 @@ static long dma_buf_set_name(struct dma_buf *dmabuf, const char __user *buf)
}
kfree(dmabuf->name);
dmabuf->name = name;
+ dmabuf->file->f_path.dentry->d_fsdata = name;
out_unlock:
dma_resv_unlock(dmabuf->resv);
@@ -446,7 +450,6 @@ static struct file *dma_buf_getfile(struct dma_buf *dmabuf, int flags)
goto err_alloc_file;
file->f_flags = flags & (O_ACCMODE | O_NONBLOCK);
file->private_data = dmabuf;
- file->f_path.dentry->d_fsdata = dmabuf;
return file;
--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a
member of the Code Aurora Forum, hosted by The Linux Foundation
From: Henry Willard <henry.willard(a)oracle.com>
Subject: mm: limit boost_watermark on small zones
Commit 1c30844d2dfe ("mm: reclaim small amounts of memory when an external
fragmentation event occurs") adds a boost_watermark() function which
increases the min watermark in a zone by at least pageblock_nr_pages or
the number of pages in a page block. On Arm64, with 64K pages and 512M
huge pages, this is 8192 pages or 512M. It does this regardless of the
number of managed pages managed in the zone or the likelihood of success.
This can put the zone immediately under water in terms of allocating pages
from the zone, and can cause a small machine to fail immediately due to
OoM. Unlike set_recommended_min_free_kbytes(), which substantially
increases min_free_kbytes and is tied to THP, boost_watermark() can be
called even if THP is not active. The problem is most likely to appear
on architectures such as Arm64 where pageblock_nr_pages is very large.
It is desirable to run the kdump capture kernel in as small a space as
possible to avoid wasting memory. In some architectures, such as Arm64,
there are restrictions on where the capture kernel can run, and therefore,
the space available. A capture kernel running in 768M can fail due to OoM
immediately after boost_watermark() sets the min in zone DMA32, where
most of the memory is, to 512M. It fails even though there is over 500M of
free memory. With boost_watermark() suppressed, the capture kernel can run
successfully in 448M.
This patch limits boost_watermark() to boosting a zone's min watermark only
when there are enough pages that the boost will produce positive results.
In this case that is estimated to be four times as many pages as
pageblock_nr_pages.
Mel said:
: There is no harm in marking it stable. Clearly it does not happen very
: often but it's not impossible. 32-bit x86 is a lot less common now
: which would previously have been vulnerable to triggering this easily.
: ppc64 has a larger base page size but typically only has one zone.
: arm64 is likely the most vulnerable, particularly when CMA is
: configured with a small movable zone.
Link: http://lkml.kernel.org/r/1588294148-6586-1-git-send-email-henry.willard@ora…
Fixes: 1c30844d2dfe ("mm: reclaim small amounts of memory when an external fragmentation event occurs")
Signed-off-by: Henry Willard <henry.willard(a)oracle.com>
Acked-by: Mel Gorman <mgorman(a)techsingularity.net>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/page_alloc.c | 8 ++++++++
1 file changed, 8 insertions(+)
--- a/mm/page_alloc.c~mm-limit-boost_watermark-on-small-zones
+++ a/mm/page_alloc.c
@@ -2401,6 +2401,14 @@ static inline void boost_watermark(struc
if (!watermark_boost_factor)
return;
+ /*
+ * Don't bother in zones that are unlikely to produce results.
+ * On small machines, including kdump capture kernels running
+ * in a small area, boosting the watermark can cause an out of
+ * memory situation immediately.
+ */
+ if ((pageblock_nr_pages * 4) > zone_managed_pages(zone))
+ return;
max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
watermark_boost_factor, 10000);
_
From: Roman Penyaev <rpenyaev(a)suse.de>
Subject: epoll: atomically remove wait entry on wake up
This patch does two things:
1. fixes lost wakeup introduced by:
339ddb53d373 ("fs/epoll: remove unnecessary wakeups of nested epoll")
2. improves performance for events delivery.
The description of the problem is the following: if N (>1) threads are
waiting on ep->wq for new events and M (>1) events come, it is quite
likely that >1 wakeups hit the same wait queue entry, because there is
quite a big window between __add_wait_queue_exclusive() and the following
__remove_wait_queue() calls in ep_poll() function. This can lead to lost
wakeups, because thread, which was woken up, can handle not all the events
in ->rdllist. (in better words the problem is described here:
https://lkml.org/lkml/2019/10/7/905)
The idea of the current patch is to use init_wait() instead of
init_waitqueue_entry(). Internally init_wait() sets
autoremove_wake_function as a callback, which removes the wait entry
atomically (under the wq locks) from the list, thus the next coming wakeup
hits the next wait entry in the wait queue, thus preventing lost wakeups.
Problem is very well reproduced by the epoll60 test case [1].
Wait entry removal on wakeup has also performance benefits, because there
is no need to take a ep->lock and remove wait entry from the queue after
the successful wakeup. Here is the timing output of the epoll60 test
case:
With explicit wakeup from ep_scan_ready_list() (the state of the
code prior 339ddb53d373):
real 0m6.970s
user 0m49.786s
sys 0m0.113s
After this patch:
real 0m5.220s
user 0m36.879s
sys 0m0.019s
The other testcase is the stress-epoll [2], where one thread consumes
all the events and other threads produce many events:
With explicit wakeup from ep_scan_ready_list() (the state of the
code prior 339ddb53d373):
threads events/ms run-time ms
8 5427 1474
16 6163 2596
32 6824 4689
64 7060 9064
128 6991 18309
After this patch:
threads events/ms run-time ms
8 5598 1429
16 7073 2262
32 7502 4265
64 7640 8376
128 7634 16767
(number of "events/ms" represents event bandwidth, thus higher is
better; number of "run-time ms" represents overall time spent
doing the benchmark, thus lower is better)
[1] tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c
[2] https://github.com/rouming/test-tools/blob/master/stress-epoll.c
Link: http://lkml.kernel.org/r/20200430130326.1368509-2-rpenyaev@suse.de
Signed-off-by: Roman Penyaev <rpenyaev(a)suse.de>
Reviewed-by: Jason Baron <jbaron(a)akamai.com>
Cc: Khazhismel Kumykov <khazhy(a)google.com>
Cc: Alexander Viro <viro(a)zeniv.linux.org.uk>
Cc: Heiher <r(a)hev.cc>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/eventpoll.c | 43 ++++++++++++++++++++++++-------------------
1 file changed, 24 insertions(+), 19 deletions(-)
--- a/fs/eventpoll.c~epoll-atomically-remove-wait-entry-on-wake-up
+++ a/fs/eventpoll.c
@@ -1822,7 +1822,6 @@ static int ep_poll(struct eventpoll *ep,
{
int res = 0, eavail, timed_out = 0;
u64 slack = 0;
- bool waiter = false;
wait_queue_entry_t wait;
ktime_t expires, *to = NULL;
@@ -1867,21 +1866,23 @@ fetch_events:
*/
ep_reset_busy_poll_napi_id(ep);
- /*
- * We don't have any available event to return to the caller. We need
- * to sleep here, and we will be woken by ep_poll_callback() when events
- * become available.
- */
- if (!waiter) {
- waiter = true;
- init_waitqueue_entry(&wait, current);
-
+ do {
+ /*
+ * Internally init_wait() uses autoremove_wake_function(),
+ * thus wait entry is removed from the wait queue on each
+ * wakeup. Why it is important? In case of several waiters
+ * each new wakeup will hit the next waiter, giving it the
+ * chance to harvest new event. Otherwise wakeup can be
+ * lost. This is also good performance-wise, because on
+ * normal wakeup path no need to call __remove_wait_queue()
+ * explicitly, thus ep->lock is not taken, which halts the
+ * event delivery.
+ */
+ init_wait(&wait);
write_lock_irq(&ep->lock);
__add_wait_queue_exclusive(&ep->wq, &wait);
write_unlock_irq(&ep->lock);
- }
- for (;;) {
/*
* We don't want to sleep if the ep_poll_callback() sends us
* a wakeup in between. That's why we set the task state
@@ -1911,10 +1912,20 @@ fetch_events:
timed_out = 1;
break;
}
- }
+
+ /* We were woken up, thus go and try to harvest some events */
+ eavail = 1;
+
+ } while (0);
__set_current_state(TASK_RUNNING);
+ if (!list_empty_careful(&wait.entry)) {
+ write_lock_irq(&ep->lock);
+ __remove_wait_queue(&ep->wq, &wait);
+ write_unlock_irq(&ep->lock);
+ }
+
send_events:
/*
* Try to transfer events to user space. In case we get 0 events and
@@ -1925,12 +1936,6 @@ send_events:
!(res = ep_send_events(ep, events, maxevents)) && !timed_out)
goto fetch_events;
- if (waiter) {
- write_lock_irq(&ep->lock);
- __remove_wait_queue(&ep->wq, &wait);
- write_unlock_irq(&ep->lock);
- }
-
return res;
}
_
From: Khazhismel Kumykov <khazhy(a)google.com>
Subject: eventpoll: fix missing wakeup for ovflist in ep_poll_callback
In the event that we add to ovflist, before 339ddb53d373 we would be woken
up by ep_scan_ready_list, and did no wakeup in ep_poll_callback. With
that wakeup removed, if we add to ovflist here, we may never wake up.
Rather than adding back the ep_scan_ready_list wakeup - which was
resulting in unnecessary wakeups, trigger a wake-up in ep_poll_callback.
We noticed that one of our workloads was missing wakeups starting with
339ddb53d373 and upon manual inspection, this wakeup seemed missing to me.
With this patch added, we no longer see missing wakeups. I haven't yet
tried to make a small reproducer, but the existing kselftests in
filesystem/epoll passed for me with this patch.
[khazhy(a)google.com: use if/elif instead of goto + cleanup suggested by Roman]
Link: http://lkml.kernel.org/r/20200424190039.192373-1-khazhy@google.com
Link: http://lkml.kernel.org/r/20200424025057.118641-1-khazhy@google.com
Fixes: 339ddb53d373 ("fs/epoll: remove unnecessary wakeups of nested epoll")
Signed-off-by: Khazhismel Kumykov <khazhy(a)google.com>
Reviewed-by: Roman Penyaev <rpenyaev(a)suse.de>
Cc: Alexander Viro <viro(a)zeniv.linux.org.uk>
Cc: Roman Penyaev <rpenyaev(a)suse.de>
Cc: Heiher <r(a)hev.cc>
Cc: Jason Baron <jbaron(a)akamai.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
fs/eventpoll.c | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
--- a/fs/eventpoll.c~eventpoll-fix-missing-wakeup-for-ovflist-in-ep_poll_callback
+++ a/fs/eventpoll.c
@@ -1171,6 +1171,10 @@ static inline bool chain_epi_lockless(st
{
struct eventpoll *ep = epi->ep;
+ /* Fast preliminary check */
+ if (epi->next != EP_UNACTIVE_PTR)
+ return false;
+
/* Check that the same epi has not been just chained from another CPU */
if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
return false;
@@ -1237,16 +1241,12 @@ static int ep_poll_callback(wait_queue_e
* chained in ep->ovflist and requeued later on.
*/
if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
- if (epi->next == EP_UNACTIVE_PTR &&
- chain_epi_lockless(epi))
+ if (chain_epi_lockless(epi))
+ ep_pm_stay_awake_rcu(epi);
+ } else if (!ep_is_linked(epi)) {
+ /* In the usual case, add event to ready list. */
+ if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist))
ep_pm_stay_awake_rcu(epi);
- goto out_unlock;
- }
-
- /* If this file is already in the ready list we exit soon */
- if (!ep_is_linked(epi) &&
- list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) {
- ep_pm_stay_awake_rcu(epi);
}
/*
_