blk_mq_try_issue_directly() can return BLK_STS*_RESOURCE for requests that have been queued. If that happens when blk_mq_try_issue_directly() is called by the dm-mpath driver then dm-mpath will try to resubmit a request that is already queued and a kernel crash follows. Since it is nontrivial to fix blk_mq_request_issue_directly(), revert the blk_mq_request_issue_directly() changes that went into kernel v5.0.
This patch reverts the following commits: * d6a51a97c0b2 ("blk-mq: replace and kill blk_mq_request_issue_directly") # v5.0. * 5b7a6f128aad ("blk-mq: issue directly with bypass 'false' in blk_mq_sched_insert_requests") # v5.0. * 7f556a44e61d ("blk-mq: refactor the code of issue request directly") # v5.0.
Cc: Christoph Hellwig hch@infradead.org Cc: Ming Lei ming.lei@redhat.com Cc: Jianchao Wang jianchao.w.wang@oracle.com Cc: Hannes Reinecke hare@suse.com Cc: Johannes Thumshirn jthumshirn@suse.de Cc: James Smart james.smart@broadcom.com Cc: Dongli Zhang dongli.zhang@oracle.com Cc: Laurence Oberman loberman@redhat.com Cc: stable@vger.kernel.org Reported-by: Laurence Oberman loberman@redhat.com Tested-by: Laurence Oberman loberman@redhat.com Fixes: 7f556a44e61d ("blk-mq: refactor the code of issue request directly") # v5.0. Signed-off-by: Bart Van Assche bvanassche@acm.org --- block/blk-core.c | 4 +- block/blk-mq-sched.c | 8 +-- block/blk-mq.c | 122 ++++++++++++++++++++++--------------------- block/blk-mq.h | 6 +-- 4 files changed, 71 insertions(+), 69 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c index 2921af6f8d33..5bca56016575 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1232,8 +1232,6 @@ static int blk_cloned_rq_check_limits(struct request_queue *q, */ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq) { - blk_qc_t unused; - if (blk_cloned_rq_check_limits(q, rq)) return BLK_STS_IOERR;
@@ -1249,7 +1247,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * * bypass a potential scheduler on the bottom device for * insert. */ - return blk_mq_try_issue_directly(rq->mq_hctx, rq, &unused, true, true); + return blk_mq_request_issue_directly(rq, true); } EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 40905539afed..aa6bc5c02643 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -423,10 +423,12 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, * busy in case of 'none' scheduler, and this way may save * us one extra enqueue & dequeue to sw queue. */ - if (!hctx->dispatch_busy && !e && !run_queue_async) + if (!hctx->dispatch_busy && !e && !run_queue_async) { blk_mq_try_issue_list_directly(hctx, list); - else - blk_mq_insert_requests(hctx, ctx, list); + if (list_empty(list)) + return; + } + blk_mq_insert_requests(hctx, ctx, list); }
blk_mq_run_hw_queue(hctx, run_queue_async); diff --git a/block/blk-mq.c b/block/blk-mq.c index 652d0c6d5945..403984a557bb 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1808,74 +1808,76 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, return ret; }
-blk_status_t blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, +static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_qc_t *cookie, - bool bypass, bool last) + bool bypass_insert, bool last) { struct request_queue *q = rq->q; bool run_queue = true; - blk_status_t ret = BLK_STS_RESOURCE; - int srcu_idx; - bool force = false;
- hctx_lock(hctx, &srcu_idx); /* - * hctx_lock is needed before checking quiesced flag. + * RCU or SRCU read lock is needed before checking quiesced flag. * - * When queue is stopped or quiesced, ignore 'bypass', insert - * and return BLK_STS_OK to caller, and avoid driver to try to - * dispatch again. + * When queue is stopped or quiesced, ignore 'bypass_insert' from + * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller, + * and avoid driver to try to dispatch again. */ - if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) { + if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) { run_queue = false; - bypass = false; - goto out_unlock; + bypass_insert = false; + goto insert; }
- if (unlikely(q->elevator && !bypass)) - goto out_unlock; + if (q->elevator && !bypass_insert) + goto insert;
if (!blk_mq_get_dispatch_budget(hctx)) - goto out_unlock; + goto insert;
if (!blk_mq_get_driver_tag(rq)) { blk_mq_put_dispatch_budget(hctx); - goto out_unlock; + goto insert; }
- /* - * Always add a request that has been through - *.queue_rq() to the hardware dispatch list. - */ - force = true; - ret = __blk_mq_issue_directly(hctx, rq, cookie, last); -out_unlock: + return __blk_mq_issue_directly(hctx, rq, cookie, last); +insert: + if (bypass_insert) + return BLK_STS_RESOURCE; + + blk_mq_request_bypass_insert(rq, run_queue); + return BLK_STS_OK; +} + +static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, + struct request *rq, blk_qc_t *cookie) +{ + blk_status_t ret; + int srcu_idx; + + might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); + + hctx_lock(hctx, &srcu_idx); + + ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true); + if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) + blk_mq_request_bypass_insert(rq, true); + else if (ret != BLK_STS_OK) + blk_mq_end_request(rq, ret); + + hctx_unlock(hctx, srcu_idx); +} + +blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) +{ + blk_status_t ret; + int srcu_idx; + blk_qc_t unused_cookie; + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; + + hctx_lock(hctx, &srcu_idx); + ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last); hctx_unlock(hctx, srcu_idx); - switch (ret) { - case BLK_STS_OK: - break; - case BLK_STS_DEV_RESOURCE: - case BLK_STS_RESOURCE: - if (force) { - blk_mq_request_bypass_insert(rq, run_queue); - /* - * We have to return BLK_STS_OK for the DM - * to avoid livelock. Otherwise, we return - * the real result to indicate whether the - * request is direct-issued successfully. - */ - ret = bypass ? BLK_STS_OK : ret; - } else if (!bypass) { - blk_mq_sched_insert_request(rq, false, - run_queue, false); - } - break; - default: - if (!bypass) - blk_mq_end_request(rq, ret); - break; - }
return ret; } @@ -1883,20 +1885,22 @@ blk_status_t blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list) { - blk_qc_t unused; - blk_status_t ret = BLK_STS_OK; - while (!list_empty(list)) { + blk_status_t ret; struct request *rq = list_first_entry(list, struct request, queuelist);
list_del_init(&rq->queuelist); - if (ret == BLK_STS_OK) - ret = blk_mq_try_issue_directly(hctx, rq, &unused, - false, + ret = blk_mq_request_issue_directly(rq, list_empty(list)); + if (ret != BLK_STS_OK) { + if (ret == BLK_STS_RESOURCE || + ret == BLK_STS_DEV_RESOURCE) { + blk_mq_request_bypass_insert(rq, list_empty(list)); - else - blk_mq_sched_insert_request(rq, false, true, false); + break; + } + blk_mq_end_request(rq, ret); + } }
/* @@ -1904,7 +1908,7 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, * the driver there was more coming, but that turned out to * be a lie. */ - if (ret != BLK_STS_OK && hctx->queue->mq_ops->commit_rqs) + if (!list_empty(list) && hctx->queue->mq_ops->commit_rqs) hctx->queue->mq_ops->commit_rqs(hctx); }
@@ -2017,13 +2021,13 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) if (same_queue_rq) { data.hctx = same_queue_rq->mq_hctx; blk_mq_try_issue_directly(data.hctx, same_queue_rq, - &cookie, false, true); + &cookie); } } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator && !data.hctx->dispatch_busy)) { blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); - blk_mq_try_issue_directly(data.hctx, rq, &cookie, false, true); + blk_mq_try_issue_directly(data.hctx, rq, &cookie); } else { blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); diff --git a/block/blk-mq.h b/block/blk-mq.h index d704fc7766f4..423ea88ab6fb 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -70,10 +70,8 @@ void blk_mq_request_bypass_insert(struct request *rq, bool run_queue); void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct list_head *list);
-blk_status_t blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, - struct request *rq, - blk_qc_t *cookie, - bool bypass, bool last); +/* Used by blk_insert_cloned_request() to issue request directly */ +blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last); void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list);
On Thu, Apr 04, 2019 at 10:08:43AM -0700, Bart Van Assche wrote:
blk_mq_try_issue_directly() can return BLK_STS*_RESOURCE for requests that have been queued. If that happens when blk_mq_try_issue_directly() is called by the dm-mpath driver then dm-mpath will try to resubmit a request that is already queued and a kernel crash follows. Since it is nontrivial to fix blk_mq_request_issue_directly(), revert the blk_mq_request_issue_directly() changes that went into kernel v5.0.
This patch reverts the following commits:
- d6a51a97c0b2 ("blk-mq: replace and kill blk_mq_request_issue_directly") # v5.0.
- 5b7a6f128aad ("blk-mq: issue directly with bypass 'false' in blk_mq_sched_insert_requests") # v5.0.
- 7f556a44e61d ("blk-mq: refactor the code of issue request directly") # v5.0.
I am fine to revert them now.
However, could you revert them one by one so that they can be reviewed easily?
Thanks, Ming
On Fri, 2019-04-05 at 05:43 +0800, Ming Lei wrote:
On Thu, Apr 04, 2019 at 10:08:43AM -0700, Bart Van Assche wrote:
blk_mq_try_issue_directly() can return BLK_STS*_RESOURCE for requests that have been queued. If that happens when blk_mq_try_issue_directly() is called by the dm-mpath driver then dm-mpath will try to resubmit a request that is already queued and a kernel crash follows. Since it is nontrivial to fix blk_mq_request_issue_directly(), revert the blk_mq_request_issue_directly() changes that went into kernel v5.0.
This patch reverts the following commits:
- d6a51a97c0b2 ("blk-mq: replace and kill blk_mq_request_issue_directly") # v5.0.
- 5b7a6f128aad ("blk-mq: issue directly with bypass 'false' in blk_mq_sched_insert_requests") # v5.0.
- 7f556a44e61d ("blk-mq: refactor the code of issue request directly") # v5.0.
I am fine to revert them now.
However, could you revert them one by one so that they can be reviewed easily?
Anyone who wants to verify this patch can do that by reverting the three commits mentioned above and by using diff to compare the revert result with this patch.
Thanks,
Bart.
On Fri, 2019-04-05 at 05:43 +0800, Ming Lei wrote:
On Thu, Apr 04, 2019 at 10:08:43AM -0700, Bart Van Assche wrote:
blk_mq_try_issue_directly() can return BLK_STS*_RESOURCE for requests that have been queued. If that happens when blk_mq_try_issue_directly() is called by the dm-mpath driver then dm-mpath will try to resubmit a request that is already queued and a kernel crash follows. Since it is nontrivial to fix blk_mq_request_issue_directly(), revert the blk_mq_request_issue_directly() changes that went into kernel v5.0.
This patch reverts the following commits:
- d6a51a97c0b2 ("blk-mq: replace and kill
blk_mq_request_issue_directly") # v5.0.
- 5b7a6f128aad ("blk-mq: issue directly with bypass 'false' in
blk_mq_sched_insert_requests") # v5.0.
- 7f556a44e61d ("blk-mq: refactor the code of issue request
directly") # v5.0.
I am fine to revert them now.
However, could you revert them one by one so that they can be reviewed easily?
Thanks, Ming
Hello Ming I did them 1 by 1 here for you
commit cff8d4710f8cc0817950942252bcf0809a95aa4a Author: Laurence Oberman loberman@ibclient.lab.eng.bos.redhat.com Date: Thu Apr 4 19:51:33 2019 -0400
Revert "blk-mq: replace and kill blk_mq_request_issue_directly"
This reverts commit d6a51a97c0b2e21fec224746c2683ff739bcf4ae.
diff --git a/block/blk-core.c b/block/blk-core.c index 4673ebe..a55389b 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1245,8 +1245,6 @@ static int blk_cloned_rq_check_limits(struct request_queue *q, */ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq) { - blk_qc_t unused; - if (blk_cloned_rq_check_limits(q, rq)) return BLK_STS_IOERR;
@@ -1262,7 +1260,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * * bypass a potential scheduler on the bottom device for * insert. */ - return blk_mq_try_issue_directly(rq->mq_hctx, rq, &unused, true, true); + return blk_mq_request_issue_directly(rq, true); } EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
diff --git a/block/blk-mq.c b/block/blk-mq.c index 3ff3d7b..536b176 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1800,7 +1800,7 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, return ret; }
-blk_status_t blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, +static blk_status_t blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_qc_t *cookie, bool bypass, bool last) @@ -1872,6 +1872,13 @@ blk_status_t blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, return ret; }
+blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) +{ + blk_qc_t unused; + + return blk_mq_try_issue_directly(rq->mq_hctx, rq, &unused, true, last); +} + void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list) { diff --git a/block/blk-mq.h b/block/blk-mq.h index d704fc7..423ea88 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -70,10 +70,8 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct list_head *list);
-blk_status_t blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, - struct request *rq, - blk_qc_t *cookie, - bool bypass, bool last); +/* Used by blk_insert_cloned_request() to issue request directly */ +blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last); void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list);
commit 9b7f3de27504aefae8304dfd9640a1477599de8b Author: Laurence Oberman loberman@ibclient.lab.eng.bos.redhat.com Date: Thu Apr 4 19:53:23 2019 -0400
Revert "blk-mq: issue directly with bypass 'false' in blk_mq_sched_insert_requests"
This reverts commit 5b7a6f128aad761b471ca0ff620b4841b38e596f.
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 4090553..aa6bc5c 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -423,10 +423,12 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, * busy in case of 'none' scheduler, and this way may save * us one extra enqueue & dequeue to sw queue. */ - if (!hctx->dispatch_busy && !e && !run_queue_async) + if (!hctx->dispatch_busy && !e && !run_queue_async) { blk_mq_try_issue_list_directly(hctx, list); - else - blk_mq_insert_requests(hctx, ctx, list); + if (list_empty(list)) + return; + } + blk_mq_insert_requests(hctx, ctx, list); }
blk_mq_run_hw_queue(hctx, run_queue_async); diff --git a/block/blk-mq.c b/block/blk-mq.c index 536b176..df2150e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1882,20 +1882,22 @@ blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list) { - blk_qc_t unused; - blk_status_t ret = BLK_STS_OK; - while (!list_empty(list)) { + blk_status_t ret; struct request *rq = list_first_entry(list, struct request, queuelist);
list_del_init(&rq->queuelist); - if (ret == BLK_STS_OK) - ret = blk_mq_try_issue_directly(hctx, rq, &unused, - false, + ret = blk_mq_request_issue_directly(rq, list_empty(list)); + if (ret != BLK_STS_OK) { + if (ret == BLK_STS_RESOURCE || + ret == BLK_STS_DEV_RESOURCE) { + blk_mq_request_bypass_insert(rq, list_empty(list )); - else - blk_mq_sched_insert_request(rq, false, true, false); + break; + } + blk_mq_end_request(rq, ret); + } }
/* @@ -1903,7 +1905,7 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, * the driver there was more coming, but that turned out to * be a lie. */ - if (ret != BLK_STS_OK && hctx->queue->mq_ops->commit_rqs) + if (!list_empty(list) && hctx->queue->mq_ops->commit_rqs) hctx->queue->mq_ops->commit_rqs(hctx); }
commit 6db32dc1b5cb3d236f26f77947094f1d895ba557 Author: Laurence Oberman loberman@ibclient.lab.eng.bos.redhat.com Date: Thu Apr 4 19:54:28 2019 -0400
Revert "blk-mq: refactor the code of issue request directly"
This reverts commit 7f556a44e61d0b62d78db9a2662a5f0daef010f2.
diff --git a/block/blk-mq.c b/block/blk-mq.c index df2150e..afac6c2 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1800,83 +1800,78 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, return ret; }
-static blk_status_t blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, +static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_qc_t *cookie, - bool bypass, bool last) + bool bypass_insert, bool last) { struct request_queue *q = rq->q; bool run_queue = true; - blk_status_t ret = BLK_STS_RESOURCE; - int srcu_idx; - bool force = false;
- hctx_lock(hctx, &srcu_idx); /* - * hctx_lock is needed before checking quiesced flag. + * RCU or SRCU read lock is needed before checking quiesced flag. * - * When queue is stopped or quiesced, ignore 'bypass', insert - * and return BLK_STS_OK to caller, and avoid driver to try to - * dispatch again. + * When queue is stopped or quiesced, ignore 'bypass_insert' from + * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller, + * and avoid driver to try to dispatch again. */ - if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) { + if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) { run_queue = false; - bypass = false; - goto out_unlock; + bypass_insert = false; + goto insert; }
- if (unlikely(q->elevator && !bypass)) - goto out_unlock; + if (q->elevator && !bypass_insert) + goto insert;
if (!blk_mq_get_dispatch_budget(hctx)) - goto out_unlock; + goto insert;
if (!blk_mq_get_driver_tag(rq)) { blk_mq_put_dispatch_budget(hctx); - goto out_unlock; + goto insert; }
- /* - * Always add a request that has been through - *.queue_rq() to the hardware dispatch list. - */ - force = true; - ret = __blk_mq_issue_directly(hctx, rq, cookie, last); -out_unlock: - hctx_unlock(hctx, srcu_idx); - switch (ret) { - case BLK_STS_OK: - break; - case BLK_STS_DEV_RESOURCE: - case BLK_STS_RESOURCE: - if (force) { - blk_mq_request_bypass_insert(rq, run_queue); - /* - * We have to return BLK_STS_OK for the DM - * to avoid livelock. Otherwise, we return - * the real result to indicate whether the - * request is direct-issued successfully. - */ - ret = bypass ? BLK_STS_OK : ret; - } else if (!bypass) { - blk_mq_sched_insert_request(rq, false, - run_queue, false); - } - break; - default: - if (!bypass) - blk_mq_end_request(rq, ret); - break; - } + return __blk_mq_issue_directly(hctx, rq, cookie, last); +insert: + if (bypass_insert) + return BLK_STS_RESOURCE;
- return ret; + blk_mq_request_bypass_insert(rq, run_queue); + return BLK_STS_OK; +} + +static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, + struct request *rq, blk_qc_t *cookie) +{ + blk_status_t ret; + int srcu_idx; + + might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); + + hctx_lock(hctx, &srcu_idx); + + ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true); + if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) + blk_mq_request_bypass_insert(rq, true); + else if (ret != BLK_STS_OK) + blk_mq_end_request(rq, ret); + + hctx_unlock(hctx, srcu_idx); }
blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) { - blk_qc_t unused; + blk_status_t ret; + int srcu_idx; + blk_qc_t unused_cookie; + struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
- return blk_mq_try_issue_directly(rq->mq_hctx, rq, &unused, true, last); + hctx_lock(hctx, &srcu_idx); + ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last); + hctx_unlock(hctx, srcu_idx); + + return ret; }
void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, @@ -2018,13 +2013,13 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) if (same_queue_rq) { data.hctx = same_queue_rq->mq_hctx; blk_mq_try_issue_directly(data.hctx, same_queue_rq, - &cookie, false, true); + &cookie); } } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator && !data.hctx->dispatch_busy)) { blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); - blk_mq_try_issue_directly(data.hctx, rq, &cookie, false, true); + blk_mq_try_issue_directly(data.hctx, rq, &cookie); } else { blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio);
On Thu, Apr 04, 2019 at 10:08:43AM -0700, Bart Van Assche wrote:
blk_mq_try_issue_directly() can return BLK_STS*_RESOURCE for requests that have been queued. If that happens when blk_mq_try_issue_directly() is called by the dm-mpath driver then dm-mpath will try to resubmit a request that is already queued and a kernel crash follows. Since it is nontrivial to fix blk_mq_request_issue_directly(), revert the blk_mq_request_issue_directly() changes that went into kernel v5.0.
And what are we going to do about 5.1+?
On Fri, 2019-04-05 at 08:06 +0200, Christoph Hellwig wrote:
On Thu, Apr 04, 2019 at 10:08:43AM -0700, Bart Van Assche wrote:
blk_mq_try_issue_directly() can return BLK_STS*_RESOURCE for requests that have been queued. If that happens when blk_mq_try_issue_directly() is called by the dm-mpath driver then dm-mpath will try to resubmit a request that is already queued and a kernel crash follows. Since it is nontrivial to fix blk_mq_request_issue_directly(), revert the blk_mq_request_issue_directly() changes that went into kernel v5.0.
And what are we going to do about 5.1+?
Hello Christoph All the testing of the reverts etc. was agains 5.1-rc3. I believe the intention is to revert for 5.1x+ and earlier. Regards Laurence
On 4/5/19 5:43 AM, Laurence Oberman wrote:
On Fri, 2019-04-05 at 08:06 +0200, Christoph Hellwig wrote:
On Thu, Apr 04, 2019 at 10:08:43AM -0700, Bart Van Assche wrote:
blk_mq_try_issue_directly() can return BLK_STS*_RESOURCE for requests that have been queued. If that happens when blk_mq_try_issue_directly() is called by the dm-mpath driver then dm-mpath will try to resubmit a request that is already queued and a kernel crash follows. Since it is nontrivial to fix blk_mq_request_issue_directly(), revert the blk_mq_request_issue_directly() changes that went into kernel v5.0.
And what are we going to do about 5.1+?
All the testing of the reverts etc. was agains 5.1-rc3. I believe the intention is to revert for 5.1x+ and earlier.
I think the revert should happen for the v5.0 and v5.1 kernels, hence the "Cc: stable" tag.
Bart.
On 4/4/19 11:06 PM, Christoph Hellwig wrote:
On Thu, Apr 04, 2019 at 10:08:43AM -0700, Bart Van Assche wrote:
blk_mq_try_issue_directly() can return BLK_STS*_RESOURCE for requests that have been queued. If that happens when blk_mq_try_issue_directly() is called by the dm-mpath driver then dm-mpath will try to resubmit a request that is already queued and a kernel crash follows. Since it is nontrivial to fix blk_mq_request_issue_directly(), revert the blk_mq_request_issue_directly() changes that went into kernel v5.0.
And what are we going to do about 5.1+?
Hi Christoph,
This patch was developed and tested on my setup against Jens' for-next branch. So this patch should apply fine on kernel v5.1-rc3.
Bart.
On 4/5/19 8:00 AM, Bart Van Assche wrote:
On 4/4/19 11:06 PM, Christoph Hellwig wrote:
On Thu, Apr 04, 2019 at 10:08:43AM -0700, Bart Van Assche wrote:
blk_mq_try_issue_directly() can return BLK_STS*_RESOURCE for requests that have been queued. If that happens when blk_mq_try_issue_directly() is called by the dm-mpath driver then dm-mpath will try to resubmit a request that is already queued and a kernel crash follows. Since it is nontrivial to fix blk_mq_request_issue_directly(), revert the blk_mq_request_issue_directly() changes that went into kernel v5.0.
And what are we going to do about 5.1+?
Hi Christoph,
This patch was developed and tested on my setup against Jens' for-next branch. So this patch should apply fine on kernel v5.1-rc3.
I'm inclined to apply the bundled up version. It applies with a bit of help and merges cleanly between the 5.1 and 5.2 branches.
Bart Van Assche bvanassche@acm.org writes:
blk_mq_try_issue_directly() can return BLK_STS*_RESOURCE for requests that have been queued. If that happens when blk_mq_try_issue_directly() is called by the dm-mpath driver then dm-mpath will try to resubmit a request that is already queued and a kernel crash follows. Since it is nontrivial to fix blk_mq_request_issue_directly(), revert the blk_mq_request_issue_directly() changes that went into kernel v5.0.
This patch reverts the following commits:
- d6a51a97c0b2 ("blk-mq: replace and kill blk_mq_request_issue_directly") # v5.0.
- 5b7a6f128aad ("blk-mq: issue directly with bypass 'false' in blk_mq_sched_insert_requests") # v5.0.
- 7f556a44e61d ("blk-mq: refactor the code of issue request directly") # v5.0.
At least when this patch is cherry-picked back on top of 5.0.7, it doesn't seem to fully fix the problem for us (which is ppc64le). We hit this at some point while eudev is finding disks and we have a process mounting them to see what's there:
cpu 0x4a: Vector: 400 (Instruction Access) at [c000203ff398b100] pc: c0000000021fe700 lr: c0000000002001d8: blk_mq_complete_request+0x34/0x138 sp: c000203ff398b390 msr: 9000000010009033 current = 0xc000203ff3901b00 paca = 0xc000203fff7f3680 irqmask: 0x03 irq_happened: 0x01 pid = 658, comm = kworker/u257:2 Linux version 5.0.7-openpower1 (hostboot@gfw275.aus.stglabs.ibm.com) (gcc version 6.5.0 (Buildroot 2019.02.1-00016-ge01dcd0)) #2 SMP Mon Apr 8 09:14:53 CDT 2019
[link register ] c0000000002001d8 blk_mq_complete_request+0x34/0x138 [c000203ff398b390] c000203ff398b3d0 (unreliable) [c000203ff398b3d0] c0000000002d0428 scsi_mq_done+0x48/0x6c [c000203ff398b410] c0000000002ee8d8 ata_scsi_simulate+0x78/0x29c [c000203ff398b460] c0000000002eee74 ata_scsi_queuecmd+0xa8/0x2cc [c000203ff398b4a0] c0000000002d2ac4 scsi_queue_rq+0x7bc/0x810 [c000203ff398b540] c000000000202570 blk_mq_dispatch_rq_list+0x474/0x5c0 [c000203ff398b600] c000000000207560 blk_mq_sched_dispatch_requests+0x114/0x18c [c000203ff398b660] c0000000002009e4 __blk_mq_run_hw_queue+0xe0/0xf8 [c000203ff398b6e0] c000000000200a5c __blk_mq_delay_run_hw_queue+0x60/0x184 [c000203ff398b740] c000000000200c20 blk_mq_run_hw_queue+0x70/0xe4 [c000203ff398b790] c0000000002077ec blk_mq_sched_insert_request+0x68/0x194 [c000203ff398b7f0] c0000000001fc1d8 blk_execute_rq_nowait+0x78/0x8c [c000203ff398b810] c0000000001fc234 blk_execute_rq+0x48/0x90 [c000203ff398b860] c0000000002cfa10 __scsi_execute+0xd8/0x1ac [c000203ff398b8d0] c0000000002cc31c ioctl_internal_command.constprop.2+0x50/0x144 [c000203ff398b980] c0000000002cc474 scsi_set_medium_removal+0x64/0x9c [c000203ff398b9c0] c0080000101f2754 sd_open+0xe8/0x148 [sd_mod] [c000203ff398ba00] c00000000018b730 __blkdev_get+0x198/0x3e4 [c000203ff398ba70] c00000000018b9cc blkdev_get+0x50/0x35c [c000203ff398bb10] c00000000020b140 __device_add_disk+0x468/0x488 [c000203ff398bbd0] c0080000101f5484 sd_probe_async+0xd4/0x180 [sd_mod] [c000203ff398bc60] c00000000009a128 async_run_entry_fn+0x68/0x138 [c000203ff398bca0] c000000000090858 process_one_work+0x204/0x334 [c000203ff398bd30] c000000000090ca4 worker_thread+0x2d0/0x394 [c000203ff398bdb0] c000000000096cb4 kthread+0x14c/0x154 [c000203ff398be20] c00000000000b72c ret_from_kernel_thread+0x5c/0x70
4a:mon> r
R00 = c0000000002d0428 R16 = 0000000000000001 R01 = c000203ff398b390 R17 = 0000000000000000 R02 = c000000001d5af00 R18 = 0000000000000004 R03 = c000003feb215600 R19 = 0000000000000000 R04 = c000003feb1a1900 R20 = c000003feb215600 R05 = 0000000000000005 R21 = c000003feb660000 R06 = 0000000000000020 R22 = 0000000000000402 R07 = 0000000000000000 R23 = 0000000000000000 R08 = c000003fecf86300 R24 = c000003fecf22800 R09 = 0000000000000001 R25 = c000003fecf2e800 R10 = 0000000000000000 R26 = c000003feb0f5000 R11 = c0080000101f5b20 R27 = c000003feb215720 R12 = c000203ffc1ff500 R28 = c000003fecf2e800 R13 = c000203fff7f3680 R29 = c000003feb2158b0 R14 = c000000000096b70 R30 = c000003feb660000 R15 = c000203ff3c10000 R31 = c000003feb215720 pc = c0000000021fe700 cfar= c0000000002001d4 blk_mq_complete_request+0x30/0x138 lr = c0000000002001d8 blk_mq_complete_request+0x34/0x138 msr = 9000000010009033 cr = 24022222 ctr = c0000000002d03e0 xer = 0000000000000000 trap = 400 4a:mon> special_registers= S
msr = 9000000000001033 sprg0 = 0000000000000000 pvr = 00000000004e1202 sprg1 = 0000000000000000 dec = ffffffe807402ddf sprg2 = 0000000000000000 sp = c000203ff398ab60 sprg3 = 000000000008004a toc = c000000001d5af00 dar = 00000000100be2af srr0 = c0000000021fe700 srr1 = 9000000010009033 dsisr = 40000000 dscr = 0000000000000010 ppr = 0010000000000000 pir = 0000080a amr = 0000000000000000 uamor = 0000000000000000 sdr1 = 0000000048022224 hdar = 0000000000000000 hdsisr = 00000000 hsrr0 = c000000000051708 hsrr1 = 9000000000001033 hdec = ffffffe109d0f8da lpcr = 0040400001d2f012 pcr = 0000000000000000 lpidr = 00000000 hsprg0 = c000203fff7f3680 hsprg1 = c000203fff7f3680 amor = 0000000000000000 dabr = 0000000048022224 dabrx = c000000000051708 dpdes = 0000000000000000 tir = 0000000000000002 cir = 00000000 fscr = 0000000000000180 tar = 0000000000000000 pspb = 00000000 mmcr0 = 0000000000000000 mmcr1 = 0000000000000000 mmcr2 = 0000000000000000 pmc1 = 00000000 pmc2 = 00000000 pmc3 = 00000000 pmc4 = 00000000 mmcra = 0000000000000000 siar = 0000000000000000 pmc5 = 80000031 sdar = 0000000000000000 sier = 0000000000000000 pmc6 = 80000001 ebbhr = 0000000000000000 ebbrr = 0000000000000000 bescr = 0000000000000000 iamr = 0000000000000000 hfscr = 000000000000059f dhdes = c000000000051708 rpr = 00000103070f1f3f dawr = 0000000000000000 dawrx = 0000000000000000 ciabr = 0000000000000000 pidr = 000000000000001a tidr = 0000000000000000 asdr = 0000000000000000 psscr = 2000000000300332 ptcr = 0000203fff7e0004
For reference, the backtrace we got with near pure 5.0.6 (one patch in xhci):
cpu 0x27: Vector: 380 (Data Access Out of Range) at [c000001ff537b2d0] pc: c0000000001fe754: blk_add_timer+0x2c/0xa4 lr: c0000000002001d4: blk_mq_start_request+0xa8/0xe0 sp: c000001ff537b560 msr: 9000000000009033 dar: 43f900c0000020d3 current = 0xc000001ff52dd880 paca = 0xc000001ffffe1f80 irqmask: 0x03 irq_happened: 0x01 pid = 812, comm = kworker/u321:1 Linux version 5.0.6-openpower1 (hostboot@gfw273.aus.stglabs.ibm.com) (gcc version 6.5.0 (Buildroot 2019.02.1-00015-gc5f183f)) #2 SMP Wed Apr 3 04:45:56 CDT 2019 enter ? for help [c000001ff537b590] c0000000002001d4 blk_mq_start_request+0xa8/0xe0 [c000001ff537b5c0] c0000000002d25b0 scsi_queue_rq+0x508/0x810 [c000001ff537b660] c0000000002023c0 blk_mq_dispatch_rq_list+0x474/0x5c0 [c000001ff537b720] c000000000207360 blk_mq_sched_dispatch_requests+0x114/0x18c [c000001ff537b780] c000000000200834 __blk_mq_run_hw_queue+0xe0/0xf8 [c000001ff537b800] c0000000002008ac __blk_mq_delay_run_hw_queue+0x60/0x184 [c000001ff537b860] c000000000200a70 blk_mq_run_hw_queue+0x70/0xe4 [c000001ff537b8b0] c0000000002075ec blk_mq_sched_insert_request+0x68/0x194 [c000001ff537b910] c0000000001fc028 blk_execute_rq_nowait+0x78/0x8c [c000001ff537b930] c0000000001fc084 blk_execute_rq+0x48/0x90 [c000001ff537b980] c0000000002cf7b0 __scsi_execute+0xd8/0x1ac [c000001ff537b9f0] c0000000002d3bd0 scsi_probe_and_add_lun+0x1ec/0xa48 [c000001ff537bb40] c0000000002d4778 __scsi_add_device+0xf4/0xf8 [c000001ff537bba0] c0000000002ef068 ata_scsi_scan_host+0xf4/0x1fc [c000001ff537bc30] c0000000002e94b4 async_port_probe+0x6c/0x78 [c000001ff537bc60] c00000000009a0b0 async_run_entry_fn+0x68/0x138 [c000001ff537bca0] c0000000000907e0 process_one_work+0x204/0x334 [c000001ff537bd30] c000000000090c2c worker_thread+0x2d0/0x394 [c000001ff537bdb0] c000000000096c3c kthread+0x14c/0x154 [c000001ff537be20] c00000000000b72c ret_from_kernel_thread+0x5c/0x70 27:mon>
On 4/9/19 1:13 PM, Stewart Smith wrote:
cpu 0x4a: Vector: 400 (Instruction Access) at [c000203ff398b100] pc: c0000000021fe700 lr: c0000000002001d8: blk_mq_complete_request+0x34/0x138
Would you please figure out what is the source code here with gdb ?
From the backtrace, it seems not the same issue, because no dm-mpath is involved here.
Thanks Jianchao
sp: c000203ff398b390
msr: 9000000010009033 current = 0xc000203ff3901b00 paca = 0xc000203fff7f3680 irqmask: 0x03 irq_happened: 0x01 pid = 658, comm = kworker/u257:2 Linux version 5.0.7-openpower1 (hostboot@gfw275.aus.stglabs.ibm.com) (gcc version 6.5.0 (Buildroot 2019.02.1-00016-ge01dcd0)) #2 SMP Mon Apr 8 09:14:53 CDT 2019
[link register ] c0000000002001d8 blk_mq_complete_request+0x34/0x138 [c000203ff398b390] c000203ff398b3d0 (unreliable) [c000203ff398b3d0] c0000000002d0428 scsi_mq_done+0x48/0x6c [c000203ff398b410] c0000000002ee8d8 ata_scsi_simulate+0x78/0x29c [c000203ff398b460] c0000000002eee74 ata_scsi_queuecmd+0xa8/0x2cc [c000203ff398b4a0] c0000000002d2ac4 scsi_queue_rq+0x7bc/0x810 [c000203ff398b540] c000000000202570 blk_mq_dispatch_rq_list+0x474/0x5c0 [c000203ff398b600] c000000000207560 blk_mq_sched_dispatch_requests+0x114/0x18c [c000203ff398b660] c0000000002009e4 __blk_mq_run_hw_queue+0xe0/0xf8 [c000203ff398b6e0] c000000000200a5c __blk_mq_delay_run_hw_queue+0x60/0x184 [c000203ff398b740] c000000000200c20 blk_mq_run_hw_queue+0x70/0xe4 [c000203ff398b790] c0000000002077ec blk_mq_sched_insert_request+0x68/0x194 [c000203ff398b7f0] c0000000001fc1d8 blk_execute_rq_nowait+0x78/0x8c [c000203ff398b810] c0000000001fc234 blk_execute_rq+0x48/0x90 [c000203ff398b860] c0000000002cfa10 __scsi_execute+0xd8/0x1ac [c000203ff398b8d0] c0000000002cc31c ioctl_internal_command.constprop.2+0x50/0x144 [c000203ff398b980] c0000000002cc474 scsi_set_medium_removal+0x64/0x9c [c000203ff398b9c0] c0080000101f2754 sd_open+0xe8/0x148 [sd_mod] [c000203ff398ba00] c00000000018b730 __blkdev_get+0x198/0x3e4 [c000203ff398ba70] c00000000018b9cc blkdev_get+0x50/0x35c [c000203ff398bb10] c00000000020b140 __device_add_disk+0x468/0x488 [c000203ff398bbd0] c0080000101f5484 sd_probe_async+0xd4/0x180 [sd_mod] [c000203ff398bc60] c00000000009a128 async_run_entry_fn+0x68/0x138 [c000203ff398bca0] c000000000090858 process_one_work+0x204/0x334 [c000203ff398bd30] c000000000090ca4 worker_thread+0x2d0/0x394 [c000203ff398bdb0] c000000000096cb4 kthread+0x14c/0x154 [c000203ff398be20] c00000000000b72c ret_from_kernel_thread+0x5c/0x70
"jianchao.wang" jianchao.w.wang@oracle.com writes:
On 4/9/19 1:13 PM, Stewart Smith wrote:
cpu 0x4a: Vector: 400 (Instruction Access) at [c000203ff398b100] pc: c0000000021fe700 lr: c0000000002001d8: blk_mq_complete_request+0x34/0x138
Would you please figure out what is the source code here with gdb ?
From the backtrace, it seems not the same issue, because no dm-mpath is involved here.
It turns out it's not the same issue, we just managed to hit it at the same time as the original issue.
It's actually a bug in the STRICT_KERNEL_RWX=y code on ppc64le with CONFIG_HUGETLB_PAGE=n that manifests as corrupting random kernel memory and we just happen to hit it in this codepath and only once we brought in "Revert v5.0 blk_mq_request_issue_directly".
More details over at https://github.com/linuxppc/issues/issues/237 and there should be a fix for STRICT_KERNEL_RWX coming at some point.
On 4/8/19 10:13 PM, Stewart Smith wrote:
At least when this patch is cherry-picked back on top of 5.0.7, it doesn't seem to fully fix the problem for us (which is ppc64le). We hit this at some point while eudev is finding disks and we have a process mounting them to see what's there:
cpu 0x4a: Vector: 400 (Instruction Access) at [c000203ff398b100] pc: c0000000021fe700 lr: c0000000002001d8: blk_mq_complete_request+0x34/0x138 sp: c000203ff398b390 msr: 9000000010009033 current = 0xc000203ff3901b00 paca = 0xc000203fff7f3680 irqmask: 0x03 irq_happened: 0x01 pid = 658, comm = kworker/u257:2 Linux version 5.0.7-openpower1 (hostboot@gfw275.aus.stglabs.ibm.com) (gcc version 6.5.0 (Buildroot 2019.02.1-00016-ge01dcd0)) #2 SMP Mon Apr 8 09:14:53 CDT 2019
Can you bisect this crash?
Thanks,
Bart.
linux-stable-mirror@lists.linaro.org