Hello,
On Wed, Apr 11, 2018 at 05:24:49PM +0300, Sagi Grimberg wrote:
Ah, yeah, I was moving it out of add_timer but forgot to actully add it to the issue path. Fixed patch below.
BTW, no matter what we do w/ the request handover between normal and timeout paths, we'd need something similar. Otherwise, while we can reduce the window, we can't get rid of it.
Thanks.
Just noticed this one, this looks interesting to me as well. Israel, can you run your test with this patch?
The following is the combined patch with one debug message to see whether missed completions are actually happening.
Thanks.
Index: work/block/blk-mq.c =================================================================== --- work.orig/block/blk-mq.c +++ work/block/blk-mq.c @@ -642,6 +642,8 @@ void blk_mq_complete_request(struct requ hctx_lock(hctx, &srcu_idx); if (blk_mq_rq_aborted_gstate(rq) != rq->gstate) __blk_mq_complete_request(rq); + else + rq->missed_completion = true; hctx_unlock(hctx, srcu_idx); } EXPORT_SYMBOL(blk_mq_complete_request); @@ -684,6 +686,7 @@ void blk_mq_start_request(struct request
blk_mq_rq_update_state(rq, MQ_RQ_IN_FLIGHT); blk_add_timer(rq); + rq->missed_completion = false;
write_seqcount_end(&rq->gstate_seq); preempt_enable(); @@ -818,7 +821,8 @@ struct blk_mq_timeout_data { unsigned int nr_expired; };
-static void blk_mq_rq_timed_out(struct request *req, bool reserved) +static void blk_mq_rq_timed_out(struct blk_mq_hw_ctx *hctx, struct request *req, + int *nr_resets, bool reserved) { const struct blk_mq_ops *ops = req->q->mq_ops; enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; @@ -833,13 +837,10 @@ static void blk_mq_rq_timed_out(struct r __blk_mq_complete_request(req); break; case BLK_EH_RESET_TIMER: - /* - * As nothing prevents from completion happening while - * ->aborted_gstate is set, this may lead to ignored - * completions and further spurious timeouts. - */ - blk_mq_rq_update_aborted_gstate(req, 0); blk_add_timer(req); + req->rq_flags |= RQF_MQ_TIMEOUT_RESET; + (*nr_resets)++; + hctx->need_sync_rcu = true; break; case BLK_EH_NOT_HANDLED: break; @@ -876,13 +877,35 @@ static void blk_mq_check_expired(struct time_after_eq(jiffies, deadline)) { blk_mq_rq_update_aborted_gstate(rq, gstate); data->nr_expired++; - hctx->nr_expired++; + hctx->need_sync_rcu = true; } else if (!data->next_set || time_after(data->next, deadline)) { data->next = deadline; data->next_set = 1; } }
+static void blk_mq_timeout_sync_rcu(struct request_queue *q, bool clear) +{ + struct blk_mq_hw_ctx *hctx; + bool has_rcu = false; + int i; + + queue_for_each_hw_ctx(q, hctx, i) { + if (!hctx->need_sync_rcu) + continue; + + if (!(hctx->flags & BLK_MQ_F_BLOCKING)) + has_rcu = true; + else + synchronize_srcu(hctx->srcu); + + if (clear) + hctx->need_sync_rcu = false; + } + if (has_rcu) + synchronize_rcu(); +} + static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) { @@ -895,7 +918,45 @@ static void blk_mq_terminate_expired(str */ if (!(rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) && READ_ONCE(rq->gstate) == rq->aborted_gstate) - blk_mq_rq_timed_out(rq, reserved); + blk_mq_rq_timed_out(hctx, rq, priv, reserved); +} + +static void blk_mq_timeout_reset_return(struct blk_mq_hw_ctx *hctx, + struct request *rq, void *priv, bool reserved) +{ + /* + * @rq's timer reset has gone through rcu synchronization and is + * visible now. Allow normal completions again by resetting + * ->aborted_gstate. Don't clear RQF_MQ_TIMEOUT_RESET here as + * blk_mq_timeout_reset_cleanup() needs it again and there's no + * memory ordering around ->aborted_gstate making it the only field + * safe to update. Let blk_add_timer() clear it later when the + * request is recycled or times out again. + */ + if (rq->rq_flags & RQF_MQ_TIMEOUT_RESET) + blk_mq_rq_update_aborted_gstate(rq, 0); +} + +static void blk_mq_timeout_reset_cleanup(struct blk_mq_hw_ctx *hctx, + struct request *rq, void *priv, bool reserved) +{ + int cnt = 0; + + /* + * @rq is now fully returned to the normal path. If normal + * completion raced timeout handling, execute the missed completion + * here. This is safe because 1. ->missed_completion can no longer + * be asserted because nothing is timing out right now and 2. if + * ->missed_completion is set, @rq is safe from recycling because + * nobody could have completed it. + */ + if ((rq->rq_flags & RQF_MQ_TIMEOUT_RESET) && rq->missed_completion) { + blk_mq_complete_request(rq); + cnt++; + } + + printk("XXX blk_mq_timeout_reset_cleanup(): executed %d missed completions\n", + cnt); }
static void blk_mq_timeout_work(struct work_struct *work) @@ -930,7 +991,7 @@ static void blk_mq_timeout_work(struct w blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
if (data.nr_expired) { - bool has_rcu = false; + int nr_resets = 0;
/* * Wait till everyone sees ->aborted_gstate. The @@ -938,22 +999,25 @@ static void blk_mq_timeout_work(struct w * becomes a problem, we can add per-hw_ctx rcu_head and * wait in parallel. */ - queue_for_each_hw_ctx(q, hctx, i) { - if (!hctx->nr_expired) - continue; - - if (!(hctx->flags & BLK_MQ_F_BLOCKING)) - has_rcu = true; - else - synchronize_srcu(hctx->srcu); - - hctx->nr_expired = 0; - } - if (has_rcu) - synchronize_rcu(); + blk_mq_timeout_sync_rcu(q, true);
/* terminate the ones we won */ - blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, NULL); + blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, + &nr_resets); + + /* + * For BLK_EH_RESET_TIMER, release the requests after + * blk_add_timer() from above is visible to avoid timer + * reset racing against recycling. + */ + if (nr_resets) { + blk_mq_timeout_sync_rcu(q, false); + blk_mq_queue_tag_busy_iter(q, + blk_mq_timeout_reset_return, NULL); + blk_mq_timeout_sync_rcu(q, true); + blk_mq_queue_tag_busy_iter(q, + blk_mq_timeout_reset_cleanup, NULL); + } }
if (data.next_set) { Index: work/include/linux/blk-mq.h =================================================================== --- work.orig/include/linux/blk-mq.h +++ work/include/linux/blk-mq.h @@ -51,7 +51,7 @@ struct blk_mq_hw_ctx { unsigned int queue_num;
atomic_t nr_active; - unsigned int nr_expired; + bool need_sync_rcu;
struct hlist_node cpuhp_dead; struct kobject kobj; Index: work/block/blk-timeout.c =================================================================== --- work.orig/block/blk-timeout.c +++ work/block/blk-timeout.c @@ -216,7 +216,7 @@ void blk_add_timer(struct request *req) req->timeout = q->rq_timeout;
blk_rq_set_deadline(req, jiffies + req->timeout); - req->rq_flags &= ~RQF_MQ_TIMEOUT_EXPIRED; + req->rq_flags &= ~(RQF_MQ_TIMEOUT_EXPIRED | RQF_MQ_TIMEOUT_RESET);
/* * Only the non-mq case needs to add the request to a protected list. Index: work/include/linux/blkdev.h =================================================================== --- work.orig/include/linux/blkdev.h +++ work/include/linux/blkdev.h @@ -127,8 +127,10 @@ typedef __u32 __bitwise req_flags_t; #define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19)) /* timeout is expired */ #define RQF_MQ_TIMEOUT_EXPIRED ((__force req_flags_t)(1 << 20)) +/* timeout is expired */ +#define RQF_MQ_TIMEOUT_RESET ((__force req_flags_t)(1 << 21)) /* already slept for hybrid poll */ -#define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 21)) +#define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 22))
/* flags that prevent us from merging requests: */ #define RQF_NOMERGE_FLAGS \ @@ -225,6 +227,8 @@ struct request {
unsigned int extra_len; /* length of alignment and padding */
+ bool missed_completion; + /* * On blk-mq, the lower bits of ->gstate (generation number and * state) carry the MQ_RQ_* state value and the upper bits the