From 59f082e464ae0533813d5a1c3149b22563da93d7 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 1 Feb 2017 09:53:14 -0800 Subject: blk-mq: allocate blk_mq_tags and requests in correct node blk_mq_tags/requests of specific hardware queue are mostly used in specific cpus, which might not be in the same numa node as disk. For example, a nvme card is in node 0. half hardware queue will be used by node 0, the other node 1. Signed-off-by: Shaohua Li Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 9e6b064e5339..d84c66fb37b7 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1713,16 +1713,20 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, unsigned int reserved_tags) { struct blk_mq_tags *tags; + int node; + + node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx); + if (node == NUMA_NO_NODE) + node = set->numa_node; - tags = blk_mq_init_tags(nr_tags, reserved_tags, - set->numa_node, + tags = blk_mq_init_tags(nr_tags, reserved_tags, node, BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); if (!tags) return NULL; tags->rqs = kzalloc_node(nr_tags * sizeof(struct request *), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, - set->numa_node); + node); if (!tags->rqs) { blk_mq_free_tags(tags); return NULL; @@ -1730,7 +1734,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, tags->static_rqs = kzalloc_node(nr_tags * sizeof(struct request *), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, - set->numa_node); + node); if (!tags->static_rqs) { kfree(tags->rqs); blk_mq_free_tags(tags); @@ -1750,6 +1754,11 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, { unsigned int i, j, entries_per_page, max_order = 4; size_t rq_size, left; + int node; + + node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx); + if (node == NUMA_NO_NODE) + node = set->numa_node; INIT_LIST_HEAD(&tags->page_list); @@ -1771,7 +1780,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, this_order--; do { - page = alloc_pages_node(set->numa_node, + page = alloc_pages_node(node, GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO, this_order); if (page) @@ -1804,7 +1813,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, if (set->ops->init_request) { if (set->ops->init_request(set->driver_data, rq, hctx_idx, i, - set->numa_node)) { + node)) { tags->static_rqs[i] = NULL; goto fail; } -- cgit v1.2.3 From 415b806de5576b656f3ff94366589af9a161d0c8 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 27 Feb 2017 10:04:39 -0700 Subject: blk-mq-sched: Allocate sched reserved tags as specified in the original queue tagset Signed-off-by: Sagi Grimberg Modified by me to also check at driver tag allocation time if the original request was reserved, so we can be sure to allocate a properly reserved tag at that point in time, too. Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index d84c66fb37b7..4df5fb42c74f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -852,6 +852,9 @@ done: return true; } + if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag)) + data.flags |= BLK_MQ_REQ_RESERVED; + rq->tag = blk_mq_get_tag(&data); if (rq->tag >= 0) { if (blk_mq_tag_busy(data.hctx)) { -- cgit v1.2.3 From 6d2809d51a5079f01a416d91dd63b0766cb685d0 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Mon, 27 Feb 2017 10:28:27 -0800 Subject: blk-mq: make blk_mq_alloc_request_hctx() allocate a scheduler request blk_mq_alloc_request_hctx() allocates a driver request directly, unlike its blk_mq_alloc_request() counterpart. It also crashes because it doesn't update the tags->rqs map. Fix it by making it allocate a scheduler request. Reported-by: Sagi Grimberg Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe Tested-by: Sagi Grimberg --- block/blk-mq.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 4df5fb42c74f..85a7047d8b03 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -273,10 +273,9 @@ EXPORT_SYMBOL(blk_mq_alloc_request); struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw, unsigned int flags, unsigned int hctx_idx) { - struct blk_mq_hw_ctx *hctx; - struct blk_mq_ctx *ctx; + struct blk_mq_alloc_data alloc_data = { .flags = flags }; struct request *rq; - struct blk_mq_alloc_data alloc_data; + unsigned int cpu; int ret; /* @@ -299,25 +298,23 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw, * Check if the hardware context is actually mapped to anything. * If not tell the caller that it should skip this queue. */ - hctx = q->queue_hw_ctx[hctx_idx]; - if (!blk_mq_hw_queue_mapped(hctx)) { - ret = -EXDEV; - goto out_queue_exit; + alloc_data.hctx = q->queue_hw_ctx[hctx_idx]; + if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) { + blk_queue_exit(q); + return ERR_PTR(-EXDEV); } - ctx = __blk_mq_get_ctx(q, cpumask_first(hctx->cpumask)); + cpu = cpumask_first(alloc_data.hctx->cpumask); + alloc_data.ctx = __blk_mq_get_ctx(q, cpu); - blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); - rq = __blk_mq_alloc_request(&alloc_data, rw); - if (!rq) { - ret = -EWOULDBLOCK; - goto out_queue_exit; - } - - return rq; + rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data); -out_queue_exit: + blk_mq_put_ctx(alloc_data.ctx); blk_queue_exit(q); - return ERR_PTR(ret); + + if (!rq) + return ERR_PTR(-EWOULDBLOCK); + + return rq; } EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); -- cgit v1.2.3 From 562bef4259776c19cb2423d43af1f99183910a4d Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Mon, 27 Feb 2017 09:47:55 -0800 Subject: blk-mq: move update of tags->rqs to __blk_mq_alloc_request() No functional difference, it just makes a little more sense to update the tag map where we actually allocate the tag. Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe Tested-by: Sagi Grimberg --- block/blk-mq.c | 1 + 1 file changed, 1 insertion(+) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 85a7047d8b03..94593c6282d8 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -234,6 +234,7 @@ struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data, } rq->tag = tag; rq->internal_tag = -1; + data->hctx->tags->rqs[rq->tag] = rq; } blk_mq_rq_ctx_init(data->q, data->ctx, rq, op); -- cgit v1.2.3 From 6bae363ee3057a14eec93440826813603559273a Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 1 Mar 2017 14:22:10 -0500 Subject: blk-mq: Export blk_mq_freeze_queue_wait Drivers can start a freeze, so this provides a way to wait for frozen. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 94593c6282d8..8da2c04bb88f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -75,10 +75,11 @@ void blk_mq_freeze_queue_start(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start); -static void blk_mq_freeze_queue_wait(struct request_queue *q) +void blk_mq_freeze_queue_wait(struct request_queue *q) { wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter)); } +EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait); /* * Guarantee no request is in use, so we can change any data structure of -- cgit v1.2.3 From f91328c40a559362b6e7b7bfee01ca17fda87592 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 1 Mar 2017 14:22:11 -0500 Subject: blk-mq: Provide freeze queue timeout A driver may wish to take corrective action if queued requests do not complete within a set time. Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- block/blk-mq.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 8da2c04bb88f..a5e66a7a3506 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -81,6 +81,15 @@ void blk_mq_freeze_queue_wait(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait); +int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, + unsigned long timeout) +{ + return wait_event_timeout(q->mq_freeze_wq, + percpu_ref_is_zero(&q->q_usage_counter), + timeout); +} +EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout); + /* * Guarantee no request is in use, so we can change any data structure of * the queue afterward. -- cgit v1.2.3 From 113285b473824922498d07d7f82459507b9792eb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 2 Mar 2017 13:26:04 -0700 Subject: blk-mq: ensure that bd->last is always set correctly When drivers are called with a request in blk-mq, blk-mq flags the state such that the driver knows if this is the last request in this call chain or not. The driver can then use that information to defer kicking off IO until bd->last is true. However, with blk-mq and scheduling, we need to allocate a driver tag for a request before it can be issued. If we fail to allocate such a tag, we could end up in the situation where the last request issued did not have bd->last == true set. This can then cause a driver hang. This fixes a hang with virtio-blk, which uses bd->last as a hint on whether to kick the queue or not. Reported-by: Chris Mason Tested-by: Chris Mason Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-mq.c | 50 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 7 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index a5e66a7a3506..e797607dab89 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -876,12 +876,9 @@ done: return false; } -static void blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, - struct request *rq) +static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, + struct request *rq) { - if (rq->tag == -1 || rq->internal_tag == -1) - return; - blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag); rq->tag = -1; @@ -891,6 +888,26 @@ static void blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, } } +static void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx, + struct request *rq) +{ + if (rq->tag == -1 || rq->internal_tag == -1) + return; + + __blk_mq_put_driver_tag(hctx, rq); +} + +static void blk_mq_put_driver_tag(struct request *rq) +{ + struct blk_mq_hw_ctx *hctx; + + if (rq->tag == -1 || rq->internal_tag == -1) + return; + + hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu); + __blk_mq_put_driver_tag(hctx, rq); +} + /* * If we fail getting a driver tag because all the driver tags are already * assigned and on the dispatch list, BUT the first entry does not have a @@ -1000,7 +1017,19 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) bd.rq = rq; bd.list = dptr; - bd.last = list_empty(list); + + /* + * Flag last if we have no more requests, or if we have more + * but can't assign a driver tag to it. + */ + if (list_empty(list)) + bd.last = true; + else { + struct request *nxt; + + nxt = list_first_entry(list, struct request, queuelist); + bd.last = !blk_mq_get_driver_tag(nxt, NULL, false); + } ret = q->mq_ops->queue_rq(hctx, &bd); switch (ret) { @@ -1008,7 +1037,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) queued++; break; case BLK_MQ_RQ_QUEUE_BUSY: - blk_mq_put_driver_tag(hctx, rq); + blk_mq_put_driver_tag_hctx(hctx, rq); list_add(&rq->queuelist, list); __blk_mq_requeue_request(rq); break; @@ -1038,6 +1067,13 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) * that is where we will continue on next queue run. */ if (!list_empty(list)) { + /* + * If we got a driver tag for the next request already, + * free it again. + */ + rq = list_first_entry(list, struct request, queuelist); + blk_mq_put_driver_tag(rq); + spin_lock(&hctx->lock); list_splice_init(list, &hctx->dispatch); spin_unlock(&hctx->lock); -- cgit v1.2.3