From f441108fa08c466d986a7dca776f59dabab58456 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 15 Jun 2018 14:55:17 -0700 Subject: block: Remove a superfluous cast from blkdev_report_zones() No cast is necessary when assigning a non-void pointer to a void pointer. Signed-off-by: Bart Van Assche Reviewed-by: Damien Le Moal Cc: Matias Bjorling Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-zoned.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 51000914e23f..c461cf63f1f4 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -200,7 +200,7 @@ int blkdev_report_zones(struct block_device *bdev, /* Get header in the first page */ ofst = 0; if (!nr_rep) { - hdr = (struct blk_zone_report_hdr *) addr; + hdr = addr; nr_rep = hdr->nr_zones; ofst = sizeof(struct blk_zone_report_hdr); } -- cgit v1.2.3 From b3e7e7d2d668de0102264302a4d10dd9d4438a42 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 15 Jun 2018 14:55:18 -0700 Subject: include/uapi/linux/blkzoned.h: Remove a superfluous __packed directive Using the __packed directive for a structure that does not need it is wrong because it makes gcc generate suboptimal code on some architectures. Hence remove the __packed directive from the blk_zone_report structure definition. See also http://digitalvampire.org/blog/index.php/2006/07/31/why-you-shouldnt-use-__attribute__packed/. Signed-off-by: Bart Van Assche Reviewed-by: Damien Le Moal Cc: Matias Bjorling Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- include/uapi/linux/blkzoned.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h index e3c70fe6bf0f..ff5a5db8906a 100644 --- a/include/uapi/linux/blkzoned.h +++ b/include/uapi/linux/blkzoned.h @@ -117,7 +117,7 @@ struct blk_zone_report { __u32 nr_zones; __u8 reserved[4]; struct blk_zone zones[0]; -} __packed; +}; /** * struct blk_zone_range - BLKRESETZONE ioctl request -- cgit v1.2.3 From 6b1d83d274486615cc341e410467a98fd9c27c0a Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 15 Jun 2018 14:55:19 -0700 Subject: block: Remove bdev_nr_zones() Remove this function since it has no callers. This function was introduced in commit 6cc77e9cb080 ("block: introduce zoned block devices zone write locking"). Signed-off-by: Bart Van Assche Reviewed-by: Damien Le Moal Cc: Christoph Hellwig Cc: Matias Bjorling Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 9 --------- 1 file changed, 9 deletions(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 79226ca8f80f..49a400afb146 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1639,15 +1639,6 @@ static inline unsigned int bdev_zone_sectors(struct block_device *bdev) return 0; } -static inline unsigned int bdev_nr_zones(struct block_device *bdev) -{ - struct request_queue *q = bdev_get_queue(bdev); - - if (q) - return blk_queue_nr_zones(q); - return 0; -} - static inline int queue_dma_alignment(struct request_queue *q) { return q ? q->dma_alignment : 511; -- cgit v1.2.3 From 7c8542b7982264226cf94102950343185869b584 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 15 Jun 2018 14:55:20 -0700 Subject: block: Inline blk_queue_nr_zones() Since the implementation of blk_queue_nr_zones() is trivial and since it only has a single caller, inline this function. Signed-off-by: Bart Van Assche Reviewed-by: Damien Le Moal Cc: Matias Bjorling Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 2 +- include/linux/blkdev.h | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 1c4532e92938..26e1f8e425a8 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -214,7 +214,7 @@ static int queue_zone_wlock_show(void *data, struct seq_file *m) if (!q->seq_zones_wlock) return 0; - for (i = 0; i < blk_queue_nr_zones(q); i++) + for (i = 0; i < q->nr_zones; i++) if (test_bit(i, q->seq_zones_wlock)) seq_printf(m, "%u\n", i); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 49a400afb146..905daa7c647e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -800,11 +800,6 @@ static inline unsigned int blk_queue_zone_sectors(struct request_queue *q) return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0; } -static inline unsigned int blk_queue_nr_zones(struct request_queue *q) -{ - return q->nr_zones; -} - static inline unsigned int blk_queue_zone_no(struct request_queue *q, sector_t sector) { -- cgit v1.2.3 From 6a5ac9846508ad7d1d23881d9d5add35f2e6ae71 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 15 Jun 2018 14:55:21 -0700 Subject: block: Make struct request_queue smaller for CONFIG_BLK_DEV_ZONED=n Exclude zoned block device members from struct request_queue for CONFIG_BLK_DEV_ZONED == n. Avoid breaking the build by only building the code that uses these struct request_queue members if CONFIG_BLK_DEV_ZONED != n. Signed-off-by: Bart Van Assche Reviewed-by: Damien Le Moal Cc: Matias Bjorling Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- block/Kconfig | 4 ++++ block/Makefile | 1 + block/blk-mq-debugfs-zoned.c | 24 ++++++++++++++++++++++++ block/blk-mq-debugfs.c | 15 --------------- block/blk-mq-debugfs.h | 9 +++++++++ include/linux/blkdev.h | 6 ++++++ 6 files changed, 44 insertions(+), 15 deletions(-) create mode 100644 block/blk-mq-debugfs-zoned.c diff --git a/block/Kconfig b/block/Kconfig index eb50fd4977c2..dfe7bc770fc9 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -177,6 +177,10 @@ config BLK_DEBUG_FS Unless you are building a kernel for a tiny system, you should say Y here. +config BLK_DEBUG_FS_ZONED + bool + default BLK_DEBUG_FS && BLK_DEV_ZONED + config BLK_SED_OPAL bool "Logic for interfacing with Opal enabled SEDs" ---help--- diff --git a/block/Makefile b/block/Makefile index 6a56303b9925..a8f94cdb75c3 100644 --- a/block/Makefile +++ b/block/Makefile @@ -34,4 +34,5 @@ obj-$(CONFIG_BLK_MQ_RDMA) += blk-mq-rdma.o obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o obj-$(CONFIG_BLK_WBT) += blk-wbt.o obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o +obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o diff --git a/block/blk-mq-debugfs-zoned.c b/block/blk-mq-debugfs-zoned.c new file mode 100644 index 000000000000..fb2c82c351e4 --- /dev/null +++ b/block/blk-mq-debugfs-zoned.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2017 Western Digital Corporation or its affiliates. + * + * This file is released under the GPL. + */ + +#include +#include "blk-mq-debugfs.h" + +int queue_zone_wlock_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + unsigned int i; + + if (!q->seq_zones_wlock) + return 0; + + for (i = 0; i < q->nr_zones; i++) + if (test_bit(i, q->seq_zones_wlock)) + seq_printf(m, "%u\n", i); + + return 0; +} diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 26e1f8e425a8..7efe268e4447 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -206,21 +206,6 @@ static ssize_t queue_write_hint_store(void *data, const char __user *buf, return count; } -static int queue_zone_wlock_show(void *data, struct seq_file *m) -{ - struct request_queue *q = data; - unsigned int i; - - if (!q->seq_zones_wlock) - return 0; - - for (i = 0; i < q->nr_zones; i++) - if (test_bit(i, q->seq_zones_wlock)) - seq_printf(m, "%u\n", i); - - return 0; -} - static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = { { "poll_stat", 0400, queue_poll_stat_show }, { "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops }, diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index b9d366e57097..a9160be12be0 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h @@ -80,4 +80,13 @@ static inline void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hc } #endif +#ifdef CONFIG_BLK_DEBUG_FS_ZONED +int queue_zone_wlock_show(void *data, struct seq_file *m); +#else +static inline int queue_zone_wlock_show(void *data, struct seq_file *m) +{ + return 0; +} +#endif + #endif diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 905daa7c647e..ca5a8b046894 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -592,6 +592,7 @@ struct request_queue { struct queue_limits limits; +#ifdef CONFIG_BLK_DEV_ZONED /* * Zoned block device information for request dispatch control. * nr_zones is the total number of zones of the device. This is always @@ -612,6 +613,7 @@ struct request_queue { unsigned int nr_zones; unsigned long *seq_zones_bitmap; unsigned long *seq_zones_wlock; +#endif /* CONFIG_BLK_DEV_ZONED */ /* * sg stuff @@ -800,6 +802,7 @@ static inline unsigned int blk_queue_zone_sectors(struct request_queue *q) return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0; } +#ifdef CONFIG_BLK_DEV_ZONED static inline unsigned int blk_queue_zone_no(struct request_queue *q, sector_t sector) { @@ -815,6 +818,7 @@ static inline bool blk_queue_zone_is_seq(struct request_queue *q, return false; return test_bit(blk_queue_zone_no(q, sector), q->seq_zones_bitmap); } +#endif /* CONFIG_BLK_DEV_ZONED */ static inline bool rq_is_sync(struct request *rq) { @@ -1065,6 +1069,7 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq) return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT; } +#ifdef CONFIG_BLK_DEV_ZONED static inline unsigned int blk_rq_zone_no(struct request *rq) { return blk_queue_zone_no(rq->q, blk_rq_pos(rq)); @@ -1074,6 +1079,7 @@ static inline unsigned int blk_rq_zone_is_seq(struct request *rq) { return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq)); } +#endif /* CONFIG_BLK_DEV_ZONED */ /* * Some commands like WRITE SAME have a payload or data transfer size which -- cgit v1.2.3 From 0471559c2fbd2c19d183fc0f51ce88aefa0a13c8 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Mon, 25 Jun 2018 21:55:34 +0200 Subject: block, bfq: add/remove entity weights correctly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To keep I/O throughput high as often as possible, BFQ performs I/O-dispatch plugging (aka device idling) only when beneficial exactly for throughput, or when needed for service guarantees (low latency, fairness). An important case where the latter condition holds is when the scenario is 'asymmetric' in terms of weights: i.e., when some bfq_queue or whole group of queues has a higher weight, and thus has to receive more service, than other queues or groups. Without dispatch plugging, lower-weight queues/groups may unjustly steal bandwidth to higher-weight queues/groups. To detect asymmetric scenarios, BFQ checks some sufficient conditions. One of these conditions is that active groups have different weights. BFQ controls this condition by maintaining a special set of unique weights of active groups (group_weights_tree). To this purpose, in the function bfq_active_insert/bfq_active_extract BFQ adds/removes the weight of a group to/from this set. Unfortunately, the function bfq_active_extract may happen to be invoked also for a group that is still active (to preserve the correct update of the next queue to serve, see comments in function bfq_no_longer_next_in_service() for details). In this case, removing the weight of the group makes the set group_weights_tree inconsistent. Service-guarantee violations follow. This commit addresses this issue by moving group_weights_tree insertions from their previous location (in bfq_active_insert) into the function __bfq_activate_entity, and by moving group_weights_tree extractions from bfq_active_extract to when the entity that represents a group remains throughly idle, i.e., with no request either enqueued or dispatched. Tested-by: Holger Hoffstätte Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 45 +++++++++++++++++++++++++++++++++++++++++---- block/bfq-iosched.h | 7 +++++-- block/bfq-wf2q.c | 24 +++++++++++++----------- 3 files changed, 59 insertions(+), 17 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 495b9ddb3355..3f32e88c7e9b 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -742,8 +742,9 @@ inc_counter: * See the comments to the function bfq_weights_tree_add() for considerations * about overhead. */ -void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_entity *entity, - struct rb_root *root) +void __bfq_weights_tree_remove(struct bfq_data *bfqd, + struct bfq_entity *entity, + struct rb_root *root) { if (!entity->weight_counter) return; @@ -759,6 +760,43 @@ reset_entity_pointer: entity->weight_counter = NULL; } +/* + * Invoke __bfq_weights_tree_remove on bfqq and all its inactive + * parent entities. + */ +void bfq_weights_tree_remove(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = bfqq->entity.parent; + + __bfq_weights_tree_remove(bfqd, &bfqq->entity, + &bfqd->queue_weights_tree); + + for_each_entity(entity) { + struct bfq_sched_data *sd = entity->my_sched_data; + + if (sd->next_in_service || sd->in_service_entity) { + /* + * entity is still active, because either + * next_in_service or in_service_entity is not + * NULL (see the comments on the definition of + * next_in_service for details on why + * in_service_entity must be checked too). + * + * As a consequence, the weight of entity is + * not to be removed. In addition, if entity + * is active, then its parent entities are + * active as well, and thus their weights are + * not to be removed either. In the end, this + * loop must stop here. + */ + break; + } + __bfq_weights_tree_remove(bfqd, entity, + &bfqd->group_weights_tree); + } +} + /* * Return expired entry, or NULL to just start from scratch in rbtree. */ @@ -4582,8 +4620,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) */ bfqq->budget_timeout = jiffies; - bfq_weights_tree_remove(bfqd, &bfqq->entity, - &bfqd->queue_weights_tree); + bfq_weights_tree_remove(bfqd, bfqq); } now_ns = ktime_get_ns(); diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 0f712e03b035..a8a2e5aca4d4 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -827,8 +827,11 @@ struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic); void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq); void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity, struct rb_root *root); -void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_entity *entity, - struct rb_root *root); +void __bfq_weights_tree_remove(struct bfq_data *bfqd, + struct bfq_entity *entity, + struct rb_root *root); +void bfq_weights_tree_remove(struct bfq_data *bfqd, + struct bfq_queue *bfqq); void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, bool compensate, enum bfqq_expiration reason); void bfq_put_queue(struct bfq_queue *bfqq); diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 4498c43245e2..58cf38fcee05 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -499,9 +499,6 @@ static void bfq_active_insert(struct bfq_service_tree *st, if (bfqq) list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); #ifdef CONFIG_BFQ_GROUP_IOSCHED - else /* bfq_group */ - bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree); - if (bfqg != bfqd->root_group) bfqg->active_entities++; #endif @@ -601,10 +598,6 @@ static void bfq_active_extract(struct bfq_service_tree *st, if (bfqq) list_del(&bfqq->bfqq_list); #ifdef CONFIG_BFQ_GROUP_IOSCHED - else /* bfq_group */ - bfq_weights_tree_remove(bfqd, entity, - &bfqd->group_weights_tree); - if (bfqg != bfqd->root_group) bfqg->active_entities--; #endif @@ -799,7 +792,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, if (prev_weight != new_weight) { root = bfqq ? &bfqd->queue_weights_tree : &bfqd->group_weights_tree; - bfq_weights_tree_remove(bfqd, entity, root); + __bfq_weights_tree_remove(bfqd, entity, root); } entity->weight = new_weight; /* @@ -971,7 +964,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, * one of its children receives a new request. * * Basically, this function updates the timestamps of entity and - * inserts entity into its active tree, ater possibly extracting it + * inserts entity into its active tree, after possibly extracting it * from its idle tree. */ static void __bfq_activate_entity(struct bfq_entity *entity, @@ -1015,6 +1008,16 @@ static void __bfq_activate_entity(struct bfq_entity *entity, entity->on_st = true; } +#ifdef BFQ_GROUP_IOSCHED_ENABLED + if (!bfq_entity_to_bfqq(entity)) { /* bfq_group */ + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_weights_tree_add(bfqg->bfqd, entity, + &bfqd->group_weights_tree); + } +#endif + bfq_update_fin_time_enqueue(entity, st, backshifted); } @@ -1664,8 +1667,7 @@ void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqd->busy_queues--; if (!bfqq->dispatched) - bfq_weights_tree_remove(bfqd, &bfqq->entity, - &bfqd->queue_weights_tree); + bfq_weights_tree_remove(bfqd, bfqq); if (bfqq->wr_coeff > 1) bfqd->wr_busy_queues--; -- cgit v1.2.3 From 4420b095cc474759f6fbdb6351648c7ff9833a54 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Mon, 25 Jun 2018 21:55:35 +0200 Subject: block, bfq: do not expire a queue that will deserve dispatch plugging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For some bfq_queues, BFQ plugs I/O dispatching when the queue becomes idle, and keeps the plug until a new request of the queue arrives, or a timeout fires. BFQ does so either to boost throughput or to preserve service guarantees for the queue. More precisely, for such a queue, plugging starts when the queue happens to have either no request enqueued, or no request in flight, that is, no request already dispatched but not yet completed. On the opposite end, BFQ may happen to expire a queue with no request enqueued, without doing any plugging, if the queue still has some request in flight. Unfortunately, such a premature expiration causes the queue to lose its chance to enjoy dispatch plugging a moment later, i.e., when its in-flight requests finally get completed. This breaks service guarantees for the queue. This commit prevents BFQ from expiring an empty queue if the latter still has in-flight requests. Tested-by: Holger Hoffstätte Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 3f32e88c7e9b..4fd4f1996498 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -3597,8 +3597,14 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); + /* + * Do not expire bfqq for budget timeout if bfqq may be about + * to enjoy device idling. The reason why, in this case, we + * prevent bfqq from expiring is the same as in the comments + * on the case where bfq_bfqq_must_idle() returns true, in + * bfq_completed_request(). + */ if (bfq_may_expire_for_budg_timeout(bfqq) && - !bfq_bfqq_wait_request(bfqq) && !bfq_bfqq_must_idle(bfqq)) goto expire; @@ -4674,8 +4680,32 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) * or if we want to idle in case it has no pending requests. */ if (bfqd->in_service_queue == bfqq) { - if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) { - bfq_arm_slice_timer(bfqd); + if (bfq_bfqq_must_idle(bfqq)) { + if (bfqq->dispatched == 0) + bfq_arm_slice_timer(bfqd); + /* + * If we get here, we do not expire bfqq, even + * if bfqq was in budget timeout or had no + * more requests (as controlled in the next + * conditional instructions). The reason for + * not expiring bfqq is as follows. + * + * Here bfqq->dispatched > 0 holds, but + * bfq_bfqq_must_idle() returned true. This + * implies that, even if no request arrives + * for bfqq before bfqq->dispatched reaches 0, + * bfqq will, however, not be expired on the + * completion event that causes bfqq->dispatch + * to reach zero. In contrast, on this event, + * bfqq will start enjoying device idling + * (I/O-dispatch plugging). + * + * But, if we expired bfqq here, bfqq would + * not have the chance to enjoy device idling + * when bfqq->dispatched finally reaches + * zero. This would expose bfqq to violation + * of its reserved service guarantees. + */ return; } else if (bfq_may_expire_for_budg_timeout(bfqq)) bfq_bfqq_expire(bfqd, bfqq, false, -- cgit v1.2.3 From 9fae8dd59ff3d9c19570cbddf12e87d7bb66c8a2 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Mon, 25 Jun 2018 21:55:36 +0200 Subject: block, bfq: fix service being wrongly set to zero in case of preemption MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If - a bfq_queue Q preempts another queue, because one request of Q arrives in time, - but, after this preemption, Q is not the queue that is set in service, then Q->entity.service is set to 0 when Q is eventually set in service. But Q should have continued receiving service with its old budget (which is why preemption has occurred) and its old service. This commit addresses this issue by resetting service on queue real expiration. Tested-by: Holger Hoffstätte Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 34 ++++++++++++++++++++++++++++------ block/bfq-wf2q.c | 6 ------ 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 4fd4f1996498..d579cc8e0db6 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1382,18 +1382,30 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, * remain unchanged after such an expiration, and the * following statement therefore assigns to * entity->budget the remaining budget on such an - * expiration. For clarity, entity->service is not - * updated on expiration in any case, and, in normal - * operation, is reset only when bfqq is selected for - * service (see bfq_get_next_queue). + * expiration. */ entity->budget = min_t(unsigned long, bfq_bfqq_budget_left(bfqq), bfqq->max_budget); + /* + * At this point, we have used entity->service to get + * the budget left (needed for updating + * entity->budget). Thus we finally can, and have to, + * reset entity->service. The latter must be reset + * because bfqq would otherwise be charged again for + * the service it has received during its previous + * service slot(s). + */ + entity->service = 0; + return true; } + /* + * We can finally complete expiration, by setting service to 0. + */ + entity->service = 0; entity->budget = max_t(unsigned long, bfqq->max_budget, bfq_serv_to_charge(bfqq->next_rq, bfqq)); bfq_clear_bfqq_non_blocking_wait_rq(bfqq); @@ -3271,11 +3283,21 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, ref = bfqq->ref; __bfq_bfqq_expire(bfqd, bfqq); + if (ref == 1) /* bfqq is gone, no more actions on it */ + return; + /* mark bfqq as waiting a request only if a bic still points to it */ - if (ref > 1 && !bfq_bfqq_busy(bfqq) && + if (!bfq_bfqq_busy(bfqq) && reason != BFQQE_BUDGET_TIMEOUT && - reason != BFQQE_BUDGET_EXHAUSTED) + reason != BFQQE_BUDGET_EXHAUSTED) { bfq_mark_bfqq_non_blocking_wait_rq(bfqq); + /* + * Not setting service to 0, because, if the next rq + * arrives in time, the queue will go on receiving + * service with this same budget (as if it never expired) + */ + } else + entity->service = 0; } /* diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 58cf38fcee05..dbc07b456059 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -1544,12 +1544,6 @@ struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) entity = sd->next_in_service; sd->in_service_entity = entity; - /* - * Reset the accumulator of the amount of service that - * the entity is about to receive. - */ - entity->service = 0; - /* * If entity is no longer a candidate for next * service, then it must be extracted from its active -- cgit v1.2.3 From 277a4a9b56cde0f3d53ea8abc0e43ff636820007 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Mon, 25 Jun 2018 21:55:37 +0200 Subject: block, bfq: give a better name to bfq_bfqq_may_idle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The actual goal of the function bfq_bfqq_may_idle is to tell whether it is better to perform device idling (more precisely: I/O-dispatch plugging) for the input bfq_queue, either to boost throughput or to preserve service guarantees. This commit improves the name of the function accordingly. Tested-by: Holger Hoffstätte Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index d579cc8e0db6..41d9036b1822 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -634,7 +634,7 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd) * The following function returns true if every queue must receive the * same share of the throughput (this condition is used when deciding * whether idling may be disabled, see the comments in the function - * bfq_bfqq_may_idle()). + * bfq_better_to_idle()). * * Such a scenario occurs when: * 1) all active queues have the same weight, @@ -3355,7 +3355,7 @@ static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) * issues taken into account are not trivial. We discuss these issues * individually while introducing the variables. */ -static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) +static bool bfq_better_to_idle(struct bfq_queue *bfqq) { struct bfq_data *bfqd = bfqq->bfqd; bool rot_without_queueing = @@ -3588,19 +3588,19 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) } /* - * If the in-service queue is empty but the function bfq_bfqq_may_idle + * If the in-service queue is empty but the function bfq_better_to_idle * returns true, then: * 1) the queue must remain in service and cannot be expired, and * 2) the device must be idled to wait for the possible arrival of a new * request for the queue. - * See the comments on the function bfq_bfqq_may_idle for the reasons + * See the comments on the function bfq_better_to_idle for the reasons * why performing device idling is the best choice to boost the throughput - * and preserve service guarantees when bfq_bfqq_may_idle itself + * and preserve service guarantees when bfq_better_to_idle itself * returns true. */ static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) { - return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_may_idle(bfqq); + return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_better_to_idle(bfqq); } /* @@ -3686,7 +3686,7 @@ check_queue: * may idle after their completion, then keep it anyway. */ if (bfq_bfqq_wait_request(bfqq) || - (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) { + (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) { bfqq = NULL; goto keep_queue; } @@ -4734,7 +4734,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) BFQQE_BUDGET_TIMEOUT); else if (RB_EMPTY_ROOT(&bfqq->sort_list) && (bfqq->dispatched == 0 || - !bfq_bfqq_may_idle(bfqq))) + !bfq_better_to_idle(bfqq))) bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_NO_MORE_REQUESTS); } -- cgit v1.2.3 From 8ab6bb9ee8d04ba56b9eb19cc7e4f56d0a43ad1a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 25 Jun 2018 19:31:45 +0800 Subject: blk-mq: cleanup blk_mq_get_driver_tag() We never pass 'wait' as true to blk_mq_get_driver_tag(), and hence we never change '**hctx' as well. The last use of these went away with the flush cleanup, commit 0c2a6fe4dc3e. So cleanup the usage and remove the two extra parameters. Cc: Bart Van Assche Cc: Christoph Hellwig Tested-by: Andrew Jones Reviewed-by: Omar Sandoval Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 19 +++++++------------ block/blk-mq.h | 3 +-- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 95919268564b..ae8a6b2c7c22 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -964,17 +964,14 @@ static inline unsigned int queued_to_index(unsigned int queued) return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1); } -bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, - bool wait) +bool blk_mq_get_driver_tag(struct request *rq) { struct blk_mq_alloc_data data = { .q = rq->q, .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), - .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT, + .flags = BLK_MQ_REQ_NOWAIT, }; - might_sleep_if(wait); - if (rq->tag != -1) goto done; @@ -991,8 +988,6 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, } done: - if (hctx) - *hctx = data.hctx; return rq->tag != -1; } @@ -1034,7 +1029,7 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, * Don't clear RESTART here, someone else could have set it. * At most this will cost an extra queue run. */ - return blk_mq_get_driver_tag(rq, hctx, false); + return blk_mq_get_driver_tag(rq); } wait = &this_hctx->dispatch_wait; @@ -1055,7 +1050,7 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, * allocation failure and adding the hardware queue to the wait * queue. */ - ret = blk_mq_get_driver_tag(rq, hctx, false); + ret = blk_mq_get_driver_tag(rq); if (!ret) { spin_unlock(&this_hctx->lock); return false; @@ -1105,7 +1100,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) break; - if (!blk_mq_get_driver_tag(rq, NULL, false)) { + if (!blk_mq_get_driver_tag(rq)) { /* * The initial allocation attempt failed, so we need to * rerun the hardware queue when a tag is freed. The @@ -1137,7 +1132,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, bd.last = true; else { nxt = list_first_entry(list, struct request, queuelist); - bd.last = !blk_mq_get_driver_tag(nxt, NULL, false); + bd.last = !blk_mq_get_driver_tag(nxt); } ret = q->mq_ops->queue_rq(hctx, &bd); @@ -1700,7 +1695,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, if (!blk_mq_get_dispatch_budget(hctx)) goto insert; - if (!blk_mq_get_driver_tag(rq, NULL, false)) { + if (!blk_mq_get_driver_tag(rq)) { blk_mq_put_dispatch_budget(hctx); goto insert; } diff --git a/block/blk-mq.h b/block/blk-mq.h index 89231e439b2f..23659f41bf2c 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -36,8 +36,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool); void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); -bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, - bool wait); +bool blk_mq_get_driver_tag(struct request *rq); struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *start); -- cgit v1.2.3 From 2278d69f030f6cb7fdacba6281a46fb9d637d2aa Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 25 Jun 2018 19:31:46 +0800 Subject: blk-mq: don't pass **hctx to blk_mq_mark_tag_wait() 'hctx' won't be changed at all, so not necessary to pass '**hctx' to blk_mq_mark_tag_wait(). Cc: Christoph Hellwig Cc: Bart Van Assche Tested-by: Andrew Jones Reviewed-by: Omar Sandoval Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index ae8a6b2c7c22..9eee896a3592 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1009,17 +1009,16 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, * restart. For both cases, take care to check the condition again after * marking us as waiting. */ -static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, +static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, struct request *rq) { - struct blk_mq_hw_ctx *this_hctx = *hctx; struct sbq_wait_state *ws; wait_queue_entry_t *wait; bool ret; - if (!(this_hctx->flags & BLK_MQ_F_TAG_SHARED)) { - if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state)) - set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state); + if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) { + if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) + set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); /* * It's possible that a tag was freed in the window between the @@ -1032,17 +1031,17 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, return blk_mq_get_driver_tag(rq); } - wait = &this_hctx->dispatch_wait; + wait = &hctx->dispatch_wait; if (!list_empty_careful(&wait->entry)) return false; - spin_lock(&this_hctx->lock); + spin_lock(&hctx->lock); if (!list_empty(&wait->entry)) { - spin_unlock(&this_hctx->lock); + spin_unlock(&hctx->lock); return false; } - ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx); + ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx); add_wait_queue(&ws->wait, wait); /* @@ -1052,7 +1051,7 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, */ ret = blk_mq_get_driver_tag(rq); if (!ret) { - spin_unlock(&this_hctx->lock); + spin_unlock(&hctx->lock); return false; } @@ -1063,7 +1062,7 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx, spin_lock_irq(&ws->wait.lock); list_del_init(&wait->entry); spin_unlock_irq(&ws->wait.lock); - spin_unlock(&this_hctx->lock); + spin_unlock(&hctx->lock); return true; } @@ -1108,7 +1107,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, * before we add this entry back on the dispatch list, * we'll re-run it below. */ - if (!blk_mq_mark_tag_wait(&hctx, rq)) { + if (!blk_mq_mark_tag_wait(hctx, rq)) { blk_mq_put_dispatch_budget(hctx); /* * For non-shared tags, the RESTART check -- cgit v1.2.3 From 5815839b3ca16bb1d45939270871169f6803a121 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 25 Jun 2018 19:31:47 +0800 Subject: blk-mq: introduce new lock for protecting hctx->dispatch_wait Now hctx->lock is only acquired when adding hctx->dispatch_wait to one wait queue, but not held when removing it from the wait queue. IO hang can be observed easily if SCHED RESTART is disabled, that means now RESTART exits just for fixing the issue in blk_mq_mark_tag_wait(). This patch fixes the issue by introducing hctx->dispatch_wait_lock and holding it for removing hctx->dispatch_wait in blk_mq_dispatch_wake(), since we need to avoid acquiring hctx->lock in irq context. Fixes: eb619fdb2d4cb8b3d3419 ("blk-mq: fix issue with shared tag queue re-running") Cc: Christoph Hellwig Cc: Omar Sandoval Cc: Bart Van Assche Tested-by: Andrew Jones Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 26 +++++++++++++++++--------- include/linux/blk-mq.h | 1 + 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 9eee896a3592..df84281f6af6 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -998,7 +998,10 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); + spin_lock(&hctx->dispatch_wait_lock); list_del_init(&wait->entry); + spin_unlock(&hctx->dispatch_wait_lock); + blk_mq_run_hw_queue(hctx, true); return 1; } @@ -1012,7 +1015,7 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, struct request *rq) { - struct sbq_wait_state *ws; + struct wait_queue_head *wq; wait_queue_entry_t *wait; bool ret; @@ -1035,14 +1038,18 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, if (!list_empty_careful(&wait->entry)) return false; - spin_lock(&hctx->lock); + wq = &bt_wait_ptr(&hctx->tags->bitmap_tags, hctx)->wait; + + spin_lock_irq(&wq->lock); + spin_lock(&hctx->dispatch_wait_lock); if (!list_empty(&wait->entry)) { - spin_unlock(&hctx->lock); + spin_unlock(&hctx->dispatch_wait_lock); + spin_unlock_irq(&wq->lock); return false; } - ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx); - add_wait_queue(&ws->wait, wait); + wait->flags &= ~WQ_FLAG_EXCLUSIVE; + __add_wait_queue(wq, wait); /* * It's possible that a tag was freed in the window between the @@ -1051,7 +1058,8 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, */ ret = blk_mq_get_driver_tag(rq); if (!ret) { - spin_unlock(&hctx->lock); + spin_unlock(&hctx->dispatch_wait_lock); + spin_unlock_irq(&wq->lock); return false; } @@ -1059,10 +1067,9 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, * We got a tag, remove ourselves from the wait queue to ensure * someone else gets the wakeup. */ - spin_lock_irq(&ws->wait.lock); list_del_init(&wait->entry); - spin_unlock_irq(&ws->wait.lock); - spin_unlock(&hctx->lock); + spin_unlock(&hctx->dispatch_wait_lock); + spin_unlock_irq(&wq->lock); return true; } @@ -2142,6 +2149,7 @@ static int blk_mq_init_hctx(struct request_queue *q, hctx->nr_ctx = 0; + spin_lock_init(&hctx->dispatch_wait_lock); init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); INIT_LIST_HEAD(&hctx->dispatch_wait.entry); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index e3147eb74222..ea690254dab7 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -39,6 +39,7 @@ struct blk_mq_hw_ctx { struct blk_mq_ctx **ctxs; unsigned int nr_ctx; + spinlock_t dispatch_wait_lock; wait_queue_entry_t dispatch_wait; atomic_t wait_index; -- cgit v1.2.3 From 97889f9ac24f8d2fc8e703ea7f80c162bab10d4d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 25 Jun 2018 19:31:48 +0800 Subject: blk-mq: remove synchronize_rcu() from blk_mq_del_queue_tag_set() We have to remove synchronize_rcu() from blk_queue_cleanup(), otherwise long delay can be caused during lun probe. For removing it, we have to avoid to iterate the set->tag_list in IO path, eg, blk_mq_sched_restart(). This patch reverts 5b79413946d (Revert "blk-mq: don't handle TAG_SHARED in restart"). Given we have fixed enough IO hang issue, and there isn't any reason to restart all queues in one tags any more, see the following reasons: 1) blk-mq core can deal with shared-tags case well via blk_mq_get_driver_tag(), which can wake up queues waiting for driver tag. 2) SCSI is a bit special because it may return BLK_STS_RESOURCE if queue, target or host is ready, but SCSI built-in restart can cover all these well, see scsi_end_request(), queue will be rerun after any request initiated from this host/target is completed. In my test on scsi_debug(8 luns), this patch may improve IOPS by 20% ~ 30% when running I/O on these 8 luns concurrently. Fixes: 705cda97ee3a ("blk-mq: Make it safe to use RCU to iterate over blk_mq_tag_set.tag_list") Cc: Omar Sandoval Cc: Bart Van Assche Cc: Christoph Hellwig Cc: Martin K. Petersen Cc: linux-scsi@vger.kernel.org Reported-by: Andrew Jones Tested-by: Andrew Jones Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 85 +++----------------------------------------------- block/blk-mq.c | 10 ++---- include/linux/blkdev.h | 2 -- 3 files changed, 7 insertions(+), 90 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 56c493c6cd90..4e027f6108ae 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -59,29 +59,16 @@ static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) return; - if (hctx->flags & BLK_MQ_F_TAG_SHARED) { - struct request_queue *q = hctx->queue; - - if (!test_and_set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - atomic_inc(&q->shared_hctx_restart); - } else - set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); + set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); } -static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) +void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) { if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - return false; - - if (hctx->flags & BLK_MQ_F_TAG_SHARED) { - struct request_queue *q = hctx->queue; - - if (test_and_clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - atomic_dec(&q->shared_hctx_restart); - } else - clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); + return; + clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); - return blk_mq_run_hw_queue(hctx, true); + blk_mq_run_hw_queue(hctx, true); } /* @@ -380,68 +367,6 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, return false; } -/** - * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list - * @pos: loop cursor. - * @skip: the list element that will not be examined. Iteration starts at - * @skip->next. - * @head: head of the list to examine. This list must have at least one - * element, namely @skip. - * @member: name of the list_head structure within typeof(*pos). - */ -#define list_for_each_entry_rcu_rr(pos, skip, head, member) \ - for ((pos) = (skip); \ - (pos = (pos)->member.next != (head) ? list_entry_rcu( \ - (pos)->member.next, typeof(*pos), member) : \ - list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \ - (pos) != (skip); ) - -/* - * Called after a driver tag has been freed to check whether a hctx needs to - * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware - * queues in a round-robin fashion if the tag set of @hctx is shared with other - * hardware queues. - */ -void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx) -{ - struct blk_mq_tags *const tags = hctx->tags; - struct blk_mq_tag_set *const set = hctx->queue->tag_set; - struct request_queue *const queue = hctx->queue, *q; - struct blk_mq_hw_ctx *hctx2; - unsigned int i, j; - - if (set->flags & BLK_MQ_F_TAG_SHARED) { - /* - * If this is 0, then we know that no hardware queues - * have RESTART marked. We're done. - */ - if (!atomic_read(&queue->shared_hctx_restart)) - return; - - rcu_read_lock(); - list_for_each_entry_rcu_rr(q, queue, &set->tag_list, - tag_set_list) { - queue_for_each_hw_ctx(q, hctx2, i) - if (hctx2->tags == tags && - blk_mq_sched_restart_hctx(hctx2)) - goto done; - } - j = hctx->queue_num + 1; - for (i = 0; i < queue->nr_hw_queues; i++, j++) { - if (j == queue->nr_hw_queues) - j = 0; - hctx2 = queue->queue_hw_ctx[j]; - if (hctx2->tags == tags && - blk_mq_sched_restart_hctx(hctx2)) - break; - } -done: - rcu_read_unlock(); - } else { - blk_mq_sched_restart_hctx(hctx); - } -} - void blk_mq_sched_insert_request(struct request *rq, bool at_head, bool run_queue, bool async) { diff --git a/block/blk-mq.c b/block/blk-mq.c index df84281f6af6..7c6ff13171ef 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2335,15 +2335,10 @@ static void queue_set_hctx_shared(struct request_queue *q, bool shared) int i; queue_for_each_hw_ctx(q, hctx, i) { - if (shared) { - if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - atomic_inc(&q->shared_hctx_restart); + if (shared) hctx->flags |= BLK_MQ_F_TAG_SHARED; - } else { - if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - atomic_dec(&q->shared_hctx_restart); + else hctx->flags &= ~BLK_MQ_F_TAG_SHARED; - } } } @@ -2374,7 +2369,6 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q) blk_mq_update_tag_set_depth(set, false); } mutex_unlock(&set->tag_list_lock); - synchronize_rcu(); INIT_LIST_HEAD(&q->tag_set_list); } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ca5a8b046894..9d05646d5059 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -442,8 +442,6 @@ struct request_queue { int nr_rqs[2]; /* # allocated [a]sync rqs */ int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */ - atomic_t shared_hctx_restart; - struct blk_queue_stats *stats; struct rq_wb *rq_wb; -- cgit v1.2.3 From 1311326cf4755c7ffefd20f576144ecf46d9906b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 25 Jun 2018 19:31:49 +0800 Subject: blk-mq: avoid to synchronize rcu inside blk_cleanup_queue() SCSI probing may synchronously create and destroy a lot of request_queues for non-existent devices. Any synchronize_rcu() in queue creation or destroy path may introduce long latency during booting, see detailed description in comment of blk_register_queue(). This patch removes one synchronize_rcu() inside blk_cleanup_queue() for this case, commit c2856ae2f315d75(blk-mq: quiesce queue before freeing queue) needs synchronize_rcu() for implementing blk_mq_quiesce_queue(), but when queue isn't initialized, it isn't necessary to do that since only pass-through requests are involved, no original issue in scsi_execute() at all. Without this patch and previous one, it may take more 20+ seconds for virtio-scsi to complete disk probe. With the two patches, the time becomes less than 100ms. Fixes: c2856ae2f315d75 ("blk-mq: quiesce queue before freeing queue") Reported-by: Andrew Jones Cc: Omar Sandoval Cc: Bart Van Assche Cc: linux-scsi@vger.kernel.org Cc: "Martin K. Petersen" Cc: Christoph Hellwig Tested-by: Andrew Jones Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-core.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index f84a9b7b6f5a..947e7a4abd8c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -762,9 +762,13 @@ void blk_cleanup_queue(struct request_queue *q) * make sure all in-progress dispatch are completed because * blk_freeze_queue() can only complete all requests, and * dispatch may still be in-progress since we dispatch requests - * from more than one contexts + * from more than one contexts. + * + * No need to quiesce queue if it isn't initialized yet since + * blk_freeze_queue() should be enough for cases of passthrough + * request. */ - if (q->mq_ops) + if (q->mq_ops && blk_queue_init_done(q)) blk_mq_quiesce_queue(q); /* for synchronous bio-based driver finish in-flight integrity i/o */ -- cgit v1.2.3 From d05d199883b09cd34937ebb045adbed9098e9780 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 25 Jun 2018 15:51:00 -0700 Subject: drbd: Do not redefine __must_hold() Since __must_hold() is defined in , do not redefine it in DRBD. Compile-tested only. Reviewed-by: Christoph Hellwig Signed-off-by: Bart Van Assche Cc: Philipp Reisner Cc: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_int.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index bc4ed2ed40a2..e35a234b0a8f 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -55,12 +55,10 @@ # define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr"))) # define __protected_read_by(x) __attribute__((require_context(x,1,999,"read"))) # define __protected_write_by(x) __attribute__((require_context(x,1,999,"write"))) -# define __must_hold(x) __attribute__((context(x,1,1), require_context(x,1,999,"call"))) #else # define __protected_by(x) # define __protected_read_by(x) # define __protected_write_by(x) -# define __must_hold(x) #endif /* shared module parameters, defined in drbd_main.c */ -- cgit v1.2.3 From 1954e9a998d59d08520d7d4bebeafb8f66ba0d0f Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 27 Jun 2018 13:09:05 -0700 Subject: block: Document how blk_update_request() handles RQF_SPECIAL_PAYLOAD requests The payload of struct request is stored in the request.bio chain if the RQF_SPECIAL_PAYLOAD flag is not set and in request.special_vec if RQF_SPECIAL_PAYLOAD has been set. However, blk_update_request() iterates over req->bio whether or not RQF_SPECIAL_PAYLOAD has been set. Additionally, the RQF_SPECIAL_PAYLOAD flag is ignored by blk_rq_bytes() which means that the value returned by that function is incorrect if the RQF_SPECIAL_PAYLOAD flag has been set. It is not clear to me whether this is an oversight or whether this happened on purpose. Anyway, document that it is known that both functions ignore RQF_SPECIAL_PAYLOAD. See also commit f9d03f96b988 ("block: improve handling of the magic discard payload"). Reviewed-by: Christoph Hellwig Signed-off-by: Bart Van Assche Cc: Ming Lei Signed-off-by: Jens Axboe --- block/blk-core.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/block/blk-core.c b/block/blk-core.c index 947e7a4abd8c..2ff8e131a892 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -3056,6 +3056,10 @@ EXPORT_SYMBOL_GPL(blk_steal_bios); * Passing the result of blk_rq_bytes() as @nr_bytes guarantees * %false return from this function. * + * Note: + * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in both + * blk_rq_bytes() and in blk_update_request(). + * * Return: * %false - this request doesn't have any more data * %true - this request has more data -- cgit v1.2.3 From e1a413245a564683697a3d02ec197b72cf009b89 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 29 Jun 2018 09:56:08 +0800 Subject: Blktrace: bail out early if block debugfs is not configured Since @blk_debugfs_root couldn't be configured dynamically, we can save a few memory allocation if it's not there. Signed-off-by: Liu Bo Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 987d9a9ae283..b951aa1fac61 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -494,6 +494,9 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!buts->buf_size || !buts->buf_nr) return -EINVAL; + if (!blk_debugfs_root) + return -ENOENT; + strncpy(buts->name, name, BLKTRACE_BDEV_SIZE); buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0'; @@ -518,9 +521,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, ret = -ENOENT; - if (!blk_debugfs_root) - goto err; - dir = debugfs_lookup(buts->name, blk_debugfs_root); if (!dir) bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); -- cgit v1.2.3 From 43ada78781246cb36036f26158a645c17550ac54 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 29 Jun 2018 09:56:56 +0800 Subject: Block: blk-throttle: set low_valid immediately once one cgroup has io.low configured Once one cgroup has io.low configured, @low_valid becomes true and other cgroups won't switch it back whatsoever. Signed-off-by: Liu Bo Signed-off-by: Jens Axboe --- block/blk-throttle.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 82282e6fdcf8..63bb261811dd 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -579,8 +579,10 @@ static void blk_throtl_update_limit_valid(struct throtl_data *td) struct throtl_grp *tg = blkg_to_tg(blkg); if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] || - tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) + tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) { low_valid = true; + break; + } } rcu_read_unlock(); -- cgit v1.2.3 From b64a71a0130dd2a88b0fc36a3c0a4882f47813e8 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 2 Jul 2018 08:42:34 +0100 Subject: block/floppy: remove redundant variable dflags Variable dflags is being assigned but is never used hence it is redundant and can be removed. Cleans up clang warning: warning: variable 'dflags' set but not used [-Wunused-but-set-variable] Signed-off-by: Colin Ian King Signed-off-by: Jens Axboe --- drivers/block/floppy.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 8871b5044d9e..48f622728ce6 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -1461,7 +1461,6 @@ static void setup_rw_floppy(void) int i; int r; int flags; - int dflags; unsigned long ready_date; void (*function)(void); @@ -1485,8 +1484,6 @@ static void setup_rw_floppy(void) if (fd_wait_for_completion(ready_date, function)) return; } - dflags = DRS->flags; - if ((flags & FD_RAW_READ) || (flags & FD_RAW_WRITE)) setup_DMA(); -- cgit v1.2.3 From f4354a94e2097fe87a14d47ff502754bb547029a Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 2 Jul 2018 08:47:06 +0100 Subject: loop: remove redundant pointer inode Pointer inode is being assigned but is never used hence it is redundant and can be removed. Cleans up clang warning: warning: variable 'inode' set but not used [-Wunused-but-set-variable] Signed-off-by: Colin Ian King Signed-off-by: Jens Axboe --- drivers/block/loop.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 4cb1d1be3cfb..bae472646e4a 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -690,7 +690,6 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, unsigned int arg) { struct file *file, *old_file; - struct inode *inode; int error; error = -ENXIO; @@ -711,7 +710,6 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, if (error) goto out_putf; - inode = file->f_mapping->host; old_file = lo->lo_backing_file; error = -EINVAL; -- cgit v1.2.3 From e84422cdf3caa9cf35e625076dc62977f0023992 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 2 Jul 2018 08:13:59 +0100 Subject: partitions/ldm: remove redundant pointer dgrp Pointer dgrp is being assigned but is never used hence it is redundant and can be removed. Cleans up clang warning: warning: variable 'dgrp' set but not used [-Wunused-but-set-variable] Signed-off-by: Colin Ian King Signed-off-by: Jens Axboe --- block/partitions/ldm.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c index 0417937dfe99..16766f267559 100644 --- a/block/partitions/ldm.c +++ b/block/partitions/ldm.c @@ -830,7 +830,6 @@ static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb) { char buf[64]; int r_objid, r_name, r_id1, r_id2, len; - struct vblk_dgrp *dgrp; BUG_ON (!buffer || !vb); @@ -853,8 +852,6 @@ static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb) if (len != get_unaligned_be32(buffer + 0x14)) return false; - dgrp = &vb->vblk.dgrp; - ldm_get_vstr (buffer + 0x18 + r_objid, buf, sizeof (buf)); return true; } -- cgit v1.2.3 From 5efac89c849849ad3a959224eb711f9c311e5bde Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 2 Jul 2018 09:14:19 +0100 Subject: paride: remove redundant variable n Variable n is being assigned but is never used hence it is redundant and can be removed. Also put spacing between variables in declaration to clean up checkpatch warnings. Cleans up clang warning: warning: variable 'n' set but not used [-Wunused-but-set-variable] Signed-off-by: Colin Ian King Signed-off-by: Jens Axboe --- drivers/block/paride/bpck.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/paride/bpck.c b/drivers/block/paride/bpck.c index 4f27e7392e38..f5f63ca2889d 100644 --- a/drivers/block/paride/bpck.c +++ b/drivers/block/paride/bpck.c @@ -347,7 +347,7 @@ static int bpck_test_proto( PIA *pi, char * scratch, int verbose ) static void bpck_read_eeprom ( PIA *pi, char * buf ) -{ int i,j,k,n,p,v,f, om, od; +{ int i, j, k, p, v, f, om, od; bpck_force_spp(pi); @@ -356,7 +356,6 @@ static void bpck_read_eeprom ( PIA *pi, char * buf ) bpck_connect(pi); - n = 0; WR(4,0); for (i=0;i<64;i++) { WR(6,8); -- cgit v1.2.3 From 0da73d00ca111a6175825a00d94dbeae185f6d7e Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Mon, 2 Jul 2018 23:46:43 +0900 Subject: blk-mq: code clean-up by adding an API to clear set->mq_map set->mq_map is now currently cleared if something goes wrong when establishing a queue map in blk-mq-pci.c. It's also cleared before updating a queue map in blk_mq_update_queue_map(). This patch provides an API to clear set->mq_map to make it clear. Signed-off-by: Minwoo Im Signed-off-by: Jens Axboe --- block/blk-mq-pci.c | 5 +++-- block/blk-mq.c | 4 +--- block/blk-mq.h | 8 ++++++++ 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c index e233996bb76f..db644ec624f5 100644 --- a/block/blk-mq-pci.c +++ b/block/blk-mq-pci.c @@ -17,6 +17,8 @@ #include #include +#include "blk-mq.h" + /** * blk_mq_pci_map_queues - provide a default queue mapping for PCI device * @set: tagset to provide the mapping for @@ -48,8 +50,7 @@ int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev, fallback: WARN_ON_ONCE(set->nr_hw_queues > 1); - for_each_possible_cpu(cpu) - set->mq_map[cpu] = 0; + blk_mq_clear_mq_map(set); return 0; } EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues); diff --git a/block/blk-mq.c b/block/blk-mq.c index 7c6ff13171ef..3cc074ae5c59 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2683,7 +2683,6 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) { if (set->ops->map_queues) { - int cpu; /* * transport .map_queues is usually done in the following * way: @@ -2698,8 +2697,7 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) * killing stale mapping since one CPU may not be mapped * to any hw queue. */ - for_each_possible_cpu(cpu) - set->mq_map[cpu] = 0; + blk_mq_clear_mq_map(set); return set->ops->map_queues(set); } else diff --git a/block/blk-mq.h b/block/blk-mq.h index 23659f41bf2c..bc2b24735ed4 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -202,4 +202,12 @@ static inline void blk_mq_put_driver_tag(struct request *rq) __blk_mq_put_driver_tag(hctx, rq); } +static inline void blk_mq_clear_mq_map(struct blk_mq_tag_set *set) +{ + int cpu; + + for_each_possible_cpu(cpu) + set->mq_map[cpu] = 0; +} + #endif -- cgit v1.2.3 From c018c84fdb453ae057f3bcc87a1f1f730d41628b Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sat, 30 Jun 2018 22:12:41 +0900 Subject: blk-mq: fix typo in a function comment Fix typo in a function blk_mq_alloc_tag_set() comment. if if it too large -> if it's too large. Signed-off-by: Minwoo Im Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 3cc074ae5c59..acf31ad733bf 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2707,7 +2707,7 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) /* * Alloc a tag set to be associated with one or more request queues. * May fail with EINVAL for various error conditions. May adjust the - * requested depth down, if if it too large. In that case, the set + * requested depth down, if it's too large. In that case, the set * value will be stored in set->queue_depth. */ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) -- cgit v1.2.3 From 3f0cedc7e9a0b32e79c79d2aac0c96d2b870ae55 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 2 Jul 2018 17:35:58 +0800 Subject: blk-mq: use list_splice_tail_init() to insert requests list_splice_tail_init() is much more faster than inserting each request one by one, given all requets in 'list' belong to same sw queue and ctx->lock is required to insert requests. Cc: Laurence Oberman Cc: Omar Sandoval Cc: Bart Van Assche Tested-by: Kashyap Desai Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index acf31ad733bf..795ba859b16b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1545,19 +1545,19 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct list_head *list) { + struct request *rq; + /* * preemption doesn't flush plug list, so it's possible ctx->cpu is * offline now */ - spin_lock(&ctx->lock); - while (!list_empty(list)) { - struct request *rq; - - rq = list_first_entry(list, struct request, queuelist); + list_for_each_entry(rq, list, queuelist) { BUG_ON(rq->mq_ctx != ctx); - list_del_init(&rq->queuelist); - __blk_mq_insert_req_list(hctx, rq, false); + trace_block_rq_insert(hctx->queue, rq); } + + spin_lock(&ctx->lock);