From c3077b5d97a39223a2d4b95a21ccff660836170f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jun 2020 08:44:41 +0200 Subject: blk-mq: merge blk-softirq.c into blk-mq.c __blk_complete_request is only called from the blk-mq code, and duplicates a lot of code from blk-mq.c. Move it there to prepare for better code sharing and simplifications. Signed-off-by: Christoph Hellwig Reviewed-by: Daniel Wagner Signed-off-by: Jens Axboe --- block/Makefile | 2 +- block/blk-mq.c | 135 +++++++++++++++++++++++++++++++++++++++++++++ block/blk-softirq.c | 156 ---------------------------------------------------- 3 files changed, 136 insertions(+), 157 deletions(-) delete mode 100644 block/blk-softirq.c (limited to 'block') diff --git a/block/Makefile b/block/Makefile index 78719169fb2a..8d841f5f986f 100644 --- a/block/Makefile +++ b/block/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ - blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ + blk-exec.o blk-merge.o blk-timeout.o \ blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \ genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o diff --git a/block/blk-mq.c b/block/blk-mq.c index a9aa6d1e44cf..60febbf6f8d9 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -41,6 +41,8 @@ #include "blk-mq-sched.h" #include "blk-rq-qos.h" +static DEFINE_PER_CPU(struct list_head, blk_cpu_done); + static void blk_mq_poll_stats_start(struct request_queue *q); static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); @@ -574,6 +576,130 @@ void blk_mq_end_request(struct request *rq, blk_status_t error) } EXPORT_SYMBOL(blk_mq_end_request); +/* + * Softirq action handler - move entries to local list and loop over them + * while passing them to the queue registered handler. + */ +static __latent_entropy void blk_done_softirq(struct softirq_action *h) +{ + struct list_head *cpu_list, local_list; + + local_irq_disable(); + cpu_list = this_cpu_ptr(&blk_cpu_done); + list_replace_init(cpu_list, &local_list); + local_irq_enable(); + + while (!list_empty(&local_list)) { + struct request *rq; + + rq = list_entry(local_list.next, struct request, ipi_list); + list_del_init(&rq->ipi_list); + rq->q->mq_ops->complete(rq); + } +} + +#ifdef CONFIG_SMP +static void trigger_softirq(void *data) +{ + struct request *rq = data; + struct list_head *list; + + list = this_cpu_ptr(&blk_cpu_done); + list_add_tail(&rq->ipi_list, list); + + if (list->next == &rq->ipi_list) + raise_softirq_irqoff(BLOCK_SOFTIRQ); +} + +/* + * Setup and invoke a run of 'trigger_softirq' on the given cpu. + */ +static int raise_blk_irq(int cpu, struct request *rq) +{ + if (cpu_online(cpu)) { + call_single_data_t *data = &rq->csd; + + data->func = trigger_softirq; + data->info = rq; + data->flags = 0; + + smp_call_function_single_async(cpu, data); + return 0; + } + + return 1; +} +#else /* CONFIG_SMP */ +static int raise_blk_irq(int cpu, struct request *rq) +{ + return 1; +} +#endif + +static int blk_softirq_cpu_dead(unsigned int cpu) +{ + /* + * If a CPU goes away, splice its entries to the current CPU + * and trigger a run of the softirq + */ + local_irq_disable(); + list_splice_init(&per_cpu(blk_cpu_done, cpu), + this_cpu_ptr(&blk_cpu_done)); + raise_softirq_irqoff(BLOCK_SOFTIRQ); + local_irq_enable(); + + return 0; +} + +static void __blk_complete_request(struct request *req) +{ + struct request_queue *q = req->q; + int cpu, ccpu = req->mq_ctx->cpu; + unsigned long flags; + bool shared = false; + + BUG_ON(!q->mq_ops->complete); + + local_irq_save(flags); + cpu = smp_processor_id(); + + /* + * Select completion CPU + */ + if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && ccpu != -1) { + if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) + shared = cpus_share_cache(cpu, ccpu); + } else + ccpu = cpu; + + /* + * If current CPU and requested CPU share a cache, run the softirq on + * the current CPU. One might concern this is just like + * QUEUE_FLAG_SAME_FORCE, but actually not. blk_complete_request() is + * running in interrupt handler, and currently I/O controller doesn't + * support multiple interrupts, so current CPU is unique actually. This + * avoids IPI sending from current CPU to the first CPU of a group. + */ + if (ccpu == cpu || shared) { + struct list_head *list; +do_local: + list = this_cpu_ptr(&blk_cpu_done); + list_add_tail(&req->ipi_list, list); + + /* + * if the list only contains our just added request, + * signal a raise of the softirq. If there are already + * entries there, someone already raised the irq but it + * hasn't run yet. + */ + if (list->next == &req->ipi_list) + raise_softirq_irqoff(BLOCK_SOFTIRQ); + } else if (raise_blk_irq(ccpu, req)) + goto do_local; + + local_irq_restore(flags); +} + static void __blk_mq_complete_request_remote(void *data) { struct request *rq = data; @@ -3760,6 +3886,15 @@ EXPORT_SYMBOL(blk_mq_rq_cpu); static int __init blk_mq_init(void) { + int i; + + for_each_possible_cpu(i) + INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i)); + open_softirq(BLOCK_SOFTIRQ, blk_done_softirq); + + cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD, + "block/softirq:dead", NULL, + blk_softirq_cpu_dead); cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, blk_mq_hctx_notify_dead); cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online", diff --git a/block/blk-softirq.c b/block/blk-softirq.c deleted file mode 100644 index 6e7ec87d49fa..000000000000 --- a/block/blk-softirq.c +++ /dev/null @@ -1,156 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Functions related to softirq rq completions - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "blk.h" - -static DEFINE_PER_CPU(struct list_head, blk_cpu_done); - -/* - * Softirq action handler - move entries to local list and loop over them - * while passing them to the queue registered handler. - */ -static __latent_entropy void blk_done_softirq(struct softirq_action *h) -{ - struct list_head *cpu_list, local_list; - - local_irq_disable(); - cpu_list = this_cpu_ptr(&blk_cpu_done); - list_replace_init(cpu_list, &local_list); - local_irq_enable(); - - while (!list_empty(&local_list)) { - struct request *rq; - - rq = list_entry(local_list.next, struct request, ipi_list); - list_del_init(&rq->ipi_list); - rq->q->mq_ops->complete(rq); - } -} - -#ifdef CONFIG_SMP -static void trigger_softirq(void *data) -{ - struct request *rq = data; - struct list_head *list; - - list = this_cpu_ptr(&blk_cpu_done); - list_add_tail(&rq->ipi_list, list); - - if (list->next == &rq->ipi_list) - raise_softirq_irqoff(BLOCK_SOFTIRQ); -} - -/* - * Setup and invoke a run of 'trigger_softirq' on the given cpu. - */ -static int raise_blk_irq(int cpu, struct request *rq) -{ - if (cpu_online(cpu)) { - call_single_data_t *data = &rq->csd; - - data->func = trigger_softirq; - data->info = rq; - data->flags = 0; - - smp_call_function_single_async(cpu, data); - return 0; - } - - return 1; -} -#else /* CONFIG_SMP */ -static int raise_blk_irq(int cpu, struct request *rq) -{ - return 1; -} -#endif - -static int blk_softirq_cpu_dead(unsigned int cpu) -{ - /* - * If a CPU goes away, splice its entries to the current CPU - * and trigger a run of the softirq - */ - local_irq_disable(); - list_splice_init(&per_cpu(blk_cpu_done, cpu), - this_cpu_ptr(&blk_cpu_done)); - raise_softirq_irqoff(BLOCK_SOFTIRQ); - local_irq_enable(); - - return 0; -} - -void __blk_complete_request(struct request *req) -{ - struct request_queue *q = req->q; - int cpu, ccpu = req->mq_ctx->cpu; - unsigned long flags; - bool shared = false; - - BUG_ON(!q->mq_ops->complete); - - local_irq_save(flags); - cpu = smp_processor_id(); - - /* - * Select completion CPU - */ - if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && ccpu != -1) { - if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) - shared = cpus_share_cache(cpu, ccpu); - } else - ccpu = cpu; - - /* - * If current CPU and requested CPU share a cache, run the softirq on - * the current CPU. One might concern this is just like - * QUEUE_FLAG_SAME_FORCE, but actually not. blk_complete_request() is - * running in interrupt handler, and currently I/O controller doesn't - * support multiple interrupts, so current CPU is unique actually. This - * avoids IPI sending from current CPU to the first CPU of a group. - */ - if (ccpu == cpu || shared) { - struct list_head *list; -do_local: - list = this_cpu_ptr(&blk_cpu_done); - list_add_tail(&req->ipi_list, list); - - /* - * if the list only contains our just added request, - * signal a raise of the softirq. If there are already - * entries there, someone already raised the irq but it - * hasn't run yet. - */ - if (list->next == &req->ipi_list) - raise_softirq_irqoff(BLOCK_SOFTIRQ); - } else if (raise_blk_irq(ccpu, req)) - goto do_local; - - local_irq_restore(flags); -} - -static __init int blk_softirq_init(void) -{ - int i; - - for_each_possible_cpu(i) - INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i)); - - open_softirq(BLOCK_SOFTIRQ, blk_done_softirq); - cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD, - "block/softirq:dead", NULL, - blk_softirq_cpu_dead); - return 0; -} -subsys_initcall(blk_softirq_init); -- cgit v1.2.3 From 115243f5534c7b3980cc946e00f79740fdc0e068 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jun 2020 08:44:42 +0200 Subject: blk-mq: factor out a helper to reise the block softirq Add a helper to deduplicate the logic that raises the block softirq. Signed-off-by: Christoph Hellwig Reviewed-by: Daniel Wagner Signed-off-by: Jens Axboe --- block/blk-mq.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 60febbf6f8d9..a261e145ddfb 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -598,19 +598,27 @@ static __latent_entropy void blk_done_softirq(struct softirq_action *h) } } -#ifdef CONFIG_SMP -static void trigger_softirq(void *data) +static void blk_mq_trigger_softirq(struct request *rq) { - struct request *rq = data; - struct list_head *list; + struct list_head *list = this_cpu_ptr(&blk_cpu_done); - list = this_cpu_ptr(&blk_cpu_done); list_add_tail(&rq->ipi_list, list); + /* + * If the list only contains our just added request, signal a raise of + * the softirq. If there are already entries there, someone already + * raised the irq but it hasn't run yet. + */ if (list->next == &rq->ipi_list) raise_softirq_irqoff(BLOCK_SOFTIRQ); } +#ifdef CONFIG_SMP +static void trigger_softirq(void *data) +{ + blk_mq_trigger_softirq(data); +} + /* * Setup and invoke a run of 'trigger_softirq' on the given cpu. */ @@ -681,19 +689,8 @@ static void __blk_complete_request(struct request *req) * avoids IPI sending from current CPU to the first CPU of a group. */ if (ccpu == cpu || shared) { - struct list_head *list; do_local: - list = this_cpu_ptr(&blk_cpu_done); - list_add_tail(&req->ipi_list, list); - - /* - * if the list only contains our just added request, - * signal a raise of the softirq. If there are already - * entries there, someone already raised the irq but it - * hasn't run yet. - */ - if (list->next == &req->ipi_list) - raise_softirq_irqoff(BLOCK_SOFTIRQ); + blk_mq_trigger_softirq(req); } else if (raise_blk_irq(ccpu, req)) goto do_local; -- cgit v1.2.3 From dea6f3993812c82b4dd5f61acd41c55a311a445f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jun 2020 08:44:43 +0200 Subject: blk-mq: remove raise_blk_irq By open coding raise_blk_irq in the only caller, and replacing the ifdef CONFIG_SMP with an IS_ENABLED check the flow in the caller can be significantly simplified. Reviewed-by: Daniel Wagner Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 40 ++++++++++------------------------------ 1 file changed, 10 insertions(+), 30 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index a261e145ddfb..ada55521601f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -613,37 +613,11 @@ static void blk_mq_trigger_softirq(struct request *rq) raise_softirq_irqoff(BLOCK_SOFTIRQ); } -#ifdef CONFIG_SMP static void trigger_softirq(void *data) { blk_mq_trigger_softirq(data); } -/* - * Setup and invoke a run of 'trigger_softirq' on the given cpu. - */ -static int raise_blk_irq(int cpu, struct request *rq) -{ - if (cpu_online(cpu)) { - call_single_data_t *data = &rq->csd; - - data->func = trigger_softirq; - data->info = rq; - data->flags = 0; - - smp_call_function_single_async(cpu, data); - return 0; - } - - return 1; -} -#else /* CONFIG_SMP */ -static int raise_blk_irq(int cpu, struct request *rq) -{ - return 1; -} -#endif - static int blk_softirq_cpu_dead(unsigned int cpu) { /* @@ -688,11 +662,17 @@ static void __blk_complete_request(struct request *req) * support multiple interrupts, so current CPU is unique actually. This * avoids IPI sending from current CPU to the first CPU of a group. */ - if (ccpu == cpu || shared) { -do_local: + if (IS_ENABLED(CONFIG_SMP) && + ccpu != cpu && !shared && cpu_online(ccpu)) { + call_single_data_t *data = &req->csd; + + data->func = trigger_softirq; + data->info = req; + data->flags = 0; + smp_call_function_single_async(cpu, data); + } else { blk_mq_trigger_softirq(req); - } else if (raise_blk_irq(ccpu, req)) - goto do_local; + } local_irq_restore(flags); } -- cgit v1.2.3 From 6aab1da603e731383b342dbe612f92cd222fe56b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jun 2020 08:44:44 +0200 Subject: blk-mq: complete polled requests directly Even for single queue devices there is no point in offloading a polled completion to the softirq, given that blk_mq_force_complete_rq is called from the polling thread in that case and thus there are no starvation issues. Reviewed-by: Daniel Wagner Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index ada55521601f..ea083f58d9da 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -703,6 +703,16 @@ void blk_mq_force_complete_rq(struct request *rq) int cpu; WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); + + /* + * For a polled request, always complete locallly, it's pointless + * to redirect the completion. + */ + if (rq->cmd_flags & REQ_HIPRI) { + q->mq_ops->complete(rq); + return; + } + /* * Most of single queue controllers, there is only one irq vector * for handling IO completion, and the only irq's affinity is set @@ -717,12 +727,7 @@ void blk_mq_force_complete_rq(struct request *rq) return; } - /* - * For a polled request, always complete locallly, it's pointless - * to redirect the completion. - */ - if ((rq->cmd_flags & REQ_HIPRI) || - !test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) { + if (!test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) { q->mq_ops->complete(rq); return; } -- cgit v1.2.3 From d6cc464cc58424e137eca5e0a53226291044f5d2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jun 2020 08:44:45 +0200 Subject: blk-mq: short cut the IPI path in blk_mq_force_complete_rq for !SMP Let the compile optimize out the entire IPI path, given that we are obviously not going to use it. Reviewed-by: Daniel Wagner Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index ea083f58d9da..eef2f3c7f402 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -727,7 +727,8 @@ void blk_mq_force_complete_rq(struct request *rq) return; } - if (!test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) { + if (!IS_ENABLED(CONFIG_SMP) || + !test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) { q->mq_ops->complete(rq); return; } -- cgit v1.2.3 From d391a7a399e46315a8adc65eb8fb5d9123b91700 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jun 2020 08:44:46 +0200 Subject: blk-mq: merge the softirq vs non-softirq IPI logic Both the softirq path for single queue devices and the multi-queue completion handler share the same logic to figure out if we need an IPI for the completion and eventually issue it. Merge the two versions into a single unified code path. Reviewed-by: Daniel Wagner Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 85 ++++++++++++++-------------------------------------------- 1 file changed, 20 insertions(+), 65 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index eef2f3c7f402..ce772ab19188 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -600,8 +600,11 @@ static __latent_entropy void blk_done_softirq(struct softirq_action *h) static void blk_mq_trigger_softirq(struct request *rq) { - struct list_head *list = this_cpu_ptr(&blk_cpu_done); + struct list_head *list; + unsigned long flags; + local_irq_save(flags); + list = this_cpu_ptr(&blk_cpu_done); list_add_tail(&rq->ipi_list, list); /* @@ -611,11 +614,7 @@ static void blk_mq_trigger_softirq(struct request *rq) */ if (list->next == &rq->ipi_list) raise_softirq_irqoff(BLOCK_SOFTIRQ); -} - -static void trigger_softirq(void *data) -{ - blk_mq_trigger_softirq(data); + local_irq_restore(flags); } static int blk_softirq_cpu_dead(unsigned int cpu) @@ -633,56 +632,26 @@ static int blk_softirq_cpu_dead(unsigned int cpu) return 0; } -static void __blk_complete_request(struct request *req) +static void __blk_mq_complete_request(struct request *rq) { - struct request_queue *q = req->q; - int cpu, ccpu = req->mq_ctx->cpu; - unsigned long flags; - bool shared = false; - - BUG_ON(!q->mq_ops->complete); - - local_irq_save(flags); - cpu = smp_processor_id(); - - /* - * Select completion CPU - */ - if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && ccpu != -1) { - if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) - shared = cpus_share_cache(cpu, ccpu); - } else - ccpu = cpu; - /* - * If current CPU and requested CPU share a cache, run the softirq on - * the current CPU. One might concern this is just like - * QUEUE_FLAG_SAME_FORCE, but actually not. blk_complete_request() is - * running in interrupt handler, and currently I/O controller doesn't - * support multiple interrupts, so current CPU is unique actually. This - * avoids IPI sending from current CPU to the first CPU of a group. + * For most of single queue controllers, there is only one irq vector + * for handling I/O completion, and the only irq's affinity is set + * to all possible CPUs. On most of ARCHs, this affinity means the irq + * is handled on one specific CPU. + * + * So complete I/O requests in softirq context in case of single queue + * devices to avoid degrading I/O performance due to irqsoff latency. */ - if (IS_ENABLED(CONFIG_SMP) && - ccpu != cpu && !shared && cpu_online(ccpu)) { - call_single_data_t *data = &req->csd; - - data->func = trigger_softirq; - data->info = req; - data->flags = 0; - smp_call_function_single_async(cpu, data); - } else { - blk_mq_trigger_softirq(req); - } - - local_irq_restore(flags); + if (rq->q->nr_hw_queues == 1) + blk_mq_trigger_softirq(rq); + else + rq->q->mq_ops->complete(rq); } static void __blk_mq_complete_request_remote(void *data) { - struct request *rq = data; - struct request_queue *q = rq->q; - - q->mq_ops->complete(rq); + __blk_mq_complete_request(data); } /** @@ -713,23 +682,9 @@ void blk_mq_force_complete_rq(struct request *rq) return; } - /* - * Most of single queue controllers, there is only one irq vector - * for handling IO completion, and the only irq's affinity is set - * as all possible CPUs. On most of ARCHs, this affinity means the - * irq is handled on one specific CPU. - * - * So complete IO reqeust in softirq context in case of single queue - * for not degrading IO performance by irqsoff latency. - */ - if (q->nr_hw_queues == 1) { - __blk_complete_request(rq); - return; - } - if (!IS_ENABLED(CONFIG_SMP) || !test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) { - q->mq_ops->complete(rq); + __blk_mq_complete_request(rq); return; } @@ -743,7 +698,7 @@ void blk_mq_force_complete_rq(struct request *rq) rq->csd.flags = 0; smp_call_function_single_async(ctx->cpu, &rq->csd); } else { - q->mq_ops->complete(rq); + __blk_mq_complete_request(rq); } put_cpu(); } -- cgit v1.2.3 From 15f73f5b3e5958f2d169fe13c420eeeeae07bbf2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jun 2020 08:44:47 +0200 Subject: blk-mq: move failure injection out of blk_mq_complete_request Move the call to blk_should_fake_timeout out of blk_mq_complete_request and into the drivers, skipping call sites that are obvious error handlers, and remove the now superflous blk_mq_force_complete_rq helper. This ensures we don't keep injecting errors into completions that just terminate the Linux request after the hardware has been reset or the command has been aborted. Reviewed-by: Daniel Wagner Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 34 +++++++--------------------------- block/blk-timeout.c | 6 ++---- block/blk.h | 9 --------- block/bsg-lib.c | 5 ++++- 4 files changed, 13 insertions(+), 41 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index ce772ab19188..3f4f227cf830 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -655,16 +655,13 @@ static void __blk_mq_complete_request_remote(void *data) } /** - * blk_mq_force_complete_rq() - Force complete the request, bypassing any error - * injection that could drop the completion. - * @rq: Request to be force completed + * blk_mq_complete_request - end I/O on a request + * @rq: the request being processed * - * Drivers should use blk_mq_complete_request() to complete requests in their - * normal IO path. For timeout error recovery, drivers may call this forced - * completion routine after they've reclaimed timed out requests to bypass - * potentially subsequent fake timeouts. - */ -void blk_mq_force_complete_rq(struct request *rq) + * Description: + * Complete a request by scheduling the ->complete_rq operation. + **/ +void blk_mq_complete_request(struct request *rq) { struct blk_mq_ctx *ctx = rq->mq_ctx; struct request_queue *q = rq->q; @@ -702,7 +699,7 @@ void blk_mq_force_complete_rq(struct request *rq) } put_cpu(); } -EXPORT_SYMBOL_GPL(blk_mq_force_complete_rq); +EXPORT_SYMBOL(blk_mq_complete_request); static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx) __releases(hctx->srcu) @@ -724,23 +721,6 @@ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx) *srcu_idx = srcu_read_lock(hctx->srcu); } -/** - * blk_mq_complete_request - end I/O on a request - * @rq: the request being processed - * - * Description: - * Ends all I/O on a request. It does not handle partial completions. - * The actual completion happens out-of-order, through a IPI handler. - **/ -bool blk_mq_complete_request(struct request *rq) -{ - if (unlikely(blk_should_fake_timeout(rq->q))) - return false; - blk_mq_force_complete_rq(rq); - return true; -} -EXPORT_SYMBOL(blk_mq_complete_request); - /** * blk_mq_start_request - Start processing a request * @rq: Pointer to request to be started diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 8aa68fae96ad..3a1ac6434758 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -20,13 +20,11 @@ static int __init setup_fail_io_timeout(char *str) } __setup("fail_io_timeout=", setup_fail_io_timeout); -int blk_should_fake_timeout(struct request_queue *q) +bool __blk_should_fake_timeout(struct request_queue *q) { - if (!test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags)) - return 0; - return should_fail(&fail_io_timeout, 1); } +EXPORT_SYMBOL_GPL(__blk_should_fake_timeout); static int __init fail_io_timeout_debugfs(void) { diff --git a/block/blk.h b/block/blk.h index b5d1f0fc6547..8ba4a5e4fe07 100644 --- a/block/blk.h +++ b/block/blk.h @@ -223,18 +223,9 @@ ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf); ssize_t part_fail_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count); - -#ifdef CONFIG_FAIL_IO_TIMEOUT -int blk_should_fake_timeout(struct request_queue *); ssize_t part_timeout_show(struct device *, struct device_attribute *, char *); ssize_t part_timeout_store(struct device *, struct device_attribute *, const char *, size_t); -#else -static inline int blk_should_fake_timeout(struct request_queue *q) -{ - return 0; -} -#endif void __blk_queue_split(struct request_queue *q, struct bio **bio, unsigned int *nr_segs); diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 6cbb7926534c..fb7b347f8010 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -181,9 +181,12 @@ EXPORT_SYMBOL_GPL(bsg_job_get); void bsg_job_done(struct bsg_job *job, int result, unsigned int reply_payload_rcv_len) { + struct request *rq = blk_mq_rq_from_pdu(job); + job->result = result; job->reply_payload_rcv_len = reply_payload_rcv_len; - blk_mq_complete_request(blk_mq_rq_from_pdu(job)); + if (likely(!blk_should_fake_timeout(rq->q))) + blk_mq_complete_request(rq); } EXPORT_SYMBOL_GPL(bsg_job_done); -- cgit v1.2.3 From 4c8fc19686dc761f600833fc9b8fa390eaf73dd5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jun 2020 08:44:48 +0200 Subject: blk-mq: remove the get_cpu/put_cpu pair in blk_mq_complete_request We don't really care if we get migrated during the I/O completion. In the worth case we either perform an IPI that wasn't required, or complete the request on a CPU which we just migrated off. Reviewed-by: Daniel Wagner Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 3f4f227cf830..95125bfe779b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -685,7 +685,7 @@ void blk_mq_complete_request(struct request *rq) return; } - cpu = get_cpu(); + cpu = raw_smp_processor_id(); if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) shared = cpus_share_cache(cpu, ctx->cpu); @@ -697,7 +697,6 @@ void blk_mq_complete_request(struct request *rq) } else { __blk_mq_complete_request(rq); } - put_cpu(); } EXPORT_SYMBOL(blk_mq_complete_request); -- cgit v1.2.3 From 963395269c758641e1cb7208f3bdce6824ea608d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jun 2020 08:44:49 +0200 Subject: blk-mq: factor out a blk_mq_complete_need_ipi helper Add a helper to decide if we can complete locally or need an IPI. Reviewed-by: Daniel Wagner Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 95125bfe779b..961635b40999 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -654,6 +654,24 @@ static void __blk_mq_complete_request_remote(void *data) __blk_mq_complete_request(data); } +static inline bool blk_mq_complete_need_ipi(struct request *rq) +{ + int cpu = raw_smp_processor_id(); + + if (!IS_ENABLED(CONFIG_SMP) || + !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) + return false; + + /* same CPU or cache domain? Complete locally */ + if (cpu == rq->mq_ctx->cpu || + (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) && + cpus_share_cache(cpu, rq->mq_ctx->cpu))) + return false; + + /* don't try to IPI to an offline CPU */ + return cpu_online(rq->mq_ctx->cpu); +} + /** * blk_mq_complete_request - end I/O on a request * @rq: the request being processed @@ -663,11 +681,6 @@ static void __blk_mq_complete_request_remote(void *data) **/ void blk_mq_complete_request(struct request *rq) { - struct blk_mq_ctx *ctx = rq->mq_ctx; - struct request_queue *q = rq->q; - bool shared = false; - int cpu; - WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); /* @@ -675,25 +688,15 @@ void blk_mq_complete_request(struct request *rq) * to redirect the completion. */ if (rq->cmd_flags & REQ_HIPRI) { - q->mq_ops->complete(rq); - return; - } - - if (!IS_ENABLED(CONFIG_SMP) || - !test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) { - __blk_mq_complete_request(rq); + rq->q->mq_ops->complete(rq); return; } - cpu = raw_smp_processor_id(); - if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) - shared = cpus_share_cache(cpu, ctx->cpu); - - if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) { + if (blk_mq_complete_need_ipi(rq)) { rq->csd.func = __blk_mq_complete_request_remote; rq->csd.info = rq; rq->csd.flags = 0; - smp_call_function_single_async(ctx->cpu, &rq->csd); + smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd); } else { __blk_mq_complete_request(rq); } -- cgit v1.2.3 From 40d09b53bfc557af7481b9d80f060a7ac9c7d314 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jun 2020 08:44:50 +0200 Subject: blk-mq: add a new blk_mq_complete_request_remote API This is a variant of blk_mq_complete_request_remote that only completes the request if it needs to be bounced to another CPU or a softirq. If the request can be completed locally the function returns false and lets the driver complete it without requring and indirect function call. Reviewed-by: Daniel Wagner Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 45 ++++++++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 19 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 961635b40999..b8738b3c6d06 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -632,8 +632,11 @@ static int blk_softirq_cpu_dead(unsigned int cpu) return 0; } -static void __blk_mq_complete_request(struct request *rq) + +static void __blk_mq_complete_request_remote(void *data) { + struct request *rq = data; + /* * For most of single queue controllers, there is only one irq vector * for handling I/O completion, and the only irq's affinity is set @@ -649,11 +652,6 @@ static void __blk_mq_complete_request(struct request *rq) rq->q->mq_ops->complete(rq); } -static void __blk_mq_complete_request_remote(void *data) -{ - __blk_mq_complete_request(data); -} - static inline bool blk_mq_complete_need_ipi(struct request *rq) { int cpu = raw_smp_processor_id(); @@ -672,14 +670,7 @@ static inline bool blk_mq_complete_need_ipi(struct request *rq) return cpu_online(rq->mq_ctx->cpu); } -/** - * blk_mq_complete_request - end I/O on a request - * @rq: the request being processed - * - * Description: - * Complete a request by scheduling the ->complete_rq operation. - **/ -void blk_mq_complete_request(struct request *rq) +bool blk_mq_complete_request_remote(struct request *rq) { WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); @@ -687,10 +678,8 @@ void blk_mq_complete_request(struct request *rq) * For a polled request, always complete locallly, it's pointless * to redirect the completion. */ - if (rq->cmd_flags & REQ_HIPRI) { - rq->q->mq_ops->complete(rq); - return; - } + if (rq->cmd_flags & REQ_HIPRI) + return false; if (blk_mq_complete_need_ipi(rq)) { rq->csd.func = __blk_mq_complete_request_remote; @@ -698,8 +687,26 @@ void blk_mq_complete_request(struct request *rq) rq->csd.flags = 0; smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd); } else { - __blk_mq_complete_request(rq); + if (rq->q->nr_hw_queues > 1) + return false; + blk_mq_trigger_softirq(rq); } + + return true; +} +EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote); + +/** + * blk_mq_complete_request - end I/O on a request + * @rq: the request being processed + * + * Description: + * Complete a request by scheduling the ->complete_rq operation. + **/ +void blk_mq_complete_request(struct request *rq) +{ + if (!blk_mq_complete_request_remote(rq)) + rq->q->mq_ops->complete(rq); } EXPORT_SYMBOL(blk_mq_complete_request); -- cgit v1.2.3 From b5bd357cf8b65d31e32b1668293cbeedb6c06334 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 19 Jun 2020 20:47:23 +0000 Subject: block: add docs for gendisk / request_queue refcount helpers This adds documentation for the gendisk / request_queue refcount helpers. Signed-off-by: Luis Chamberlain Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-core.c | 13 +++++++++++++ block/genhd.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 62 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 03252af8c82c..13777c0c97f0 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -321,6 +321,13 @@ void blk_clear_pm_only(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_clear_pm_only); +/** + * blk_put_queue - decrement the request_queue refcount + * @q: the request_queue structure to decrement the refcount for + * + * Decrements the refcount of the request_queue kobject. When this reaches 0 + * we'll have blk_release_queue() called. + */ void blk_put_queue(struct request_queue *q) { kobject_put(&q->kobj); @@ -598,6 +605,12 @@ struct request_queue *blk_alloc_queue(make_request_fn make_request, int node_id) } EXPORT_SYMBOL(blk_alloc_queue); +/** + * blk_get_queue - increment the request_queue refcount + * @q: the request_queue structure to increment the refcount for + * + * Increment the refcount of the request_queue kobject. + */ bool blk_get_queue(struct request_queue *q) { if (likely(!blk_queue_dying(q))) { diff --git a/block/genhd.c b/block/genhd.c index 1a7659327664..f741613d731f 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -876,6 +876,20 @@ static void invalidate_partition(struct gendisk *disk, int partno) bdput(bdev); } +/** + * del_gendisk - remove the gendisk + * @disk: the struct gendisk to remove + * + * Removes the gendisk and all its associated resources. This deletes the + * partitions associated with the gendisk, and unregisters the associated + * request_queue. + * + * This is the counter to the respective __device_add_disk() call. + * + * The final removal of the struct gendisk happens when its refcount reaches 0 + * with put_disk(), which should be called after del_gendisk(), if + * __device_add_disk() was used. + */ void del_gendisk(struct gendisk *disk) { struct disk_part_iter piter; @@ -1514,6 +1528,23 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno) return 0; } +/** + * disk_release - releases all allocated resources of the gendisk + * @dev: the device representing this disk + * + * This function releases all allocated resources of the gendisk. + * + * The struct gendisk refcount is incremented with get_gendisk() or + * get_disk_and_module(), and its refcount is decremented with + * put_disk_and_module() or put_disk(). Once the refcount reaches 0 this + * function is called. + * + * Drivers which used __device_add_disk() have a gendisk with a request_queue + * assigned. Since the request_queue sits on top of the gendisk for these + * drivers we also call blk_put_queue() for them, and we expect the + * request_queue refcount to reach 0 at this point, and so the request_queue + * will also be freed prior to the disk. + */ static void disk_release(struct device *dev) { struct gendisk *disk = dev_to_disk(dev); @@ -1727,6 +1758,13 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) } EXPORT_SYMBOL(__alloc_disk_node); +/** + * get_disk_and_module - increments the gendisk and gendisk fops module refcount + * @disk: the struct gendisk to to increment the refcount for + * + * This increments the refcount for the struct gendisk, and the gendisk's + * fops module owner. + */ struct kobject *get_disk_and_module(struct gendisk *disk) { struct module *owner; @@ -1747,6 +1785,13 @@ struct kobject *get_disk_and_module(struct gendisk *disk) } EXPORT_SYMBOL(get_disk_and_module); +/** + * put_disk - decrements the gendisk refcount + * @disk: the struct gendisk to to decrement the refcount for + * + * This decrements the refcount for the struct gendisk. When this reaches 0 + * we'll have disk_release() called. + */ void put_disk(struct gendisk *disk) { if (disk) @@ -1754,7 +1799,10 @@ void put_disk(struct gendisk *disk) } EXPORT_SYMBOL(put_disk); -/* +/** + * put_disk_and_module - decrements the module and gendisk refcount + * @disk: the struct gendisk to to decrement the refcount for + * * This is a counterpart of get_disk_and_module() and thus also of * get_gendisk(). */ -- cgit v1.2.3 From 763b58923aeb0a06c5a5f7e5fbb4c654c644d91d Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 19 Jun 2020 20:47:24 +0000 Subject: block: clarify context for refcount increment helpers Let us clarify the context under which the helpers to increment the refcount for the gendisk and request_queue can be called under. We make this explicit on the places where we may sleep with might_sleep(). We don't address the decrement context yet, as that needs some extra work and fixes, but will be addressed in the next patch. Signed-off-by: Luis Chamberlain Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-core.c | 2 ++ block/genhd.c | 6 ++++++ 2 files changed, 8 insertions(+) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 13777c0c97f0..f68398cb2ef6 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -610,6 +610,8 @@ EXPORT_SYMBOL(blk_alloc_queue); * @q: the request_queue structure to increment the refcount for * * Increment the refcount of the request_queue kobject. + * + * Context: Any context. */ bool blk_get_queue(struct request_queue *q) { diff --git a/block/genhd.c b/block/genhd.c index f741613d731f..1be86b1f43ec 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -985,11 +985,15 @@ static ssize_t disk_badblocks_store(struct device *dev, * * This function gets the structure containing partitioning * information for the given device @devt. + * + * Context: can sleep */ struct gendisk *get_gendisk(dev_t devt, int *partno) { struct gendisk *disk = NULL; + might_sleep(); + if (MAJOR(devt) != BLOCK_EXT_MAJOR) { struct kobject *kobj; @@ -1764,6 +1768,8 @@ EXPORT_SYMBOL(__alloc_disk_node); * * This increments the refcount for the struct gendisk, and the gendisk's * fops module owner. + * + * Context: Any context. */ struct kobject *get_disk_and_module(struct gendisk *disk) { -- cgit v1.2.3 From e8c7d14ac6c37c173ec606907d38802b00302988 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 19 Jun 2020 20:47:25 +0000 Subject: block: revert back to synchronous request_queue removal Commit dc9edc44de6c ("block: Fix a blk_exit_rl() regression") merged on v4.12 moved the work behind blk_release_queue() into a workqueue after a splat floated around which indicated some work on blk_release_queue() could sleep in blk_exit_rl(). This splat would be possible when a driver called blk_put_queue() or blk_cleanup_queue() (which calls blk_put_queue() as its final call) from an atomic context. blk_put_queue() decrements the refcount for the request_queue kobject, and upon reaching 0 blk_release_queue() is called. Although blk_exit_rl() is now removed through commit db6d99523560 ("block: remove request_list code") on v5.0, we reserve the right to be able to sleep within blk_release_queue() context. The last reference for the request_queue must not be called from atomic context. *When* the last reference to the request_queue reaches 0 varies, and so let's take the opportunity to document when that is expected to happen and also document the context of the related calls as best as possible so we can avoid future issues, and with the hopes that the synchronous request_queue removal sticks. We revert back to synchronous request_queue removal because asynchronous removal creates a regression with expected userspace interaction with several drivers. An example is when removing the loopback driver, one uses ioctls from userspace to do so, but upon return and if successful, one expects the device to be removed. Likewise if one races to add another device the new one may not be added as it is still being removed. This was expected behavior before and it now fails as the device is still present and busy still. Moving to asynchronous request_queue removal could have broken many scripts which relied on the removal to have been completed if there was no error. Document this expectation as well so that this doesn't regress userspace again. Using asynchronous request_queue removal however has helped us find other bugs. In the future we can test what could break with this arrangement by enabling CONFIG_DEBUG_KOBJECT_RELEASE. While at it, update the docs with the context expectations for the request_queue / gendisk refcount decrement, and make these expectations explicit by using might_sleep(). Fixes: dc9edc44de6c ("block: Fix a blk_exit_rl() regression") Suggested-by: Nicolai Stange Signed-off-by: Luis Chamberlain Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Cc: Bart Van Assche Cc: Omar Sandoval Cc: Hannes Reinecke Cc: Nicolai Stange Cc: Greg Kroah-Hartman Cc: Michal Hocko Cc: yu kuai Signed-off-by: Jens Axboe --- block/blk-core.c | 8 ++++++++ block/blk-sysfs.c | 43 ++++++++++++++++++++++--------------------- block/genhd.c | 17 +++++++++++++++++ 3 files changed, 47 insertions(+), 21 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index f68398cb2ef6..a99b22fac38a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -327,6 +327,9 @@ EXPORT_SYMBOL_GPL(blk_clear_pm_only); * * Decrements the refcount of the request_queue kobject. When this reaches 0 * we'll have blk_release_queue() called. + * + * Context: Any context, but the last reference must not be dropped from + * atomic context. */ void blk_put_queue(struct request_queue *q) { @@ -359,9 +362,14 @@ EXPORT_SYMBOL_GPL(blk_set_queue_dying); * * Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and * put it. All future requests will be failed immediately with -ENODEV. + * + * Context: can sleep */ void blk_cleanup_queue(struct request_queue *q) { + /* cannot be called from atomic context */ + might_sleep(); + WARN_ON_ONCE(blk_queue_registered(q)); /* mark @q DYING, no new request or merges will be allowed afterwards */ diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 02643e149d5e..561624d4cc4e 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -873,22 +873,32 @@ static void blk_exit_queue(struct request_queue *q) bdi_put(q->backing_dev_info); } - /** - * __blk_release_queue - release a request queue - * @work: pointer to the release_work member of the request queue to be released + * blk_release_queue - releases all allocated resources of the request_queue + * @kobj: pointer to a kobject, whose container is a request_queue + * + * This function releases all allocated resources of the request queue. + * + * The struct request_queue refcount is incremented with blk_get_queue() and + * decremented with blk_put_queue(). Once the refcount reaches 0 this function + * is called. + * + * For drivers that have a request_queue on a gendisk and added with + * __device_add_disk() the refcount to request_queue will reach 0 with + * the last put_disk() called by the driver. For drivers which don't use + * __device_add_disk() this happens with blk_cleanup_queue(). * - * Description: - * This function is called when a block device is being unregistered. The - * process of releasing a request queue starts with blk_cleanup_queue, which - * set the appropriate flags and then calls blk_put_queue, that decrements - * the reference counter of the request queue. Once the reference counter - * of the request queue reaches zero, blk_release_queue is called to release - * all allocated resources of the request queue. + * Drivers exist which depend on the release of the request_queue to be + * synchronous, it should not be deferred. + * + * Context: can sleep */ -static void __blk_release_queue(struct work_struct *work) +static void blk_release_queue(struct kobject *kobj) { - struct request_queue *q = container_of(work, typeof(*q), release_work); + struct request_queue *q = + container_of(kobj, struct request_queue, kobj); + + might_sleep(); if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags)) blk_stat_remove_callback(q, q->poll_cb); @@ -917,15 +927,6 @@ static void __blk_release_queue(struct work_struct *work) call_rcu(&q->rcu_head, blk_free_queue_rcu); } -static void blk_release_queue(struct kobject *kobj) -{ - struct request_queue *q = - container_of(kobj, struct request_queue, kobj); - - INIT_WORK(&q->release_work, __blk_release_queue); - schedule_work(&q->release_work); -} - static const struct sysfs_ops queue_sysfs_ops = { .show = queue_attr_show, .store = queue_attr_store, diff --git a/block/genhd.c b/block/genhd.c index 1be86b1f43ec..60ae4e1b4d38 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -889,12 +889,19 @@ static void invalidate_partition(struct gendisk *disk, int partno) * The final removal of the struct gendisk happens when its refcount reaches 0 * with put_disk(), which should be called after del_gendisk(), if * __device_add_disk() was used. + * + * Drivers exist which depend on the release of the gendisk to be synchronous, + * it should not be deferred. + * + * Context: can sleep */ void del_gendisk(struct gendisk *disk) { struct disk_part_iter piter; struct hd_struct *part; + might_sleep(); + blk_integrity_del(disk); disk_del_events(disk); @@ -1548,11 +1555,15 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno) * drivers we also call blk_put_queue() for them, and we expect the * request_queue refcount to reach 0 at this point, and so the request_queue * will also be freed prior to the disk. + * + * Context: can sleep */ static void disk_release(struct device *dev) { struct gendisk *disk = dev_to_disk(dev); + might_sleep(); + blk_free_devt(dev->devt); disk_release_events(disk); kfree(disk->random); @@ -1797,6 +1808,9 @@ EXPORT_SYMBOL(get_disk_and_module); * * This decrements the refcount for the struct gendisk. When this reaches 0 * we'll have disk_release() called. + * + * Context: Any context, but the last reference must not be dropped from + * atomic context. */ void put_disk(struct gendisk *disk) { @@ -1811,6 +1825,9 @@ EXPORT_SYMBOL(put_disk); * * This is a counterpart of get_disk_and_module() and thus also of * get_gendisk(). + * + * Context: Any context, but the last reference must not be dropped from + * atomic context. */ void put_disk_and_module(struct gendisk *disk) { -- cgit v1.2.3 From 85e0cbbb8a79537dbc465e9deb449a08b2b092a6 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 19 Jun 2020 20:47:30 +0000 Subject: block: create the request_queue debugfs_dir on registration We were only creating the request_queue debugfs_dir only for make_request block drivers (multiqueue), but never for request-based block drivers. We did this as we were only creating non-blktrace additional debugfs files on that directory for make_request drivers. However, since blktrace *always* creates that directory anyway, we special-case the use of that directory on blktrace. Other than this being an eye-sore, this exposes request-based block drivers to the same debugfs fragile race that used to exist with make_request block drivers where if we start adding files onto that directory we can later run a race with a double removal of dentries on the directory if we don't deal with this carefully on blktrace. Instead, just simplify things by always creating the request_queue debugfs_dir on request_queue registration. Rename the mutex also to reflect the fact that this is used outside of the blktrace context. Signed-off-by: Luis Chamberlain Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 8 +------- block/blk-mq-debugfs.c | 5 ----- block/blk-sysfs.c | 9 +++++++++ block/blk.h | 2 -- 4 files changed, 10 insertions(+), 14 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index a99b22fac38a..a9769c1a2875 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -51,9 +51,7 @@ #include "blk-pm.h" #include "blk-rq-qos.h" -#ifdef CONFIG_DEBUG_FS struct dentry *blk_debugfs_root; -#endif EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); @@ -555,9 +553,7 @@ struct request_queue *__blk_alloc_queue(int node_id) kobject_init(&q->kobj, &blk_queue_ktype); -#ifdef CONFIG_BLK_DEV_IO_TRACE - mutex_init(&q->blk_trace_mutex); -#endif + mutex_init(&q->debugfs_mutex); mutex_init(&q->sysfs_lock); mutex_init(&q->sysfs_dir_lock); spin_lock_init(&q->queue_lock); @@ -1931,9 +1927,7 @@ int __init blk_dev_init(void) blk_requestq_cachep = kmem_cache_create("request_queue", sizeof(struct request_queue), 0, SLAB_PANIC, NULL); -#ifdef CONFIG_DEBUG_FS blk_debugfs_root = debugfs_create_dir("block", NULL); -#endif return 0; } diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 15df3a36e9fa..a2800bc56fb4 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -824,9 +824,6 @@ void blk_mq_debugfs_register(struct request_queue *q) struct blk_mq_hw_ctx *hctx; int i; - q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent), - blk_debugfs_root); - debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs); /* @@ -857,9 +854,7 @@ void blk_mq_debugfs_register(struct request_queue *q) void blk_mq_debugfs_unregister(struct request_queue *q) { - debugfs_remove_recursive(q->debugfs_dir); q->sched_debugfs_dir = NULL; - q->debugfs_dir = NULL; } static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx, diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 561624d4cc4e..be67952e7be2 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "blk.h" #include "blk-mq.h" @@ -917,6 +918,9 @@ static void blk_release_queue(struct kobject *kobj) blk_mq_release(q); blk_trace_shutdown(q); + mutex_lock(&q->debugfs_mutex); + debugfs_remove_recursive(q->debugfs_dir); + mutex_unlock(&q->debugfs_mutex); if (queue_is_mq(q)) blk_mq_debugfs_unregister(q); @@ -989,6 +993,11 @@ int blk_register_queue(struct gendisk *disk) goto unlock; } + mutex_lock(&q->debugfs_mutex); + q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent), + blk_debugfs_root); + mutex_unlock(&q->debugfs_mutex); + if (queue_is_mq(q)) { __blk_mq_register_dev(dev, q); blk_mq_debugfs_register(q); diff --git a/block/blk.h b/block/blk.h index 8ba4a5e4fe07..3a120a070dac 100644 --- a/block/blk.h +++ b/block/blk.h @@ -14,9 +14,7 @@ /* Max future timer expiry for timeouts */ #define BLK_MAX_TIMEOUT (5 * HZ) -#ifdef CONFIG_DEBUG_FS extern struct dentry *blk_debugfs_root; -#endif struct blk_flush_queue { unsigned int flush_pending_idx:1; -- cgit v1.2.3 From 1f4fe21cf45c799a2fef41ae23dd2a8a8dbb93b7 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 19 Jun 2020 19:49:49 -0500 Subject: block: bio: Use struct_size() in kmalloc() Make use of the struct_size() helper instead of an open-coded version in order to avoid any potential type mistakes. This code was detected with the help of Coccinelle and, audited and fixed manually. Signed-off-by: Gustavo A. R. Silva Addresses-KSPP-ID: https://github.com/KSPP/linux/issues/83 Signed-off-by: Jens Axboe --- block/bio.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index a7366c02c9b5..fb5533416fa6 100644 --- a/block/bio.c +++ b/block/bio.c @@ -444,9 +444,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs, if (nr_iovecs > UIO_MAXIOV) return NULL; - p = kmalloc(sizeof(struct bio) + - nr_iovecs * sizeof(struct bio_vec), - gfp_mask); + p = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask); front_pad = 0; inline_vecs = nr_iovecs; } else { -- cgit v1.2.3 From f61d6e259c7ebb9a134dee5cd0b32c192d726984 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 19 Jun 2020 18:08:30 -0500 Subject: blk-iocost: Use struct_size() in kzalloc_node() Make use of the struct_size() helper instead of an open-coded version in order to avoid any potential type mistakes. This code was detected with the help of Coccinelle and, audited and fixed manually. Signed-off-by: Gustavo A. R. Silva Addresses-KSPP-ID: https://github.com/KSPP/linux/issues/83 Signed-off-by: Jens Axboe --- block/blk-iocost.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 8ac4aad66ebc..cea5ee9be639 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2045,8 +2045,7 @@ static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q, int levels = blkcg->css.cgroup->level + 1; struct ioc_gq *iocg; - iocg = kzalloc_node(sizeof(*iocg) + levels * sizeof(iocg->ancestors[0]), - gfp, q->node); + iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, q->node); if (!iocg) return NULL; -- cgit v1.2.3 From f3bdc62fd82ed93dbe4d049eacba310de7eb2a6a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 17 Jun 2020 15:58:23 +0200 Subject: blktrace: Provide event for request merging Currently blk-mq does not report any event when two requests get merged in the elevator. This then results in difficult to understand sequence of events like: ... 8,0 34 1579 0.608765271 2718 I WS 215023504 + 40 [dbench] 8,0 34 1584 0.609184613 2719 A WS 215023544 + 56 <- (8,4) 2160568 8,0 34 1585 0.609184850 2719 Q WS 215023544 + 56 [dbench] 8,0 34 1586 0.609188524 2719 G WS 215023544 + 56 [dbench] 8,0 3 602 0.609684162 773 D WS 215023504 + 96 [kworker/3:1H] 8,0 34 1591 0.609843593 0 C WS 215023504 + 96 [0] and you can only guess (after quite some headscratching since the above excerpt is intermixed with a lot of other IO) that request 215023544+56 got merged to request 215023504+40. Provide proper event for request merging like we used to do in the legacy block layer. Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- block/blk-merge.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index f0b0bae075a0..9c9fb21584b6 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -793,6 +793,8 @@ static struct request *attempt_merge(struct request_queue *q, */ blk_account_io_merge_request(next); + trace_block_rq_merge(q, next); + /* * ownership of bio passed from next to req, return 'next' for * the caller to free -- cgit v1.2.3 From 826f2f48da8c331ac51e1381998d318012d66550 Mon Sep 17 00:00:00 2001 From: Guo Xuenan Date: Sun, 28 Jun 2020 09:56:25 -0400 Subject: blk-rq-qos: remove redundant finish_wait to rq_qos_wait. It is no need do finish_wait twice after acquiring inflight. Signed-off-by: Guo Xuenan Signed-off-by: Jens Axboe --- block/blk-rq-qos.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'block') diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index 656460636ad3..18f3eab9f768 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -273,8 +273,6 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, if (data.got_token) break; if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) { - finish_wait(&rqw->wait, &data.wq); - /* * We raced with wbt_wake_function() getting a token, * which means we now have two. Put our local token -- cgit v1.2.3 From db9819c76c1fd48c30699381c94bba5c95dd467e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 27 Jun 2020 09:31:47 +0200 Subject: block: remove bio_disassociate_blkg bio_disassociate_blkg has two callers, of which one immediately assigns a new value to >bi_blkg. Just open code the function in the two callers. Acked-by: Tejun Heo Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index fb5533416fa6..8aef4460b32e 100644 --- a/block/bio.c +++ b/block/bio.c @@ -234,8 +234,12 @@ fallback: void bio_uninit(struct bio *bio) { - bio_disassociate_blkg(bio); - +#ifdef CONFIG_BLK_CGROUP + if (bio->bi_blkg) { + blkg_put(bio->bi_blkg); + bio->bi_blkg = NULL; + } +#endif if (bio_integrity(bio)) bio_integrity_free(bio); @@ -1625,21 +1629,6 @@ EXPORT_SYMBOL(bioset_init_from_src); #ifdef CONFIG_BLK_CGROUP -/** - * bio_disassociate_blkg - puts back the blkg reference if associated - * @bio: target bio - * - * Helper to disassociate the blkg from @bio if a blkg is associated. - */ -void bio_disassociate_blkg(struct bio *bio) -{ - if (bio->bi_blkg) { - blkg_put(bio->bi_blkg); - bio->bi_blkg = NULL; - } -} -EXPORT_SYMBOL_GPL(bio_disassociate_blkg); - /** * __bio_associate_blkg - associate a bio with the a blkg * @bio: target bio @@ -1656,8 +1645,8 @@ EXPORT_SYMBOL_GPL(bio_disassociate_blkg); */ static void __bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg) { - bio_disassociate_blkg(bio); - + if (bio->bi_blkg) + blkg_put(bio->bi_blkg); bio->bi_blkg = blkg_tryget_closest(blkg); } -- cgit v1.2.3 From d92c370a16cbe0276954c761b874bd024a7e4fac Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 27 Jun 2020 09:31:48 +0200 Subject: block: really clone the block cgroup in bio_clone_blkg_association bio_clone_blkg_association is supposed to clone the associatation, but actually ends up doing a search with a tryget. As we know we have a reference on the source cgroup just get an unconditional additional reference to it and call it a day. That also removes the need for a RCU critical section. Acked-by: Tejun Heo Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 8aef4460b32e..e1d01acce807 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1737,12 +1737,12 @@ EXPORT_SYMBOL_GPL(bio_associate_blkg); */ void bio_clone_blkg_association(struct bio *dst, struct bio *src) { - rcu_read_lock(); - - if (src->bi_blkg) - __bio_associate_blkg(dst, src->bi_blkg); - - rcu_read_unlock(); + if (src->bi_blkg) { + if (dst->bi_blkg) + blkg_put(dst->bi_blkg); + blkg_get(src->bi_blkg); + dst->bi_blkg = src->bi_blkg; + } } EXPORT_SYMBOL_GPL(bio_clone_blkg_association); #endif /* CONFIG_BLK_CGROUP */ -- cgit v1.2.3 From 2badf06cf906c7af4bdd4bc1da62890c8e686341 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 27 Jun 2020 09:31:49 +0200 Subject: block: merge __bio_associate_blkg into bio_associate_blkg_from_css Merge __bio_associate_blkg into the only caller, which allows to slightly reduce the RCU crticial section and better explain the code flow. Acked-by: Tejun Heo Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 45 +++++++++++++-------------------------------- 1 file changed, 13 insertions(+), 32 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index e1d01acce807..bc8de2432e36 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1628,52 +1628,33 @@ int bioset_init_from_src(struct bio_set *bs, struct bio_set *src) EXPORT_SYMBOL(bioset_init_from_src); #ifdef CONFIG_BLK_CGROUP - -/** - * __bio_associate_blkg - associate a bio with the a blkg - * @bio: target bio - * @blkg: the blkg to associate - * - * This tries to associate @bio with the specified @blkg. Association failure - * is handled by walking up the blkg tree. Therefore, the blkg associated can - * be anything between @blkg and the root_blkg. This situation only happens - * when a cgroup is dying and then the remaining bios will spill to the closest - * alive blkg. - * - * A reference will be taken on the @blkg and will be released when @bio is - * freed. - */ -static void __bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg) -{ - if (bio->bi_blkg) - blkg_put(bio->bi_blkg); - bio->bi_blkg = blkg_tryget_closest(blkg); -} - /** * bio_associate_blkg_from_css - associate a bio with a specified css * @bio: target bio * @css: target css * * Associate @bio with the blkg found by combining the css's blkg and the - * request_queue of the @bio. This falls back to the queue's root_blkg if - * the association fails with the css. + * request_queue of the @bio. An association failure is handled by walking up + * the blkg tree. Therefore, the blkg associated can be anything between @blkg + * and q->root_blkg. This situation only happens when a cgroup is dying and + * then the remaining bios will spill to the closest alive blkg. + * + * A reference will be taken on the blkg and will be released when @bio is + * freed. */ void bio_associate_blkg_from_css(struct bio *bio, struct cgroup_subsys_state *css) { struct request_queue *q = bio->bi_disk->queue; - struct blkcg_gq *blkg; + struct blkcg_gq *blkg = q->root_blkg; - rcu_read_lock(); + if (bio->bi_blkg) + blkg_put(bio->bi_blkg); - if (!css || !css->parent) - blkg = q->root_blkg; - else + rcu_read_lock(); + if (css && css->parent) blkg = blkg_lookup_create(css_to_blkcg(css), q); - - __bio_associate_blkg(bio, blkg); - + bio->bi_blkg = blkg_tryget_closest(blkg); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); -- cgit v1.2.3 From a18b9b1590ca64f877588700de32c9ad236f405c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 27 Jun 2020 09:31:50 +0200 Subject: block: move bio_associate_blkg_from_page to mm/page_io.c bio_associate_blkg_from_page is a special purpose helper for swap bios that doesn't need access to bio internals. Move it to the swap code instead of having it in bio.c. Acked-by: Tejun Heo Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 26 -------------------------- 1 f