diff options
author | Jens Axboe <axboe@kernel.dk> | 2013-11-08 09:08:12 -0700 |
---|---|---|
committer | Jens Axboe <axboe@kernel.dk> | 2013-11-08 09:08:12 -0700 |
commit | e37459b8e2c7db6735e39e019e448b76e5e77647 (patch) | |
tree | a3f0944db87a8ae0d41e5acbbbabc1e7ef534d1b /block | |
parent | c7d1ba417c7cb7297d14dd47a390ec90ce548d5c (diff) | |
parent | e7e245000110a7794de8f925b9edc06a9c852f80 (diff) |
Merge branch 'blk-mq/core' into for-3.13/core
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Conflicts:
block/blk-timeout.c
Diffstat (limited to 'block')
-rw-r--r-- | block/Makefile | 5 | ||||
-rw-r--r-- | block/blk-core.c | 157 | ||||
-rw-r--r-- | block/blk-exec.c | 14 | ||||
-rw-r--r-- | block/blk-flush.c | 154 | ||||
-rw-r--r-- | block/blk-merge.c | 17 | ||||
-rw-r--r-- | block/blk-mq-cpu.c | 93 | ||||
-rw-r--r-- | block/blk-mq-cpumap.c | 108 | ||||
-rw-r--r-- | block/blk-mq-sysfs.c | 384 | ||||
-rw-r--r-- | block/blk-mq-tag.c | 204 | ||||
-rw-r--r-- | block/blk-mq-tag.h | 27 | ||||
-rw-r--r-- | block/blk-mq.c | 1500 | ||||
-rw-r--r-- | block/blk-mq.h | 52 | ||||
-rw-r--r-- | block/blk-sysfs.c | 13 | ||||
-rw-r--r-- | block/blk-timeout.c | 74 | ||||
-rw-r--r-- | block/blk.h | 17 |
15 files changed, 2701 insertions, 118 deletions
diff --git a/block/Makefile b/block/Makefile index 671a83d063a5..20645e88fb57 100644 --- a/block/Makefile +++ b/block/Makefile @@ -5,8 +5,9 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ - blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o \ - partition-generic.o partitions/ + blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \ + blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ + genhd.o scsi_ioctl.o partition-generic.o partitions/ obj-$(CONFIG_BLK_DEV_BSG) += bsg.o obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o diff --git a/block/blk-core.c b/block/blk-core.c index 25f13479f552..8bdd0121212a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -16,6 +16,7 @@ #include <linux/backing-dev.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/blk-mq.h> #include <linux/highmem.h> #include <linux/mm.h> #include <linux/kernel_stat.h> @@ -48,7 +49,7 @@ DEFINE_IDA(blk_queue_ida); /* * For the allocated request tables */ -static struct kmem_cache *request_cachep; +struct kmem_cache *request_cachep = NULL; /* * For queue allocation @@ -60,42 +61,6 @@ struct kmem_cache *blk_requestq_cachep; */ static struct workqueue_struct *kblockd_workqueue; -static void drive_stat_acct(struct request *rq, int new_io) -{ - struct hd_struct *part; - int rw = rq_data_dir(rq); - int cpu; - - if (!blk_do_io_stat(rq)) - return; - - cpu = part_stat_lock(); - - if (!new_io) { - part = rq->part; - part_stat_inc(cpu, part, merges[rw]); - } else { - part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); - if (!hd_struct_try_get(part)) { - /* - * The partition is already being removed, - * the request will be accounted on the disk only - * - * We take a reference on disk->part0 although that - * partition will never be deleted, so we can treat - * it as any other partition. - */ - part = &rq->rq_disk->part0; - hd_struct_get(part); - } - part_round_stats(cpu, part); - part_inc_in_flight(part, rw); - rq->part = part; - } - - part_stat_unlock(); -} - void blk_queue_congestion_threshold(struct request_queue *q) { int nr; @@ -145,7 +110,6 @@ void blk_rq_init(struct request_queue *q, struct request *rq) rq->cmd = rq->__cmd; rq->cmd_len = BLK_MAX_CDB; rq->tag = -1; - rq->ref_count = 1; rq->start_time = jiffies; set_start_time_ns(rq); rq->part = NULL; @@ -174,9 +138,9 @@ void blk_dump_rq_flags(struct request *rq, char *msg) { int bit; - printk(KERN_INFO "%s: dev %s: type=%x, flags=%x\n", msg, + printk(KERN_INFO "%s: dev %s: type=%x, flags=%llx\n", msg, rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type, - rq->cmd_flags); + (unsigned long long) rq->cmd_flags); printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", (unsigned long long)blk_rq_pos(rq), @@ -595,9 +559,12 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (!q) return NULL; + if (percpu_counter_init(&q->mq_usage_counter, 0)) + goto fail_q; + q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); if (q->id < 0) - goto fail_q; + goto fail_c; q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; @@ -644,6 +611,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) q->bypass_depth = 1; __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags); + init_waitqueue_head(&q->mq_freeze_wq); + if (blkcg_init_queue(q)) goto fail_bdi; @@ -653,6 +622,8 @@ fail_bdi: bdi_destroy(&q->backing_dev_info); fail_id: ida_simple_remove(&blk_queue_ida, q->id); +fail_c: + percpu_counter_destroy(&q->mq_usage_counter); fail_q: kmem_cache_free(blk_requestq_cachep, q); return NULL; @@ -1119,7 +1090,8 @@ retry: goto retry; } -struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) +static struct request *blk_old_get_request(struct request_queue *q, int rw, + gfp_t gfp_mask) { struct request *rq; @@ -1136,6 +1108,14 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) return rq; } + +struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) +{ + if (q->mq_ops) + return blk_mq_alloc_request(q, rw, gfp_mask, false); + else + return blk_old_get_request(q, rw, gfp_mask); +} EXPORT_SYMBOL(blk_get_request); /** @@ -1221,7 +1201,7 @@ EXPORT_SYMBOL(blk_requeue_request); static void add_acct_request(struct request_queue *q, struct request *rq, int where) { - drive_stat_acct(rq, 1); + blk_account_io_start(rq, true); __elv_add_request(q, rq, where); } @@ -1282,8 +1262,6 @@ void __blk_put_request(struct request_queue *q, struct request *req) { if (unlikely(!q)) return; - if (unlikely(--req->ref_count)) - return; blk_pm_put_request(req); @@ -1312,12 +1290,17 @@ EXPORT_SYMBOL_GPL(__blk_put_request); void blk_put_request(struct request *req) { - unsigned long flags; struct request_queue *q = req->q; - spin_lock_irqsave(q->queue_lock, flags); - __blk_put_request(q, req); - spin_unlock_irqrestore(q->queue_lock, flags); + if (q->mq_ops) + blk_mq_free_request(req); + else { + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + __blk_put_request(q, req); + spin_unlock_irqrestore(q->queue_lock, flags); + } } EXPORT_SYMBOL(blk_put_request); @@ -1353,8 +1336,8 @@ void blk_add_request_payload(struct request *rq, struct page *page, } EXPORT_SYMBOL_GPL(blk_add_request_payload); -static bool bio_attempt_back_merge(struct request_queue *q, struct request *req, - struct bio *bio) +bool bio_attempt_back_merge(struct request_queue *q, struct request *req, + struct bio *bio) { const int ff = bio->bi_rw & REQ_FAILFAST_MASK; @@ -1371,12 +1354,12 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req, req->__data_len += bio->bi_size; req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); - drive_stat_acct(req, 0); + blk_account_io_start(req, false); return true; } -static bool bio_attempt_front_merge(struct request_queue *q, - struct request *req, struct bio *bio) +bool bio_attempt_front_merge(struct request_queue *q, struct request *req, + struct bio *bio) { const int ff = bio->bi_rw & REQ_FAILFAST_MASK; @@ -1401,12 +1384,12 @@ static bool bio_attempt_front_merge(struct request_queue *q, req->__data_len += bio->bi_size; req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); - drive_stat_acct(req, 0); + blk_account_io_start(req, false); return true; } /** - * attempt_plug_merge - try to merge with %current's plugged list + * blk_attempt_plug_merge - try to merge with %current's plugged list * @q: request_queue new bio is being queued at * @bio: new bio being queued * @request_count: out parameter for number of traversed plugged requests @@ -1422,12 +1405,13 @@ static bool bio_attempt_front_merge(struct request_queue *q, * reliable access to the elevator outside queue lock. Only check basic * merging parameters without querying the elevator. */ -static bool attempt_plug_merge(struct request_queue *q, struct bio *bio, - unsigned int *request_count) +bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, + unsigned int *request_count) { struct blk_plug *plug; struct request *rq; bool ret = false; + struct list_head *plug_list; if (blk_queue_nomerges(q)) goto out; @@ -1437,7 +1421,12 @@ static bool attempt_plug_merge(struct request_queue *q, struct bio *bio, goto out; *request_count = 0; - list_for_each_entry_reverse(rq, &plug->list, queuelist) { + if (q->mq_ops) + plug_list = &plug->mq_list; + else + plug_list = &plug->list; + + list_for_each_entry_reverse(rq, plug_list, queuelist) { int el_ret; if (rq->q == q) @@ -1505,7 +1494,7 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio) * Check if we can merge with the plugged list before grabbing * any locks. */ - if (attempt_plug_merge(q, bio, &request_count)) + if (blk_attempt_plug_merge(q, bio, &request_count)) return; spin_lock_irq(q->queue_lock); @@ -1573,7 +1562,7 @@ get_rq: } } list_add_tail(&req->queuelist, &plug->list); - drive_stat_acct(req, 1); + blk_account_io_start(req, true); } else { spin_lock_irq(q->queue_lock); add_acct_request(q, req, where); @@ -2027,7 +2016,7 @@ unsigned int blk_rq_err_bytes(const struct request *rq) } EXPORT_SYMBOL_GPL(blk_rq_err_bytes); -static void blk_account_io_completion(struct request *req, unsigned int bytes) +void blk_account_io_completion(struct request *req, unsigned int bytes) { if (blk_do_io_stat(req)) { const int rw = rq_data_dir(req); @@ -2041,7 +2030,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes) } } -static void blk_account_io_done(struct request *req) +void blk_account_io_done(struct request *req) { /* * Account IO completion. flush_rq isn't accounted as a @@ -2089,6 +2078,42 @@ static inline struct request *blk_pm_peek_request(struct request_queue *q, } #endif +void blk_account_io_start(struct request *rq, bool new_io) +{ + struct hd_struct *part; + int rw = rq_data_dir(rq); + int cpu; + + if (!blk_do_io_stat(rq)) + return; + + cpu = part_stat_lock(); + + if (!new_io) { + part = rq->part; + part_stat_inc(cpu, part, merges[rw]); + } else { + part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); + if (!hd_struct_try_get(part)) { + /* + * The partition is already being removed, + * the request will be accounted on the disk only + * + * We take a reference on disk->part0 although that + * partition will never be deleted, so we can treat + * it as any other partition. + */ + part = &rq->rq_disk->part0; + hd_struct_get(part); + } + part_round_stats(cpu, part); + part_inc_in_flight(part, rw); + rq->part = part; + } + + part_stat_unlock(); +} + /** * blk_peek_request - peek at the top of a request queue * @q: request queue to peek at @@ -2465,7 +2490,6 @@ static void blk_finish_request(struct request *req, int error) if (req->cmd_flags & REQ_DONTPREP) blk_unprep_request(req); - blk_account_io_done(req); if (req->end_io) @@ -2887,6 +2911,7 @@ void blk_start_plug(struct blk_plug *plug) plug->magic = PLUG_MAGIC; INIT_LIST_HEAD(&plug->list); + INIT_LIST_HEAD(&plug->mq_list); INIT_LIST_HEAD(&plug->cb_list); /* @@ -2984,6 +3009,10 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) BUG_ON(plug->magic != PLUG_MAGIC); flush_plug_callbacks(plug, from_schedule); + + if (!list_empty(&plug->mq_list)) + blk_mq_flush_plug_list(plug, from_schedule); + if (list_empty(&plug->list)) return; diff --git a/block/blk-exec.c b/block/blk-exec.c index ae4f27d7944e..c3edf9dff566 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -5,6 +5,7 @@ #include <linux/module.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/blk-mq.h> #include <linux/sched/sysctl.h> #include "blk.h" @@ -24,7 +25,6 @@ static void blk_end_sync_rq(struct request *rq, int error) struct completion *waiting = rq->end_io_data; rq->end_io_data = NULL; - __blk_put_request(rq->q, rq); /* * complete last, if this is a stack request the process (and thus @@ -59,6 +59,12 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, rq->rq_disk = bd_disk; rq->end_io = done; + + if (q->mq_ops) { + blk_mq_insert_request(q, rq, true); + return; + } + /* * need to check this before __blk_run_queue(), because rq can * be freed before that returns. @@ -103,12 +109,6 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, int err = 0; unsigned long hang_check; - /* - * we need an extra reference to the request, so we can look at - * it after io completion - */ - rq->ref_count++; - if (!rq->sense) { memset(sense, 0, sizeof(sense)); rq->sense = sense; diff --git a/block/blk-flush.c b/block/blk-flush.c index cc2b827a853c..331e627301ea 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -69,8 +69,10 @@ #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/gfp.h> +#include <linux/blk-mq.h> #include "blk.h" +#include "blk-mq.h" /* FLUSH/FUA sequences */ enum { @@ -124,6 +126,24 @@ static void blk_flush_restore_request(struct request *rq) /* make @rq a normal request */ rq->cmd_flags &= ~REQ_FLUSH_SEQ; rq->end_io = rq->flush.saved_end_io; + + blk_clear_rq_complete(rq); +} + +static void mq_flush_data_run(struct work_struct *work) +{ + struct request *rq; + + rq = container_of(work, struct request, mq_flush_data); + + memset(&rq->csd, 0, sizeof(rq->csd)); + blk_mq_run_request(rq, true, false); +} + +static void blk_mq_flush_data_insert(struct request *rq) +{ + INIT_WORK(&rq->mq_flush_data, mq_flush_data_run); + kblockd_schedule_work(rq->q, &rq->mq_flush_data); } /** @@ -136,7 +156,7 @@ static void blk_flush_restore_request(struct request *rq) * completion and trigger the next step. * * CONTEXT: - * spin_lock_irq(q->queue_lock) + * spin_lock_irq(q->queue_lock or q->mq_flush_lock) * * RETURNS: * %true if requests were added to the dispatch queue, %false otherwise. @@ -146,7 +166,7 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq, { struct request_queue *q = rq->q; struct list_head *pending = &q->flush_queue[q->flush_pending_idx]; - bool queued = false; + bool queued = false, kicked; BUG_ON(rq->flush.seq & seq); rq->flush.seq |= seq; @@ -167,8 +187,12 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq, case REQ_FSEQ_DATA: list_move_tail(&rq->flush.list, &q->flush_data_in_flight); - list_add(&rq->queuelist, &q->queue_head); - queued = true; + if (q->mq_ops) + blk_mq_flush_data_insert(rq); + else { + list_add(&rq->queuelist, &q->queue_head); + queued = true; + } break; case REQ_FSEQ_DONE: @@ -181,28 +205,43 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq, BUG_ON(!list_empty(&rq->queuelist)); list_del_init(&rq->flush.list); blk_flush_restore_request(rq); - __blk_end_request_all(rq, error); + if (q->mq_ops) + blk_mq_end_io(rq, error); + else + __blk_end_request_all(rq, error); break; default: BUG(); } - return blk_kick_flush(q) | queued; + kicked = blk_kick_flush(q); + /* blk_mq_run_flush will run queue */ + if (q->mq_ops) + return queued; + return kicked | queued; } static void flush_end_io(struct request *flush_rq, int error) { struct request_queue *q = flush_rq->q; - struct list_head *running = &q->flush_queue[q->flush_running_idx]; + struct list_head *running; bool queued = false; struct request *rq, *n; + unsigned long flags = 0; + if (q->mq_ops) { + blk_mq_free_request(flush_rq); + spin_lock_irqsave(&q->mq_flush_lock, flags); + } + running = &q->flush_queue[q->flush_running_idx]; BUG_ON(q->flush_pending_idx == q->flush_running_idx); /* account completion of the flush request */ q->flush_running_idx ^= 1; - elv_completed_request(q, flush_rq); + + if (!q->mq_ops) + elv_completed_request(q, flush_rq); /* and push the waiting requests to the next stage */ list_for_each_entry_safe(rq, n, running, flush.list) { @@ -223,9 +262,48 @@ static void flush_end_io(struct request *flush_rq, int error) * directly into request_fn may confuse the driver. Always use * kblockd. */ - if (queued || q->flush_queue_delayed) - blk_run_queue_async(q); + if (queued || q->flush_queue_delayed) { + if (!q->mq_ops) + blk_run_queue_async(q); + else + /* + * This can be optimized to only run queues with requests + * queued if necessary. + */ + blk_mq_run_queues(q, true); + } q->flush_queue_delayed = 0; + if (q->mq_ops) + spin_unlock_irqrestore(&q->mq_flush_lock, flags); +} + +static void mq_flush_work(struct work_struct *work) +{ + struct request_queue *q; + struct request *rq; + + q = container_of(work, struct request_queue, mq_flush_work); + + /* We don't need set REQ_FLUSH_SEQ, it's for consistency */ + rq = blk_mq_alloc_request(q, WRITE_FLUSH|REQ_FLUSH_SEQ, + __GFP_WAIT|GFP_ATOMIC, true); + rq->cmd_type = REQ_TYPE_FS; + rq->end_io = flush_end_io; + + blk_mq_run_request(rq, true, false); +} + +/* + * We can't directly use q->flush_rq, because it doesn't have tag and is not in + * hctx->rqs[]. so we must allocate a new request, since we can't sleep here, + * so offload the work to workqueue. + * + * Note: we assume a flush request finished in any hardware queue will flush + * the whole disk cache. + */ +static void mq_run_flush(struct request_queue *q) +{ + kblockd_schedule_work(q, &q->mq_flush_work); } /** @@ -236,7 +314,7 @@ static void flush_end_io(struct request *flush_rq, int error) * Please read the comment at the top of this file for more info. * * CONTEXT: - * spin_lock_irq(q->queue_lock) + * spin_lock_irq(q->queue_lock or q->mq_flush_lock) * * RETURNS: * %true if flush was issued, %false otherwise. @@ -261,13 +339,18 @@ static bool blk_kick_flush(struct request_queue *q) * Issue flush and toggle pending_idx. This makes pending_idx * different from running_idx, which means flush is in flight. */ + q->flush_pending_idx ^= 1; + if (q->mq_ops) { + mq_run_flush(q); + return true; + } + blk_rq_init(q, &q->flush_rq); q->flush_rq.cmd_type = REQ_TYPE_FS; q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; q->flush_rq.rq_disk = first_rq->rq_disk; q->flush_rq.end_io = flush_end_io; - q->flush_pending_idx ^= 1; list_add_tail(&q->flush_rq.queuelist, &q->queue_head); return true; } @@ -284,16 +367,37 @@ static void flush_data_end_io(struct request *rq, int error) blk_run_queue_async(q); } +static void mq_flush_data_end_io(struct request *rq, int error) +{ + struct request_queue *q = rq->q; + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + unsigned long flags; + + ctx = rq->mq_ctx; + hctx = q->mq_ops->map_queue(q, ctx->cpu); + + /* + * After populating an empty queue, kick it to avoid stall. Read + * the comment in flush_end_io(). + */ + spin_lock_irqsave(&q->mq_flush_lock, flags); + if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error)) + blk_mq_run_hw_queue(hctx, true); + spin_unlock_irqrestore(&q->mq_flush_lock, flags); +} + /** * blk_insert_flush - insert a new FLUSH/FUA request * @rq: request to insert * * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions. + * or __blk_mq_run_hw_queue() to dispatch request. * @rq is being submitted. Analyze what needs to be done and put it on the * right queue. * * CONTEXT: - * spin_lock_irq(q->queue_lock) + * spin_lock_irq(q->queue_lock) in !mq case */ void blk_insert_flush(struct request *rq) { @@ -316,7 +420,10 @@ void blk_insert_flush(struct request *rq) * complete the request. */ if (!policy) { - __blk_end_bidi_request(rq, 0, 0, 0); + if (q->mq_ops) + blk_mq_end_io(rq, 0); + else + __blk_end_bidi_request(rq, 0, 0, 0); return; } @@ -329,7 +436,10 @@ void blk_insert_flush(struct request *rq) */ if ((policy & REQ_FSEQ_DATA) && !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { - list_add_tail(&rq->queuelist, &q->queue_head); + if (q->mq_ops) { + blk_mq_run_request(rq, false, true); + } else + list_add_tail(&rq->queuelist, &q->queue_head); return; } @@ -341,6 +451,14 @@ void blk_insert_flush(struct request *rq) INIT_LIST_HEAD(&rq->flush.list); rq->cmd_flags |= REQ_FLUSH_SEQ; rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ + if (q->mq_ops) { + rq->end_io = mq_flush_data_end_io; + + spin_lock_irq(&q->mq_flush_lock); + blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0); + spin_unlock_irq(&q->mq_flush_lock); + return; + } rq->end_io = flush_data_end_io; blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0); @@ -453,3 +571,9 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, return ret; } EXPORT_SYMBOL(blkdev_issue_flush); + +void blk_mq_init_flush(struct request_queue *q) +{ + spin_lock_init(&q->mq_flush_lock); + INIT_WORK(&q->mq_flush_work, mq_flush_work); +} diff --git a/block/blk-merge.c b/block/blk-merge.c index 5f2448253797..1ffc58977835 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -308,6 +308,17 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req, return ll_new_hw_segment(q, req, bio); } +/* + * blk-mq uses req->special to carry normal driver per-request payload, it + * does not indicate a prepared command that we cannot merge with. + */ +static bool req_no_special_merge(struct request *req) +{ + struct request_queue *q = req->q; + + return !q->mq_ops && req->special; +} + static int ll_merge_requests_fn(struct request_queue *q, struct request *req, struct request *next) { @@ -319,7 +330,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, * First check if the either of the requests are re-queued * requests. Can't merge them if they are. */ - if (req->special || next->special) + if (req_no_special_merge(req) || req_no_special_merge(next)) return 0; /* @@ -416,7 +427,7 @@ static int attempt_merge(struct request_queue *q, struct request *req, if (rq_data_dir(req) != rq_data_dir(next) || req->rq_disk != next->rq_disk - || next->special) + || req_no_special_merge(next)) return 0; if (req->cmd_flags & REQ_WRITE_SAME && @@ -515,7 +526,7 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) return false; /* must be same device and not a special request */ - if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special) + if (rq->rq_disk != bio->bi_bdev->bd_disk || req_no_special_merge(rq)) return false; /* only merge integrity protected bio into ditto rq */ diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c new file mode 100644 index 000000000000..f8ea39d7ae54 --- /dev/null +++ b/block/blk-mq-cpu.c @@ -0,0 +1,93 @@ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/blkdev.h> +#include <linux/list.h> +#include <linux/llist.h> +#include <linux/smp.h> +#include <linux/cpu.h> + +#include <linux/blk-mq.h> +#include "blk-mq.h" + +static LIST_HEAD(blk_mq_cpu_notify_list); +static DEFINE_SPINLOCK(blk_mq_cpu_notify_lock); + +static int __cpuinit blk_mq_main_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long) hcpu; + struct blk_mq_cpu_notifier *notify; + + spin_lock(&blk_mq_cpu_notify_lock); + + list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) + notify->notify(notify->data, action, cpu); + + spin_unlock(&blk_mq_cpu_notify_lock); + return NOTIFY_OK; +} + +static void __cpuinit blk_mq_cpu_notify(void *data, unsigned long action, + unsigned int cpu) +{ + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { + /* + * If the CPU goes away, ensure that we run any pending + * completions. + */ + struct llist_node *node; + struct request *rq; + + local_irq_disable(); + + node = llist_del_all(&per_cpu(ipi_lists, cpu)); + while (node) { + struct llist_node *next = node->next; + + rq = llist_entry(node, struct request, ll_list); + __blk_mq_end_io(rq, rq->errors); + node = next; + } + + local_irq_enable(); + } +} + +static struct notifier_block __cpuinitdata blk_mq_main_cpu_notifier = { + .notifier_call = blk_mq_main_cpu_notify, +}; + +void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier) +{ + BUG_ON(!notifier->notify); + + spin_lock(&blk_mq_cpu_notify_lock); + list_add_tail(¬ifier->list, &blk_mq_cpu_notify_list); + spin_unlock(&blk_mq_cpu_notify_lock); +} + +void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier) +{ + spin_lock(&blk_mq_cpu_notify_lock); + list_del(¬ifier->list); + spin_unlock(&blk_mq_cpu_notify_lock); +} + +void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, + void (*fn)(void *, unsigned long, unsigned int), + void *data) +{ + notifier->notify = fn; + notifier->data = data; +} + +static struct blk_mq_cpu_notifier __cpuinitdata cpu_notifier = { + .notify = blk_mq_cpu_notify, +}; + +void __init blk_mq_cpu_init(void) +{ + register_hotcpu_notifier(&blk_mq_main_cpu_notifier); + blk_mq_register_cpu_notifier(&cpu_notifier); +} diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c new file mode 100644 index 000000000000..f8721278601c --- /dev/null +++ b/block/blk-mq-cpumap.c @@ -0,0 +1,108 @@ +#include <linux/kernel.h> +#include <linux/threads.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/cpu.h> + +#include <linux/blk-mq.h> +#include "blk.h" +#include "blk-mq.h" + +static void show_map(unsigned int *map, unsigned int nr) +{ + int i; + + pr_info("blk-mq: CPU -> queue map\n"); + for_each_online_cpu(i) + pr_info(" CPU%2u -> Queue %u\n", i, map[i]); +} + +static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues, + const int cpu) +{ + return cpu / ((nr_cpus + nr_queues - 1) / nr_queues); +} + +static int get_first_sibling(unsigned int cpu) +{ + unsigned int ret; + + ret = cpumask_first(topology_thread_cpumask(cpu)); + if (ret < nr_cpu_ids) + return ret; + + return cpu; +} + +int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues) +{ + unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling; + cpumask_var_t cpus; + + if (!alloc_cpumask_var(&cpus, GFP_ATOMIC)) + return 1; + + cpumask_clear(cpus); + nr_cpus = nr_uniq_cpus = 0; + for_each_online_cpu(i) { + nr_cpus++; + first_sibling = get_first_sibling(i); + if (!cpumask_test_cpu(first_sibling, cpus)) + nr_uniq_cpus++; + cpumask_set_cpu(i, cpus); + } + + queue = 0; + for_each_possible_cpu(i) { + if (!cpu_online(i)) { + map[i] = 0; + continue; + } + + /* + * Easy case - we have equal or more hardware queues. Or + * there are no thread siblings to take into account. Do + * 1:1 if enough, or sequential mapping if less. + */ + if (nr_queues >= nr_cpus || nr_cpus == nr_uniq_cpus) { + map[i] = cpu_to_queue_index(nr_cpus, nr_queues, queue); + queue++; + continue; + } + + /* + * Less then nr_cpus queues, and we have some number of + * threads per cores. Map sibling threads to the same + * queue. + */ + first_sibling = get_first_sibling(i); + if (first_sibling == i) { + map[i] = cpu_to_queue_index(nr_uniq_cpus, nr_queues, + queue); + queue++; + } else + map[i] = map[first_sibling]; + } + + show_map(map, nr_cpus); + free_cpumask_var(cpus); + return 0; +} + +unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg) +{ + unsigned int *map; + + /* If cpus are offline, map them to first hctx */ + map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL, + reg->numa_node); + if (!map) + return NULL; + + if (!blk_mq_update_queue_map(map, reg->nr_hw_queues)) + return map; + + kfree(map); + return NULL; +} diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c new file mode 100644 index 000000000000..ba6cf8e9aa0a --- /dev/null +++ b/block/blk-mq-sysfs.c @@ -0,0 +1,384 @@ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/backing-dev.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/workqueue.h> +#include <linux/smp.h> + +#include <linux/blk-mq.h> +#include "blk-mq.h" +#include "blk-mq-tag.h" + +static void blk_mq_sysfs_release(struct kobject *kobj) +{ +} + +struct blk_mq_ctx_sysfs_entry { + struct attribute attr; + ssize_t (*show)(struct blk_mq_ctx *, char *); + ssize_t (*store)(struct blk_mq_ctx *, const char *, size_t); +}; + +struct blk_mq_hw_ctx_sysfs_entry { + struct attribute attr; + ssize_t (*show)(struct blk_mq_hw_ctx *, char *); + ssize_t (*store)(struct blk_mq_hw_ctx *, const char *, size_t); +}; + +static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr, + char *page) +{ + struct blk_mq_ctx_sysfs_entry *entry; + struct blk_mq_ctx *ctx; + struct request_queue *q; + ssize_t res; + + entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr); + ctx = container_of(kobj, struct blk_mq_ctx, kobj); + q = ctx->queue; + + if (!entry->show) + return -EIO; + + res = -ENOENT; + mutex_lock(&q->sysfs_lock); + if (!blk_queue_dying(q)) + res = entry->show(ctx, page); + mutex_unlock(&q->sysfs_lock); + return res; +} + +static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr, + const char *page, size_t length) +{ + struct blk_mq_ctx_sysfs_entry *entry; + struct blk_mq_ctx *ctx; + struct request_queue *q; + ssize_t re |