Merge branch 'blk-mq/core' into for-3.13/core

Signed-off-by: Jens Axboe <axboe@kernel.dk> Conflicts: block/blk-timeout.c
author: Jens Axboe <axboe@kernel.dk> 2013-11-08 09:08:12 -0700
committer: Jens Axboe <axboe@kernel.dk> 2013-11-08 09:08:12 -0700
commit: e37459b8e2c7db6735e39e019e448b76e5e77647 (patch)
tree: a3f0944db87a8ae0d41e5acbbbabc1e7ef534d1b /block
parent: c7d1ba417c7cb7297d14dd47a390ec90ce548d5c (diff)
parent: e7e245000110a7794de8f925b9edc06a9c852f80 (diff)
15 files changed, 2701 insertions, 118 deletions
diff --git a/block/Makefile b/block/Makefile
index 671a83d063a5..20645e88fb57 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,8 +5,9 @@
 obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 			blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
 			blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
-			blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o \
-			partition-generic.o partitions/
+			blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
+			blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
+			genhd.o scsi_ioctl.o partition-generic.o partitions/
 
 obj-$(CONFIG_BLK_DEV_BSG)	+= bsg.o
 obj-$(CONFIG_BLK_DEV_BSGLIB)	+= bsg-lib.o
diff --git a/block/blk-core.c b/block/blk-core.c
index 25f13479f552..8bdd0121212a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -16,6 +16,7 @@
 #include <linux/backing-dev.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 #include <linux/highmem.h>
 #include <linux/mm.h>
 #include <linux/kernel_stat.h>
@@ -48,7 +49,7 @@ DEFINE_IDA(blk_queue_ida);
 /*
  * For the allocated request tables
  */
-static struct kmem_cache *request_cachep;
+struct kmem_cache *request_cachep = NULL;
 
 /*
  * For queue allocation
@@ -60,42 +61,6 @@ struct kmem_cache *blk_requestq_cachep;
  */
 static struct workqueue_struct *kblockd_workqueue;
 
-static void drive_stat_acct(struct request *rq, int new_io)
-{
-	struct hd_struct *part;
-	int rw = rq_data_dir(rq);
-	int cpu;
-
-	if (!blk_do_io_stat(rq))
-		return;
-
-	cpu = part_stat_lock();
-
-	if (!new_io) {
-		part = rq->part;
-		part_stat_inc(cpu, part, merges[rw]);
-	} else {
-		part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
-		if (!hd_struct_try_get(part)) {
-			/*
-			 * The partition is already being removed,
-			 * the request will be accounted on the disk only
-			 *
-			 * We take a reference on disk->part0 although that
-			 * partition will never be deleted, so we can treat
-			 * it as any other partition.
-			 */
-			part = &rq->rq_disk->part0;
-			hd_struct_get(part);
-		}
-		part_round_stats(cpu, part);
-		part_inc_in_flight(part, rw);
-		rq->part = part;
-	}
-
-	part_stat_unlock();
-}
-
 void blk_queue_congestion_threshold(struct request_queue *q)
 {
 	int nr;
@@ -145,7 +110,6 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	rq->cmd = rq->__cmd;
 	rq->cmd_len = BLK_MAX_CDB;
 	rq->tag = -1;
-	rq->ref_count = 1;
 	rq->start_time = jiffies;
 	set_start_time_ns(rq);
 	rq->part = NULL;
@@ -174,9 +138,9 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
 {
 	int bit;
 
-	printk(KERN_INFO "%s: dev %s: type=%x, flags=%x\n", msg,
+	printk(KERN_INFO "%s: dev %s: type=%x, flags=%llx\n", msg,
 		rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,
-		rq->cmd_flags);
+		(unsigned long long) rq->cmd_flags);
 
 	printk(KERN_INFO "  sector %llu, nr/cnr %u/%u\n",
 	       (unsigned long long)blk_rq_pos(rq),
@@ -595,9 +559,12 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	if (!q)
 		return NULL;
 
+	if (percpu_counter_init(&q->mq_usage_counter, 0))
+		goto fail_q;
+
 	q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
 	if (q->id < 0)
-		goto fail_q;
+		goto fail_c;
 
 	q->backing_dev_info.ra_pages =
 			(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
@@ -644,6 +611,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	q->bypass_depth = 1;
 	__set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
 
+	init_waitqueue_head(&q->mq_freeze_wq);
+
 	if (blkcg_init_queue(q))
 		goto fail_bdi;
 
@@ -653,6 +622,8 @@ fail_bdi:
 	bdi_destroy(&q->backing_dev_info);
 fail_id:
 	ida_simple_remove(&blk_queue_ida, q->id);
+fail_c:
+	percpu_counter_destroy(&q->mq_usage_counter);
 fail_q:
 	kmem_cache_free(blk_requestq_cachep, q);
 	return NULL;
@@ -1119,7 +1090,8 @@ retry:
 	goto retry;
 }
 
-struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
+static struct request *blk_old_get_request(struct request_queue *q, int rw,
+		gfp_t gfp_mask)
 {
 	struct request *rq;
 
@@ -1136,6 +1108,14 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
 
 	return rq;
 }
+
+struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
+{
+	if (q->mq_ops)
+		return blk_mq_alloc_request(q, rw, gfp_mask, false);
+	else
+		return blk_old_get_request(q, rw, gfp_mask);
+}
 EXPORT_SYMBOL(blk_get_request);
 
 /**
@@ -1221,7 +1201,7 @@ EXPORT_SYMBOL(blk_requeue_request);
 static void add_acct_request(struct request_queue *q, struct request *rq,
 			     int where)
 {
-	drive_stat_acct(rq, 1);
+	blk_account_io_start(rq, true);
 	__elv_add_request(q, rq, where);
 }
 
@@ -1282,8 +1262,6 @@ void __blk_put_request(struct request_queue *q, struct request *req)
 {
 	if (unlikely(!q))
 		return;
-	if (unlikely(--req->ref_count))
-		return;
 
 	blk_pm_put_request(req);
 
@@ -1312,12 +1290,17 @@ EXPORT_SYMBOL_GPL(__blk_put_request);
 
 void blk_put_request(struct request *req)
 {
-	unsigned long flags;
 	struct request_queue *q = req->q;
 
-	spin_lock_irqsave(q->queue_lock, flags);
-	__blk_put_request(q, req);
-	spin_unlock_irqrestore(q->queue_lock, flags);
+	if (q->mq_ops)
+		blk_mq_free_request(req);
+	else {
+		unsigned long flags;
+
+		spin_lock_irqsave(q->queue_lock, flags);
+		__blk_put_request(q, req);
+		spin_unlock_irqrestore(q->queue_lock, flags);
+	}
 }
 EXPORT_SYMBOL(blk_put_request);
 
@@ -1353,8 +1336,8 @@ void blk_add_request_payload(struct request *rq, struct page *page,
 }
 EXPORT_SYMBOL_GPL(blk_add_request_payload);
 
-static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
-				   struct bio *bio)
+bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
+			    struct bio *bio)
 {
 	const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
 
@@ -1371,12 +1354,12 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
 	req->__data_len += bio->bi_size;
 	req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
 
-	drive_stat_acct(req, 0);
+	blk_account_io_start(req, false);
 	return true;
 }
 
-static bool bio_attempt_front_merge(struct request_queue *q,
-				    struct request *req, struct bio *bio)
+bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
+			     struct bio *bio)
 {
 	const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
 
@@ -1401,12 +1384,12 @@ static bool bio_attempt_front_merge(struct request_queue *q,
 	req->__data_len += bio->bi_size;
 	req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
 
-	drive_stat_acct(req, 0);
+	blk_account_io_start(req, false);
 	return true;
 }
 
 /**
- * attempt_plug_merge - try to merge with %current's plugged list
+ * blk_attempt_plug_merge - try to merge with %current's plugged list
  * @q: request_queue new bio is being queued at
  * @bio: new bio being queued
  * @request_count: out parameter for number of traversed plugged requests
@@ -1422,12 +1405,13 @@ static bool bio_attempt_front_merge(struct request_queue *q,
  * reliable access to the elevator outside queue lock.  Only check basic
  * merging parameters without querying the elevator.
  */
-static bool attempt_plug_merge(struct request_queue *q, struct bio *bio,
-			       unsigned int *request_count)
+bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
+			    unsigned int *request_count)
 {
 	struct blk_plug *plug;
 	struct request *rq;
 	bool ret = false;
+	struct list_head *plug_list;
 
 	if (blk_queue_nomerges(q))
 		goto out;
@@ -1437,7 +1421,12 @@ static bool attempt_plug_merge(struct request_queue *q, struct bio *bio,
 		goto out;
 	*request_count = 0;
 
-	list_for_each_entry_reverse(rq, &plug->list, queuelist) {
+	if (q->mq_ops)
+		plug_list = &plug->mq_list;
+	else
+		plug_list = &plug->list;
+
+	list_for_each_entry_reverse(rq, plug_list, queuelist) {
 		int el_ret;
 
 		if (rq->q == q)
@@ -1505,7 +1494,7 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio)
 	 * Check if we can merge with the plugged list before grabbing
 	 * any locks.
 	 */
-	if (attempt_plug_merge(q, bio, &request_count))
+	if (blk_attempt_plug_merge(q, bio, &request_count))
 		return;
 
 	spin_lock_irq(q->queue_lock);
@@ -1573,7 +1562,7 @@ get_rq:
 			}
 		}
 		list_add_tail(&req->queuelist, &plug->list);
-		drive_stat_acct(req, 1);
+		blk_account_io_start(req, true);
 	} else {
 		spin_lock_irq(q->queue_lock);
 		add_acct_request(q, req, where);
@@ -2027,7 +2016,7 @@ unsigned int blk_rq_err_bytes(const struct request *rq)
 }
 EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
 
-static void blk_account_io_completion(struct request *req, unsigned int bytes)
+void blk_account_io_completion(struct request *req, unsigned int bytes)
 {
 	if (blk_do_io_stat(req)) {
 		const int rw = rq_data_dir(req);
@@ -2041,7 +2030,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
 	}
 }
 
-static void blk_account_io_done(struct request *req)
+void blk_account_io_done(struct request *req)
 {
 	/*
 	 * Account IO completion.  flush_rq isn't accounted as a
@@ -2089,6 +2078,42 @@ static inline struct request *blk_pm_peek_request(struct request_queue *q,
 }
 #endif
 
+void blk_account_io_start(struct request *rq, bool new_io)
+{
+	struct hd_struct *part;
+	int rw = rq_data_dir(rq);
+	int cpu;
+
+	if (!blk_do_io_stat(rq))
+		return;
+
+	cpu = part_stat_lock();
+
+	if (!new_io) {
+		part = rq->part;
+		part_stat_inc(cpu, part, merges[rw]);
+	} else {
+		part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
+		if (!hd_struct_try_get(part)) {
+			/*
+			 * The partition is already being removed,
+			 * the request will be accounted on the disk only
+			 *
+			 * We take a reference on disk->part0 although that
+			 * partition will never be deleted, so we can treat
+			 * it as any other partition.
+			 */
+			part = &rq->rq_disk->part0;
+			hd_struct_get(part);
+		}
+		part_round_stats(cpu, part);
+		part_inc_in_flight(part, rw);
+		rq->part = part;
+	}
+
+	part_stat_unlock();
+}
+
 /**
  * blk_peek_request - peek at the top of a request queue
  * @q: request queue to peek at
@@ -2465,7 +2490,6 @@ static void blk_finish_request(struct request *req, int error)
 	if (req->cmd_flags & REQ_DONTPREP)
 		blk_unprep_request(req);
 
-
 	blk_account_io_done(req);
 
 	if (req->end_io)
@@ -2887,6 +2911,7 @@ void blk_start_plug(struct blk_plug *plug)
 
 	plug->magic = PLUG_MAGIC;
 	INIT_LIST_HEAD(&plug->list);
+	INIT_LIST_HEAD(&plug->mq_list);
 	INIT_LIST_HEAD(&plug->cb_list);
 
 	/*
@@ -2984,6 +3009,10 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 	BUG_ON(plug->magic != PLUG_MAGIC);
 
 	flush_plug_callbacks(plug, from_schedule);
+
+	if (!list_empty(&plug->mq_list))
+		blk_mq_flush_plug_list(plug, from_schedule);
+
 	if (list_empty(&plug->list))
 		return;
 
diff --git a/block/blk-exec.c b/block/blk-exec.c
index ae4f27d7944e..c3edf9dff566 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -5,6 +5,7 @@
 #include <linux/module.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 #include <linux/sched/sysctl.h>
 
 #include "blk.h"
@@ -24,7 +25,6 @@ static void blk_end_sync_rq(struct request *rq, int error)
 	struct completion *waiting = rq->end_io_data;
 
 	rq->end_io_data = NULL;
-	__blk_put_request(rq->q, rq);
 
 	/*
 	 * complete last, if this is a stack request the process (and thus
@@ -59,6 +59,12 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 
 	rq->rq_disk = bd_disk;
 	rq->end_io = done;
+
+	if (q->mq_ops) {
+		blk_mq_insert_request(q, rq, true);
+		return;
+	}
+
 	/*
 	 * need to check this before __blk_run_queue(), because rq can
 	 * be freed before that returns.
@@ -103,12 +109,6 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
 	int err = 0;
 	unsigned long hang_check;
 
-	/*
-	 * we need an extra reference to the request, so we can look at
-	 * it after io completion
-	 */
-	rq->ref_count++;
-
 	if (!rq->sense) {
 		memset(sense, 0, sizeof(sense));
 		rq->sense = sense;
diff --git a/block/blk-flush.c b/block/blk-flush.c
index cc2b827a853c..331e627301ea 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -69,8 +69,10 @@
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/gfp.h>
+#include <linux/blk-mq.h>
 
 #include "blk.h"
+#include "blk-mq.h"
 
 /* FLUSH/FUA sequences */
 enum {
@@ -124,6 +126,24 @@ static void blk_flush_restore_request(struct request *rq)
 	/* make @rq a normal request */
 	rq->cmd_flags &= ~REQ_FLUSH_SEQ;
 	rq->end_io = rq->flush.saved_end_io;
+
+	blk_clear_rq_complete(rq);
+}
+
+static void mq_flush_data_run(struct work_struct *work)
+{
+	struct request *rq;
+
+	rq = container_of(work, struct request, mq_flush_data);
+
+	memset(&rq->csd, 0, sizeof(rq->csd));
+	blk_mq_run_request(rq, true, false);
+}
+
+static void blk_mq_flush_data_insert(struct request *rq)
+{
+	INIT_WORK(&rq->mq_flush_data, mq_flush_data_run);
+	kblockd_schedule_work(rq->q, &rq->mq_flush_data);
 }
 
 /**
@@ -136,7 +156,7 @@ static void blk_flush_restore_request(struct request *rq)
  * completion and trigger the next step.
  *
  * CONTEXT:
- * spin_lock_irq(q->queue_lock)
+ * spin_lock_irq(q->queue_lock or q->mq_flush_lock)
  *
  * RETURNS:
  * %true if requests were added to the dispatch queue, %false otherwise.
@@ -146,7 +166,7 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
 {
 	struct request_queue *q = rq->q;
 	struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
-	bool queued = false;
+	bool queued = false, kicked;
 
 	BUG_ON(rq->flush.seq & seq);
 	rq->flush.seq |= seq;
@@ -167,8 +187,12 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
 
 	case REQ_FSEQ_DATA:
 		list_move_tail(&rq->flush.list, &q->flush_data_in_flight);
-		list_add(&rq->queuelist, &q->queue_head);
-		queued = true;
+		if (q->mq_ops)
+			blk_mq_flush_data_insert(rq);
+		else {
+			list_add(&rq->queuelist, &q->queue_head);
+			queued = true;
+		}
 		break;
 
 	case REQ_FSEQ_DONE:
@@ -181,28 +205,43 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
 		BUG_ON(!list_empty(&rq->queuelist));
 		list_del_init(&rq->flush.list);
 		blk_flush_restore_request(rq);
-		__blk_end_request_all(rq, error);
+		if (q->mq_ops)
+			blk_mq_end_io(rq, error);
+		else
+			__blk_end_request_all(rq, error);
 		break;
 
 	default:
 		BUG();
 	}
 
-	return blk_kick_flush(q) | queued;
+	kicked = blk_kick_flush(q);
+	/* blk_mq_run_flush will run queue */
+	if (q->mq_ops)
+		return queued;
+	return kicked | queued;
 }
 
 static void flush_end_io(struct request *flush_rq, int error)
 {
 	struct request_queue *q = flush_rq->q;
-	struct list_head *running = &q->flush_queue[q->flush_running_idx];
+	struct list_head *running;
 	bool queued = false;
 	struct request *rq, *n;
+	unsigned long flags = 0;
 
+	if (q->mq_ops) {
+		blk_mq_free_request(flush_rq);
+		spin_lock_irqsave(&q->mq_flush_lock, flags);
+	}
+	running = &q->flush_queue[q->flush_running_idx];
 	BUG_ON(q->flush_pending_idx == q->flush_running_idx);
 
 	/* account completion of the flush request */
 	q->flush_running_idx ^= 1;
-	elv_completed_request(q, flush_rq);
+
+	if (!q->mq_ops)
+		elv_completed_request(q, flush_rq);
 
 	/* and push the waiting requests to the next stage */
 	list_for_each_entry_safe(rq, n, running, flush.list) {
@@ -223,9 +262,48 @@ static void flush_end_io(struct request *flush_rq, int error)
 	 * directly into request_fn may confuse the driver.  Always use
 	 * kblockd.
 	 */
-	if (queued || q->flush_queue_delayed)
-		blk_run_queue_async(q);
+	if (queued || q->flush_queue_delayed) {
+		if (!q->mq_ops)
+			blk_run_queue_async(q);
+		else
+		/*
+		 * This can be optimized to only run queues with requests
+		 * queued if necessary.
+		 */
+			blk_mq_run_queues(q, true);
+	}
 	q->flush_queue_delayed = 0;
+	if (q->mq_ops)
+		spin_unlock_irqrestore(&q->mq_flush_lock, flags);
+}
+
+static void mq_flush_work(struct work_struct *work)
+{
+	struct request_queue *q;
+	struct request *rq;
+
+	q = container_of(work, struct request_queue, mq_flush_work);
+
+	/* We don't need set REQ_FLUSH_SEQ, it's for consistency */
+	rq = blk_mq_alloc_request(q, WRITE_FLUSH|REQ_FLUSH_SEQ,
+		__GFP_WAIT|GFP_ATOMIC, true);
+	rq->cmd_type = REQ_TYPE_FS;
+	rq->end_io = flush_end_io;
+
+	blk_mq_run_request(rq, true, false);
+}
+
+/*
+ * We can't directly use q->flush_rq, because it doesn't have tag and is not in
+ * hctx->rqs[]. so we must allocate a new request, since we can't sleep here,
+ * so offload the work to workqueue.
+ *
+ * Note: we assume a flush request finished in any hardware queue will flush
+ * the whole disk cache.
+ */
+static void mq_run_flush(struct request_queue *q)
+{
+	kblockd_schedule_work(q, &q->mq_flush_work);
 }
 
 /**
@@ -236,7 +314,7 @@ static void flush_end_io(struct request *flush_rq, int error)
  * Please read the comment at the top of this file for more info.
  *
  * CONTEXT:
- * spin_lock_irq(q->queue_lock)
+ * spin_lock_irq(q->queue_lock or q->mq_flush_lock)
  *
  * RETURNS:
  * %true if flush was issued, %false otherwise.
@@ -261,13 +339,18 @@ static bool blk_kick_flush(struct request_queue *q)
 	 * Issue flush and toggle pending_idx.  This makes pending_idx
 	 * different from running_idx, which means flush is in flight.
 	 */
+	q->flush_pending_idx ^= 1;
+	if (q->mq_ops) {
+		mq_run_flush(q);
+		return true;
+	}
+
 	blk_rq_init(q, &q->flush_rq);
 	q->flush_rq.cmd_type = REQ_TYPE_FS;
 	q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
 	q->flush_rq.rq_disk = first_rq->rq_disk;
 	q->flush_rq.end_io = flush_end_io;
 
-	q->flush_pending_idx ^= 1;
 	list_add_tail(&q->flush_rq.queuelist, &q->queue_head);
 	return true;
 }
@@ -284,16 +367,37 @@ static void flush_data_end_io(struct request *rq, int error)
 		blk_run_queue_async(q);
 }
 
+static void mq_flush_data_end_io(struct request *rq, int error)
+{
+	struct request_queue *q = rq->q;
+	struct blk_mq_hw_ctx *hctx;
+	struct blk_mq_ctx *ctx;
+	unsigned long flags;
+
+	ctx = rq->mq_ctx;
+	hctx = q->mq_ops->map_queue(q, ctx->cpu);
+
+	/*
+	 * After populating an empty queue, kick it to avoid stall.  Read
+	 * the comment in flush_end_io().
+	 */
+	spin_lock_irqsave(&q->mq_flush_lock, flags);
+	if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error))
+		blk_mq_run_hw_queue(hctx, true);
+	spin_unlock_irqrestore(&q->mq_flush_lock, flags);
+}
+
 /**
  * blk_insert_flush - insert a new FLUSH/FUA request
  * @rq: request to insert
  *
  * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions.
+ * or __blk_mq_run_hw_queue() to dispatch request.
  * @rq is being submitted.  Analyze what needs to be done and put it on the
  * right queue.
  *
  * CONTEXT:
- * spin_lock_irq(q->queue_lock)
+ * spin_lock_irq(q->queue_lock) in !mq case
  */
 void blk_insert_flush(struct request *rq)
 {
@@ -316,7 +420,10 @@ void blk_insert_flush(struct request *rq)
 	 * complete the request.
 	 */
 	if (!policy) {
-		__blk_end_bidi_request(rq, 0, 0, 0);
+		if (q->mq_ops)
+			blk_mq_end_io(rq, 0);
+		else
+			__blk_end_bidi_request(rq, 0, 0, 0);
 		return;
 	}
 
@@ -329,7 +436,10 @@ void blk_insert_flush(struct request *rq)
 	 */
 	if ((policy & REQ_FSEQ_DATA) &&
 	    !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
-		list_add_tail(&rq->queuelist, &q->queue_head);
+		if (q->mq_ops) {
+			blk_mq_run_request(rq, false, true);
+		} else
+			list_add_tail(&rq->queuelist, &q->queue_head);
 		return;
 	}
 
@@ -341,6 +451,14 @@ void blk_insert_flush(struct request *rq)
 	INIT_LIST_HEAD(&rq->flush.list);
 	rq->cmd_flags |= REQ_FLUSH_SEQ;
 	rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
+	if (q->mq_ops) {
+		rq->end_io = mq_flush_data_end_io;
+
+		spin_lock_irq(&q->mq_flush_lock);
+		blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
+		spin_unlock_irq(&q->mq_flush_lock);
+		return;
+	}
 	rq->end_io = flush_data_end_io;
 
 	blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
@@ -453,3 +571,9 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
 	return ret;
 }
 EXPORT_SYMBOL(blkdev_issue_flush);
+
+void blk_mq_init_flush(struct request_queue *q)
+{
+	spin_lock_init(&q->mq_flush_lock);
+	INIT_WORK(&q->mq_flush_work, mq_flush_work);
+}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 5f2448253797..1ffc58977835 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -308,6 +308,17 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
 	return ll_new_hw_segment(q, req, bio);
 }
 
+/*
+ * blk-mq uses req->special to carry normal driver per-request payload, it
+ * does not indicate a prepared command that we cannot merge with.
+ */
+static bool req_no_special_merge(struct request *req)
+{
+	struct request_queue *q = req->q;
+
+	return !q->mq_ops && req->special;
+}
+
 static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 				struct request *next)
 {
@@ -319,7 +330,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 	 * First check if the either of the requests are re-queued
 	 * requests.  Can't merge them if they are.
 	 */
-	if (req->special || next->special)
+	if (req_no_special_merge(req) || req_no_special_merge(next))
 		return 0;
 
 	/*
@@ -416,7 +427,7 @@ static int attempt_merge(struct request_queue *q, struct request *req,
 
 	if (rq_data_dir(req) != rq_data_dir(next)
 	    || req->rq_disk != next->rq_disk
-	    || next->special)
+	    || req_no_special_merge(next))
 		return 0;
 
 	if (req->cmd_flags & REQ_WRITE_SAME &&
@@ -515,7 +526,7 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 		return false;
 
 	/* must be same device and not a special request */
-	if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special)
+	if (rq->rq_disk != bio->bi_bdev->bd_disk || req_no_special_merge(rq))
 		return false;
 
 	/* only merge integrity protected bio into ditto rq */
diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c
new file mode 100644
index 000000000000..f8ea39d7ae54
--- /dev/null
+++ b/block/blk-mq-cpu.c
@@ -0,0 +1,93 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/list.h>
+#include <linux/llist.h>
+#include <linux/smp.h>
+#include <linux/cpu.h>
+
+#include <linux/blk-mq.h>
+#include "blk-mq.h"
+
+static LIST_HEAD(blk_mq_cpu_notify_list);
+static DEFINE_SPINLOCK(blk_mq_cpu_notify_lock);
+
+static int __cpuinit blk_mq_main_cpu_notify(struct notifier_block *self,
+					    unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned long) hcpu;
+	struct blk_mq_cpu_notifier *notify;
+
+	spin_lock(&blk_mq_cpu_notify_lock);
+
+	list_for_each_entry(notify, &blk_mq_cpu_notify_list, list)
+		notify->notify(notify->data, action, cpu);
+
+	spin_unlock(&blk_mq_cpu_notify_lock);
+	return NOTIFY_OK;
+}
+
+static void __cpuinit blk_mq_cpu_notify(void *data, unsigned long action,
+					unsigned int cpu)
+{
+	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+		/*
+		 * If the CPU goes away, ensure that we run any pending
+		 * completions.
+		 */
+		struct llist_node *node;
+		struct request *rq;
+
+		local_irq_disable();
+
+		node = llist_del_all(&per_cpu(ipi_lists, cpu));
+		while (node) {
+			struct llist_node *next = node->next;
+
+			rq = llist_entry(node, struct request, ll_list);
+			__blk_mq_end_io(rq, rq->errors);
+			node = next;
+		}
+
+		local_irq_enable();
+	}
+}
+
+static struct notifier_block __cpuinitdata blk_mq_main_cpu_notifier = {
+	.notifier_call	= blk_mq_main_cpu_notify,
+};
+
+void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
+{
+	BUG_ON(!notifier->notify);
+
+	spin_lock(&blk_mq_cpu_notify_lock);
+	list_add_tail(&notifier->list, &blk_mq_cpu_notify_list);
+	spin_unlock(&blk_mq_cpu_notify_lock);
+}
+
+void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
+{
+	spin_lock(&blk_mq_cpu_notify_lock);
+	list_del(&notifier->list);
+	spin_unlock(&blk_mq_cpu_notify_lock);
+}
+
+void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
+			      void (*fn)(void *, unsigned long, unsigned int),
+			      void *data)
+{
+	notifier->notify = fn;
+	notifier->data = data;
+}
+
+static struct blk_mq_cpu_notifier __cpuinitdata cpu_notifier = {
+	.notify = blk_mq_cpu_notify,
+};
+
+void __init blk_mq_cpu_init(void)
+{
+	register_hotcpu_notifier(&blk_mq_main_cpu_notifier);
+	blk_mq_register_cpu_notifier(&cpu_notifier);
+}
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
new file mode 100644
index 000000000000..f8721278601c
--- /dev/null
+++ b/block/blk-mq-cpumap.c
@@ -0,0 +1,108 @@
+#include <linux/kernel.h>
+#include <linux/threads.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/cpu.h>
+
+#include <linux/blk-mq.h>
+#include "blk.h"
+#include "blk-mq.h"
+
+static void show_map(unsigned int *map, unsigned int nr)
+{
+	int i;
+
+	pr_info("blk-mq: CPU -> queue map\n");
+	for_each_online_cpu(i)
+		pr_info("  CPU%2u -> Queue %u\n", i, map[i]);
+}
+
+static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues,
+			      const int cpu)
+{
+	return cpu / ((nr_cpus + nr_queues - 1) / nr_queues);
+}
+
+static int get_first_sibling(unsigned int cpu)
+{
+	unsigned int ret;
+
+	ret = cpumask_first(topology_thread_cpumask(cpu));
+	if (ret < nr_cpu_ids)
+		return ret;
+
+	return cpu;
+}
+
+int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
+{
+	unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling;
+	cpumask_var_t cpus;
+
+	if (!alloc_cpumask_var(&cpus, GFP_ATOMIC))
+		return 1;
+
+	cpumask_clear(cpus);
+	nr_cpus = nr_uniq_cpus = 0;
+	for_each_online_cpu(i) {
+		nr_cpus++;
+		first_sibling = get_first_sibling(i);
+		if (!cpumask_test_cpu(first_sibling, cpus))
+			nr_uniq_cpus++;
+		cpumask_set_cpu(i, cpus);
+	}
+
+	queue = 0;
+	for_each_possible_cpu(i) {
+		if (!cpu_online(i)) {
+			map[i] = 0;
+			continue;
+		}
+
+		/*
+		 * Easy case - we have equal or more hardware queues. Or
+		 * there are no thread siblings to take into account. Do
+		 * 1:1 if enough, or sequential mapping if less.
+		 */
+		if (nr_queues >= nr_cpus || nr_cpus == nr_uniq_cpus) {
+			map[i] = cpu_to_queue_index(nr_cpus, nr_queues, queue);
+			queue++;
+			continue;
+		}
+
+		/*
+		 * Less then nr_cpus queues, and we have some number of
+		 * threads per cores. Map sibling threads to the same
+		 * queue.
+		 */
+		first_sibling = get_first_sibling(i);
+		if (first_sibling == i) {
+			map[i] = cpu_to_queue_index(nr_uniq_cpus, nr_queues,
+							queue);
+			queue++;
+		} else
+			map[i] = map[first_sibling];
+	}
+
+	show_map(map, nr_cpus);
+	free_cpumask_var(cpus);
+	return 0;
+}
+
+unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg)
+{
+	unsigned int *map;
+
+	/* If cpus are offline, map them to first hctx */
+	map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL,
+				reg->numa_node);
+	if (!map)
+		return NULL;
+
+	if (!blk_mq_update_queue_map(map, reg->nr_hw_queues))
+		return map;
+
+	kfree(map);
+	return NULL;
+}
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
new file mode 100644
index 000000000000..ba6cf8e9aa0a
--- /dev/null
+++ b/block/blk-mq-sysfs.c
@@ -0,0 +1,384 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/backing-dev.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/smp.h>
+
+#include <linux/blk-mq.h>
+#include "blk-mq.h"
+#include "blk-mq-tag.h"
+
+static void blk_mq_sysfs_release(struct kobject *kobj)
+{
+}
+
+struct blk_mq_ctx_sysfs_entry {
+	struct attribute attr;
+	ssize_t (*show)(struct blk_mq_ctx *, char *);
+	ssize_t (*store)(struct blk_mq_ctx *, const char *, size_t);
+};
+
+struct blk_mq_hw_ctx_sysfs_entry {
+	struct attribute attr;
+	ssize_t (*show)(struct blk_mq_hw_ctx *, char *);
+	ssize_t (*store)(struct blk_mq_hw_ctx *, const char *, size_t);
+};
+
+static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr,
+				 char *page)
+{
+	struct blk_mq_ctx_sysfs_entry *entry;
+	struct blk_mq_ctx *ctx;
+	struct request_queue *q;
+	ssize_t res;
+
+	entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr);
+	ctx = container_of(kobj, struct blk_mq_ctx, kobj);
+	q = ctx->queue;
+
+	if (!entry->show)
+		return -EIO;
+
+	res = -ENOENT;
+	mutex_lock(&q->sysfs_lock);
+	if (!blk_queue_dying(q))
+		res = entry->show(ctx, page);
+	mutex_unlock(&q->sysfs_lock);
+	return res;
+}
+
+static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr,
+				  const char *page, size_t length)
+{
+	struct blk_mq_ctx_sysfs_entry *entry;
+	struct blk_mq_ctx *ctx;
+	struct request_queue *q;
+	ssize_t re
author	Jens Axboe <axboe@kernel.dk>	2013-11-08 09:08:12 -0700
committer	Jens Axboe <axboe@kernel.dk>	2013-11-08 09:08:12 -0700
commit	e37459b8e2c7db6735e39e019e448b76e5e77647 (patch)
tree	a3f0944db87a8ae0d41e5acbbbabc1e7ef534d1b /block
parent	c7d1ba417c7cb7297d14dd47a390ec90ce548d5c (diff)
parent	e7e245000110a7794de8f925b9edc06a9c852f80 (diff)