From e795421e40b39bf874c5ed9434365fb4a4b6c5d1 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.com>
Date: Tue, 12 Jan 2016 16:24:15 +0100
Subject: cfq-iosched: Don't group_idle if cfqq has big thinktime

There is no point in idling on a cfq group if the only cfq queue that is
there has too big thinktime.

Signed-off-by: Jan Kara <jack@suse.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/cfq-iosched.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 1f9093e901da..0a6a70a9bca8 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2897,6 +2897,7 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 {
 	struct cfq_queue *cfqq = cfqd->active_queue;
+	struct cfq_rb_root *st = cfqq->service_tree;
 	struct cfq_io_cq *cic;
 	unsigned long sl, group_idle = 0;
 
@@ -2947,8 +2948,13 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 		return;
 	}
 
-	/* There are other queues in the group, don't do group idle */
-	if (group_idle && cfqq->cfqg->nr_cfqq > 1)
+	/*
+	 * There are other queues in the group or this is the only group and
+	 * it has too big thinktime, don't do group idle.
+	 */
+	if (group_idle &&
+	    (cfqq->cfqg->nr_cfqq > 1 ||
+	     cfq_io_thinktime_big(cfqd, &st->ttime, true)))
 		return;
 
 	cfq_mark_cfqq_wait_request(cfqq);
-- 
cgit v1.2.3


From 6c80731c75bd3289d35b89a21a4e317cbcb306e3 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.com>
Date: Tue, 12 Jan 2016 16:24:16 +0100
Subject: cfq-iosched: Reorder checks in cfq_should_preempt()

Move check for preemption by rt class up. There is no functional change
but it makes arguing about conditions simpler since we can be sure both
cfq queues are from the same ioprio class.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jan Kara <jack@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/cfq-iosched.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 0a6a70a9bca8..2544c219c00c 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3959,6 +3959,13 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 	if (cfq_slice_used(cfqq))
 		return true;
 
+	/*
+	 * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
+	 */
+	if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
+		return true;
+
+	WARN_ON_ONCE(cfqq->ioprio_class != new_cfqq->ioprio_class);
 	/* Allow preemption only if we are idling on sync-noidle tree */
 	if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD &&
 	    cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
@@ -3973,12 +3980,6 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 	if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending)
 		return true;
 
-	/*
-	 * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
-	 */
-	if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
-		return true;
-
 	/* An idle queue should not be idle now for some reason */
 	if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
 		return true;
-- 
cgit v1.2.3


From a257ae3e482e1cbd5fc573efeef5a73bffe09757 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.com>
Date: Tue, 12 Jan 2016 16:24:17 +0100
Subject: cfq-iosched: Allow sync noidle workloads to preempt each other

The original idea with preemption of sync noidle queues (introduced in
commit 718eee0579b8 "cfq-iosched: fairness for sync no-idle queues") was
that we service all sync noidle queues together, we don't idle on any of
the queues individually and we idle only if there is no sync noidle
queue to be served. This intention also matches the original test:

	if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD
	   && new_cfqq->service_tree == cfqq->service_tree)
		return true;

However since at that time cfqq->service_tree was not set for idling
queues, this test was unreliable and was replaced in commit e4a229196a7c
"cfq-iosched: fix no-idle preemption logic" by:

	if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
	    cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
	    new_cfqq->service_tree->count == 1)
		return true;

That was a reliable test but was actually doing something different -
now we preempt sync noidle queue only if the new queue is the only one
busy in the service tree.

These days cfq queue is kept in service tree even if it is idling and
thus the original check would be safe again. But since we actually check
that cfq queues are in the same cgroup, of the same priority class and
workload type (sync noidle), we know that new_cfqq is fine to preempt
cfqq. So just remove the service tree check.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jan Kara <jack@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/cfq-iosched.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 2544c219c00c..98569c2f373e 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3969,7 +3969,6 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 	/* Allow preemption only if we are idling on sync-noidle tree */
 	if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD &&
 	    cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
-	    new_cfqq->service_tree->count == 2 &&
 	    RB_EMPTY_ROOT(&cfqq->sort_list))
 		return true;
 
-- 
cgit v1.2.3


From 3984aa55204e2c3f423a70b013c44c64261788df Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.com>
Date: Tue, 12 Jan 2016 16:24:19 +0100
Subject: cfq-iosched: Allow parent cgroup to preempt its child

Currently we don't allow sync workload of one cgroup to preempt sync
workload of any other cgroup. This is because we want to achieve service
separation between cgroups. However in cases where cgroup preempting is
ancestor of the current cgroup, there is no need of separation and
idling introduces unnecessary overhead. This hurts for example the case
when workload is isolated within a cgroup but journalling threads are in
root cgroup. Simple way to demostrate the issue is using:

dbench4 -c /usr/share/dbench4/client.txt -t 10 -D /mnt 1

on ext4 filesystem on plain SATA drive (mounted with barrier=0 to make
difference more visible). When all processes are in the root cgroup,
reported throughput is 153.132 MB/sec. When dbench process gets its own
blkio cgroup, reported throughput drops to 26.1006 MB/sec.

Fix the problem by making check in cfq_should_preempt() more benevolent
and allow preemption by ancestor cgroup. This improves the throughput
reported by dbench4 to 48.9106 MB/sec.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jan Kara <jack@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/cfq-iosched.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 98569c2f373e..e3c591dd8f19 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -632,6 +632,13 @@ static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg)
 	return pblkg ? blkg_to_cfqg(pblkg) : NULL;
 }
 
+static inline bool cfqg_is_descendant(struct cfq_group *cfqg,
+				      struct cfq_group *ancestor)
+{
+	return cgroup_is_descendant(cfqg_to_blkg(cfqg)->blkcg->css.cgroup,
+				    cfqg_to_blkg(ancestor)->blkcg->css.cgroup);
+}
+
 static inline void cfqg_get(struct cfq_group *cfqg)
 {
 	return blkg_get(cfqg_to_blkg(cfqg));
@@ -758,6 +765,11 @@ static void cfqg_stats_xfer_dead(struct cfq_group *cfqg)
 #else	/* CONFIG_CFQ_GROUP_IOSCHED */
 
 static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; }
+static inline bool cfqg_is_descendant(struct cfq_group *cfqg,
+				      struct cfq_group *ancestor)
+{
+	return true;
+}
 static inline void cfqg_get(struct cfq_group *cfqg) { }
 static inline void cfqg_put(struct cfq_group *cfqg) { }
 
@@ -3953,7 +3965,12 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 	if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
 		return true;
 
-	if (new_cfqq->cfqg != cfqq->cfqg)
+	/*
+	 * Treat ancestors of current cgroup the same way as current cgroup.
+	 * For anybody else we disallow preemption to guarantee service
+	 * fairness among cgroups.
+	 */
+	if (!cfqg_is_descendant(cfqq->cfqg, new_cfqq->cfqg))
 		return false;
 
 	if (cfq_slice_used(cfqq))
-- 
cgit v1.2.3


From 868f2f0b72068a097508b6e8870a8950fd8eb7ef Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Thu, 17 Dec 2015 17:08:14 -0700
Subject: blk-mq: dynamic h/w context count

The hardware's provided queue count may change at runtime with resource
provisioning. This patch allows a block driver to alter the number of
h/w queues available when its resource count changes.

The main part is a new blk-mq API to request a new number of h/w queues
for a given live tag set. The new API freezes all queues using that set,
then adjusts the allocated count prior to remapping these to CPUs.

The bulk of the rest just shifts where h/w contexts and all their
artifacts are allocated and freed.

The number of max h/w contexts is capped to the number of possible cpus
since there is no use for more than that. As such, all pre-allocated
memory for pointers need to account for the max possible rather than
the initial number of queues.

A side effect of this is that the blk-mq will proceed successfully as
long as it can allocate at least one h/w context. Previously it would
fail request queue initialization if less than the requested number
was allocated.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Jon Derrick <jonathan.derrick@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-sysfs.c |   9 +--
 block/blk-mq.c       | 173 +++++++++++++++++++++++++++++++--------------------
 block/blk-mq.h       |   1 +
 3 files changed, 110 insertions(+), 73 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 1cf18784c5cf..431fdda21737 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -408,17 +408,18 @@ void blk_mq_unregister_disk(struct gendisk *disk)
 	blk_mq_enable_hotplug();
 }
 
+void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx)
+{
+	kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
+}
+
 static void blk_mq_sysfs_init(struct request_queue *q)
 {
-	struct blk_mq_hw_ctx *hctx;
 	struct blk_mq_ctx *ctx;
 	int i;
 
 	kobject_init(&q->mq_kobj, &blk_mq_ktype);
 
-	queue_for_each_hw_ctx(q, hctx, i)
-		kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
-
 	queue_for_each_ctx(q, ctx, i)
 		kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
 }
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4c0622fae413..645eb9e716d0 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1742,31 +1742,6 @@ static int blk_mq_init_hctx(struct request_queue *q,
 	return -1;
 }
 
-static int blk_mq_init_hw_queues(struct request_queue *q,
-		struct blk_mq_tag_set *set)
-{
-	struct blk_mq_hw_ctx *hctx;
-	unsigned int i;
-
-	/*
-	 * Initialize hardware queues
-	 */
-	queue_for_each_hw_ctx(q, hctx, i) {
-		if (blk_mq_init_hctx(q, set, hctx, i))
-			break;
-	}
-
-	if (i == q->nr_hw_queues)
-		return 0;
-
-	/*
-	 * Init failed
-	 */
-	blk_mq_exit_hw_queues(q, set, i);
-
-	return 1;
-}
-
 static void blk_mq_init_cpu_queues(struct request_queue *q,
 				   unsigned int nr_hw_queues)
 {
@@ -1824,6 +1799,7 @@ static void blk_mq_map_swqueue(struct request_queue *q,
 			continue;
 
 		hctx = q->mq_ops->map_queue(q, i);
+
 		cpumask_set_cpu(i, hctx->cpumask);
 		ctx->index_hw = hctx->nr_ctx;
 		hctx->ctxs[hctx->nr_ctx++] = ctx;
@@ -1972,54 +1948,89 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 }
 EXPORT_SYMBOL(blk_mq_init_queue);
 
-struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
-						  struct request_queue *q)
+static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
+						struct request_queue *q)
 {
-	struct blk_mq_hw_ctx **hctxs;
-	struct blk_mq_ctx __percpu *ctx;
-	unsigned int *map;
-	int i;
-
-	ctx = alloc_percpu(struct blk_mq_ctx);
-	if (!ctx)
-		return ERR_PTR(-ENOMEM);
-
-	hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
-			set->numa_node);
-
-	if (!hctxs)
-		goto err_percpu;
-
-	map = blk_mq_make_queue_map(set);
-	if (!map)
-		goto err_map;
+	int i, j;
+	struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
 
+	blk_mq_sysfs_unregister(q);
 	for (i = 0; i < set->nr_hw_queues; i++) {
-		int node = blk_mq_hw_queue_to_node(map, i);
+		int node;
 
+		if (hctxs[i])
+			continue;
+
+		node = blk_mq_hw_queue_to_node(q->mq_map, i);
 		hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx),
 					GFP_KERNEL, node);
 		if (!hctxs[i])
-			goto err_hctxs;
+			break;
 
 		if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL,
-						node))
-			goto err_hctxs;
+						node)) {
+			kfree(hctxs[i]);
+			hctxs[i] = NULL;
+			break;
+		}
 
 		atomic_set(&hctxs[i]->nr_active, 0);
 		hctxs[i]->numa_node = node;
 		hctxs[i]->queue_num = i;
+
+		if (blk_mq_init_hctx(q, set, hctxs[i], i)) {
+			free_cpumask_var(hctxs[i]->cpumask);
+			kfree(hctxs[i]);
+			hctxs[i] = NULL;
+			break;
+		}
+		blk_mq_hctx_kobj_init(hctxs[i]);
 	}
+	for (j = i; j < q->nr_hw_queues; j++) {
+		struct blk_mq_hw_ctx *hctx = hctxs[j];
+
+		if (hctx) {
+			if (hctx->tags) {
+				blk_mq_free_rq_map(set, hctx->tags, j);
+				set->tags[j] = NULL;
+			}
+			blk_mq_exit_hctx(q, set, hctx, j);
+			free_cpumask_var(hctx->cpumask);
+			kobject_put(&hctx->kobj);
+			kfree(hctx->ctxs);
+			kfree(hctx);
+			hctxs[j] = NULL;
+
+		}
+	}
+	q->nr_hw_queues = i;
+	blk_mq_sysfs_register(q);
+}
+
+struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
+						  struct request_queue *q)
+{
+	q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
+	if (!q->queue_ctx)
+		return ERR_PTR(-ENOMEM);
+
+	q->queue_hw_ctx = kzalloc_node(nr_cpu_ids * sizeof(*(q->queue_hw_ctx)),
+						GFP_KERNEL, set->numa_node);
+	if (!q->queue_hw_ctx)
+		goto err_percpu;
+
+	q->mq_map = blk_mq_make_queue_map(set);
+	if (!q->mq_map)
+		goto err_map;
+
+	blk_mq_realloc_hw_ctxs(set, q);
+	if (!q->nr_hw_queues)
+		goto err_hctxs;
 
 	INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
 	blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
 
 	q->nr_queues = nr_cpu_ids;
-	q->nr_hw_queues = set->nr_hw_queues;
-	q->mq_map = map;
-
-	q->queue_ctx = ctx;
-	q->queue_hw_ctx = hctxs;
 
 	q->mq_ops = set->ops;
 	q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
@@ -2048,9 +2059,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 
 	blk_mq_init_cpu_queues(q, set->nr_hw_queues);
 
-	if (blk_mq_init_hw_queues(q, set))
-		goto err_hctxs;
-
 	get_online_cpus();
 	mutex_lock(&all_q_mutex);
 
@@ -2064,17 +2072,11 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	return q;
 
 err_hctxs:
-	kfree(map);
-	for (i = 0; i < set->nr_hw_queues; i++) {
-		if (!hctxs[i])
-			break;
-		free_cpumask_var(hctxs[i]->cpumask);
-		kfree(hctxs[i]);
-	}
+	kfree(q->mq_map);
 err_map:
-	kfree(hctxs);
+	kfree(q->queue_hw_ctx);
 err_percpu:
-	free_percpu(ctx);
+	free_percpu(q->queue_ctx);
 	return ERR_PTR(-ENOMEM);
 }
 EXPORT_SYMBOL(blk_mq_init_allocated_queue);
@@ -2282,9 +2284,13 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 		set->nr_hw_queues = 1;
 		set->queue_depth = min(64U, set->queue_depth);
 	}
+	/*
+	 * There is no use for more h/w queues than cpus.
+	 */
+	if (set->nr_hw_queues > nr_cpu_ids)
+		set->nr_hw_queues = nr_cpu_ids;
 
-	set->tags = kmalloc_node(set->nr_hw_queues *
-				 sizeof(struct blk_mq_tags *),
+	set->tags = kzalloc_node(nr_cpu_ids * sizeof(struct blk_mq_tags *),
 				 GFP_KERNEL, set->numa_node);
 	if (!set->tags)
 		return -ENOMEM;
@@ -2307,7 +2313,7 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
 {
 	int i;
 
-	for (i = 0; i < set->nr_hw_queues; i++) {
+	for (i = 0; i < nr_cpu_ids; i++) {
 		if (set->tags[i])
 			blk_mq_free_rq_map(set, set->tags[i], i);
 	}
@@ -2339,6 +2345,35 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
 	return ret;
 }
 
+void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
+{
+	struct request_queue *q;
+
+	if (nr_hw_queues > nr_cpu_ids)
+		nr_hw_queues = nr_cpu_ids;
+	if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues)
+		return;
+
+	list_for_each_entry(q, &set->tag_list, tag_set_list)
+		blk_mq_freeze_queue(q);
+
+	set->nr_hw_queues = nr_hw_queues;
+	list_for_each_entry(q, &set->tag_list, tag_set_list) {
+		blk_mq_realloc_hw_ctxs(set, q);
+
+		if (q->nr_hw_queues > 1)
+			blk_queue_make_request(q, blk_mq_make_request);
+		else
+			blk_queue_make_request(q, blk_sq_make_request);
+
+		blk_mq_queue_reinit(q, cpu_online_mask);
+	}
+
+	list_for_each_entry(q, &set->tag_list, tag_set_list)
+		blk_mq_unfreeze_queue(q);
+}
+EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
+
 void blk_mq_disable_hotplug(void)
 {
 	mutex_lock(&all_q_mutex);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index eaede8e45c9c..9087b11037b7 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -57,6 +57,7 @@ extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int);
  */
 extern int blk_mq_sysfs_register(struct request_queue *q);
 extern void blk_mq_sysfs_unregister(struct request_queue *q);
+extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
 
 extern void blk_mq_rq_timed_out(struct request *req, bool reserved);
 
-- 
cgit v1.2.3


From d5df731ab804e0d917f44099bfeb88a5f1488a3d Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Wed, 10 Feb 2016 16:52:47 -0700
Subject: block: Initialize max_dev_sectors to 0

The new queue limit is not used by the majority of block drivers, and
should be initialized to 0 for the driver's requested settings to be used.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Sagi Grimberg <sagig@mellanox.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-settings.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index dd4973583978..c7bb666aafd1 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -91,8 +91,8 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
 	lim->virt_boundary_mask = 0;
 	lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
-	lim->max_sectors = lim->max_dev_sectors = lim->max_hw_sectors =
-		BLK_SAFE_MAX_SECTORS;
+	lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
+	lim->max_dev_sectors = 0;
 	lim->chunk_sectors = 0;
 	lim->max_write_same_sectors = 0;
 	lim->max_discard_sectors = 0;
-- 
cgit v1.2.3


From 66841672161efb9e3be4a1dbd9755020bb1d86b7 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Fri, 12 Feb 2016 15:27:00 +0800
Subject: blk-mq: mark request queue as mq asap

Currently q->mq_ops is used widely to decide if the queue
is mq or not, so we should set the 'flag' asap so that both
block core and drivers can get the correct mq info.

For example, commit 868f2f0b720(blk-mq: dynamic h/w context count)
moves the hctx's initialization before setting q->mq_ops in
blk_mq_init_allocated_queue(), then cause blk_alloc_flush_queue()
to think the queue is non-mq and don't allocate command size
for the per-hctx flush rq.

This patches should fix the problem reported by Sasha.

Cc: Keith Busch <keith.busch@intel.com>
Reported-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Fixes: 868f2f0b720 ("blk-mq: dynamic h/w context count")
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 645eb9e716d0..f539a53d16c3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2010,6 +2010,9 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 						  struct request_queue *q)
 {
+	/* mark the queue as mq asap */
+	q->mq_ops = set->ops;
+
 	q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
 	if (!q->queue_ctx)
 		return ERR_PTR(-ENOMEM);
@@ -2032,7 +2035,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 
 	q->nr_queues = nr_cpu_ids;
 
-	q->mq_ops = set->ops;
 	q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
 
 	if (!(set->flags & BLK_MQ_F_SG_MERGE))
-- 
cgit v1.2.3


From e9137d4b93078b6a9965acfb18a2a2ad91cf8405 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Thu, 18 Feb 2016 14:56:35 -0700
Subject: blk-mq: Fix NULL pointer updating nr_requests

A h/w context's tags are freed if it was not assigned a CPU. Check if
the context has tags before updating the depth.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index f539a53d16c3..5667f59c277c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2336,6 +2336,8 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
 
 	ret = 0;
 	queue_for_each_hw_ctx(q, hctx, i) {
+		if (!hctx->tags)
+			continue;
 		ret = blk_mq_tag_update_depth(hctx->tags, nr);
 		if (ret)
 			break;
-- 
cgit v1.2.3


From af3e3a5259e35d7056fbe568a0ffcbd1420e1742 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 11 Mar 2016 17:34:50 +0100
Subject: block: don't unecessarily clobber bi_error for chained bios

Only overwrite the parents bi_error if it was zero. That way a successful
bio completion doesn't reset the error pointer.

Reported-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/bio.c b/block/bio.c
index dbabd48b1934..282ca2e5aaf2 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -300,7 +300,8 @@ static void bio_chain_endio(struct bio *bio)
 {
 	struct bio *parent = bio->bi_private;
 
-	parent->bi_error = bio->bi_error;
+	if (!parent->bi_error)
+		parent->bi_error = bio->bi_error;
 	bio_endio(parent);
 	bio_put(bio);
 }
@@ -1753,7 +1754,9 @@ void bio_endio(struct bio *bio)
 		 */
 		if (bio->bi_end_io == bio_chain_endio) {
 			struct bio *parent = bio->bi_private;
-			parent->bi_error = bio->bi_error;
+
+			if (!parent->bi_error)
+				parent->bi_error = bio->bi_error;
 			bio_put(bio);
 			bio = parent;
 		} else {
-- 
cgit v1.2.3


From 38f8baae890561203ba6093f76b14576ce9b271b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 11 Mar 2016 17:34:51 +0100
Subject: block: factor out chained bio completion

Factor common code between bio_chain_endio and bio_endio into a common
helper.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'block')

diff --git a/block/bio.c b/block/bio.c
index 282ca2e5aaf2..67e51ace1b77 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -296,14 +296,19 @@ void bio_reset(struct bio *bio)
 }
 EXPORT_SYMBOL(bio_reset);
 
-static void bio_chain_endio(struct bio *bio)
+static struct bio *__bio_chain_endio(struct bio *bio)
 {
 	struct bio *parent = bio->bi_private;
 
 	if (!parent->bi_error)
 		parent->bi_error = bio->bi_error;
-	bio_endio(parent);
 	bio_put(bio);
+	return parent;
+}
+
+static void bio_chain_endio(struct bio *bio)
+{
+	bio_endio(__bio_chain_endio(bio));
 }
 
 /*
@@ -1753,12 +1758,7 @@ void bio_endio(struct bio *bio)
 		 * pointers also disables gcc's sibling call optimization.
 		 */
 		if (bio->bi_end_io == bio_chain_endio) {
-			struct bio *parent = bio->bi_private;
-
-			if (!parent->bi_error)
-				parent->bi_error = bio->bi_error;
-			bio_put(bio);
-			bio = parent;
+			bio = __bio_chain_endio(bio);
 		} else {
 			if (bio->bi_end_io)
 				bio->bi_end_io(bio);
-- 
cgit v1.2.3


From ba8c6967b7391aab8fa562611fe637a57850b4aa Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 11 Mar 2016 17:34:52 +0100
Subject: block: cleanup bio_endio

Replace the while loop that unecessarily checks for a NULL bio in the fast
path with a simple goto loop.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

(limited to 'block')

diff --git a/block/bio.c b/block/bio.c
index 67e51ace1b77..e4682ec11fcd 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1745,26 +1745,25 @@ static inline bool bio_remaining_done(struct bio *bio)
  **/
 void bio_endio(struct bio *bio)
 {
-	while (bio) {
-		if (unlikely(!bio_remaining_done(bio)))
-			break;
+again:
+	if (unlikely(!bio_remaining_done(bio)))
+		return;
 
-		/*
-		 * Need to have a real endio function for chained bios,
-		 * otherwise various corner cases will break (like stacking
-		 * block devices that save/restore bi_end_io) - however, we want
-		 * to avoid unbounded recursion and blowing the stack. Tail call
-		 * optimization would handle this, but compiling with frame
-		 * pointers also disables gcc's sibling call optimization.
-		 */
-		if (bio->bi_end_io == bio_chain_endio) {
-			bio = __bio_chain_endio(bio);
-		} else {
-			if (bio->bi_end_io)
-				bio->bi_end_io(bio);
-			bio = NULL;
-		}
+	/*
+	 * Need to have a real endio function for chained bios, otherwise
+	 * various corner cases will break (like stacking block devices that
+	 * save/restore bi_end_io) - however, we want to avoid unbounded
+	 * recursion and blowing the stack. Tail call optimization would
+	 * handle this, but compiling with frame pointers also disables
+	 * gcc's sibling call optimization.
+	 */
+	if (bio->bi_end_io == bio_chain_endio) {
+		bio = __bio_chain_endio(bio);
+		goto again;
 	}
+
+	if (bio->bi_end_io)
+		bio->bi_end_io(bio);
 }
 EXPORT_SYMBOL(bio_endio);
 
-- 
cgit v1.2.3


From 2b885517110cbe8724fef30363778b6284d0a428 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 11 Mar 2016 17:34:53 +0100
Subject: block: bio_remaining_done() isn't unlikely

We use bio chaining during most I/Os these days due to the delayed
bio splitting.  Additionally XFS will start using it, and there is
a pending direct I/O rewrite also making heavy use for it.  Don't
pretend it's always unlikely, and let the branch predictor do it's
job instead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/bio.c b/block/bio.c
index e4682ec11fcd..0fde6e0e81f2 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1746,7 +1746,7 @@ static inline bool bio_remaining_done(struct bio *bio)
 void bio_endio(struct bio *bio)
 {
 again:
-	if (unlikely(!bio_remaining_done(bio)))
+	if (!bio_remaining_done(bio))
 		return;
 
 	/*
-- 
cgit v1.2.3


From 4ee86babe09f0682a60b1c56be99819bbe4ba62c Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Tue, 15 Mar 2016 12:03:28 -0700
Subject: blk-mq: add bounds check on tag-to-rq conversion

We need to check for a valid index before accessing the array
element to avoid accessing invalid memory regions.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>

Modified by Jens to drop the unlikely(), and make the fall through
path be having a valid tag.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5667f59c277c..261b6feddae6 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -544,7 +544,10 @@ EXPORT_SYMBOL(blk_mq_abort_requeue_list);
 
 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 {
-	return tags->rqs[tag];
+	if (tag < tags->nr_tags)
+		return tags->rqs[tag];
+
+	return NULL;
 }
 EXPORT_SYMBOL(blk_mq_tag_to_rq);
 
-- 
cgit v1.2.3