summaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-11-14 15:32:19 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2017-11-14 15:32:19 -0800
commite2c5923c349c1738fe8fda980874d93f6fb2e5b6 (patch)
treeb97a90170c45211bcc437761653aa8016c34afcd /block
parentabc36be236358162202e86ad88616ff95a755101 (diff)
parenta04b5de5050ab8b891128eb2c47a0916fe8622e1 (diff)
Merge branch 'for-4.15/block' of git://git.kernel.dk/linux-block
Pull core block layer updates from Jens Axboe: "This is the main pull request for block storage for 4.15-rc1. Nothing out of the ordinary in here, and no API changes or anything like that. Just various new features for drivers, core changes, etc. In particular, this pull request contains: - A patch series from Bart, closing the whole on blk/scsi-mq queue quescing. - A series from Christoph, building towards hidden gendisks (for multipath) and ability to move bio chains around. - NVMe - Support for native multipath for NVMe (Christoph). - Userspace notifications for AENs (Keith). - Command side-effects support (Keith). - SGL support (Chaitanya Kulkarni) - FC fixes and improvements (James Smart) - Lots of fixes and tweaks (Various) - bcache - New maintainer (Michael Lyle) - Writeback control improvements (Michael) - Various fixes (Coly, Elena, Eric, Liang, et al) - lightnvm updates, mostly centered around the pblk interface (Javier, Hans, and Rakesh). - Removal of unused bio/bvec kmap atomic interfaces (me, Christoph) - Writeback series that fix the much discussed hundreds of millions of sync-all units. This goes all the way, as discussed previously (me). - Fix for missing wakeup on writeback timer adjustments (Yafang Shao). - Fix laptop mode on blk-mq (me). - {mq,name} tupple lookup for IO schedulers, allowing us to have alias names. This means you can use 'deadline' on both !mq and on mq (where it's called mq-deadline). (me). - blktrace race fix, oopsing on sg load (me). - blk-mq optimizations (me). - Obscure waitqueue race fix for kyber (Omar). - NBD fixes (Josef). - Disable writeback throttling by default on bfq, like we do on cfq (Luca Miccio). - Series from Ming that enable us to treat flush requests on blk-mq like any other request. This is a really nice cleanup. - Series from Ming that improves merging on blk-mq with schedulers, getting us closer to flipping the switch on scsi-mq again. - BFQ updates (Paolo). - blk-mq atomic flags memory ordering fixes (Peter Z). - Loop cgroup support (Shaohua). - Lots of minor fixes from lots of different folks, both for core and driver code" * 'for-4.15/block' of git://git.kernel.dk/linux-block: (294 commits) nvme: fix visibility of "uuid" ns attribute blk-mq: fixup some comment typos and lengths ide: ide-atapi: fix compile error with defining macro DEBUG blk-mq: improve tag waiting setup for non-shared tags brd: remove unused brd_mutex blk-mq: only run the hardware queue if IO is pending block: avoid null pointer dereference on null disk fs: guard_bio_eod() needs to consider partitions xtensa/simdisk: fix compile error nvme: expose subsys attribute to sysfs nvme: create 'slaves' and 'holders' entries for hidden controllers block: create 'slaves' and 'holders' entries for hidden gendisks nvme: also expose the namespace identification sysfs files for mpath nodes nvme: implement multipath access to nvme subsystems nvme: track shared namespaces nvme: introduce a nvme_ns_ids structure nvme: track subsystems block, nvme: Introduce blk_mq_req_flags_t block, scsi: Make SCSI quiesce and resume work reliably block: Add the QUEUE_FLAG_PREEMPT_ONLY request queue flag ...
Diffstat (limited to 'block')
-rw-r--r--block/bfq-iosched.c225
-rw-r--r--block/bio-integrity.c7
-rw-r--r--block/bio.c40
-rw-r--r--block/blk-cgroup.c9
-rw-r--r--block/blk-core.c274
-rw-r--r--block/blk-flush.c37
-rw-r--r--block/blk-lib.c108
-rw-r--r--block/blk-mq-debugfs.c3
-rw-r--r--block/blk-mq-sched.c203
-rw-r--r--block/blk-mq-tag.c11
-rw-r--r--block/blk-mq-tag.h7
-rw-r--r--block/blk-mq.c422
-rw-r--r--block/blk-mq.h60
-rw-r--r--block/blk-settings.c2
-rw-r--r--block/blk-stat.c45
-rw-r--r--block/blk-throttle.c12
-rw-r--r--block/blk-timeout.c5
-rw-r--r--block/blk-wbt.c2
-rw-r--r--block/blk.h46
-rw-r--r--block/bsg.c18
-rw-r--r--block/elevator.c67
-rw-r--r--block/genhd.c70
-rw-r--r--block/ioctl.c19
-rw-r--r--block/kyber-iosched.c12
-rw-r--r--block/mq-deadline.c1
-rw-r--r--block/scsi_ioctl.c8
26 files changed, 1097 insertions, 616 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index a4783da90ba8..889a8549d97f 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -108,6 +108,7 @@
#include "blk-mq-tag.h"
#include "blk-mq-sched.h"
#include "bfq-iosched.h"
+#include "blk-wbt.h"
#define BFQ_BFQQ_FNS(name) \
void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
@@ -724,6 +725,44 @@ static void bfq_updated_next_req(struct bfq_data *bfqd,
}
}
+static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
+{
+ u64 dur;
+
+ if (bfqd->bfq_wr_max_time > 0)
+ return bfqd->bfq_wr_max_time;
+
+ dur = bfqd->RT_prod;
+ do_div(dur, bfqd->peak_rate);
+
+ /*
+ * Limit duration between 3 and 13 seconds. Tests show that
+ * higher values than 13 seconds often yield the opposite of
+ * the desired result, i.e., worsen responsiveness by letting
+ * non-interactive and non-soft-real-time applications
+ * preserve weight raising for a too long time interval.
+ *
+ * On the other end, lower values than 3 seconds make it
+ * difficult for most interactive tasks to complete their jobs
+ * before weight-raising finishes.
+ */
+ if (dur > msecs_to_jiffies(13000))
+ dur = msecs_to_jiffies(13000);
+ else if (dur < msecs_to_jiffies(3000))
+ dur = msecs_to_jiffies(3000);
+
+ return dur;
+}
+
+/* switch back from soft real-time to interactive weight raising */
+static void switch_back_to_interactive_wr(struct bfq_queue *bfqq,
+ struct bfq_data *bfqd)
+{
+ bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+ bfqq->last_wr_start_finish = bfqq->wr_start_at_switch_to_srt;
+}
+
static void
bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
struct bfq_io_cq *bic, bool bfq_already_existing)
@@ -750,10 +789,16 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) ||
time_is_before_jiffies(bfqq->last_wr_start_finish +
bfqq->wr_cur_max_time))) {
- bfq_log_bfqq(bfqq->bfqd, bfqq,
- "resume state: switching off wr");
-
- bfqq->wr_coeff = 1;
+ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
+ !bfq_bfqq_in_large_burst(bfqq) &&
+ time_is_after_eq_jiffies(bfqq->wr_start_at_switch_to_srt +
+ bfq_wr_duration(bfqd))) {
+ switch_back_to_interactive_wr(bfqq, bfqd);
+ } else {
+ bfqq->wr_coeff = 1;
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
+ "resume state: switching off wr");
+ }
}
/* make sure weight will be updated, however we got here */
@@ -1173,33 +1218,22 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
return wr_or_deserves_wr;
}
-static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
+/*
+ * Return the farthest future time instant according to jiffies
+ * macros.
+ */
+static unsigned long bfq_greatest_from_now(void)
{
- u64 dur;
-
- if (bfqd->bfq_wr_max_time > 0)
- return bfqd->bfq_wr_max_time;
-
- dur = bfqd->RT_prod;
- do_div(dur, bfqd->peak_rate);
-
- /*
- * Limit duration between 3 and 13 seconds. Tests show that
- * higher values than 13 seconds often yield the opposite of
- * the desired result, i.e., worsen responsiveness by letting
- * non-interactive and non-soft-real-time applications
- * preserve weight raising for a too long time interval.
- *
- * On the other end, lower values than 3 seconds make it
- * difficult for most interactive tasks to complete their jobs
- * before weight-raising finishes.
- */
- if (dur > msecs_to_jiffies(13000))
- dur = msecs_to_jiffies(13000);
- else if (dur < msecs_to_jiffies(3000))
- dur = msecs_to_jiffies(3000);
+ return jiffies + MAX_JIFFY_OFFSET;
+}
- return dur;
+/*
+ * Return the farthest past time instant according to jiffies
+ * macros.
+ */
+static unsigned long bfq_smallest_from_now(void)
+{
+ return jiffies - MAX_JIFFY_OFFSET;
}
static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
@@ -1216,7 +1250,19 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
bfqq->wr_coeff = bfqd->bfq_wr_coeff;
bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
} else {
- bfqq->wr_start_at_switch_to_srt = jiffies;
+ /*
+ * No interactive weight raising in progress
+ * here: assign minus infinity to
+ * wr_start_at_switch_to_srt, to make sure
+ * that, at the end of the soft-real-time
+ * weight raising periods that is starting
+ * now, no interactive weight-raising period
+ * may be wrongly considered as still in
+ * progress (and thus actually started by
+ * mistake).
+ */
+ bfqq->wr_start_at_switch_to_srt =
+ bfq_smallest_from_now();
bfqq->wr_coeff = bfqd->bfq_wr_coeff *
BFQ_SOFTRT_WEIGHT_FACTOR;
bfqq->wr_cur_max_time =
@@ -2016,10 +2062,27 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
- bic->saved_wr_coeff = bfqq->wr_coeff;
- bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt;
- bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish;
- bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
+ if (unlikely(bfq_bfqq_just_created(bfqq) &&
+ !bfq_bfqq_in_large_burst(bfqq))) {
+ /*
+ * bfqq being merged right after being created: bfqq
+ * would have deserved interactive weight raising, but
+ * did not make it to be set in a weight-raised state,
+ * because of this early merge. Store directly the
+ * weight-raising state that would have been assigned
+ * to bfqq, so that to avoid that bfqq unjustly fails
+ * to enjoy weight raising if split soon.
+ */
+ bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff;
+ bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd);
+ bic->saved_last_wr_start_finish = jiffies;
+ } else {
+ bic->saved_wr_coeff = bfqq->wr_coeff;
+ bic->saved_wr_start_at_switch_to_srt =
+ bfqq->wr_start_at_switch_to_srt;
+ bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish;
+ bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
+ }
}
static void
@@ -2897,24 +2960,6 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
}
-/*
- * Return the farthest future time instant according to jiffies
- * macros.
- */
-static unsigned long bfq_greatest_from_now(void)
-{
- return jiffies + MAX_JIFFY_OFFSET;
-}
-
-/*
- * Return the farthest past time instant according to jiffies
- * macros.
- */
-static unsigned long bfq_smallest_from_now(void)
-{
- return jiffies - MAX_JIFFY_OFFSET;
-}
-
/**
* bfq_bfqq_expire - expire a queue.
* @bfqd: device owning the queue.
@@ -3489,11 +3534,7 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
bfq_wr_duration(bfqd)))
bfq_bfqq_end_wr(bfqq);
else {
- /* switch back to interactive wr */
- bfqq->wr_coeff = bfqd->bfq_wr_coeff;
- bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
- bfqq->last_wr_start_finish =
- bfqq->wr_start_at_switch_to_srt;
+ switch_back_to_interactive_wr(bfqq, bfqd);
bfqq->entity.prio_changed = 1;
}
}
@@ -3685,16 +3726,37 @@ void bfq_put_queue(struct bfq_queue *bfqq)
if (bfqq->ref)
return;
- if (bfq_bfqq_sync(bfqq))
+ if (!hlist_unhashed(&bfqq->burst_list_node)) {
+ hlist_del_init(&bfqq->burst_list_node);
/*
- * The fact that this queue is being destroyed does not
- * invalidate the fact that this queue may have been
- * activated during the current burst. As a consequence,
- * although the queue does not exist anymore, and hence
- * needs to be removed from the burst list if there,
- * the burst size has not to be decremented.
+ * Decrement also burst size after the removal, if the
+ * process associated with bfqq is exiting, and thus
+ * does not contribute to the burst any longer. This
+ * decrement helps filter out false positives of large
+ * bursts, when some short-lived process (often due to
+ * the execution of commands by some service) happens
+ * to start and exit while a complex application is
+ * starting, and thus spawning several processes that
+ * do I/O (and that *must not* be treated as a large
+ * burst, see comments on bfq_handle_burst).
+ *
+ * In particular, the decrement is performed only if:
+ * 1) bfqq is not a merged queue, because, if it is,
+ * then this free of bfqq is not triggered by the exit
+ * of the process bfqq is associated with, but exactly
+ * by the fact that bfqq has just been merged.
+ * 2) burst_size is greater than 0, to handle
+ * unbalanced decrements. Unbalanced decrements may
+ * happen in te following case: bfqq is inserted into
+ * the current burst list--without incrementing
+ * bust_size--because of a split, but the current
+ * burst list is not the burst list bfqq belonged to
+ * (see comments on the case of a split in
+ * bfq_set_request).
*/
- hlist_del_init(&bfqq->burst_list_node);
+ if (bfqq->bic && bfqq->bfqd->burst_size > 0)
+ bfqq->bfqd->burst_size--;
+ }
kmem_cache_free(bfq_pool, bfqq);
#ifdef CONFIG_BFQ_GROUP_IOSCHED
@@ -4127,7 +4189,6 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
new_bfqq->allocated++;
bfqq->allocated--;
new_bfqq->ref++;
- bfq_clear_bfqq_just_created(bfqq);
/*
* If the bic associated with the process
* issuing this request still points to bfqq
@@ -4139,6 +4200,8 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
bfqq, new_bfqq);
+
+ bfq_clear_bfqq_just_created(bfqq);
/*
* rq is about to be enqueued into new_bfqq,
* release rq reference on bfqq
@@ -4424,6 +4487,34 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
else {
bfq_clear_bfqq_in_large_burst(bfqq);
if (bic->was_in_burst_list)
+ /*
+ * If bfqq was in the current
+ * burst list before being
+ * merged, then we have to add
+ * it back. And we do not need
+ * to increase burst_size, as
+ * we did not decrement
+ * burst_size when we removed
+ * bfqq from the burst list as
+ * a consequence of a merge
+ * (see comments in
+ * bfq_put_queue). In this
+ * respect, it would be rather
+ * costly to know whether the
+ * current burst list is still
+ * the same burst list from
+ * which bfqq was removed on
+ * the merge. To avoid this
+ * cost, if bfqq was in a
+ * burst list, then we add
+ * bfqq to the current burst
+ * list without any further
+ * check. This can cause
+ * inappropriate insertions,
+ * but rarely enough to not
+ * harm the detection of large
+ * bursts significantly.
+ */
hlist_add_head(&bfqq->burst_list_node,
&bfqd->burst_list);
}
@@ -4775,7 +4866,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
bfq_init_root_group(bfqd->root_group, bfqd);
bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
-
+ wbt_disable_default(q);
return 0;
out_free:
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 5df32907ff3b..23b42e8aa03e 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -485,11 +485,8 @@ EXPORT_SYMBOL(bioset_integrity_create);
void bioset_integrity_free(struct bio_set *bs)
{
- if (bs->bio_integrity_pool)
- mempool_destroy(bs->bio_integrity_pool);
-
- if (bs->bvec_integrity_pool)
- mempool_destroy(bs->bvec_integrity_pool);
+ mempool_destroy(bs->bio_integrity_pool);
+ mempool_destroy(bs->bvec_integrity_pool);
}
EXPORT_SYMBOL(bioset_integrity_free);
diff --git a/block/bio.c b/block/bio.c
index cc60213e56d8..b94a802f8ba3 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -400,7 +400,7 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
/**
* bio_alloc_bioset - allocate a bio for I/O
- * @gfp_mask: the GFP_ mask given to the slab allocator
+ * @gfp_mask: the GFP_* mask given to the slab allocator
* @nr_iovecs: number of iovecs to pre-allocate
* @bs: the bio_set to allocate from.
*
@@ -1931,11 +1931,8 @@ void bioset_free(struct bio_set *bs)
if (bs->rescue_workqueue)
destroy_workqueue(bs->rescue_workqueue);
- if (bs->bio_pool)
- mempool_destroy(bs->bio_pool);
-
- if (bs->bvec_pool)
- mempool_destroy(bs->bvec_pool);
+ mempool_destroy(bs->bio_pool);
+ mempool_destroy(bs->bvec_pool);
bioset_integrity_free(bs);
bio_put_slab(bs);
@@ -2036,37 +2033,6 @@ int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css)
EXPORT_SYMBOL_GPL(bio_associate_blkcg);
/**
- * bio_associate_current - associate a bio with %current
- * @bio: target bio
- *
- * Associate @bio with %current if it hasn't been associated yet. Block
- * layer will treat @bio as if it were issued by %current no matter which
- * task actually issues it.
- *
- * This function takes an extra reference of @task's io_context and blkcg
- * which will be put when @bio is released. The caller must own @bio,
- * ensure %current->io_context exists, and is responsible for synchronizing
- * calls to this function.
- */
-int bio_associate_current(struct bio *bio)
-{
- struct io_context *ioc;
-
- if (bio->bi_css)
- return -EBUSY;
-
- ioc = current->io_context;
- if (!ioc)
- return -ENOENT;
-
- get_io_context_active(ioc);
- bio->bi_ioc = ioc;
- bio->bi_css = task_get_css(current, io_cgrp_id);
- return 0;
-}
-EXPORT_SYMBOL_GPL(bio_associate_current);
-
-/**
* bio_disassociate_task - undo bio_associate_current()
* @bio: target bio
*/
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index d3f56baee936..4117524ca45b 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1419,6 +1419,11 @@ int blkcg_policy_register(struct blkcg_policy *pol)
if (i >= BLKCG_MAX_POLS)
goto err_unlock;
+ /* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */
+ if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
+ (!pol->pd_alloc_fn ^ !pol->pd_free_fn))
+ goto err_unlock;
+
/* register @pol */
pol->plid = i;
blkcg_policy[pol->plid] = pol;
@@ -1452,7 +1457,7 @@ int blkcg_policy_register(struct blkcg_policy *pol)
return 0;
err_free_cpds:
- if (pol->cpd_alloc_fn) {
+ if (pol->cpd_free_fn) {
list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
if (blkcg->cpd[pol->plid]) {
pol->cpd_free_fn(blkcg->cpd[pol->plid]);
@@ -1492,7 +1497,7 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
/* remove cpds and unregister */
mutex_lock(&blkcg_pol_mutex);
- if (pol->cpd_alloc_fn) {
+ if (pol->cpd_free_fn) {
list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
if (blkcg->cpd[pol->plid]) {
pol->cpd_free_fn(blkcg->cpd[pol->plid]);
diff --git a/block/blk-core.c b/block/blk-core.c
index 048be4aa6024..7c54c195e79e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -333,11 +333,13 @@ EXPORT_SYMBOL(blk_stop_queue);
void blk_sync_queue(struct request_queue *q)
{
del_timer_sync(&q->timeout);
+ cancel_work_sync(&q->timeout_work);
if (q->mq_ops) {
struct blk_mq_hw_ctx *hctx;
int i;
+ cancel_delayed_work_sync(&q->requeue_work);
queue_for_each_hw_ctx(q, hctx, i)
cancel_delayed_work_sync(&hctx->run_work);
} else {
@@ -347,6 +349,37 @@ void blk_sync_queue(struct request_queue *q)
EXPORT_SYMBOL(blk_sync_queue);
/**
+ * blk_set_preempt_only - set QUEUE_FLAG_PREEMPT_ONLY
+ * @q: request queue pointer
+ *
+ * Returns the previous value of the PREEMPT_ONLY flag - 0 if the flag was not
+ * set and 1 if the flag was already set.
+ */
+int blk_set_preempt_only(struct request_queue *q)
+{
+ unsigned long flags;
+ int res;
+
+ spin_lock_irqsave(q->queue_lock, flags);
+ res = queue_flag_test_and_set(QUEUE_FLAG_PREEMPT_ONLY, q);
+ spin_unlock_irqrestore(q->queue_lock, flags);
+
+ return res;
+}
+EXPORT_SYMBOL_GPL(blk_set_preempt_only);
+
+void blk_clear_preempt_only(struct request_queue *q)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(q->queue_lock, flags);
+ queue_flag_clear(QUEUE_FLAG_PREEMPT_ONLY, q);
+ wake_up_all(&q->mq_freeze_wq);
+ spin_unlock_irqrestore(q->queue_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blk_clear_preempt_only);
+
+/**
* __blk_run_queue_uncond - run a queue whether or not it has been stopped
* @q: The queue to run
*
@@ -610,6 +643,9 @@ void blk_set_queue_dying(struct request_queue *q)
}
spin_unlock_irq(q->queue_lock);
}
+
+ /* Make blk_queue_enter() reexamine the DYING flag. */
+ wake_up_all(&q->mq_freeze_wq);
}
EXPORT_SYMBOL_GPL(blk_set_queue_dying);
@@ -718,7 +754,7 @@ static void free_request_size(void *element, void *data)
int blk_init_rl(struct request_list *rl, struct request_queue *q,
gfp_t gfp_mask)
{
- if (unlikely(rl->rq_pool))
+ if (unlikely(rl->rq_pool) || q->mq_ops)
return 0;
rl->q = q;
@@ -760,15 +796,38 @@ struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
}
EXPORT_SYMBOL(blk_alloc_queue);
-int blk_queue_enter(struct request_queue *q, bool nowait)
+/**
+ * blk_queue_enter() - try to increase q->q_usage_counter
+ * @q: request queue pointer
+ * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PREEMPT
+ */
+int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
{
+ const bool preempt = flags & BLK_MQ_REQ_PREEMPT;
+
while (true) {
+ bool success = false;
int ret;
- if (percpu_ref_tryget_live(&q->q_usage_counter))
+ rcu_read_lock_sched();
+ if (percpu_ref_tryget_live(&q->q_usage_counter)) {
+ /*
+ * The code that sets the PREEMPT_ONLY flag is
+ * responsible for ensuring that that flag is globally
+ * visible before the queue is unfrozen.
+ */
+ if (preempt || !blk_queue_preempt_only(q)) {
+ success = true;
+ } else {
+ percpu_ref_put(&q->q_usage_counter);
+ }
+ }
+ rcu_read_unlock_sched();
+
+ if (success)
return 0;
- if (nowait)
+ if (flags & BLK_MQ_REQ_NOWAIT)
return -EBUSY;
/*
@@ -781,7 +840,8 @@ int blk_queue_enter(struct request_queue *q, bool nowait)
smp_rmb();
ret = wait_event_interruptible(q->mq_freeze_wq,
- !atomic_read(&q->mq_freeze_depth) ||
+ (atomic_read(&q->mq_freeze_depth) == 0 &&
+ (preempt || !blk_queue_preempt_only(q))) ||
blk_queue_dying(q));
if (blk_queue_dying(q))
return -ENODEV;
@@ -844,6 +904,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
setup_timer(&q->backing_dev_info->laptop_mode_wb_timer,
laptop_mode_timer_fn, (unsigned long) q);
setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
+ INIT_WORK(&q->timeout_work, NULL);
INIT_LIST_HEAD(&q->queue_head);
INIT_LIST_HEAD(&q->timeout_list);
INIT_LIST_HEAD(&q->icq_list);
@@ -1154,7 +1215,7 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
* @rl: request list to allocate from
* @op: operation and flags
* @bio: bio to allocate request for (can be %NULL)
- * @gfp_mask: allocation mask
+ * @flags: BLQ_MQ_REQ_* flags
*
* Get a free request from @q. This function may fail under memory
* pressure or if @q is dead.
@@ -1164,7 +1225,7 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
* Returns request pointer on success, with @q->queue_lock *not held*.
*/
static struct request *__get_request(struct request_list *rl, unsigned int op,
- struct bio *bio, gfp_t gfp_mask)
+ struct bio *bio, blk_mq_req_flags_t flags)
{
struct request_queue *q = rl->q;
struct request *rq;
@@ -1173,6 +1234,8 @@ static struct request *__get_request(struct request_list *rl, unsigned int op,
struct io_cq *icq = NULL;
const bool is_sync = op_is_sync(op);
int may_queue;
+ gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC :
+ __GFP_DIRECT_RECLAIM;
req_flags_t rq_flags = RQF_ALLOCED;
lockdep_assert_held(q->queue_lock);
@@ -1255,6 +1318,8 @@ static struct request *__get_request(struct request_list *rl, unsigned int op,
blk_rq_set_rl(rq, rl);
rq->cmd_flags = op;
rq->rq_flags = rq_flags;
+ if (flags & BLK_MQ_REQ_PREEMPT)
+ rq->rq_flags |= RQF_PREEMPT;
/* init elvpriv */
if (rq_flags & RQF_ELVPRIV) {
@@ -1333,7 +1398,7 @@ rq_starved:
* @q: request_queue to allocate request from
* @op: operation and flags
* @bio: bio to allocate request for (can be %NULL)
- * @gfp_mask: allocation mask
+ * @flags: BLK_MQ_REQ_* flags.
*
* Get a free request from @q. If %__GFP_DIRECT_RECLAIM is set in @gfp_mask,
* this function keeps retrying under memory pressure and fails iff @q is dead.
@@ -1343,7 +1408,7 @@ rq_starved:
* Returns request pointer on success, with @q->queue_lock *not held*.
*/
static struct request *get_request(struct request_queue *q, unsigned int op,
- struct bio *bio, gfp_t gfp_mask)
+ struct bio *bio, blk_mq_req_flags_t flags)
{
const bool is_sync = op_is_sync(op);
DEFINE_WAIT(wait);
@@ -1355,7 +1420,7 @@ static struct request *get_request(struct request_queue *q, unsigned int op,
rl = blk_get_rl(q, bio); /* transferred to @rq on success */
retry:
- rq = __get_request(rl, op, bio, gfp_mask);
+ rq = __get_request(rl, op, bio, flags);
if (!IS_ERR(rq))
return rq;
@@ -1364,7 +1429,7 @@ retry:
return ERR_PTR(-EAGAIN);
}
- if (!gfpflags_allow_blocking(gfp_mask) || unlikely(blk_queue_dying(q))) {
+ if ((flags & BLK_MQ_REQ_NOWAIT) || unlikely(blk_queue_dying(q))) {
blk_put_rl(rl);
return rq;
}
@@ -1391,20 +1456,28 @@ retry:
goto retry;
}
+/* flags: BLK_MQ_REQ_PREEMPT and/or BLK_MQ_REQ_NOWAIT. */
static struct request *blk_old_get_request(struct request_queue *q,
- unsigned int op, gfp_t gfp_mask)
+ unsigned int op, blk_mq_req_flags_t flags)
{
struct request *rq;
+ gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC :
+ __GFP_DIRECT_RECLAIM;
+ int ret = 0;
WARN_ON_ONCE(q->mq_ops);
/* create ioc upfront */
create_io_context(gfp_mask, q->node);
+ ret = blk_queue_enter(q, flags);
+ if (ret)
+ return ERR_PTR(ret);
spin_lock_irq(q->queue_lock);
- rq = get_request(q, op, NULL, gfp_mask);
+ rq = get_request(q, op, NULL, flags);
if (IS_ERR(rq)) {
spin_unlock_irq(q->queue_lock);
+ blk_queue_exit(q);
return rq;
}
@@ -1415,25 +1488,40 @@ static struct request *blk_old_get_request(struct request_queue *q,
return rq;
}
-struct request *blk_get_request(struct request_queue *q, unsigned int op,
- gfp_t gfp_mask)
+/**
+ * blk_get_request_flags - allocate a request
+ * @q: request queue to allocate a request for
+ * @op: operation (REQ_OP_*) and REQ_* flags, e.g. REQ_SYNC.
+ * @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT.
+ */
+struct request *blk_get_request_flags(struct request_queue *q, unsigned int op,
+ blk_mq_req_flags_t flags)
{
struct request *req;
+ WARN_ON_ONCE(op & REQ_NOWAIT);
+ WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PREEMPT));
+
if (q->mq_ops) {
- req = blk_mq_alloc_request(q, op,
- (gfp_mask & __GFP_DIRECT_RECLAIM) ?
- 0 : BLK_MQ_REQ_NOWAIT);
+ req = blk_mq_alloc_request(q, op, flags);
if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn)
q->mq_ops->initialize_rq_fn(req);
} else {
- req = blk_old_get_request(q, op, gfp_mask);
+ req = blk_old_get_request(q, op, flags);
if (!IS_ERR(req) && q->initialize_rq_fn)
q->initialize_rq_fn(req);
}
return req;
}
+EXPORT_SYMBOL(blk_get_request_flags);
+
+struct request *blk_get_request(struct request_queue *q, unsigned int op,
+ gfp_t gfp_mask)
+{
+ return blk_get_request_flags(q, op, gfp_mask & __GFP_DIRECT_RECLAIM ?
+ 0 : BLK_MQ_REQ_NOWAIT);
+}
EXPORT_SYMBOL(blk_get_request);
/**
@@ -1576,6 +1664,7 @@ void __blk_put_request(struct request_queue *q, struct request *req)
blk_free_request(rl, req);
freed_request(rl, sync, rq_flags);
blk_put_rl(rl);
+ blk_queue_exit(q);
}
}
EXPORT_SYMBOL_GPL(__blk_put_request);
@@ -1857,8 +1946,10 @@ get_rq:
* Grab a free request. This is might sleep but can not fail.
* Returns with the queue unlocked.
*/
- req = get_request(q, bio->bi_opf, bio, GFP_NOIO);
+ blk_queue_enter_live(q);
+ req = get_request(q, bio->bi_opf, bio, 0);
if (IS_ERR(req)) {
+ blk_queue_exit(q);
__wbt_done(q->rq_wb, wb_acct);
if (PTR_ERR(req) == -ENOMEM)
bio->bi_status = BLK_STS_RESOURCE;
@@ -2200,8 +2291,10 @@ blk_qc_t generic_make_request(struct bio *bio)
current->bio_list = bio_list_on_stack;
do {
struct request_queue *q = bio->bi_disk->queue;
+ blk_mq_req_flags_t flags = bio->bi_opf & REQ_NOWAIT ?
+ BLK_MQ_REQ_NOWAIT : 0;
- if (likely(blk_queue_enter(q, bio->bi_opf & REQ_NOWAIT) == 0)) {
+ if (likely(blk_queue_enter(q, flags) == 0)) {
struct bio_list lower, same;
/* Create a fresh bio_list for all subordinate requests */
@@ -2242,6 +2335,40 @@ out:
EXPORT_SYMBOL(generic_make_request);
/**
+ * direct_make_request - hand a buffer directly to its device driver for I/O
+ * @bio: The bio describing the location in memory and on the device.
+ *
+ * This function behaves like generic_make_request(), but does not protect
+ * against recursion. Must only be used if the called driver is known
+ * to not call generic_make_request (or direct_make_request) again from
+ * its make_request function. (Calling direct_make_request again from
+ * a workqueue is perfectly fine as that doesn't recurse).
+ */
+blk_qc_t direct_make_request(struct bio *bio)
+{
+ struct request_queue *q = bio->bi_disk->queue;
+ bool nowait = bio->bi_opf & REQ_NOWAIT;
+ blk_qc_t ret;
+
+ if (!generic_make_request_checks(bio))
+ return BLK_QC_T_NONE;
+
+ if (unlikely(blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0))) {
+ if (nowait && !blk_queue_dying(q))
+ bio->bi_status = BLK_STS_AGAIN;
+ else
+ bio->bi_status = BLK_STS_IOERR;
+ bio_endio(bio);