summaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig.iosched7
-rw-r--r--block/bfq-cgroup.c212
-rw-r--r--block/bfq-iosched.c967
-rw-r--r--block/bfq-iosched.h48
-rw-r--r--block/bio.c86
-rw-r--r--block/blk-cgroup.c139
-rw-r--r--block/blk-core.c111
-rw-r--r--block/blk-iolatency.c8
-rw-r--r--block/blk-map.c10
-rw-r--r--block/blk-merge.c110
-rw-r--r--block/blk-mq-debugfs.c42
-rw-r--r--block/blk-mq-sched.c26
-rw-r--r--block/blk-mq-sched.h10
-rw-r--r--block/blk-mq.c28
-rw-r--r--block/blk.h36
-rw-r--r--block/genhd.c5
-rw-r--r--block/kyber-iosched.c5
-rw-r--r--block/mq-deadline.c5
-rw-r--r--block/opal_proto.h16
-rw-r--r--block/sed-opal.c197
20 files changed, 1324 insertions, 744 deletions
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 4626b88b2d5a..7a6b2f29a582 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -36,6 +36,13 @@ config BFQ_GROUP_IOSCHED
Enable hierarchical scheduling in BFQ, using the blkio
(cgroups-v1) or io (cgroups-v2) controller.
+config BFQ_CGROUP_DEBUG
+ bool "BFQ IO controller debugging"
+ depends on BFQ_GROUP_IOSCHED
+ ---help---
+ Enable some debugging help. Currently it exports additional stat
+ files in a cgroup which can be useful for debugging.
+
endmenu
endif
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index b3796a40a61a..0f6cd688924f 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -15,7 +15,83 @@
#include "bfq-iosched.h"
-#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
+static int bfq_stat_init(struct bfq_stat *stat, gfp_t gfp)
+{
+ int ret;
+
+ ret = percpu_counter_init(&stat->cpu_cnt, 0, gfp);
+ if (ret)
+ return ret;
+
+ atomic64_set(&stat->aux_cnt, 0);
+ return 0;
+}
+
+static void bfq_stat_exit(struct bfq_stat *stat)
+{
+ percpu_counter_destroy(&stat->cpu_cnt);
+}
+
+/**
+ * bfq_stat_add - add a value to a bfq_stat
+ * @stat: target bfq_stat
+ * @val: value to add
+ *
+ * Add @val to @stat. The caller must ensure that IRQ on the same CPU
+ * don't re-enter this function for the same counter.
+ */
+static inline void bfq_stat_add(struct bfq_stat *stat, uint64_t val)
+{
+ percpu_counter_add_batch(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH);
+}
+
+/**
+ * bfq_stat_read - read the current value of a bfq_stat
+ * @stat: bfq_stat to read
+ */
+static inline uint64_t bfq_stat_read(struct bfq_stat *stat)
+{
+ return percpu_counter_sum_positive(&stat->cpu_cnt);
+}
+
+/**
+ * bfq_stat_reset - reset a bfq_stat
+ * @stat: bfq_stat to reset
+ */
+static inline void bfq_stat_reset(struct bfq_stat *stat)
+{
+ percpu_counter_set(&stat->cpu_cnt, 0);
+ atomic64_set(&stat->aux_cnt, 0);
+}
+
+/**
+ * bfq_stat_add_aux - add a bfq_stat into another's aux count
+ * @to: the destination bfq_stat
+ * @from: the source
+ *
+ * Add @from's count including the aux one to @to's aux count.
+ */
+static inline void bfq_stat_add_aux(struct bfq_stat *to,
+ struct bfq_stat *from)
+{
+ atomic64_add(bfq_stat_read(from) + atomic64_read(&from->aux_cnt),
+ &to->aux_cnt);
+}
+
+/**
+ * blkg_prfill_stat - prfill callback for bfq_stat
+ * @sf: seq_file to print to
+ * @pd: policy private data of interest
+ * @off: offset to the bfq_stat in @pd
+ *
+ * prfill callback for printing a bfq_stat.
+ */
+static u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd,
+ int off)
+{
+ return __blkg_prfill_u64(sf, pd, bfq_stat_read((void *)pd + off));
+}
/* bfqg stats flags */
enum bfqg_stats_flags {
@@ -53,7 +129,7 @@ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
now = ktime_get_ns();
if (now > stats->start_group_wait_time)
- blkg_stat_add(&stats->group_wait_time,
+ bfq_stat_add(&stats->group_wait_time,
now - stats->start_group_wait_time);
bfqg_stats_clear_waiting(stats);
}
@@ -82,14 +158,14 @@ static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
now = ktime_get_ns();
if (now > stats->start_empty_time)
- blkg_stat_add(&stats->empty_time,
+ bfq_stat_add(&stats->empty_time,
now - stats->start_empty_time);
bfqg_stats_clear_empty(stats);
}
void bfqg_stats_update_dequeue(struct bfq_group *bfqg)
{
- blkg_stat_add(&bfqg->stats.dequeue, 1);
+ bfq_stat_add(&bfqg->stats.dequeue, 1);
}
void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
@@ -119,7 +195,7 @@ void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
u64 now = ktime_get_ns();
if (now > stats->start_idle_time)
- blkg_stat_add(&stats->idle_time,
+ bfq_stat_add(&stats->idle_time,
now - stats->start_idle_time);
bfqg_stats_clear_idling(stats);
}
@@ -137,9 +213,9 @@ void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
{
struct bfqg_stats *stats = &bfqg->stats;
- blkg_stat_add(&stats->avg_queue_size_sum,
+ bfq_stat_add(&stats->avg_queue_size_sum,
blkg_rwstat_total(&stats->queued));
- blkg_stat_add(&stats->avg_queue_size_samples, 1);
+ bfq_stat_add(&stats->avg_queue_size_samples, 1);
bfqg_stats_update_group_wait_time(stats);
}
@@ -176,7 +252,7 @@ void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
io_start_time_ns - start_time_ns);
}
-#else /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
+#else /* CONFIG_BFQ_CGROUP_DEBUG */
void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
unsigned int op) { }
@@ -190,7 +266,7 @@ void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { }
void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { }
-#endif /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
+#endif /* CONFIG_BFQ_CGROUP_DEBUG */
#ifdef CONFIG_BFQ_GROUP_IOSCHED
@@ -274,18 +350,18 @@ void bfqg_and_blkg_put(struct bfq_group *bfqg)
/* @stats = 0 */
static void bfqg_stats_reset(struct bfqg_stats *stats)
{
-#ifdef CONFIG_DEBUG_BLK_CGROUP
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
/* queued stats shouldn't be cleared */
blkg_rwstat_reset(&stats->merged);
blkg_rwstat_reset(&stats->service_time);
blkg_rwstat_reset(&stats->wait_time);
- blkg_stat_reset(&stats->time);
- blkg_stat_reset(&stats->avg_queue_size_sum);
- blkg_stat_reset(&stats->avg_queue_size_samples);
- blkg_stat_reset(&stats->dequeue);
- blkg_stat_reset(&stats->group_wait_time);
- blkg_stat_reset(&stats->idle_time);
- blkg_stat_reset(&stats->empty_time);
+ bfq_stat_reset(&stats->time);
+ bfq_stat_reset(&stats->avg_queue_size_sum);
+ bfq_stat_reset(&stats->avg_queue_size_samples);
+ bfq_stat_reset(&stats->dequeue);
+ bfq_stat_reset(&stats->group_wait_time);
+ bfq_stat_reset(&stats->idle_time);
+ bfq_stat_reset(&stats->empty_time);
#endif
}
@@ -295,19 +371,19 @@ static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from)
if (!to || !from)
return;
-#ifdef CONFIG_DEBUG_BLK_CGROUP
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
/* queued stats shouldn't be cleared */
blkg_rwstat_add_aux(&to->merged, &from->merged);
blkg_rwstat_add_aux(&to->service_time, &from->service_time);
blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
- blkg_stat_add_aux(&from->time, &from->time);
- blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
- blkg_stat_add_aux(&to->avg_queue_size_samples,
+ bfq_stat_add_aux(&from->time, &from->time);
+ bfq_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
+ bfq_stat_add_aux(&to->avg_queue_size_samples,
&from->avg_queue_size_samples);
- blkg_stat_add_aux(&to->dequeue, &from->dequeue);
- blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
- blkg_stat_add_aux(&to->idle_time, &from->idle_time);
- blkg_stat_add_aux(&to->empty_time, &from->empty_time);
+ bfq_stat_add_aux(&to->dequeue, &from->dequeue);
+ bfq_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
+ bfq_stat_add_aux(&to->idle_time, &from->idle_time);
+ bfq_stat_add_aux(&to->empty_time, &from->empty_time);
#endif
}
@@ -355,35 +431,35 @@ void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg)
static void bfqg_stats_exit(struct bfqg_stats *stats)
{
-#ifdef CONFIG_DEBUG_BLK_CGROUP
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
blkg_rwstat_exit(&stats->merged);
blkg_rwstat_exit(&stats->service_time);
blkg_rwstat_exit(&stats->wait_time);
blkg_rwstat_exit(&stats->queued);
- blkg_stat_exit(&stats->time);
- blkg_stat_exit(&stats->avg_queue_size_sum);
- blkg_stat_exit(&stats->avg_queue_size_samples);
- blkg_stat_exit(&stats->dequeue);
- blkg_stat_exit(&stats->group_wait_time);
- blkg_stat_exit(&stats->idle_time);
- blkg_stat_exit(&stats->empty_time);
+ bfq_stat_exit(&stats->time);
+ bfq_stat_exit(&stats->avg_queue_size_sum);
+ bfq_stat_exit(&stats->avg_queue_size_samples);
+ bfq_stat_exit(&stats->dequeue);
+ bfq_stat_exit(&stats->group_wait_time);
+ bfq_stat_exit(&stats->idle_time);
+ bfq_stat_exit(&stats->empty_time);
#endif
}
static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
{
-#ifdef CONFIG_DEBUG_BLK_CGROUP
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
if (blkg_rwstat_init(&stats->merged, gfp) ||
blkg_rwstat_init(&stats->service_time, gfp) ||
blkg_rwstat_init(&stats->wait_time, gfp) ||
blkg_rwstat_init(&stats->queued, gfp) ||
- blkg_stat_init(&stats->time, gfp) ||
- blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
- blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
- blkg_stat_init(&stats->dequeue, gfp) ||
- blkg_stat_init(&stats->group_wait_time, gfp) ||
- blkg_stat_init(&stats->idle_time, gfp) ||
- blkg_stat_init(&stats->empty_time, gfp)) {
+ bfq_stat_init(&stats->time, gfp) ||
+ bfq_stat_init(&stats->avg_queue_size_sum, gfp) ||
+ bfq_stat_init(&stats->avg_queue_size_samples, gfp) ||
+ bfq_stat_init(&stats->dequeue, gfp) ||
+ bfq_stat_init(&stats->group_wait_time, gfp) ||
+ bfq_stat_init(&stats->idle_time, gfp) ||
+ bfq_stat_init(&stats->empty_time, gfp)) {
bfqg_stats_exit(stats);
return -ENOMEM;
}
@@ -909,7 +985,7 @@ static ssize_t bfq_io_set_weight(struct kernfs_open_file *of,
return ret ?: nbytes;
}
-#ifdef CONFIG_DEBUG_BLK_CGROUP
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
static int bfqg_print_stat(struct seq_file *sf, void *v)
{
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
@@ -927,17 +1003,34 @@ static int bfqg_print_rwstat(struct seq_file *sf, void *v)
static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
struct blkg_policy_data *pd, int off)
{
- u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
- &blkcg_policy_bfq, off);
+ struct blkcg_gq *blkg = pd_to_blkg(pd);
+ struct blkcg_gq *pos_blkg;
+ struct cgroup_subsys_state *pos_css;
+ u64 sum = 0;
+
+ lockdep_assert_held(&blkg->q->queue_lock);
+
+ rcu_read_lock();
+ blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
+ struct bfq_stat *stat;
+
+ if (!pos_blkg->online)
+ continue;
+
+ stat = (void *)blkg_to_pd(pos_blkg, &blkcg_policy_bfq) + off;
+ sum += bfq_stat_read(stat) + atomic64_read(&stat->aux_cnt);
+ }
+ rcu_read_unlock();
+
return __blkg_prfill_u64(sf, pd, sum);
}
static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
struct blkg_policy_data *pd, int off)
{
- struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
- &blkcg_policy_bfq,
- off);
+ struct blkg_rwstat_sample sum;
+
+ blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off, &sum);
return __blkg_prfill_rwstat(sf, pd, &sum);
}
@@ -975,12 +1068,13 @@ static int bfqg_print_stat_sectors(struct seq_file *sf, void *v)
static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf,
struct blkg_policy_data *pd, int off)
{
- struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
- offsetof(struct blkcg_gq, stat_bytes));
- u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
- atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
+ struct blkg_rwstat_sample tmp;
- return __blkg_prfill_u64(sf, pd, sum >> 9);
+ blkg_rwstat_recursive_sum(pd->blkg, NULL,
+ offsetof(struct blkcg_gq, stat_bytes), &tmp);
+
+ return __blkg_prfill_u64(sf, pd,
+ (tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]) >> 9);
}
static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
@@ -995,11 +1089,11 @@ static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf,
struct blkg_policy_data *pd, int off)
{
struct bfq_group *bfqg = pd_to_bfqg(pd);
- u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples);
+ u64 samples = bfq_stat_read(&bfqg->stats.avg_queue_size_samples);
u64 v = 0;
if (samples) {
- v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum);
+ v = bfq_stat_read(&bfqg->stats.avg_queue_size_sum);
v = div64_u64(v, samples);
}
__blkg_prfill_u64(sf, pd, v);
@@ -1014,7 +1108,7 @@ static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v)
0, false);
return 0;
}
-#endif /* CONFIG_DEBUG_BLK_CGROUP */
+#endif /* CONFIG_BFQ_CGROUP_DEBUG */
struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
{
@@ -1062,7 +1156,7 @@ struct cftype bfq_blkcg_legacy_files[] = {
.private = (unsigned long)&blkcg_policy_bfq,
.seq_show = blkg_print_stat_ios,
},
-#ifdef CONFIG_DEBUG_BLK_CGROUP
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
{
.name = "bfq.time",
.private = offsetof(struct bfq_group, stats.time),
@@ -1092,7 +1186,7 @@ struct cftype bfq_blkcg_legacy_files[] = {
.private = offsetof(struct bfq_group, stats.queued),
.seq_show = bfqg_print_rwstat,
},
-#endif /* CONFIG_DEBUG_BLK_CGROUP */
+#endif /* CONFIG_BFQ_CGROUP_DEBUG */
/* the same statistics which cover the bfqg and its descendants */
{
@@ -1105,7 +1199,7 @@ struct cftype bfq_blkcg_legacy_files[] = {
.private = (unsigned long)&blkcg_policy_bfq,
.seq_show = blkg_print_stat_ios_recursive,
},
-#ifdef CONFIG_DEBUG_BLK_CGROUP
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
{
.name = "bfq.time_recursive",
.private = offsetof(struct bfq_group, stats.time),
@@ -1159,7 +1253,7 @@ struct cftype bfq_blkcg_legacy_files[] = {
.private = offsetof(struct bfq_group, stats.dequeue),
.seq_show = bfqg_print_stat,
},
-#endif /* CONFIG_DEBUG_BLK_CGROUP */
+#endif /* CONFIG_BFQ_CGROUP_DEBUG */
{ } /* terminate */
};
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index f8d430f88d25..06c9b00507b6 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -157,6 +157,7 @@ BFQ_BFQQ_FNS(in_large_burst);
BFQ_BFQQ_FNS(coop);
BFQ_BFQQ_FNS(split_coop);
BFQ_BFQQ_FNS(softrt_update);
+BFQ_BFQQ_FNS(has_waker);
#undef BFQ_BFQQ_FNS \
/* Expiration time of sync (0) and async (1) requests, in ns. */
@@ -1427,17 +1428,19 @@ static int bfq_min_budget(struct bfq_data *bfqd)
* mechanism may be re-designed in such a way to make it possible to
* know whether preemption is needed without needing to update service
* trees). In addition, queue preemptions almost always cause random
- * I/O, and thus loss of throughput. Because of these facts, the next
- * function adopts the following simple scheme to avoid both costly
- * operations and too frequent preemptions: it requests the expiration
- * of the in-service queue (unconditionally) only for queues that need
- * to recover a hole, or that either are weight-raised or deserve to
- * be weight-raised.
+ * I/O, which may in turn cause loss of throughput. Finally, there may
+ * even be no in-service queue when the next function is invoked (so,
+ * no queue to compare timestamps with). Because of these facts, the
+ * next function adopts the following simple scheme to avoid costly
+ * operations, too frequent preemptions and too many dependencies on
+ * the state of the scheduler: it requests the expiration of the
+ * in-service queue (unconditionally) only for queues that need to
+ * recover a hole. Then it delegates to other parts of the code the
+ * responsibility of handling the above case 2.
*/
static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
struct bfq_queue *bfqq,
- bool arrived_in_time,
- bool wr_or_deserves_wr)
+ bool arrived_in_time)
{
struct bfq_entity *entity = &bfqq->entity;
@@ -1492,7 +1495,7 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
entity->budget = max_t(unsigned long, bfqq->max_budget,
bfq_serv_to_charge(bfqq->next_rq, bfqq));
bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
- return wr_or_deserves_wr;
+ return false;
}
/*
@@ -1610,6 +1613,36 @@ static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd,
bfqd->bfq_wr_min_idle_time);
}
+
+/*
+ * Return true if bfqq is in a higher priority class, or has a higher
+ * weight than the in-service queue.
+ */
+static bool bfq_bfqq_higher_class_or_weight(struct bfq_queue *bfqq,
+ struct bfq_queue *in_serv_bfqq)
+{
+ int bfqq_weight, in_serv_weight;
+
+ if (bfqq->ioprio_class < in_serv_bfqq->ioprio_class)
+ return true;
+
+ if (in_serv_bfqq->entity.parent == bfqq->entity.parent) {
+ bfqq_weight = bfqq->entity.weight;
+ in_serv_weight = in_serv_bfqq->entity.weight;
+ } else {
+ if (bfqq->entity.parent)
+ bfqq_weight = bfqq->entity.parent->weight;
+ else
+ bfqq_weight = bfqq->entity.weight;
+ if (in_serv_bfqq->entity.parent)
+ in_serv_weight = in_serv_bfqq->entity.parent->weight;
+ else
+ in_serv_weight = in_serv_bfqq->entity.weight;
+ }
+
+ return bfqq_weight > in_serv_weight;
+}
+
static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
struct bfq_queue *bfqq,
int old_wr_coeff,
@@ -1654,8 +1687,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
*/
bfqq_wants_to_preempt =
bfq_bfqq_update_budg_for_activation(bfqd, bfqq,
- arrived_in_time,
- wr_or_deserves_wr);
+ arrived_in_time);
/*
* If bfqq happened to be activated in a burst, but has been
@@ -1720,21 +1752,111 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
/*
* Expire in-service queue only if preemption may be needed
- * for guarantees. In this respect, the function
- * next_queue_may_preempt just checks a simple, necessary
- * condition, and not a sufficient condition based on
- * timestamps. In fact, for the latter condition to be
- * evaluated, timestamps would need first to be updated, and
- * this operation is quite costly (see the comments on the
- * function bfq_bfqq_update_budg_for_activation).
+ * for guarantees. In particular, we care only about two
+ * cases. The first is that bfqq has to recover a service
+ * hole, as explained in the comments on
+ * bfq_bfqq_update_budg_for_activation(), i.e., that
+ * bfqq_wants_to_preempt is true. However, if bfqq does not
+ * carry time-critical I/O, then bfqq's bandwidth is less
+ * important than that of queues that carry time-critical I/O.
+ * So, as a further constraint, we consider this case only if
+ * bfqq is at least as weight-raised, i.e., at least as time
+ * critical, as the in-service queue.
+ *
+ * The second case is that bfqq is in a higher priority class,
+ * or has a higher weight than the in-service queue. If this
+ * condition does not hold, we don't care because, even if
+ * bfqq does not start to be served immediately, the resulting
+ * delay for bfqq's I/O is however lower or much lower than
+ * the ideal completion time to be guaranteed to bfqq's I/O.
+ *
+ * In both cases, preemption is needed only if, according to
+ * the timestamps of both bfqq and of the in-service queue,
+ * bfqq actually is the next queue to serve. So, to reduce
+ * useless preemptions, the return value of
+ * next_queue_may_preempt() is considered in the next compound
+ * condition too. Yet next_queue_may_preempt() just checks a
+ * simple, necessary condition for bfqq to be the next queue
+ * to serve. In fact, to evaluate a sufficient condition, the
+ * timestamps of the in-service queue would need to be
+ * updated, and this operation is quite costly (see the
+ * comments on bfq_bfqq_update_budg_for_activation()).
*/
- if (bfqd->in_service_queue && bfqq_wants_to_preempt &&
- bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff &&
+ if (bfqd->in_service_queue &&
+ ((bfqq_wants_to_preempt &&
+ bfqq->wr_coeff >= bfqd->in_service_queue->wr_coeff) ||
+ bfq_bfqq_higher_class_or_weight(bfqq, bfqd->in_service_queue)) &&
next_queue_may_preempt(bfqd))
bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
false, BFQQE_PREEMPTED);
}
+static void bfq_reset_inject_limit(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq)
+{
+ /* invalidate baseline total service time */
+ bfqq->last_serv_time_ns = 0;
+
+ /*
+ * Reset pointer in case we are waiting for
+ * some request completion.
+ */
+ bfqd->waited_rq = NULL;
+
+ /*
+ * If bfqq has a short think time, then start by setting the
+ * inject limit to 0 prudentially, because the service time of
+ * an injected I/O request may be higher than the think time
+ * of bfqq, and therefore, if one request was injected when
+ * bfqq remains empty, this injected request might delay the
+ * service of the next I/O request for bfqq significantly. In
+ * case bfqq can actually tolerate some injection, then the
+ * adaptive update will however raise the limit soon. This
+ * lucky circumstance holds exactly because bfqq has a short
+ * think time, and thus, after remaining empty, is likely to
+ * get new I/O enqueued---and then completed---before being
+ * expired. This is the very pattern that gives the
+ * limit-update algorithm the chance to measure the effect of
+ * injection on request service times, and then to update the
+ * limit accordingly.
+ *
+ * However, in the following special case, the inject limit is
+ * left to 1 even if the think time is short: bfqq's I/O is
+ * synchronized with that of some other queue, i.e., bfqq may
+ * receive new I/O only after the I/O of the other queue is
+ * completed. Keeping the inject limit to 1 allows the
+ * blocking I/O to be served while bfqq is in service. And
+ * this is very convenient both for bfqq and for overall
+ * throughput, as explained in detail in the comments in
+ * bfq_update_has_short_ttime().
+ *
+ * On the opposite end, if bfqq has a long think time, then
+ * start directly by 1, because:
+ * a) on the bright side, keeping at most one request in
+ * service in the drive is unlikely to cause any harm to the
+ * latency of bfqq's requests, as the service time of a single
+ * request is likely to be lower than the think time of bfqq;
+ * b) on the downside, after becoming empty, bfqq is likely to
+ * expire before getting its next request. With this request
+ * arrival pattern, it is very hard to sample total service
+ * times and update the inject limit accordingly (see comments
+ * on bfq_update_inject_limit()). So the limit is likely to be
+ * never, or at least seldom, updated. As a consequence, by
+ * setting the limit to 1, we avoid that no injection ever
+ * occurs with bfqq. On the downside, this proactive step
+ * further reduces chances to actually compute the baseline
+ * total service time. Thus it reduces chances to execute the
+ * limit-update algorithm and possibly raise the limit to more
+ * than 1.
+ */
+ if (bfq_bfqq_has_short_ttime(bfqq))
+ bfqq->inject_limit = 0;
+ else
+ bfqq->inject_limit = 1;
+
+ bfqq->decrease_time_jif = jiffies;
+}
+
static void bfq_add_request(struct request *rq)
{
struct bfq_queue *bfqq = RQ_BFQQ(rq);
@@ -1749,77 +1871,119 @@ static void bfq_add_request(struct request *rq)
if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) {
/*
+ * Detect whether bfqq's I/O seems synchronized with
+ * that of some other queue, i.e., whether bfqq, after
+ * remaining empty, happens to receive new I/O only
+ * right after some I/O request of the other queue has
+ * been completed. We call waker queue the other
+ * queue, and we assume, for simplicity, that bfqq may
+ * have at most one waker queue.
+ *
+ * A remarkable throughput boost can be reached by
+ * unconditionally injecting the I/O of the waker
+ * queue, every time a new bfq_dispatch_request
+ * happens to be invoked while I/O is being plugged
+ * for bfqq. In addition to boosting throughput, this
+ * unblocks bfqq's I/O, thereby improving bandwidth
+ * and latency for bfqq. Note that these same results
+ * may be achieved with the general injection
+ * mechanism, but less effectively. For details on
+ * this aspect, see the comments on the choice of the
+ * queue for injection in bfq_select_queue().
+ *
+ * Turning back to the detection of a waker queue, a
+ * queue Q is deemed as a waker queue for bfqq if, for
+ * two consecutive times, bfqq happens to become non
+ * empty right after a request of Q has been
+ * completed. In particular, on the first time, Q is
+ * tentatively set as a candidate waker queue, while
+ * on the second time, the flag
+ * bfq_bfqq_has_waker(bfqq) is set to confirm that Q
+ * is a waker queue for bfqq. These detection steps
+ * are performed only if bfqq has a long think time,
+ * so as to make it more likely that bfqq's I/O is
+ * actually being blocked by a synchronization. This
+ * last filter, plus the above two-times requirement,
+ * make false positives less likely.
+ *
+ * NOTE
+ *
+ * The sooner a waker queue is detected, the sooner
+ * throughput can be boosted by injecting I/O from the
+ * waker queue. Fortunately, detection is likely to be
+ * actually fast, for the following reasons. While
+ * blocked by synchronization, bfqq has a long think
+ * time. This implies that bfqq's inject limit is at
+ * least equal to 1 (see the comments in
+ * bfq_update_inject_limit()). So, thanks to
+ * injection, the waker queue is likely to be served
+ * during the very first I/O-plugging time interval
+ * for bfqq. This triggers the first step of the
+ * detection mechanism. Thanks again to injection, the
+ * candidate waker queue is then likely to be
+ * confirmed no later than during the next
+ * I/O-plugging interval for bfqq.
+ */
+ if (!bfq_bfqq_has_short_ttime(bfqq) &&
+ ktime_get_ns() - bfqd->last_completion <
+ 200 * NSEC_PER_USEC) {
+ if (bfqd->last_completed_rq_bfqq != bfqq &&
+ bfqd->last_completed_rq_bfqq !=
+ bfqq->waker_bfqq) {
+ /*
+ * First synchronization detected with
+ * a candidate waker queue, or with a
+ * different candidate waker queue
+ * from the current one.
+ */
+ bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq;
+
+ /*
+ * If the waker queue disappears, then
+ * bfqq->waker_bfqq must be reset. To
+ * this goal, we maintain in each
+ * waker queue a list, woken_list, of
+ * all the queues that reference the
+ * waker queue through their
+ * waker_bfqq pointer. When the waker
+ * queue exits, the waker_bfqq pointer
+ * of all the queues in the woken_list
+ * is reset.
+ *
+ * In addition, if bfqq is already in
+ * the woken_list of a waker queue,
+ * then, before being inserted into
+ * the woken_list of a new waker
+ * queue, bfqq must be removed from
+ * the woken_list of the old waker
+ * queue.
+ */
+ if (!hlist_unhashed(&bfqq->woken_list_node))
+ hlist_del_init(&bfqq->woken_list_node);
+ hlist_add_head(&bfqq->woken_list_node,
+ &bfqd->last_completed_rq_bfqq->woken_list);
+
+ bfq_clear_bfqq_has_waker(bfqq);
+ } else if (bfqd->last_completed_rq_bfqq ==
+ bfqq->waker_bfqq &&
+ !bfq_bfqq_has_waker(bfqq)) {
+ /*
+ * synchronization with waker_bfqq
+ * seen for the second time
+ */
+ bfq_mark_bfqq_has_waker(bfqq);
+ }
+ }
+
+ /*
* Periodically reset inject limit, to make sure that
* the latter eventually drops in case workload
* changes, see step (3) in the comments on
* bfq_update_inject_limit().
*/
if (time_is_before_eq_jiffies(bfqq->decrease_time_jif +
- msecs_to_jiffies(1000))) {
- /* invalidate baseline total service time */
- bfqq->last_serv_time_ns = 0;
-
- /*
- * Reset pointer in case we are waiting for
- * some request completion.
- */
- bfqd->waited_rq = NULL;
-
- /*
- * If bfqq has a short think time, then start
- * by setting the inject limit to 0
- * prudentially, because the service time of
- * an injected I/O request may be higher than
- * the think time of bfqq, and therefore, if
- * one request was injected when bfqq remains
- * empty, this injected request might delay
- * the service of the next I/O request for
- * bfqq significantly. In case bfqq can
- * actually tolerate some injection, then the
- * adaptive update will however raise the
- * limit soon. This lucky circumstance holds
- * exactly because bfqq has a short think
- * time, and thus, after remaining empty, is
- * likely to get new I/O enqueued---and then
- * completed---before being expired. This is
- * the very pattern that gives the
- * limit-update algorithm the chance to
- * measure the effect of injection on request
- * service times, and then to update the limit
- * accordingly.
- *
- * On the opposite end, if bfqq has a long
- * think time, then start directly by 1,
- * because:
- * a) on the bright side, keeping at most one
- * request in service in the drive is unlikely
- * to cause any harm to the latency of bfqq's
- * requests, as the service time of a single
- * request is likely to be lower than the
- * think time of bfqq;
- * b) on the downside, after becoming empty,
- * bfqq is likely to expire before getting its
- * next request. With this request arrival
- * pattern, it is very hard to sample total
- * service times and update the inject limit
- * accordingly (see comments on
- * bfq_update_inject_limit()). So the limit is
- * likely to be never, or at least seldom,
- * updated. As a consequence, by setting the
- * limit to 1, we avoid that no injection ever
- * occurs with bfqq. On the downside, this
- * proactive step further reduces chances to
- * actually compute the baseline total service
- * time. Thus it reduces chances to execute the
- * limit-update algorithm and possibly raise the
- * limit to more than 1.
- */
- if (bfq_bfqq_has_short_ttime(bfqq))
- bfqq->inject_limit = 0;
- else
- bfqq->inject_limit = 1;
- bfqq->decrease_time_jif = jiffies;
- }
+ msecs_to_jiffies(1000)))
+ bfq_reset_inject_limit(bfqd, bfqq);
/*
* The following conditions must hold to setup a new
@@ -2027,7 +2191,8 @@ static void bfq_remove_request(struct request_queue *q,
}
-static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
+static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio,
+ unsigned int nr_segs)
{
struct request_queue *q = hctx->queue;
struct bfq_data *bfqd = q->elevator->elevator_data;
@@ -2050,7 +2215,7 @@ static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
bfqd->bio_bfqq = NULL;
bfqd->bio_bic = bic;
- ret = blk_mq_sched_try_merge(q, bio, &free);
+ ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
if (free)
blk_mq_free_request(free);
@@ -2513,6 +2678,7 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
* to enjoy weight raising if split soon.
*/
bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff;