summaryrefslogtreecommitdiffstats
path: root/block/bfq-iosched.c
diff options
context:
space:
mode:
Diffstat (limited to 'block/bfq-iosched.c')
-rw-r--r--block/bfq-iosched.c5047
1 files changed, 5047 insertions, 0 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
new file mode 100644
index 000000000000..bd8499ef157c
--- /dev/null
+++ b/block/bfq-iosched.c
@@ -0,0 +1,5047 @@
+/*
+ * Budget Fair Queueing (BFQ) I/O scheduler.
+ *
+ * Based on ideas and code from CFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ * Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
+ * Arianna Avanzini <avanzini@google.com>
+ *
+ * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BFQ is a proportional-share I/O scheduler, with some extra
+ * low-latency capabilities. BFQ also supports full hierarchical
+ * scheduling through cgroups. Next paragraphs provide an introduction
+ * on BFQ inner workings. Details on BFQ benefits, usage and
+ * limitations can be found in Documentation/block/bfq-iosched.txt.
+ *
+ * BFQ is a proportional-share storage-I/O scheduling algorithm based
+ * on the slice-by-slice service scheme of CFQ. But BFQ assigns
+ * budgets, measured in number of sectors, to processes instead of
+ * time slices. The device is not granted to the in-service process
+ * for a given time slice, but until it has exhausted its assigned
+ * budget. This change from the time to the service domain enables BFQ
+ * to distribute the device throughput among processes as desired,
+ * without any distortion due to throughput fluctuations, or to device
+ * internal queueing. BFQ uses an ad hoc internal scheduler, called
+ * B-WF2Q+, to schedule processes according to their budgets. More
+ * precisely, BFQ schedules queues associated with processes. Each
+ * process/queue is assigned a user-configurable weight, and B-WF2Q+
+ * guarantees that each queue receives a fraction of the throughput
+ * proportional to its weight. Thanks to the accurate policy of
+ * B-WF2Q+, BFQ can afford to assign high budgets to I/O-bound
+ * processes issuing sequential requests (to boost the throughput),
+ * and yet guarantee a low latency to interactive and soft real-time
+ * applications.
+ *
+ * In particular, to provide these low-latency guarantees, BFQ
+ * explicitly privileges the I/O of two classes of time-sensitive
+ * applications: interactive and soft real-time. This feature enables
+ * BFQ to provide applications in these classes with a very low
+ * latency. Finally, BFQ also features additional heuristics for
+ * preserving both a low latency and a high throughput on NCQ-capable,
+ * rotational or flash-based devices, and to get the job done quickly
+ * for applications consisting in many I/O-bound processes.
+ *
+ * BFQ is described in [1], where also a reference to the initial, more
+ * theoretical paper on BFQ can be found. The interested reader can find
+ * in the latter paper full details on the main algorithm, as well as
+ * formulas of the guarantees and formal proofs of all the properties.
+ * With respect to the version of BFQ presented in these papers, this
+ * implementation adds a few more heuristics, such as the one that
+ * guarantees a low latency to soft real-time applications, and a
+ * hierarchical extension based on H-WF2Q+.
+ *
+ * B-WF2Q+ is based on WF2Q+, which is described in [2], together with
+ * H-WF2Q+, while the augmented tree used here to implement B-WF2Q+
+ * with O(log N) complexity derives from the one introduced with EEVDF
+ * in [3].
+ *
+ * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
+ * Scheduler", Proceedings of the First Workshop on Mobile System
+ * Technologies (MST-2015), May 2015.
+ * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
+ *
+ * [2] Jon C.R. Bennett and H. Zhang, "Hierarchical Packet Fair Queueing
+ * Algorithms", IEEE/ACM Transactions on Networking, 5(5):675-689,
+ * Oct 1997.
+ *
+ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
+ *
+ * [3] I. Stoica and H. Abdel-Wahab, "Earliest Eligible Virtual Deadline
+ * First: A Flexible and Accurate Mechanism for Proportional Share
+ * Resource Allocation", technical report.
+ *
+ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/cgroup.h>
+#include <linux/elevator.h>
+#include <linux/ktime.h>
+#include <linux/rbtree.h>
+#include <linux/ioprio.h>
+#include <linux/sbitmap.h>
+#include <linux/delay.h>
+
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-tag.h"
+#include "blk-mq-sched.h"
+#include "bfq-iosched.h"
+
+#define BFQ_BFQQ_FNS(name) \
+void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
+{ \
+ __set_bit(BFQQF_##name, &(bfqq)->flags); \
+} \
+void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
+{ \
+ __clear_bit(BFQQF_##name, &(bfqq)->flags); \
+} \
+int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
+{ \
+ return test_bit(BFQQF_##name, &(bfqq)->flags); \
+}
+
+BFQ_BFQQ_FNS(just_created);
+BFQ_BFQQ_FNS(busy);
+BFQ_BFQQ_FNS(wait_request);
+BFQ_BFQQ_FNS(non_blocking_wait_rq);
+BFQ_BFQQ_FNS(fifo_expire);
+BFQ_BFQQ_FNS(idle_window);
+BFQ_BFQQ_FNS(sync);
+BFQ_BFQQ_FNS(IO_bound);
+BFQ_BFQQ_FNS(in_large_burst);
+BFQ_BFQQ_FNS(coop);
+BFQ_BFQQ_FNS(split_coop);
+BFQ_BFQQ_FNS(softrt_update);
+#undef BFQ_BFQQ_FNS \
+
+/* Expiration time of sync (0) and async (1) requests, in ns. */
+static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
+
+/* Maximum backwards seek (magic number lifted from CFQ), in KiB. */
+static const int bfq_back_max = 16 * 1024;
+
+/* Penalty of a backwards seek, in number of sectors. */
+static const int bfq_back_penalty = 2;
+
+/* Idling period duration, in ns. */
+static u64 bfq_slice_idle = NSEC_PER_SEC / 125;
+
+/* Minimum number of assigned budgets for which stats are safe to compute. */
+static const int bfq_stats_min_budgets = 194;
+
+/* Default maximum budget values, in sectors and number of requests. */
+static const int bfq_default_max_budget = 16 * 1024;
+
+/*
+ * Async to sync throughput distribution is controlled as follows:
+ * when an async request is served, the entity is charged the number
+ * of sectors of the request, multiplied by the factor below
+ */
+static const int bfq_async_charge_factor = 10;
+
+/* Default timeout values, in jiffies, approximating CFQ defaults. */
+const int bfq_timeout = HZ / 8;
+
+static struct kmem_cache *bfq_pool;
+
+/* Below this threshold (in ns), we consider thinktime immediate. */
+#define BFQ_MIN_TT (2 * NSEC_PER_MSEC)
+
+/* hw_tag detection: parallel requests threshold and min samples needed. */
+#define BFQ_HW_QUEUE_THRESHOLD 4
+#define BFQ_HW_QUEUE_SAMPLES 32
+
+#define BFQQ_SEEK_THR (sector_t)(8 * 100)
+#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
+#define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
+#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8)
+
+/* Min number of samples required to perform peak-rate update */
+#define BFQ_RATE_MIN_SAMPLES 32
+/* Min observation time interval required to perform a peak-rate update (ns) */
+#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC)
+/* Target observation time interval for a peak-rate update (ns) */
+#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC
+
+/* Shift used for peak rate fixed precision calculations. */
+#define BFQ_RATE_SHIFT 16
+
+/*
+ * By default, BFQ computes the duration of the weight raising for
+ * interactive applications automatically, using the following formula:
+ * duration = (R / r) * T, where r is the peak rate of the device, and
+ * R and T are two reference parameters.
+ * In particular, R is the peak rate of the reference device (see below),
+ * and T is a reference time: given the systems that are likely to be
+ * installed on the reference device according to its speed class, T is
+ * about the maximum time needed, under BFQ and while reading two files in
+ * parallel, to load typical large applications on these systems.
+ * In practice, the slower/faster the device at hand is, the more/less it
+ * takes to load applications with respect to the reference device.
+ * Accordingly, the longer/shorter BFQ grants weight raising to interactive
+ * applications.
+ *
+ * BFQ uses four different reference pairs (R, T), depending on:
+ * . whether the device is rotational or non-rotational;
+ * . whether the device is slow, such as old or portable HDDs, as well as
+ * SD cards, or fast, such as newer HDDs and SSDs.
+ *
+ * The device's speed class is dynamically (re)detected in
+ * bfq_update_peak_rate() every time the estimated peak rate is updated.
+ *
+ * In the following definitions, R_slow[0]/R_fast[0] and
+ * T_slow[0]/T_fast[0] are the reference values for a slow/fast
+ * rotational device, whereas R_slow[1]/R_fast[1] and
+ * T_slow[1]/T_fast[1] are the reference values for a slow/fast
+ * non-rotational device. Finally, device_speed_thresh are the
+ * thresholds used to switch between speed classes. The reference
+ * rates are not the actual peak rates of the devices used as a
+ * reference, but slightly lower values. The reason for using these
+ * slightly lower values is that the peak-rate estimator tends to
+ * yield slightly lower values than the actual peak rate (it can yield
+ * the actual peak rate only if there is only one process doing I/O,
+ * and the process does sequential I/O).
+ *
+ * Both the reference peak rates and the thresholds are measured in
+ * sectors/usec, left-shifted by BFQ_RATE_SHIFT.
+ */
+static int R_slow[2] = {1000, 10700};
+static int R_fast[2] = {14000, 33000};
+/*
+ * To improve readability, a conversion function is used to initialize the
+ * following arrays, which entails that they can be initialized only in a
+ * function.
+ */
+static int T_slow[2];
+static int T_fast[2];
+static int device_speed_thresh[2];
+
+#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
+#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
+
+struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync)
+{
+ return bic->bfqq[is_sync];
+}
+
+void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync)
+{
+ bic->bfqq[is_sync] = bfqq;
+}
+
+struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
+{
+ return bic->icq.q->elevator->elevator_data;
+}
+
+/**
+ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
+ * @icq: the iocontext queue.
+ */
+static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
+{
+ /* bic->icq is the first member, %NULL will convert to %NULL */
+ return container_of(icq, struct bfq_io_cq, icq);
+}
+
+/**
+ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
+ * @bfqd: the lookup key.
+ * @ioc: the io_context of the process doing I/O.
+ * @q: the request queue.
+ */
+static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
+ struct io_context *ioc,
+ struct request_queue *q)
+{
+ if (ioc) {
+ unsigned long flags;
+ struct bfq_io_cq *icq;
+
+ spin_lock_irqsave(q->queue_lock, flags);
+ icq = icq_to_bic(ioc_lookup_icq(ioc, q));
+ spin_unlock_irqrestore(q->queue_lock, flags);
+
+ return icq;
+ }
+
+ return NULL;
+}
+
+/*
+ * Scheduler run of queue, if there are requests pending and no one in the
+ * driver that will restart queueing.
+ */
+void bfq_schedule_dispatch(struct bfq_data *bfqd)
+{
+ if (bfqd->queued != 0) {
+ bfq_log(bfqd, "schedule dispatch");
+ blk_mq_run_hw_queues(bfqd->queue, true);
+ }
+}
+
+#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
+#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
+
+#define bfq_sample_valid(samples) ((samples) > 80)
+
+/*
+ * Lifted from AS - choose which of rq1 and rq2 that is best served now.
+ * We choose the request that is closesr to the head right now. Distance
+ * behind the head is penalized and only allowed to a certain extent.
+ */
+static struct request *bfq_choose_req(struct bfq_data *bfqd,
+ struct request *rq1,
+ struct request *rq2,
+ sector_t last)
+{
+ sector_t s1, s2, d1 = 0, d2 = 0;
+ unsigned long back_max;
+#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
+#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
+ unsigned int wrap = 0; /* bit mask: requests behind the disk head? */
+
+ if (!rq1 || rq1 == rq2)
+ return rq2;
+ if (!rq2)
+ return rq1;
+
+ if (rq_is_sync(rq1) && !rq_is_sync(rq2))
+ return rq1;
+ else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
+ return rq2;
+ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
+ return rq1;
+ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
+ return rq2;
+
+ s1 = blk_rq_pos(rq1);
+ s2 = blk_rq_pos(rq2);
+
+ /*
+ * By definition, 1KiB is 2 sectors.
+ */
+ back_max = bfqd->bfq_back_max * 2;
+
+ /*
+ * Strict one way elevator _except_ in the case where we allow
+ * short backward seeks which are biased as twice the cost of a
+ * similar forward seek.
+ */
+ if (s1 >= last)
+ d1 = s1 - last;
+ else if (s1 + back_max >= last)
+ d1 = (last - s1) * bfqd->bfq_back_penalty;
+ else
+ wrap |= BFQ_RQ1_WRAP;
+
+ if (s2 >= last)
+ d2 = s2 - last;
+ else if (s2 + back_max >= last)
+ d2 = (last - s2) * bfqd->bfq_back_penalty;
+ else
+ wrap |= BFQ_RQ2_WRAP;
+
+ /* Found required data */
+
+ /*
+ * By doing switch() on the bit mask "wrap" we avoid having to
+ * check two variables for all permutations: --> faster!
+ */
+ switch (wrap) {
+ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
+ if (d1 < d2)
+ return rq1;
+ else if (d2 < d1)
+ return rq2;
+
+ if (s1 >= s2)
+ return rq1;
+ else
+ return rq2;
+
+ case BFQ_RQ2_WRAP:
+ return rq1;
+ case BFQ_RQ1_WRAP:
+ return rq2;
+ case BFQ_RQ1_WRAP|BFQ_RQ2_WRAP: /* both rqs wrapped */
+ default:
+ /*
+ * Since both rqs are wrapped,
+ * start with the one that's further behind head
+ * (--> only *one* back seek required),
+ * since back seek takes more time than forward.
+ */
+ if (s1 <= s2)
+ return rq1;
+ else
+ return rq2;
+ }
+}
+
+static struct bfq_queue *
+bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
+ sector_t sector, struct rb_node **ret_parent,
+ struct rb_node ***rb_link)
+{
+ struct rb_node **p, *parent;
+ struct bfq_queue *bfqq = NULL;
+
+ parent = NULL;
+ p = &root->rb_node;
+ while (*p) {
+ struct rb_node **n;
+
+ parent = *p;
+ bfqq = rb_entry(parent, struct bfq_queue, pos_node);
+
+ /*
+ * Sort strictly based on sector. Smallest to the left,
+ * largest to the right.
+ */
+ if (sector > blk_rq_pos(bfqq->next_rq))
+ n = &(*p)->rb_right;
+ else if (sector < blk_rq_pos(bfqq->next_rq))
+ n = &(*p)->rb_left;
+ else
+ break;
+ p = n;
+ bfqq = NULL;
+ }
+
+ *ret_parent = parent;
+ if (rb_link)
+ *rb_link = p;
+
+ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
+ (unsigned long long)sector,
+ bfqq ? bfqq->pid : 0);
+
+ return bfqq;
+}
+
+void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ struct rb_node **p, *parent;
+ struct bfq_queue *__bfqq;
+
+ if (bfqq->pos_root) {
+ rb_erase(&bfqq->pos_node, bfqq->pos_root);
+ bfqq->pos_root = NULL;
+ }
+
+ if (bfq_class_idle(bfqq))
+ return;
+ if (!bfqq->next_rq)
+ return;
+
+ bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
+ blk_rq_pos(bfqq->next_rq), &parent, &p);
+ if (!__bfqq) {
+ rb_link_node(&bfqq->pos_node, parent, p);
+ rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
+ } else
+ bfqq->pos_root = NULL;
+}
+
+/*
+ * Tell whether there are active queues or groups with differentiated weights.
+ */
+static bool bfq_differentiated_weights(struct bfq_data *bfqd)
+{
+ /*
+ * For weights to differ, at least one of the trees must contain
+ * at least two nodes.
+ */
+ return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
+ (bfqd->queue_weights_tree.rb_node->rb_left ||
+ bfqd->queue_weights_tree.rb_node->rb_right)
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ ) ||
+ (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) &&
+ (bfqd->group_weights_tree.rb_node->rb_left ||
+ bfqd->group_weights_tree.rb_node->rb_right)
+#endif
+ );
+}
+
+/*
+ * The following function returns true if every queue must receive the
+ * same share of the throughput (this condition is used when deciding
+ * whether idling may be disabled, see the comments in the function
+ * bfq_bfqq_may_idle()).
+ *
+ * Such a scenario occurs when:
+ * 1) all active queues have the same weight,
+ * 2) all active groups at the same level in the groups tree have the same
+ * weight,
+ * 3) all active groups at the same level in the groups tree have the same
+ * number of children.
+ *
+ * Unfortunately, keeping the necessary state for evaluating exactly the
+ * above symmetry conditions would be quite complex and time-consuming.
+ * Therefore this function evaluates, instead, the following stronger
+ * sub-conditions, for which it is much easier to maintain the needed
+ * state:
+ * 1) all active queues have the same weight,
+ * 2) all active groups have the same weight,
+ * 3) all active groups have at most one active child each.
+ * In particular, the last two conditions are always true if hierarchical
+ * support and the cgroups interface are not enabled, thus no state needs
+ * to be maintained in this case.
+ */
+static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
+{
+ return !bfq_differentiated_weights(bfqd);
+}
+
+/*
+ * If the weight-counter tree passed as input contains no counter for
+ * the weight of the input entity, then add that counter; otherwise just
+ * increment the existing counter.
+ *
+ * Note that weight-counter trees contain few nodes in mostly symmetric
+ * scenarios. For example, if all queues have the same weight, then the
+ * weight-counter tree for the queues may contain at most one node.
+ * This holds even if low_latency is on, because weight-raised queues
+ * are not inserted in the tree.
+ * In most scenarios, the rate at which nodes are created/destroyed
+ * should be low too.
+ */
+void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
+ struct rb_root *root)
+{
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+ /*
+ * Do not insert if the entity is already associated with a
+ * counter, which happens if:
+ * 1) the entity is associated with a queue,
+ * 2) a request arrival has caused the queue to become both
+ * non-weight-raised, and hence change its weight, and
+ * backlogged; in this respect, each of the two events
+ * causes an invocation of this function,
+ * 3) this is the invocation of this function caused by the
+ * second event. This second invocation is actually useless,
+ * and we handle this fact by exiting immediately. More
+ * efficient or clearer solutions might possibly be adopted.
+ */
+ if (entity->weight_counter)
+ return;
+
+ while (*new) {
+ struct bfq_weight_counter *__counter = container_of(*new,
+ struct bfq_weight_counter,
+ weights_node);
+ parent = *new;
+
+ if (entity->weight == __counter->weight) {
+ entity->weight_counter = __counter;
+ goto inc_counter;
+ }
+ if (entity->weight < __counter->weight)
+ new = &((*new)->rb_left);
+ else
+ new = &((*new)->rb_right);
+ }
+
+ entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
+ GFP_ATOMIC);
+
+ /*
+ * In the unlucky event of an allocation failure, we just
+ * exit. This will cause the weight of entity to not be
+ * considered in bfq_differentiated_weights, which, in its
+ * turn, causes the scenario to be deemed wrongly symmetric in
+ * case entity's weight would have been the only weight making
+ * the scenario asymmetric. On the bright side, no unbalance
+ * will however occur when entity becomes inactive again (the
+ * invocation of this function is triggered by an activation
+ * of entity). In fact, bfq_weights_tree_remove does nothing
+ * if !entity->weight_counter.
+ */
+ if (unlikely(!entity->weight_counter))
+ return;
+
+ entity->weight_counter->weight = entity->weight;
+ rb_link_node(&entity->weight_counter->weights_node, parent, new);
+ rb_insert_color(&entity->weight_counter->weights_node, root);
+
+inc_counter:
+ entity->weight_counter->num_active++;
+}
+
+/*
+ * Decrement the weight counter associated with the entity, and, if the
+ * counter reaches 0, remove the counter from the tree.
+ * See the comments to the function bfq_weights_tree_add() for considerations
+ * about overhead.
+ */
+void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_entity *entity,
+ struct rb_root *root)
+{
+ if (!entity->weight_counter)
+ return;
+
+ entity->weight_counter->num_active--;
+ if (entity->weight_counter->num_active > 0)
+ goto reset_entity_pointer;
+
+ rb_erase(&entity->weight_counter->weights_node, root);
+ kfree(entity->weight_counter);
+
+reset_entity_pointer:
+ entity->weight_counter = NULL;
+}
+
+/*
+ * Return expired entry, or NULL to just start from scratch in rbtree.
+ */
+static struct request *bfq_check_fifo(struct bfq_queue *bfqq,
+ struct request *last)
+{
+ struct request *rq;
+
+ if (bfq_bfqq_fifo_expire(bfqq))
+ return NULL;
+
+ bfq_mark_bfqq_fifo_expire(bfqq);
+
+ rq = rq_entry_fifo(bfqq->fifo.next);
+
+ if (rq == last || ktime_get_ns() < rq->fifo_time)
+ return NULL;
+
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq);
+ return rq;
+}
+
+static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq,
+ struct request *last)
+{
+ struct rb_node *rbnext = rb_next(&last->rb_node);
+ struct rb_node *rbprev = rb_prev(&last->rb_node);
+ struct request *next, *prev = NULL;
+
+ /* Follow expired path, else get first next available. */
+ next = bfq_check_fifo(bfqq, last);
+ if (next)
+ return next;
+
+ if (rbprev)
+ prev = rb_entry_rq(rbprev);
+
+ if (rbnext)
+ next = rb_entry_rq(rbnext);
+ else {
+ rbnext = rb_first(&bfqq->sort_list);
+ if (rbnext && rbnext != &last->rb_node)
+ next = rb_entry_rq(rbnext);
+ }
+
+ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
+}
+
+/* see the definition of bfq_async_charge_factor for details */
+static unsigned long bfq_serv_to_charge(struct request *rq,
+ struct bfq_queue *bfqq)
+{
+ if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1)
+ return blk_rq_sectors(rq);
+
+ /*
+ * If there are no weight-raised queues, then amplify service
+ * by just the async charge factor; otherwise amplify service
+ * by twice the async charge factor, to further reduce latency
+ * for weight-raised queues.
+ */
+ if (bfqq->bfqd->wr_busy_queues == 0)
+ return blk_rq_sectors(rq) * bfq_async_charge_factor;
+
+ return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor;
+}
+
+/**
+ * bfq_updated_next_req - update the queue after a new next_rq selection.
+ * @bfqd: the device data the queue belongs to.
+ * @bfqq: the queue to update.
+ *
+ * If the first request of a queue changes we make sure that the queue
+ * has enough budget to serve at least its first request (if the
+ * request has grown). We do this because if the queue has not enough
+ * budget for its first request, it has to go through two dispatch
+ * rounds to actually get it dispatched.
+ */
+static void bfq_updated_next_req(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+ struct request *next_rq = bfqq->next_rq;
+ unsigned long new_budget;
+
+ if (!next_rq)
+ return;
+
+ if (bfqq == bfqd->in_service_queue)
+ /*
+ * In order not to break guarantees, budgets cannot be
+ * changed after an entity has been selected.
+ */
+ return;
+
+ new_budget = max_t(unsigned long, bfqq->max_budget,
+ bfq_serv_to_charge(next_rq, bfqq));
+ if (entity->budget != new_budget) {
+ entity->budget = new_budget;
+ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
+ new_budget);
+ bfq_requeue_bfqq(bfqd, bfqq);
+ }
+}
+
+static void
+bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
+{
+ if (bic->saved_idle_window)
+ bfq_mark_bfqq_idle_window(bfqq);
+ else
+ bfq_clear_bfqq_idle_window(bfqq);
+
+ if (bic->saved_IO_bound)
+ bfq_mark_bfqq_IO_bound(bfqq);
+ else
+ bfq_clear_bfqq_IO_bound(bfqq);
+
+ bfqq->ttime = bic->saved_ttime;
+ bfqq->wr_coeff = bic->saved_wr_coeff;
+ bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt;
+ bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish;
+ bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time;
+
+ if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) ||
+ time_is_before_jiffies(bfqq->last_wr_start_finish +
+ bfqq->wr_cur_max_time))) {
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
+ "resume state: switching off wr");
+
+ bfqq->wr_coeff = 1;
+ }
+
+ /* make sure weight will be updated, however we got here */
+ bfqq->entity.prio_changed = 1;
+}
+
+static int bfqq_process_refs(struct bfq_queue *bfqq)
+{
+ return bfqq->ref - bfqq->allocated - bfqq->entity.on_st;
+}
+
+/* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */
+static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ struct bfq_queue *item;
+ struct hlist_node *n;
+
+ hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node)
+ hlist_del_init(&item->burst_list_node);
+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
+ bfqd->burst_size = 1;
+ bfqd->burst_parent_entity = bfqq->entity.parent;
+}
+
+/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */
+static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ /* Increment burst size to take into account also bfqq */
+ bfqd->burst_size++;
+
+ if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) {
+ struct bfq_queue *pos, *bfqq_item;
+ struct hlist_node *n;
+
+ /*
+ * Enough queues have been activated shortly after each
+ * other to consider this burst as large.
+ */
+ bfqd->large_burst = true;
+
+ /*
+ * We can now mark all queues in the burst list as
+ * belonging to a large burst.
+ */
+ hlist_for_each_entry(bfqq_item, &bfqd->burst_list,
+ burst_list_node)
+ bfq_mark_bfqq_in_large_burst(bfqq_item);
+ bfq_mark_bfqq_in_large_burst(bfqq);
+
+ /*
+ * From now on, and until the current burst finishes, any
+ * new queue being activated shortly after the last queue
+ * was inserted in the burst can be immediately marked as
+ * belonging to a large burst. So the burst list is not
+ * needed any more. Remove it.
+ */
+ hlist_for_each_entry_safe(pos, n, &bfqd->burst_list,
+ burst_list_node)
+ hlist_del_init(&pos->burst_list_node);
+ } else /*
+ * Burst not yet large: add bfqq to the burst list. Do
+ * not increment the ref counter for bfqq, because bfqq
+ * is removed from the burst list before freeing bfqq
+ * in put_queue.
+ */
+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
+}
+
+/*
+ * If many queues belonging to the same group happen to be created
+ * shortly after each other, then the processes associated with these
+ * queues have typically a common goal. In particular, bursts of queue
+ * creations are usually caused by services or applications that spawn
+ * many parallel threads/processes. Examples are systemd during boot,
+ * or git grep. To help these processes get their job done as soon as
+ * possible, it is usually better to not grant either weight-raising
+ * or device idling to their queues.
+ *
+ * In this comment we describe, firstly, the reasons why this fact
+ * holds, and, secondly, the next function, which implements the main
+ * steps needed to properly mark these queues so that they can then be
+ * treated in a different way.
+ *
+ * The above services or applications benefit mostly from a high
+ * throughput: the quicker the requests of the activated queues are
+ * cumulatively served, the sooner the target job of these queues gets
+ * completed. As a consequence, weight-raising any of these queues,
+ * which also implies idling the device for it, is almost always
+ * counterproductive. In most cases it just lowers throughput.
+ *
+ * On the other hand, a burst of queue creations may be caused also by
+ * the start of an application that does not consist of a lot of
+ * parallel I/O-bound threads. In fact, with a complex application,
+ * several short processes may need to be executed to start-up the
+ * application. In this respect, to start an application as quickly as
+ * possible, the best thing to do is in any case to privilege the I/O
+ * related to the application with respect to all other
+ * I/O. Therefore, the best strategy to start as quickly as possible
+ * an application that causes a burst of queue creations is to
+ * weight-raise all the queues created during the burst. This is the
+ * exact opposite of the best strategy for the other type of bursts.
+ *
+ * In the end, to take the best action for each of the two cases, the
+ * two types of bursts need to be distinguished. Fortunately, this
+ * seems relatively easy, by looking at the sizes of the bursts. In
+ * particular, we found a threshold such that only bursts with a
+ * larger size than that threshold are apparently caused by
+ * services or commands such as systemd or git grep. For brevity,
+ * hereafter we call just 'large' these bursts. BFQ *does not*
+ * weight-raise queues whose creation occurs in a large burst. In
+ * addition, for each of these queues BFQ performs or does not perform
+ * idling depending on which choice boosts the throughput more. The
+ * exact choice depends on the device and request pattern at
+ * hand.
+ *
+ * Unfortunately, false positives may occur while an interactive task
+ * is starting (e.g., an application is being started). The
+ * consequence is that the queues associated with the task do not
+ * enjoy weight raising as expected. Fortunately these false positives
+ * are very rare. They typically occur if some service happens to
+ * start doing I/O exactly when the interactive task starts.
+ *
+ * Turning back to the next function, it implements all the steps
+ * needed to detect the occurrence of a large burst and to properly
+ * mark all the queues belonging to it (so that they can then be
+ * treated in a different way). This goal is achieved by maintaining a
+ * "burst list" that holds, temporarily, the queues that belong to the
+ * burst in progress. The list is then used to mark these queues as
+ * belonging to a large burst if the burst does become large. The main
+ * steps are the following.
+ *
+ * . when the very first queue is created, the queue is inserted into the
+ * list (as it could be the first queue in a possible burst)
+ *
+ * . if the current burst has not yet become large, and a queue Q that does
+ * not yet belong to the burst is activated shortly after the last time
+ * at which a new queue entered the burst list, then the function appends
+ * Q to the burst list
+ *
+ * . if, as a consequence of the previous step, the burst size reaches
+ * the large-burst threshold, then
+ *
+ * . all the queues in the burst list are marked as belonging to a
+ * large burst
+ *
+ * . the burst list is deleted; in fact, the burst list already served
+ * its purpose (keeping temporarily track of the queues in a burst,
+ * so as to be able to mark them as belonging to a large burst in the
+ * previous sub-step), and now is not needed any more
+ *
+ * . the device enters a large-burst mode
+ *
+ * . if a queue Q that does not belong to the burst is created while
+ * the device is in large-burst mode and shortly after the last time
+ * at which a queue either entered the burst list or was marked as
+ * belonging to the current large burst, then Q is immediately marked
+ * as belonging to a large burst.
+ *
+ * . if a queue Q that does not belong to the burst is created a while
+ * later, i.e., not shortly after, than the last time at which a queue
+ * either entered the burst list or was marked as belonging to the
+ * current large burst, then the current burst is deemed as finished and:
+ *
+ * . the large-burst mode is reset if set
+ *
+ * . the burst list is emptied
+ *
+ * . Q is inserted in the burst list, as Q may be the first queue
+ * in a possible new burst (then the burst list contains just Q
+ * after this step).
+ */
+static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ /*
+ * If bfqq is already in the burst list or is part of a large
+ * burst, or finally has just been split, then there is
+ * nothing else to do.
+ */
+ if (!hlist_unhashed(&bfqq->burst_list_node) ||
+ bfq_bfqq_in_large_burst(bfqq) ||
+ time_is_after_eq_jiffies(bfqq->split_time +
+ msecs_to_jiffies(10)))
+ return;
+
+ /*
+ * If bfqq's creation happens late enough, or bfqq belongs to
+ * a different group than the burst group, then the current
+ * burst is finished, and related data structures must be
+ * reset.
+ *
+ * In this respect, consider the special case where bfqq is
+ * the very first queue created after BFQ is selected for this
+ * device. In this case, last_ins_in_burst and
+ * burst_parent_entity are not yet significant when we get
+ * here. But it is easy to verify that, whether or not the
+ * following condition is true, bfqq will end up being
+ * inserted into the burst list. In particular the list will
+ * happen to contain only bfqq. And this is exactly what has
+ * to happen, as bfqq may be the first queue of the first
+ * burst.
+ */
+ if (time_is_before_jiffies(bfqd->last_ins_in_burst +
+ bfqd->bfq_burst_interval) ||
+ bfqq->entity.parent != bfqd->burst_parent_entity) {
+ bfqd->large_burst = false;
+ bfq_reset_burst_list(bfqd, bfqq);
+ goto end;
+ }
+
+ /*
+ * If we get here, then bfqq is being activated shortly after the
+ * last queue. So, if the current burst is also large, we can mark
+ * bfqq as belonging to this large burst immediately.
+ */
+ if (bfqd->large_burst) {
+ bfq_mark_bfqq_in_large_burst(bfqq);
+ goto end;
+ }
+
+ /*
+ * If we get here, then a large-burst state has not yet been
+ * reached, but bfqq is being activated shortly after the last
+ * queue. Then we add bfqq to the burst.
+ */
+ bfq_add_to_burst(bfqd, bfqq);
+end:
+ /*
+ * At this point, bfqq either has been added to the current
+ * burst or has caused the current burst to terminate and a
+ * possible new burst to start. In particular, in the second
+ * case, bfqq has become the first queue in the possible new
+ * burst. In both cases last_ins_in_burst needs to be moved
+ * forward.
+ */
+ bfqd->last_ins_in_burst = jiffies;
+}
+
+static int bfq_bfqq_budget_left(struct bfq_queue *bfqq)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+
+ return entity->budget - entity->service;
+}
+
+/*
+ * If enough samples have been computed, return the current max budget
+ * stored in bfqd, which is dynamically updated according to the
+ * estimated disk peak rate; otherwise return the default max budget
+ */
+static int bfq_max_budget(struct bfq_data *bfqd)
+{
+ if (bfqd->budgets_assigned < bfq_stats_min_budgets)
+ return bfq_default_max_budget;
+ else
+ return bfqd->bfq_max_budget;
+}
+
+/*
+ * Return min budget, which is a fraction of the current or default
+ * max budget (trying with 1/32)
+ */
+static int bfq_min_budget(struct bfq_data *bfqd)
+{
+ if (bfqd->budgets_assigned < bfq_stats_min_budgets)
+ return bfq_default_max_budget / 32;
+ else
+ return bfqd->bfq_max_budget / 32;
+}
+
+/*
+ * The next function, invoked after the input queue bfqq switches from
+ * idle to busy, updates the budget of bfqq. The function also tells
+ * whether the in-service queue should be expired, by returning
+ * true. The purpose of expiring the in-service queue is to give bfqq
+ * the chance to possibly preempt the in-service queue, and the reason
+ * for preempting the in-service queue is to achieve one of the two
+ * goals below.
+ *
+ * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has
+ * expired because it has remained idle. In particular, bfqq may have
+ * expired for one of the following two reasons:
+ *
+ * - BFQQE_NO_MORE_REQUESTS bf