/*
* The Kyber I/O scheduler. Controls latency by throttling queue depths using
* scalable techniques.
*
* Copyright (C) 2017 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
#include <linux/elevator.h>
#include <linux/module.h>
#include <linux/sbitmap.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
#include "blk-mq-sched.h"
#include "blk-mq-tag.h"
#include "blk-stat.h"
/* Scheduling domains. */
enum {
KYBER_READ,
KYBER_SYNC_WRITE,
KYBER_OTHER, /* Async writes, discard, etc. */
KYBER_NUM_DOMAINS,
};
enum {
KYBER_MIN_DEPTH = 256,
/*
* In order to prevent starvation of synchronous requests by a flood of
* asynchronous requests, we reserve 25% of requests for synchronous
* operations.
*/
KYBER_ASYNC_PERCENT = 75,
};
/*
* Initial device-wide depths for each scheduling domain.
*
* Even for fast devices with lots of tags like NVMe, you can saturate
* the device with only a fraction of the maximum possible queue depth.
* So, we cap these to a reasonable value.
*/
static const unsigned int kyber_depth[] = {
[KYBER_READ] = 256,
[KYBER_SYNC_WRITE] = 128,
[KYBER_OTHER] = 64,
};
/*
* Scheduling domain batch sizes. We favor reads.
*/
static const unsigned int kyber_batch_size[] = {
[KYBER_READ] = 16,
[KYBER_SYNC_WRITE] = 8,
[KYBER_OTHER] = 8,
};
/*
* There is a same mapping between ctx & hctx and kcq & khd,
* we use request->mq_ctx->index_hw to index the kcq in khd.
*/
struct kyber_ctx_queue {
/*
* Used to ensure operations on rq_list and kcq_map to be an atmoic one.
* Also protect the rqs on rq_list when merge.
*/
spinlock_t lock;
struct list_head rq_list[KYBER_NUM_DOMAINS];
} ____cacheline_aligned_in_smp;
struct kyber_queue_data {
struct request_queue *q;
struct blk_stat_callback *cb;
/*
* The device is divided into multiple scheduling domains based on the
* request type. Each domain has a fixed number of in-flight requests of
* that type device-wide, limited by these tokens.
*/
struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
/*
* Async request percentage, converted to per-word depth for
* sbitmap_get_shallow().
*/
unsigned int async_depth;
/* Target latencies in nanoseconds. */
u64 read_lat_nsec, write_lat_nsec;
};
struct kyber_hctx_data {
spinlock_t lock;
struct list_head rqs[KYBER_NUM_DOMAINS];
unsigned int cur_domain;
unsigned int batching;
struct kyber_ctx_queue *kcqs;
struct sbitmap kcq_map[KYBER_NUM_DOMAINS];
wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
atomic_t wait_index[KYBER_NUM_DOMAINS];
};
static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
void *key);
static unsigned int kyber_sched_domain(unsigned int op)
{
if ((op & REQ_OP_MASK) == REQ_OP_READ)
return KYBER_READ;
else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op))
return KYBER_SYNC_WRITE;
else
return KYBER_OTHER;
}
enum {
NONE = 0,
GOOD = 1,
GREAT = 2,
BAD = -1,
AWFUL = -2,
};
#define IS_GOOD(status) ((status) > 0)
#define IS_BAD(status) ((status) < 0)
static int kyber_lat_status(struct blk_stat_callback *cb,
unsigned int sched_domain, u64 target)
{
u64 latency;
if (!cb->stat[sched_domain].nr_samples)
return NONE;
latency = cb->stat[sched_domain].mean;
if (latency >= 2 * target)
return AWFUL;
else if (latency > target)
return BAD;
else if (latency <= target / 2)
return GREAT;
else /* (latency <= target) */
return GOOD;
}
/*
* Adjust the read or synchronous write depth given the status of reads and
* writes. The goal is that the latencies of the two domains are fair (i.e., if
* one is good, then the other is good).
*/
static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd,
unsigned int sched_domain, int this_status,
int other_status)
{
unsigned int orig_depth, depth;
/*
* If this domain had no samples, or reads and writes are both good or
* both bad, don't adjust the depth.
*/
if (this_status == NONE ||
(IS_GOOD(this_status) && IS_GOOD(other_status)) ||
(IS_BAD(this_status) && IS_BAD(other_status)))
return;
orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth;
if (other_status == NONE) {
depth++;
} else {
switch (this_status) {
case GOOD:
if (other_status == AWFUL)
depth