// SPDX-License-Identifier: GPL-2.0
/*
* The Kyber I/O scheduler. Controls latency by throttling queue depths using
* scalable techniques.
*
* Copyright (C) 2017 Facebook
*/
#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
#include <linux/elevator.h>
#include <linux/module.h>
#include <linux/sbitmap.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
#include "blk-mq-sched.h"
#include "blk-mq-tag.h"
#define CREATE_TRACE_POINTS
#include <trace/events/kyber.h>
/*
* Scheduling domains: the device is divided into multiple domains based on the
* request type.
*/
enum {
KYBER_READ,
KYBER_WRITE,
KYBER_DISCARD,
KYBER_OTHER,
KYBER_NUM_DOMAINS,
};
static const char *kyber_domain_names[] = {
[KYBER_READ] = "READ",
[KYBER_WRITE] = "WRITE",
[KYBER_DISCARD] = "DISCARD",
[KYBER_OTHER] = "OTHER",
};
enum {
/*
* In order to prevent starvation of synchronous requests by a flood of
* asynchronous requests, we reserve 25% of requests for synchronous
* operations.
*/
KYBER_ASYNC_PERCENT = 75,
};
/*
* Maximum device-wide depth for each scheduling domain.
*
* Even for fast devices with lots of tags like NVMe, you can saturate the
* device with only a fraction of the maximum possible queue depth. So, we cap
* these to a reasonable value.
*/
static const unsigned int kyber_depth[] = {
[KYBER_READ] = 256,
[KYBER_WRITE] = 128,
[KYBER_DISCARD] = 64,
[KYBER_OTHER] = 16,
};
/*
* Default latency targets for each scheduling domain.
*/
static const u64 kyber_latency_targets[] = {
[KYBER_READ] = 2ULL * NSEC_PER_MSEC,
[KYBER_WRITE] = 10ULL * NSEC_PER_MSEC,
[KYBER_DISCARD] = 5ULL * NSEC_PER_SEC,
};
/*
* Batch size (number of requests we'll dispatch in a row) for each scheduling
* domain.
*/
static const unsigned int kyber_batch_size[] = {
[KYBER_READ] = 16,
[KYBER_WRITE] = 8,
[KYBER_DISCARD] = 1,
[KYBER_OTHER] = 1,
};
/*
* Requests latencies are recorded in a histogram with buckets defined relative
* to the target latency:
*
* <= 1/4 * target latency
* <= 1/2 * target latency
* <= 3/4 * target latency
* <= target latency
* <= 1 1/4 * target latency
* <= 1 1/2 * target latency
* <= 1 3/4 * target latency
* > 1 3/4 * target latency
*/
enum {
/*
* The width of the latency histogram buckets is
* 1 / (1 << KYBER_LATENCY_SHIFT) * target latency.
*/
KYBER_LATENCY_SHIFT = 2,
/*
* The first (1 << KYBER_LATENCY_SHIFT) buckets are <= target latency,
* thus, "good".
*/
KYBER_GOOD_BUCKETS = 1 << KYBER_LATENCY_SHIFT,
/* There are also (1 << KYBER_LATENCY_SHIFT) "bad" buckets. */
KYBER_LATENCY_BUCKETS = 2 << KYBER_LATENCY_SHIFT,
};
/*
* We measure both the total latency and the I/O latency (i.e., latency after
* submitting to the device).
*/
enum {
KYBER_TOTAL_LATENCY,
KYBER_IO_LATENCY,
};
static const char *kyber_latency_type_names[] = {
[KYBER_TOTAL_LATENCY] = "total",
[KYBER_IO_LATENCY] = "I/O",
};
/*
* Per-cpu latency histograms: total latency and I/O latency for each scheduling
* domain except for KYBER_OTHER.
*/
struct kyber_cpu_latency {
atomic_t buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS];
};
/*
* There is a same mapping between ctx & hctx and kcq & khd,
* we use request->mq_ctx->index_hw to index the kcq in khd.
*/
struct kyber_ctx_queue {
/*
* Used to ensure operations on rq_list and kcq_map to be an atmoic one.
* Also protect the rqs on rq_list when merge.
*/
spinlock_t lock;
struct list_head rq_list[KYBER_NUM_DOMAINS];
} ____cacheline_aligned_in_smp;
struct kyber_queue_data {
struct request_queue *q;
/*
* Each scheduling domain has a limited number of in-flight requests
* device-wide, limited by these tokens.
*/
struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
/*
* Async request percentage, converted to per-word depth for
* sbitmap_get_shallow().
*/
unsigned int async_depth;
struct kyber_cpu_latency __percpu *cpu_latency;
/* Timer for stats aggregation and adjusting domain tokens. */
struct timer_list timer;
unsigned int latency_buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS];
unsigned long latency_timeout[KYBER_OTHER];
int domain_p99[KYBER_OTHER];
/* Target latencies in nanoseconds. */
u64 latency_targets[KYBER_OTHER];
};
struct kyber_hctx_data {
spinlock_t lock;
struct list_head rqs[KYBER_NUM_DOMAINS];
unsigned int cur_domain;
unsigned int batching;
struct kyber_ctx_queue *kcqs;
struct sbitmap kcq_map[KYBER_NUM_DOMAINS];
struct sbq_wait domain_wait[KYBER_NUM_DOMAINS];
struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
atomic_t wait_index[KYBER_NUM_DOMAINS];
};
static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
void *key);
static unsigned int kyber_sched_domain(unsigned int op)
{
switch (op & REQ_OP_MASK) {
case REQ_OP_READ:
return KYBER_READ;
<