// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
*
* Copyright (C) 2013-2015 Eric Dumazet <edumazet@google.com>
*
* Meant to be mostly used for locally generated traffic :
* Fast classification depends on skb->sk being set before reaching us.
* If not, (router workload), we use rxhash as fallback, with 32 bits wide hash.
* All packets belonging to a socket are considered as a 'flow'.
*
* Flows are dynamically allocated and stored in a hash table of RB trees
* They are also part of one Round Robin 'queues' (new or old flows)
*
* Burst avoidance (aka pacing) capability :
*
* Transport (eg TCP) can set in sk->sk_pacing_rate a rate, enqueue a
* bunch of packets, and this packet scheduler adds delay between
* packets to respect rate limitation.
*
* enqueue() :
* - lookup one RB tree (out of 1024 or more) to find the flow.
* If non existent flow, create it, add it to the tree.
* Add skb to the per flow list of skb (fifo).
* - Use a special fifo for high prio packets
*
* dequeue() : serves flows in Round Robin
* Note : When a flow becomes empty, we do not immediately remove it from
* rb trees, for performance reasons (its expected to send additional packets,
* or SLAB cache will reuse socket for another flow)
*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/string.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
#include <linux/hash.h>
#include <linux/prefetch.h>
#include <linux/vmalloc.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/sock.h>
#include <net/tcp_states.h>
#include <net/tcp.h>
struct fq_skb_cb {
u64 time_to_send;
};
static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb)
{
qdisc_cb_private_validate(skb, sizeof(struct fq_skb_cb));
return (struct fq_skb_cb *)qdisc_skb_cb(skb)->data;
}
/*
* Per flow structure, dynamically allocated.
* If packets have monotically increasing time_to_send, they are placed in O(1)
* in linear list (head,tail), otherwise are placed in a rbtree (t_root).
*/
struct fq_flow {
struct rb_root t_root;
struct sk_buff *head; /* list of skbs for this flow : first skb */
union {
struct sk_buff *tail; /* last skb in the list */
unsigned long age; /* jiffies when flow was emptied, for gc */
};
struct rb_node fq_node; /* anchor in fq_root[] trees */
struct sock *sk;
int qlen; /* number of packets in flow queue */
int credit;
u32 socket_hash; /* sk_hash */
struct fq_flow *next; /* next pointer in RR lists, or &detached */
struct rb_node rate_node; /* anchor in q->delayed tree */
u64 time_next_packet;
};
struct fq_flow_head {
struct fq_flow *first;
struct fq_flow *last;
};
struct fq_sched_data {
struct fq_flow_head new_flows;
struct fq_flow_head old_flows;
struct rb_root delayed; /* for rate limited flows */
u64 time_next_delayed_flow;
unsigned long unthrottle_latency_ns;
struct fq_flow internal; /* for non classified or high prio packets */
u32 quantum;
u32 initial_quantum;
u32 flow_refill_delay;
u32 flow_plimit; /* max packets per flow */
unsigned long flow_max_rate; /* optional max rate per flow */
u64 ce_threshold;
u32 orphan_mask; /* mask for orphaned skb */
u32 low_rate_threshold;
struct rb_root *fq_root;
u8 rate_enable;
u8 fq_trees_log;
u32 flows;
u32 inactive_flows;
u32 throttled_flows;
u64 stat_gc_flows;
u64 stat_internal_packets;
u64 stat_throttled;
u64 stat_ce_mark;
u64 stat_flows_plimit;
u64 stat_pkts_too_long;
u64 stat_allocation_errors;
struct qdisc_watchdog watchdog;
};
/* special value to mark a detached flow (not on old/new list) */
static struct fq_flow detached, throttled;
static void fq_flow_set_detached(struct fq_flow *f)
{
f->next = &detached;
f->age = jiffies;
}
static bool fq_flow_is_detached(const struct fq_flow *f)
{
return f->next == &detached;
}
static bool fq_flow_is_throttled(const struct fq_flow *f)
{
return f->next == &throttled;
}
static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow)
{
if (head->first)
head->last->next = flow;
else
head->first = flow;
head->last = flow;
flow->next = NULL;
}
static void fq_flow_unset_throttled(struct fq_sched_data *q, struct fq_flow *f)
{
rb_erase(&f->rate_node, &q->delayed);
q->throttled_flows--;
fq_flow_add_tail(&q->old_flows, f);
}
static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
{
struct rb_node **p = &q->delayed.rb_node, *parent = NULL;
while (*p) {
struct fq_flow *aux;
parent = *p;
aux = rb_entry(parent, struct fq_flow, rate_node);