Linux-2.6.12-rc2

Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
author: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 15:20:36 -0700
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 15:20:36 -0700
commit: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree: 0bba044c4ce775e45a88a51686b5d9f90697ea9d /net/sched/sch_cbq.c
1 files changed, 2124 insertions, 0 deletions
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
new file mode 100644
index 000000000000..d43e3b8cbf6a
--- /dev/null
+++ b/net/sched/sch_cbq.c
@@ -0,0 +1,2124 @@
+/*
+ * net/sched/sch_cbq.c	Class-Based Queueing discipline.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/notifier.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/pkt_sched.h>
+
+
+/*	Class-Based Queueing (CBQ) algorithm.
+	=======================================
+
+	Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
+	         Management Models for Packet Networks",
+		 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
+
+	         [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
+
+	         [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
+		 Parameters", 1996
+
+		 [4] Sally Floyd and Michael Speer, "Experimental Results
+		 for Class-Based Queueing", 1998, not published.
+
+	-----------------------------------------------------------------------
+
+	Algorithm skeleton was taken from NS simulator cbq.cc.
+	If someone wants to check this code against the LBL version,
+	he should take into account that ONLY the skeleton was borrowed,
+	the implementation is different. Particularly:
+
+	--- The WRR algorithm is different. Our version looks more
+        reasonable (I hope) and works when quanta are allowed to be
+        less than MTU, which is always the case when real time classes
+        have small rates. Note, that the statement of [3] is
+        incomplete, delay may actually be estimated even if class
+        per-round allotment is less than MTU. Namely, if per-round
+        allotment is W*r_i, and r_1+...+r_k = r < 1
+
+	delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
+
+	In the worst case we have IntServ estimate with D = W*r+k*MTU
+	and C = MTU*r. The proof (if correct at all) is trivial.
+
+
+	--- It seems that cbq-2.0 is not very accurate. At least, I cannot
+	interpret some places, which look like wrong translations
+	from NS. Anyone is advised to find these differences
+	and explain to me, why I am wrong 8).
+
+	--- Linux has no EOI event, so that we cannot estimate true class
+	idle time. Workaround is to consider the next dequeue event
+	as sign that previous packet is finished. This is wrong because of
+	internal device queueing, but on a permanently loaded link it is true.
+	Moreover, combined with clock integrator, this scheme looks
+	very close to an ideal solution.  */
+
+struct cbq_sched_data;
+
+
+struct cbq_class
+{
+	struct cbq_class	*next;		/* hash table link */
+	struct cbq_class	*next_alive;	/* next class with backlog in this priority band */
+
+/* Parameters */
+	u32			classid;
+	unsigned char		priority;	/* class priority */
+	unsigned char		priority2;	/* priority to be used after overlimit */
+	unsigned char		ewma_log;	/* time constant for idle time calculation */
+	unsigned char		ovl_strategy;
+#ifdef CONFIG_NET_CLS_POLICE
+	unsigned char		police;
+#endif
+
+	u32			defmap;
+
+	/* Link-sharing scheduler parameters */
+	long			maxidle;	/* Class parameters: see below. */
+	long			offtime;
+	long			minidle;
+	u32			avpkt;
+	struct qdisc_rate_table	*R_tab;
+
+	/* Overlimit strategy parameters */
+	void			(*overlimit)(struct cbq_class *cl);
+	long			penalty;
+
+	/* General scheduler (WRR) parameters */
+	long			allot;
+	long			quantum;	/* Allotment per WRR round */
+	long			weight;		/* Relative allotment: see below */
+
+	struct Qdisc		*qdisc;		/* Ptr to CBQ discipline */
+	struct cbq_class	*split;		/* Ptr to split node */
+	struct cbq_class	*share;		/* Ptr to LS parent in the class tree */
+	struct cbq_class	*tparent;	/* Ptr to tree parent in the class tree */
+	struct cbq_class	*borrow;	/* NULL if class is bandwidth limited;
+						   parent otherwise */
+	struct cbq_class	*sibling;	/* Sibling chain */
+	struct cbq_class	*children;	/* Pointer to children chain */
+
+	struct Qdisc		*q;		/* Elementary queueing discipline */
+
+
+/* Variables */
+	unsigned char		cpriority;	/* Effective priority */
+	unsigned char		delayed;
+	unsigned char		level;		/* level of the class in hierarchy:
+						   0 for leaf classes, and maximal
+						   level of children + 1 for nodes.
+						 */
+
+	psched_time_t		last;		/* Last end of service */
+	psched_time_t		undertime;
+	long			avgidle;
+	long			deficit;	/* Saved deficit for WRR */
+	unsigned long		penalized;
+	struct gnet_stats_basic bstats;
+	struct gnet_stats_queue qstats;
+	struct gnet_stats_rate_est rate_est;
+	spinlock_t		*stats_lock;
+	struct tc_cbq_xstats	xstats;
+
+	struct tcf_proto	*filter_list;
+
+	int			refcnt;
+	int			filters;
+
+	struct cbq_class 	*defaults[TC_PRIO_MAX+1];
+};
+
+struct cbq_sched_data
+{
+	struct cbq_class	*classes[16];		/* Hash table of all classes */
+	int			nclasses[TC_CBQ_MAXPRIO+1];
+	unsigned		quanta[TC_CBQ_MAXPRIO+1];
+
+	struct cbq_class	link;
+
+	unsigned		activemask;
+	struct cbq_class	*active[TC_CBQ_MAXPRIO+1];	/* List of all classes
+								   with backlog */
+
+#ifdef CONFIG_NET_CLS_POLICE
+	struct cbq_class	*rx_class;
+#endif
+	struct cbq_class	*tx_class;
+	struct cbq_class	*tx_borrowed;
+	int			tx_len;
+	psched_time_t		now;		/* Cached timestamp */
+	psched_time_t		now_rt;		/* Cached real time */
+	unsigned		pmask;
+
+	struct timer_list	delay_timer;
+	struct timer_list	wd_timer;	/* Watchdog timer,
+						   started when CBQ has
+						   backlog, but cannot
+						   transmit just now */
+	long			wd_expires;
+	int			toplevel;
+	u32			hgenerator;
+};
+
+
+#define L2T(cl,len)	((cl)->R_tab->data[(len)>>(cl)->R_tab->rate.cell_log])
+
+
+static __inline__ unsigned cbq_hash(u32 h)
+{
+	h ^= h>>8;
+	h ^= h>>4;
+	return h&0xF;
+}
+
+static __inline__ struct cbq_class *
+cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
+{
+	struct cbq_class *cl;
+
+	for (cl = q->classes[cbq_hash(classid)]; cl; cl = cl->next)
+		if (cl->classid == classid)
+			return cl;
+	return NULL;
+}
+
+#ifdef CONFIG_NET_CLS_POLICE
+
+static struct cbq_class *
+cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
+{
+	struct cbq_class *cl, *new;
+
+	for (cl = this->tparent; cl; cl = cl->tparent)
+		if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this)
+			return new;
+
+	return NULL;
+}
+
+#endif
+
+/* Classify packet. The procedure is pretty complicated, but
+   it allows us to combine link sharing and priority scheduling
+   transparently.
+
+   Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
+   so that it resolves to split nodes. Then packets are classified
+   by logical priority, or a more specific classifier may be attached
+   to the split node.
+ */
+
+static struct cbq_class *
+cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
+{
+	struct cbq_sched_data *q = qdisc_priv(sch);
+	struct cbq_class *head = &q->link;
+	struct cbq_class **defmap;
+	struct cbq_class *cl = NULL;
+	u32 prio = skb->priority;
+	struct tcf_result res;
+
+	/*
+	 *  Step 1. If skb->priority points to one of our classes, use it.
+	 */
+	if (TC_H_MAJ(prio^sch->handle) == 0 &&
+	    (cl = cbq_class_lookup(q, prio)) != NULL)
+		return cl;
+
+	*qerr = NET_XMIT_DROP;
+	for (;;) {
+		int result = 0;
+		defmap = head->defaults;
+
+		/*
+		 * Step 2+n. Apply classifier.
+		 */
+		if (!head->filter_list || (result = tc_classify(skb, head->filter_list, &res)) < 0)
+			goto fallback;
+
+		if ((cl = (void*)res.class) == NULL) {
+			if (TC_H_MAJ(res.classid))
+				cl = cbq_class_lookup(q, res.classid);
+			else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL)
+				cl = defmap[TC_PRIO_BESTEFFORT];
+
+			if (cl == NULL || cl->level >= head->level)
+				goto fallback;
+		}
+
+#ifdef CONFIG_NET_CLS_ACT
+		switch (result) {
+		case TC_ACT_QUEUED:
+		case TC_ACT_STOLEN: 
+			*qerr = NET_XMIT_SUCCESS;
+		case TC_ACT_SHOT:
+			return NULL;
+		}
+#elif defined(CONFIG_NET_CLS_POLICE)
+		switch (result) {
+		case TC_POLICE_RECLASSIFY:
+			return cbq_reclassify(skb, cl);
+		case TC_POLICE_SHOT:
+			return NULL;
+		default:
+			break;
+		}
+#endif
+		if (cl->level == 0)
+			return cl;
+
+		/*
+		 * Step 3+n. If classifier selected a link sharing class,
+		 *	   apply agency specific classifier.
+		 *	   Repeat this procdure until we hit a leaf node.
+		 */
+		head = cl;
+	}
+
+fallback:
+	cl = head;
+
+	/*
+	 * Step 4. No success...
+	 */
+	if (TC_H_MAJ(prio) == 0 &&
+	    !(cl = head->defaults[prio&TC_PRIO_MAX]) &&
+	    !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
+		return head;
+
+	return cl;
+}
+
+/*
+   A packet has just been enqueued on the empty class.
+   cbq_activate_class adds it to the tail of active class list
+   of its priority band.
+ */
+
+static __inline__ void cbq_activate_class(struct cbq_class *cl)
+{
+	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
+	int prio = cl->cpriority;
+	struct cbq_class *cl_tail;
+
+	cl_tail = q->active[prio];
+	q->active[prio] = cl;
+
+	if (cl_tail != NULL) {
+		cl->next_alive = cl_tail->next_alive;
+		cl_tail->next_alive = cl;
+	} else {
+		cl->next_alive = cl;
+		q->activemask |= (1<<prio);
+	}
+}
+
+/*
+   Unlink class from active chain.
+   Note that this same procedure is done directly in cbq_dequeue*
+   during round-robin procedure.
+ */
+
+static void cbq_deactivate_class(struct cbq_class *this)
+{
+	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
+	int prio = this->cpriority;
+	struct cbq_class *cl;
+	struct cbq_class *cl_prev = q->active[prio];
+
+	do {
+		cl = cl_prev->next_alive;
+		if (cl == this) {
+			cl_prev->next_alive = cl->next_alive;
+			cl->next_alive = NULL;
+
+			if (cl == q->active[prio]) {
+				q->active[prio] = cl_prev;
+				if (cl == q->active[prio]) {
+					q->active[prio] = NULL;
+					q->activemask &= ~(1<<prio);
+					return;
+				}
+			}
+
+			cl = cl_prev->next_alive;
+			return;
+		}
+	} while ((cl_prev = cl) != q->active[prio]);
+}
+
+static void
+cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
+{
+	int toplevel = q->toplevel;
+
+	if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) {
+		psched_time_t now;
+		psched_tdiff_t incr;
+
+		PSCHED_GET_TIME(now);
+		incr = PSCHED_TDIFF(now, q->now_rt);
+		PSCHED_TADD2(q->now, incr, now);
+
+		do {
+			if (PSCHED_TLESS(cl->undertime, now)) {
+				q->toplevel = cl->level;
+				return;
+			}
+		} while ((cl=cl->borrow) != NULL && toplevel > cl->level);
+	}
+}
+
+static int
+cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct cbq_sched_data *q = qdisc_priv(sch);
+	int len = skb->len;
+	int ret;
+	struct cbq_class *cl = cbq_classify(skb, sch, &ret);
+
+#ifdef CONFIG_NET_CLS_POLICE
+	q->rx_class = cl;
+#endif
+	if (cl == NULL) {
+		if (ret == NET_XMIT_DROP)
+			sch->qstats.drops++;
+		kfree_skb(skb);
+		return ret;
+	}
+
+#ifdef CONFIG_NET_CLS_POLICE
+	cl->q->__parent = sch;
+#endif
+	if ((ret = cl->q->enqueue(skb, cl->q)) == NET_XMIT_SUCCESS) {
+		sch->q.qlen++;
+		sch->bstats.packets++;
+		sch->bstats.bytes+=len;
+		cbq_mark_toplevel(q, cl);
+		if (!cl->next_alive)
+			cbq_activate_class(cl);
+		return ret;
+	}
+
+	sch->qstats.drops++;
+	cbq_mark_toplevel(q, cl);
+	cl->qstats.drops++;
+	return ret;
+}
+
+static int
+cbq_requeue(struct sk_buff *skb, struct Qdisc *sch)
+{
+	struct cbq_sched_data *q = qdisc_priv(sch);
+	struct cbq_class *cl;
+	int ret;
+
+	if ((cl = q->tx_class) == NULL) {
+		kfree_skb(skb);
+		sch->qstats.drops++;
+		return NET_XMIT_CN;
+	}
+	q->tx_class = NULL;
+
+	cbq_mark_toplevel(q, cl);
+
+#ifdef CONFIG_NET_CLS_POLICE
+	q->rx_class = cl;
+	cl->q->__parent = sch;
+#endif
+	if ((ret = cl->q->ops->requeue(skb, cl->q)) == 0) {
+		sch->q.qlen++;
+		sch->qstats.requeues++;
+		if (!cl->next_alive)
+			cbq_activate_class(cl);
+		return 0;
+	}
+	sch->qstats.drops++;
+	cl->qstats.drops++;
+	return ret;
+}
+
+/* Overlimit actions */
+
+/* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */
+
+static void cbq_ovl_classic(struct cbq_class *cl)
+{
+	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
+	psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now);
+
+	if (!cl->delayed) {
+		delay += cl->offtime;
+
+		/* 
+		   Class goes to sleep, so that it will have no
+		   chance to work avgidle. Let's forgive it 8)
+
+		   BTW cbq-2.0 has a crap in this
+		   place, apparently they forgot to shift it by cl->ewma_log.
+		 */
+		if (cl->avgidle < 0)
+			delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
+		if (cl->avgidle < cl->minidle)
+			cl->avgidle = cl->minidle;
+		if (delay <= 0)
+			delay = 1;
+		PSCHED_TADD2(q->now, delay, cl->undertime);
+
+		cl->xstats.overactions++;
+		cl->delayed = 1;
+	}
+	if (q->wd_expires == 0 || q->wd_expires > delay)
+		q->wd_expires = delay;
+
+	/* Dirty work! We must schedule wakeups based on
+	   real available rate, rather than leaf rate,
+	   which may be tiny (even zero).
+	 */
+	if (q->toplevel == TC_CBQ_MAXLEVEL) {
+		struct cbq_class *b;
+		psched_tdiff_t base_delay = q->wd_expires;
+
+		for (b = cl->borrow; b; b = b->borrow) {
+			delay = PSCHED_TDIFF(b->undertime, q->now);
+			if (delay < base_delay) {
+				if (delay <= 0)
+					delay = 1;
+				base_delay = delay;
+			}
+		}
+
+		q->wd_expires = base_delay;
+	}
+}
+
+/* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
+   they go overlimit
+ */
+
+static void cbq_ovl_rclassic(struct cbq_class *cl)
+{
+	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
+	struct cbq_class *this = cl;
+
+	do {
+		if (cl->level > q->toplevel) {
+			cl = NULL;
+			break;
+		}
+	} while ((cl = cl->borrow) != NULL);
+
+	if (cl == NULL)
+		cl = this;
+	cbq_ovl_classic(cl);
+}
+
+/* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
+
+static void cbq_ovl_delay(struct cbq_class *cl)
+{
+	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
+	psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now);
+
+	if (!cl->delayed) {
+		unsigned long sched = jiffies;
+
+		delay += cl->offtime;
+		if (cl->avgidle < 0)
+			delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
+		if (cl->avgidle < cl->minidle)
+			cl->avgidle = cl->minidle;
+		PSCHED_TADD2(q->now, delay, cl->undertime);
+
+		if (delay > 0) {
+			sched += PSCHED_US2JIFFIE(delay) + cl->penalty;
+			cl->penalized = sched;
+			cl->cpriority = TC_CBQ_MAXPRIO;
+			q->pmask |= (1<<TC_CBQ_MAXPRIO);
+			if (del_timer(&q->delay_timer) &&
+			    (long)(q->delay_timer.expires - sched) > 0)
+				q->delay_timer.expires = sched;
+			add_timer(&q->delay_timer);
+			cl->delayed = 1;
+			cl->xstats.overactions++;
+			return;
+		}
+		delay = 1;
+	}
+	if (q->wd_expires == 0 || q->wd_expires > delay)
+		q->wd_expires = delay;
+}
+
+/* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
+
+static void cbq_ovl_lowprio(struct cbq_class *cl)
+{
+	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
+
+	cl->penalized = jiffies + cl->penalty;
+
+	if (cl->cpriority != cl->priority2) {
+		cl->cpriority = cl->priority2;
+		q->pmask |= (1<<cl->cpriority);
+		cl->xstats.overactions++;
+	}
+	cbq_ovl_classic(cl);
+}
+
+/* TC_CBQ_OVL_DROP: penalize class by dropping */
+
+static void cbq_ovl_drop(struct cbq_class *cl)
+{
+	if (cl->q->ops->drop)
+		if (cl->q->ops->drop(cl->q))
+			cl->qdisc->q.qlen--;
+	cl->xstats.overactions++;
+	cbq_ovl_classic(cl);
+}
+
+static void cbq_watchdog(unsigned long arg)
+{
+	struct Qdisc *sch = (struct Qdisc*)arg;
+
+	sch->flags &= ~TCQ_F_THROTTLED;
+	netif_schedule(sch->dev);
+}
+
+static unsigned long cbq_undelay_prio(struct cbq_sched_data *q, int prio)
+{
+	struct cbq_class *cl;
+	struct cbq_class *cl_prev = q->active[prio];
+	unsigned long now = jiffies;
+	unsigned long sched = now;
+
+	if (cl_prev == NULL)
+		return now;
+
+	do {
+		cl = cl_prev->next_alive;
+		if ((long)(now - cl->penalized) > 0) {
+			cl_prev->next_alive = cl->next_alive;
+			cl->next_alive = NULL;
+			cl->cpriority = cl->priority;
+			cl->delayed = 0;
+			cbq_activate_class(cl);
+
+			if (cl == q->active[prio]) {
+				q->active[prio] = cl_prev;
+				if (cl == q->active[prio]) {
+					q->active[prio] = NULL;
+					return 0;
+				}
+			}
+
+			cl = cl_prev->next_alive;
+		} else if ((long)(sched - cl->penalized) > 0)
+			sched = cl->penalized;
+	} while ((cl_prev = cl) != q->active[prio]);
+
+	return (long)(sched - now);
+}
+
+static void cbq_undelay(unsigned long arg)
+{
+	struct Qdisc *sch = (struct Qdisc*)arg;
+	struct cbq_sched_data *q = qdisc_priv(sch);
+	long delay = 0;
+	unsigned pmask;
+
+	pmask = q->pmask;
+	q->pmask = 0;
+
+	while (pmask) {
+		int prio = ffz(~pmask);
+		long tmp;
+
+		pmask &= ~(1<<prio);
+
+		tmp = cbq_undelay_prio(q, prio);
+		if (tmp > 0) {
+			q->pmask |= 1<<prio;
+			if (tmp < delay || delay == 0)
+				delay = tmp;
+		}
+	}
+
+	if (delay) {
+		q->delay_timer.expires = jiffies + delay;
+		add_timer(&q->delay_timer);
+	}
+
+	sch->flags &= ~TCQ_F_THROTTLED;
+	netif_schedule(sch->dev);
+}
+
+
+#ifdef CONFIG_NET_CLS_POLICE
+
+static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
+{
+	int len = skb->len;
+	struct Qdisc *sch = child->__parent;
+	struct cbq_sched_data *q = qdisc_priv(sch);
+	struct cbq_class *cl = q->rx_class;
+
+	q->rx_class = NULL;
+
+	if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
+
+		cbq_mark_toplevel(q, cl);
+
+		q->rx_class = cl;
+		cl->q->__parent = sch;
+
+		if (cl->q->enqueue(skb, cl->q) == 0) {
+			sch->q.qlen++;
+			sch->bstats.packets++;
+			sch->bstats.bytes+=len;
+			if (!cl->next_alive)
+				cbq_activate_class(cl);
+			return 0;
+		}
+		sch->qstats.drops++;
+		return 0;
+	}
+
+	sch->qstats.drops++;
+	return -1;
+}
+#endif
+
+/* 
+   It is mission critical procedure.
+
+   We "regenerate" toplevel cutoff, if transmitting class
+   has backlog and it is not regulated. It is not part of
+   original CBQ description, but looks more reasonable.
+   Probably, it is wrong. This question needs further investigation.
+*/
+
+static __inline__ void
+cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
+		    struct cbq_class *borrowed)
+{
+	if (cl && q->toplevel >= borrowed->level) {
+		if (cl->q->q.qlen > 1) {
+			do {
+				if (PSCHED_IS_PASTPERFECT(borrowed->undertime)) {
+					q->toplevel = borrowed->level;
+					return;
+				}
+			} while ((borrowed=borrowed->borrow) != NULL);
+		}
+#if 0	
+	/* It is not necessary now. Uncommenting it
+	   will save CPU cycles, but decrease fairness.
+	 */
+		q->toplevel = TC_CBQ_MAXLEVEL;
+#endif
+	}
+}
+
+static void
+cbq_update(struct cbq_sched_data *q)
+{
+	struct cbq_class *this = q->tx_class;
+	struct cbq_class *cl = this;
+	int len = q->tx_len;
+
+	q->tx_class = NULL;
+
+	for ( ; cl; cl = cl->share) {
+		long avgidle = cl->avgidle;
+		long idle;
+
+		cl->bstats.packets++;
+		cl->bstats.bytes += len;
+
+		/*
+		   (now - last) is total time between packet right edges.
+		   (last_pktlen/rate) is "virtual" busy time, so that
+
+		         idle = (now - last) - last_pktlen/rate
+		 */
+
+		idle = PSCHED_TDIFF(q->now, cl->last);
+		if ((unsigned long)idle > 128*1024*1024) {
+			avgidle = cl->maxidle;
+		} else {
+			idle -= L2T(cl, len);
+
+		/* true_avgidle := (1-W)*true_avgidle + W*idle,
+		   where W=2^{-ewma_log}. But cl->avgidle is scaled:
+		   cl->avgidle == true_avgidle/W,
+		   hence:
+		 */
+			avgidle += idle - (avgidle>>cl->ewma_log);
+		}
+
+		if (avgidle <= 0) {
+			/* Overlimit or at-limit */
+
+			if (avgidle < cl->minidle)
+				avgidle = cl->minidle;
+
+			cl->avgidle = avgidle;
+
+			/* Calculate expected time, when this class
+			   will be allowed to send.
+			   It will occur, when:
+			   (1-W)*true_avgidle + W*delay = 0, i.e.
+			   idle = (1/W - 1)*(-true_avgidle)
+			   or
+			   idle = (1 - W)*(-cl->avgidle);
+			 */
+			idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
+
+			/*
+			   That is not all.
+			   To maintain the rate allocated to the class,
+			   we add to undertime virtual clock,
+			   necessary to complete transmitted packet.
+			   (len/phys_bandwidth has been already passed
+			   to the moment of cbq_update)
+			 */
+
+			idle -= L2T(&q->link, len);
+			idle += L2T(cl, len);
+
+			PSCHED_AUDIT_TDIFF(idle);
+
+			PSCHED_TADD2(q->now, idle, cl->undertime);
+		} else {
+			/* Underlimit */
+
+			PSCHED_SET_PASTPERFECT(cl->undertime);
+			if (avgidle > cl->maxidle)
+				cl->avgidle = cl->maxidle;
+			else
+				cl->avgidle = avgidle;
+		}
+		cl->last = q->now;
+	}
+
+	cbq_update_toplevel(q, this, q->tx_borrowed);
+}
+
+static __inline__ struct cbq_class *
+cbq_under_limit(struct cbq_class *cl)
+{
+	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
+	struct cbq_class *this_cl = cl;
+
+	if (cl->tparent == NULL)
+		return cl;
+
+	if (PSCHED_IS_PASTPERFECT(cl->undertime) ||
+	    !PSCHED_TLESS(q->now, cl->undertime)) {
+		cl->delayed = 0;
+		return cl;
+	}
+
+	do {
+		/* It is very suspicious place. Now overlimit
+		   action is generated for not bounded classes
+		   only if link is completely congested.
+		   Though it is in agree with ancestor-only paradigm,
+		   it looks very stupid. Particularly,
+		   it means that this chunk of code will either
+		   never be called or result in strong amplification
+		   of burstiness. Dangerous, silly, and, however,
+		   no another solution exists.
+		 */
+		if ((cl = cl->borrow) == NULL) {
+			this_cl->qstats.overlimits++;
+			this_cl->overlimit(this_cl);
+			return NULL;
+		}
+		if (cl->level > q->toplevel)
+			return NULL;
+	} while (!PSCHED_IS_PASTPERFECT(cl->undertime) &&
+		 PSCHED_TLESS(q->now, cl->undertime));
+
+	cl->delayed = 0;
+	return cl;
+}
+
+static __inline__ struct sk_buff *
+cbq_dequeue_prio(struct Qdisc *sch, int prio)
+{
+	struct cbq_sched_data *q = qdisc_priv(sch);
+	struct cbq_class *cl_tail, *cl_prev, *cl;
+	struct sk_buff *skb;
+	int deficit;
+
+	cl_tail = cl_prev = q->active[prio];
+	cl = cl_prev->next_alive;
+
+	do {
+		deficit = 0;
+
+		/* Start round */
+		do {
+			struct cbq_class *borrow = cl;
+
+			if (cl->q->q.qlen &&
+			    (borrow = cbq_under_limit(cl)) == NULL)
+				goto skip_class;
+
+			if (cl->deficit <= 0) {
+				/* Class exhausted its allotment per
+				   this round. Switch to the next one.
+				 */
+				deficit = 1;
+				cl->deficit += cl->quantum;
+				goto next_class;
+			}
+
+			skb = cl->q->dequeue(cl->q);
+
+			/* Class did not give us any skb :-(
+			   It could occur even if cl->q->q.qlen != 0 
+			   f.e. if cl->q == "tbf"
+			 */
+			if (skb == NULL)
+				goto skip_class;
+
+			cl->deficit -= skb->len;
+			q->tx_class = cl;
+			q->tx_borrowed = borrow;
+			if (borrow != cl) {
+#ifndef CBQ_XSTATS_BORROWS_BYTES
+				borrow->xstats.borrows++;
+				cl->xstats.borrows++;
+#else
+				borrow->xstats.borrows += skb->len;
+				cl->xstats.borrows += skb->len;
+#endif
+			}
+			q->tx_len = skb->len;
+
+			if (cl->deficit <= 0) {
+				q->active[prio] = cl;
+				cl = cl->next_alive;
+				cl->deficit += cl->quantum;
+			}
+			return skb;
+
+skip_class:
+			if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
+				/* Class is empty or penalized.
+				   Unlink it from active chain.
+				 */
+				cl_prev->next_alive = cl->next_alive;
+				cl->next_alive = NULL;
+
+				/* Did cl_tail point to it? */
+				if (cl == cl_tail) {
+					/* Repair it! */
+					cl_tail = cl_prev;
+
+					/* Was it the last class in this band? */
+					if (cl == cl_tail) {
+						/* Kill the band! */
+						q->active[prio] = NULL;
+						q->activemask &= ~(1<<prio);
+						if (cl->q->q.qlen)
+							cbq_activate_class(cl);
+						return NULL;
+					}
+
+					q->active[prio] = cl_tail;
+				}
+				if (cl->q->q.qlen)
+					cbq_activate_class(cl);
+
+				cl = cl_prev;
+			}
+
+next_class:
+			cl_prev = cl;
+			cl = cl->next_alive;
+		} while (cl_prev != cl_tail);
+	} while (deficit);
+
+	q->active[prio] = cl_prev;
+
+	return NULL;
+}
+
+static __inline__ struct sk_buff *
+cbq_dequeue_1(struct Qdisc *sch)
+{
+	struct cbq_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb;
+	unsigned activemask;
+
+	activemask = q->activemask&0xFF;
+	while (activemask) {
+		int prio = ffz(~activemask);
+		activemask &= ~(1<<prio);
+		skb = cbq_dequeue_prio(sch, prio);
+		if (skb)
+			return skb;
+	}
+	return NULL;
+}
+
+static struct sk_buff *
+cbq_dequeue(struct Qdisc *sch)
+{
+	struct sk_buff *skb;
+	struct cbq_sched_data *q = qdisc_priv(sch);
+	psched_time_t now;
+	psched_tdiff_t incr;
+
+	PSCHED_GET_TIME(now);
+	incr = PSCHED_TDIFF(now, q->now_rt);
+
+	if (q->tx_class) {
+		psched_tdiff_t incr2;
+		/* Time integrator. We calculate EOS time
+		   by adding expected packet transmission time.
+		   If real time is greater, we warp artificial clock,
+		   so that:
+
+		   cbq_time = max(real_time, work);
+		 */
+		incr2 = L2T(&q->link, q->tx_len);
+		PSCHED_TADD(q->now, incr2);
+		cbq_update(q);
+		if ((incr -= incr2) < 0)
+			incr = 0;
+	}
+	PSCHED_TADD(q->now, incr);
+	q->now_rt = now;
+
+	for (;;) {
+		q->wd_expires = 0;
+
+		skb = cbq_dequeue_1(sch);
+		if (skb) {
+			sch->q.qlen--;
+			sch->flags &= ~TCQ_F_THROTTLED;
+			return skb;
+		}
+
+		/* All the classes are overlimit.
+
+		   It is possible, if:
+
+		   1. Scheduler is empty.
+		   2. Toplevel cutoff inhibited borrowing.
+		   3. Root class is overlimit.
+
+		   Reset 2d and 3d conditions and retry.
+
+		   Note, that NS and cbq-2.0 are buggy, peeking
+		   an arbitrary class is appropriate for ancestor-only
+		   sharing, but not for toplevel algorithm.
+
+		   Our version is better, but slower, because it requires
+		   two passes, but it is unavoidable with top-level sharing.
+		*/
+
+		if (q->toplevel == TC_CBQ_MAXLEVEL &&
+		    PSCHED_IS_PASTPERFECT(q->link.undertime))
+			break;
+
+		q->toplevel = TC_CBQ_MAXLEVEL;
+		PSCHED_SET_PASTPERFECT(q->link.undertime);
+	}
+
+	/* No packets in scheduler or nobody wants to give them to us :-(
+	   Sigh... start watchdog timer in the last case. */
+
+	if (sch->q.qlen) {
+		sch->qstats.overlimits++;
+		if (q->wd_expires) {
+			long delay = PSCHED_US2JIFFIE(q->wd_expires);
+			if (delay <= 0)
+				delay = 1;
+			mod_timer(&q->wd_timer, jiffies + delay);
+			sch->flags |= TCQ_F_THROTTLED;
+		}
+	}
+	return NULL;
+}
+
+/* CBQ class maintanance routines */
+
+static void cbq_adjust_levels(struct cbq_class *this)
+{
+	if (this == NULL)
+		return;
+
+	do {
+		int level = 0;
+		struct cbq_class *cl;
+
+		if ((cl = this->children) != NULL) {
+			do {
+				if (cl->level > level)
+					level = cl->level;
+			} while ((cl = cl->sibling) != this->children);
+		}
+		this->level = level+1;
+	} while ((this = this->tparent) != NULL);
+}
+
+static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
+{
+	struct cbq_class *cl;
+	unsigned h;
+
+	if (q->quanta[prio] == 0)
+		return;
+
+	for (h=0; h<16; h++) {
+		for (cl = q->classes[h]; cl; cl = cl->next) {
+			/* BUGGGG... Beware! This expression suffer of
+			   arithmetic overflows!
+			 */
+			if (cl->priority == prio) {
+				cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
+					q->quanta[prio];
+			}
+			if (cl->quantum <= 0 || cl->quantum>32*cl->qdisc->dev->mtu) {
+				printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->classid, cl->quantum);
+				cl->quantum = cl->qdisc->dev->mtu/2 + 1;
+			}
+		}
+	}
+}
+
+static void cbq_sync_defmap(struct cbq_class *cl)
+{
+	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
+	struct cbq_class *split = cl->split;
+	unsigned h;
+	int i;
+
+	if (split == NULL)
+		return;
+
+	for (i=0; i<=TC_PRIO_MAX; i++) {
+		if (split->defaults[i] == cl && !(cl->defmap&(1<<i)))
+			split->defaults[i] = NULL;
+	}
+
+	for (i=0; i<=TC_PRIO_MAX; i++) {
+		int level = split->level;
+
+		if (split->defaults[i])
+			continue;
+
+		for (h=0; h<16; h++) {
+			struct cbq_class *c;
+
+			for (c = q->classes[h]; c; c = c->next) {
+				if (c->split == split && c->level < level &&
+				    c->defmap&(1<<i)) {
+					split->defaults[i] = c;
+					level = c->level;
+				}
+			}
+		}
+	}
+}
+
+static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
+{
+	struct cbq_class *split = NULL;
+
+	if (splitid == 0) {
+		if ((split = cl->split) == NULL)
+			return;
+		splitid = split->classid;
+	}
+
+	if (split == NULL || split->classid != splitid) {
+		for (split = cl->tparent; split; split = split->tparent)
+			if (split->classid == splitid)
+				break;
+	}
+
+	if (split == NULL)
+		return;
+
+	if (cl->split != split) {
+		cl->defmap = 0;
+		cbq_sync_defmap(cl);
+		cl->split = split;
+		cl->defmap = def&mask;
+	} else
+		cl->defmap = (cl->defmap&~mask)|(def&mask);
+
+	cbq_sync_defmap(cl);
+}
+
+static void cbq_unlink_class(struct cbq_class *this)
+{
+	struct cbq_class *cl, **clp;
+	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
+
+	for (clp = &q->classes[cbq_hash(this->classid)]; (cl = *clp) != NULL; clp = &cl->next) {
+		if (cl == this) {
+			*clp = cl->next;
+			cl->next = NULL;
+			break;
+		}
+	}
+
+	if (this->tparent) {
+		clp=&this->sibling;
+		cl = *clp;
+		do {
+			if (cl == this) {
+				*clp = cl->sibling;
+				break;
+			}
+			clp = &cl->sibling;
+		} while ((cl = *clp) != this->sibling);
+
+		if (this->tparent->children == this) {
+			this->tparent->children = this->sibling;
+			if (this->sibling == this)
+				this->tparent->children = NULL;
+		}
+	} else {
+		BUG_TRAP(this->sibling == this);
+	}
+}
+
+static void cbq_link_class(struct cbq_class *this)
+{
+	str
author	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 15:20:36 -0700
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 15:20:36 -0700
commit	1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree	0bba044c4ce775e45a88a51686b5d9f90697ea9d /net/sched/sch_cbq.c