From 6c148184b5c868ad2c8a5a4a777cd8097622368a Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 7 Dec 2017 09:54:06 -0800 Subject: net: sched: cleanup qdisc_run and __qdisc_run semantics Currently __qdisc_run calls qdisc_run_end() but does not call qdisc_run_begin(). This makes it hard to track pairs of qdisc_run_{begin,end} across function calls. To simplify reading these code paths this patch moves begin/end calls into qdisc_run(). Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/net/pkt_sched.h | 4 +++- net/core/dev.c | 5 +++-- net/sched/sch_generic.c | 2 -- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index d1f413f06c72..4eea7198898e 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -113,8 +113,10 @@ void __qdisc_run(struct Qdisc *q); static inline void qdisc_run(struct Qdisc *q) { - if (qdisc_run_begin(q)) + if (qdisc_run_begin(q)) { __qdisc_run(q); + qdisc_run_end(q); + } } static inline __be16 tc_skb_protocol(const struct sk_buff *skb) diff --git a/net/core/dev.c b/net/core/dev.c index 6bea8931bb62..44c7de365f55 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3192,9 +3192,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, contended = false; } __qdisc_run(q); - } else - qdisc_run_end(q); + } + qdisc_run_end(q); rc = NET_XMIT_SUCCESS; } else { rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; @@ -3204,6 +3204,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, contended = false; } __qdisc_run(q); + qdisc_run_end(q); } } spin_unlock(root_lock); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 3839cbbdc32b..f6803e1d6783 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -266,8 +266,6 @@ void __qdisc_run(struct Qdisc *q) break; } } - - qdisc_run_end(q); } unsigned long dev_trans_start(struct net_device *dev) -- cgit v1.2.3 From 6b3ba9146fe64b9bebb6346c9dcfe3b4851de2d7 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 7 Dec 2017 09:54:25 -0800 Subject: net: sched: allow qdiscs to handle locking This patch adds a flag for queueing disciplines to indicate the stack does not need to use the qdisc lock to protect operations. This can be used to build lockless scheduling algorithms and improving performance. The flag is checked in the tx path and the qdisc lock is only taken if it is not set. For now use a conditional if statement. Later we could be more aggressive if it proves worthwhile and use a static key or wrap this in a likely(). Also the lockless case drops the TCQ_F_CAN_BYPASS logic. The reason for this is synchronizing a qlen counter across threads proves to cost more than doing the enqueue/dequeue operations when tested with pktgen. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/net/sch_generic.h | 1 + net/core/dev.c | 26 ++++++++++++++++++++++---- net/sched/sch_generic.c | 30 ++++++++++++++++++++---------- 3 files changed, 43 insertions(+), 14 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 7dd8b0b0d244..77791fa055de 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -71,6 +71,7 @@ struct Qdisc { * qdisc_tree_decrease_qlen() should stop. */ #define TCQ_F_INVISIBLE 0x80 /* invisible by default in dump */ +#define TCQ_F_NOLOCK 0x100 /* qdisc does not require locking */ u32 limit; const struct Qdisc_ops *ops; struct qdisc_size_table __rcu *stab; diff --git a/net/core/dev.c b/net/core/dev.c index 44c7de365f55..e32cf5c7f200 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3162,6 +3162,21 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, int rc; qdisc_calculate_pkt_len(skb, q); + + if (q->flags & TCQ_F_NOLOCK) { + if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { + __qdisc_drop(skb, &to_free); + rc = NET_XMIT_DROP; + } else { + rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; + __qdisc_run(q); + } + + if (unlikely(to_free)) + kfree_skb_list(to_free); + return rc; + } + /* * Heuristic to force contended enqueues to serialize on a * separate lock before trying to get qdisc main lock. @@ -4144,19 +4159,22 @@ static __latent_entropy void net_tx_action(struct softirq_action *h) while (head) { struct Qdisc *q = head; - spinlock_t *root_lock; + spinlock_t *root_lock = NULL; head = head->next_sched; - root_lock = qdisc_lock(q); - spin_lock(root_lock); + if (!(q->flags & TCQ_F_NOLOCK)) { + root_lock = qdisc_lock(q); + spin_lock(root_lock); + } /* We need to make sure head->next_sched is read * before clearing __QDISC_STATE_SCHED */ smp_mb__before_atomic(); clear_bit(__QDISC_STATE_SCHED, &q->state); qdisc_run(q); - spin_unlock(root_lock); + if (root_lock) + spin_unlock(root_lock); } } } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index f6803e1d6783..ec757f66896a 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -174,7 +174,8 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, int ret = NETDEV_TX_BUSY; /* And release qdisc */ - spin_unlock(root_lock); + if (root_lock) + spin_unlock(root_lock); /* Note that we validate skb (GSO, checksum, ...) outside of locks */ if (validate) @@ -187,10 +188,13 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, HARD_TX_UNLOCK(dev, txq); } else { - spin_lock(root_lock); + if (root_lock) + spin_lock(root_lock); return qdisc_qlen(q); } - spin_lock(root_lock); + + if (root_lock) + spin_lock(root_lock); if (dev_xmit_complete(ret)) { /* Driver sent out skb successfully or skb was consumed */ @@ -231,9 +235,9 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, */ static inline int qdisc_restart(struct Qdisc *q, int *packets) { + spinlock_t *root_lock = NULL; struct netdev_queue *txq; struct net_device *dev; - spinlock_t *root_lock; struct sk_buff *skb; bool validate; @@ -242,7 +246,9 @@ static inline int qdisc_restart(struct Qdisc *q, int *packets) if (unlikely(!skb)) return 0; - root_lock = qdisc_lock(q); + if (!(q->flags & TCQ_F_NOLOCK)) + root_lock = qdisc_lock(q); + dev = qdisc_dev(q); txq = skb_get_tx_queue(dev, skb); @@ -880,14 +886,18 @@ static bool some_qdisc_is_busy(struct net_device *dev) dev_queue = netdev_get_tx_queue(dev, i); q = dev_queue->qdisc_sleeping; - root_lock = qdisc_lock(q); - spin_lock_bh(root_lock); + if (q->flags & TCQ_F_NOLOCK) { + val = test_bit(__QDISC_STATE_SCHED, &q->state); + } else { + root_lock = qdisc_lock(q); + spin_lock_bh(root_lock); - val = (qdisc_is_running(q) || - test_bit(__QDISC_STATE_SCHED, &q->state)); + val = (qdisc_is_running(q) || + test_bit(__QDISC_STATE_SCHED, &q->state)); - spin_unlock_bh(root_lock); + spin_unlock_bh(root_lock); + } if (val) return true; -- cgit v1.2.3 From 29b86cdac00a82f88b81c16659e64cc624550216 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 7 Dec 2017 09:54:47 -0800 Subject: net: sched: remove remaining uses for qdisc_qlen in xmit path sch_direct_xmit() uses qdisc_qlen as a return value but all call sites of the routine only check if it is zero or not. Simplify the logic so that we don't need to return an actual queue length value. This introduces a case now where sch_direct_xmit would have returned a qlen of zero but now it returns true. However in this case all call sites of sch_direct_xmit will implement a dequeue() and get a null skb and abort. This trades tracking qlen in the hotpath for an extra dequeue operation. Overall this seems to be good for performance. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/net/pkt_sched.h | 6 +++--- net/sched/sch_generic.c | 28 +++++++++++++--------------- 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 4eea7198898e..240469228851 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -105,9 +105,9 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, void qdisc_put_rtab(struct qdisc_rate_table *tab); void qdisc_put_stab(struct qdisc_size_table *tab); void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc); -int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, - struct net_device *dev, struct netdev_queue *txq, - spinlock_t *root_lock, bool validate); +bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, + struct net_device *dev, struct netdev_queue *txq, + spinlock_t *root_lock, bool validate); void __qdisc_run(struct Qdisc *q); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index ec757f66896a..cbc0a9a00e30 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -164,12 +164,12 @@ trace: * only one CPU can execute this function. * * Returns to the caller: - * 0 - queue is empty or throttled. - * >0 - queue is not empty. + * false - hardware queue frozen backoff + * true - feel free to send more pkts */ -int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, - struct net_device *dev, struct netdev_queue *txq, - spinlock_t *root_lock, bool validate) +bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, + struct net_device *dev, struct netdev_queue *txq, + spinlock_t *root_lock, bool validate) { int ret = NETDEV_TX_BUSY; @@ -190,28 +190,26 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, } else { if (root_lock) spin_lock(root_lock); - return qdisc_qlen(q); + return true; } if (root_lock) spin_lock(root_lock); - if (dev_xmit_complete(ret)) { - /* Driver sent out skb successfully or skb was consumed */ - ret = qdisc_qlen(q); - } else { + if (!dev_xmit_complete(ret)) { /* Driver returned NETDEV_TX_BUSY - requeue skb */ if (unlikely(ret != NETDEV_TX_BUSY)) net_warn_ratelimited("BUG %s code %d qlen %d\n", dev->name, ret, q->q.qlen); - ret = dev_requeue_skb(skb, q); + dev_requeue_skb(skb, q); + return false; } if (ret && netif_xmit_frozen_or_stopped(txq)) - ret = 0; + return false; - return ret; + return true; } /* @@ -233,7 +231,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, * >0 - queue is not empty. * */ -static inline int qdisc_restart(struct Qdisc *q, int *packets) +static inline bool qdisc_restart(struct Qdisc *q, int *packets) { spinlock_t *root_lock = NULL; struct netdev_queue *txq; @@ -244,7 +242,7 @@ static inline int qdisc_restart(struct Qdisc *q, int *packets) /* Dequeue packet */ skb = dequeue_skb(q, &validate, packets); if (unlikely(!skb)) - return 0; + return false; if (!(q->flags & TCQ_F_NOLOCK)) root_lock = qdisc_lock(q); -- cgit v1.2.3 From 40bd036219dca86938fd5bd84b3fc19ffa812596 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 7 Dec 2017 09:55:07 -0800 Subject: net: sched: provide per cpu qstat helpers The per cpu qstats support was added with per cpu bstat support which is currently used by the ingress qdisc. This patch adds a set of helpers needed to make other qdiscs that use qstats per cpu as well. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/net/sch_generic.h | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 77791fa055de..eff31d824861 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -632,12 +632,39 @@ static inline void qdisc_qstats_backlog_dec(struct Qdisc *sch, sch->qstats.backlog -= qdisc_pkt_len(skb); } +static inline void qdisc_qstats_cpu_backlog_dec(struct Qdisc *sch, + const struct sk_buff *skb) +{ + this_cpu_sub(sch->cpu_qstats->backlog, qdisc_pkt_len(skb)); +} + static inline void qdisc_qstats_backlog_inc(struct Qdisc *sch, const struct sk_buff *skb) { sch->qstats.backlog += qdisc_pkt_len(skb); } +static inline void qdisc_qstats_cpu_backlog_inc(struct Qdisc *sch, + const struct sk_buff *skb) +{ + this_cpu_add(sch->cpu_qstats->backlog, qdisc_pkt_len(skb)); +} + +static inline void qdisc_qstats_cpu_qlen_inc(struct Qdisc *sch) +{ + this_cpu_inc(sch->cpu_qstats->qlen); +} + +static inline void qdisc_qstats_cpu_qlen_dec(struct Qdisc *sch) +{ + this_cpu_dec(sch->cpu_qstats->qlen); +} + +static inline void qdisc_qstats_cpu_requeues_inc(struct Qdisc *sch) +{ + this_cpu_inc(sch->cpu_qstats->requeues); +} + static inline void __qdisc_qstats_drop(struct Qdisc *sch, int count) { sch->qstats.drops += count; @@ -845,6 +872,14 @@ static inline void rtnl_qdisc_drop(struct sk_buff *skb, struct Qdisc *sch) qdisc_qstats_drop(sch); } +static inline int qdisc_drop_cpu(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + __qdisc_drop(skb, to_free); + qdisc_qstats_cpu_drop(sch); + + return NET_XMIT_DROP; +} static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) -- cgit v1.2.3 From d59f5ffa59d80ff3a2f3b56a9acea4310974c6d1 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 7 Dec 2017 09:55:26 -0800 Subject: net: sched: a dflt qdisc may be used with per cpu stats Enable dflt qdisc support for per cpu stats before this patch a dflt qdisc was required to use the global statistics qstats and bstats. This adds a static flags field to qdisc_ops that is propagated into qdisc->flags in qdisc allocate call. This allows the allocation block to completely allocate the qdisc object so we don't have dangling allocations after qdisc init. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/net/sch_generic.h | 1 + net/sched/sch_generic.c | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index eff31d824861..6fd9a4e70066 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -180,6 +180,7 @@ struct Qdisc_ops { const struct Qdisc_class_ops *cl_ops; char id[IFNAMSIZ]; int priv_size; + unsigned int static_flags; int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch, diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index cbc0a9a00e30..80e4ae3ad693 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -632,6 +632,19 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, qdisc_skb_head_init(&sch->q); spin_lock_init(&sch->q.lock); + if (ops->static_flags & TCQ_F_CPUSTATS) { + sch->cpu_bstats = + netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); + if (!sch->cpu_bstats) + goto errout1; + + sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue); + if (!sch->cpu_qstats) { + free_percpu(sch->cpu_bstats); + goto errout1; + } + } + spin_lock_init(&sch->busylock); lockdep_set_class(&sch->busylock, dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); @@ -641,6 +654,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, dev->qdisc_running_key ?: &qdisc_running_key); sch->ops = ops; + sch->flags = ops->static_flags; sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; sch->dev_queue = dev_queue; @@ -648,6 +662,8 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, refcount_set(&sch->refcnt, 1); return sch; +errout1: + kfree(p); errout: return ERR_PTR(err); } -- cgit v1.2.3 From a53851e2c3218aa30b77abd6e68cf1c371f15afe Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 7 Dec 2017 09:55:45 -0800 Subject: net: sched: explicit locking in gso_cpu fallback This work is preparing the qdisc layer to support egress lockless qdiscs. If we are running the egress qdisc lockless in the case we overrun the netdev, for whatever reason, the netdev returns a busy error code and the skb is parked on the gso_skb pointer. With many cores all hitting this case at once its possible to have multiple sk_buffs here so we turn gso_skb into a queue. This should be the edge case and if we see this frequently then the netdev/qdisc layer needs to back off. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/net/sch_generic.h | 20 ++++++----- net/sched/sch_generic.c | 85 +++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 84 insertions(+), 21 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 6fd9a4e70066..9b9e4feda127 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -88,7 +88,7 @@ struct Qdisc { /* * For performance sake on SMP, we put highly modified fields at the end */ - struct sk_buff *gso_skb ____cacheline_aligned_in_smp; + struct sk_buff_head gso_skb ____cacheline_aligned_in_smp; struct qdisc_skb_head q; struct gnet_stats_basic_packed bstats; seqcount_t running; @@ -796,26 +796,30 @@ static inline struct sk_buff *qdisc_peek_head(struct Qdisc *sch) /* generic pseudo peek method for non-work-conserving qdisc */ static inline struct sk_buff *qdisc_peek_dequeued(struct Qdisc *sch) { + struct sk_buff *skb = skb_peek(&sch->gso_skb); + /* we can reuse ->gso_skb because peek isn't called for root qdiscs */ - if (!sch->gso_skb) { - sch->gso_skb = sch->dequeue(sch); - if (sch->gso_skb) { + if (!skb) { + skb = sch->dequeue(sch); + + if (skb) { + __skb_queue_head(&sch->gso_skb, skb); /* it's still part of the queue */ - qdisc_qstats_backlog_inc(sch, sch->gso_skb); + qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; } } - return sch->gso_skb; + return skb; } /* use instead of qdisc->dequeue() for all qdiscs queried with ->peek() */ static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch) { - struct sk_buff *skb = sch->gso_skb; + struct sk_buff *skb = skb_peek(&sch->gso_skb); if (skb) { - sch->gso_skb = NULL; + skb = __skb_dequeue(&sch->gso_skb); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; } else { diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 80e4ae3ad693..dfeabe319c56 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -45,10 +45,9 @@ EXPORT_SYMBOL(default_qdisc_ops); * - ingress filtering is also serialized via qdisc root lock * - updates to tree and tree walking are only done under the rtnl mutex. */ - -static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) +static inline int __dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) { - q->gso_skb = skb; + __skb_queue_head(&q->gso_skb, skb); q->qstats.requeues++; qdisc_qstats_backlog_inc(q, skb); q->q.qlen++; /* it's still part of the queue */ @@ -57,6 +56,30 @@ static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) return 0; } +static inline int dev_requeue_skb_locked(struct sk_buff *skb, struct Qdisc *q) +{ + spinlock_t *lock = qdisc_lock(q); + + spin_lock(lock); + __skb_queue_tail(&q->gso_skb, skb); + spin_unlock(lock); + + qdisc_qstats_cpu_requeues_inc(q); + qdisc_qstats_cpu_backlog_inc(q, skb); + qdisc_qstats_cpu_qlen_inc(q); + __netif_schedule(q); + + return 0; +} + +static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) +{ + if (q->flags & TCQ_F_NOLOCK) + return dev_requeue_skb_locked(skb, q); + else + return __dev_requeue_skb(skb, q); +} + static void try_bulk_dequeue_skb(struct Qdisc *q, struct sk_buff *skb, const struct netdev_queue *txq, @@ -112,23 +135,50 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q, static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, int *packets) { - struct sk_buff *skb = q->gso_skb; const struct netdev_queue *txq = q->dev_queue; + struct sk_buff *skb; *packets = 1; - if (unlikely(skb)) { + if (unlikely(!skb_queue_empty(&q->gso_skb))) { + spinlock_t *lock = NULL; + + if (q->flags & TCQ_F_NOLOCK) { + lock = qdisc_lock(q); + spin_lock(lock); + } + + skb = skb_peek(&q->gso_skb); + + /* skb may be null if another cpu pulls gso_skb off in between + * empty check and lock. + */ + if (!skb) { + if (lock) + spin_unlock(lock); + goto validate; + } + /* skb in gso_skb were already validated */ *validate = false; /* check the reason of requeuing without tx lock first */ txq = skb_get_tx_queue(txq->dev, skb); if (!netif_xmit_frozen_or_stopped(txq)) { - q->gso_skb = NULL; - qdisc_qstats_backlog_dec(q, skb); - q->q.qlen--; - } else + skb = __skb_dequeue(&q->gso_skb); + if (qdisc_is_percpu_stats(q)) { + qdisc_qstats_cpu_backlog_dec(q, skb); + qdisc_qstats_cpu_qlen_dec(q); + } else { + qdisc_qstats_backlog_dec(q, skb); + q->q.qlen--; + } + } else { skb = NULL; + } + if (lock) + spin_unlock(lock); goto trace; } +validate: *validate = true; skb = q->skb_bad_txq; if (unlikely(skb)) { @@ -629,6 +679,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p); sch->padded = (char *) sch - (char *) p; } + __skb_queue_head_init(&sch->gso_skb); qdisc_skb_head_init(&sch->q); spin_lock_init(&sch->q.lock); @@ -697,6 +748,7 @@ EXPORT_SYMBOL(qdisc_create_dflt); void qdisc_reset(struct Qdisc *qdisc) { const struct Qdisc_ops *ops = qdisc->ops; + struct sk_buff *skb, *tmp; if (ops->reset) ops->reset(qdisc); @@ -704,10 +756,11 @@ void qdisc_reset(struct Qdisc *qdisc) kfree_skb(qdisc->skb_bad_txq); qdisc->skb_bad_txq = NULL; - if (qdisc->gso_skb) { - kfree_skb_list(qdisc->gso_skb); - qdisc->gso_skb = NULL; + skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) { + __skb_unlink(skb, &qdisc->gso_skb); + kfree_skb_list(skb); } + qdisc->q.qlen = 0; qdisc->qstats.backlog = 0; } @@ -726,6 +779,7 @@ static void qdisc_free(struct Qdisc *qdisc) void qdisc_destroy(struct Qdisc *qdisc) { const struct Qdisc_ops *ops = qdisc->ops; + struct sk_buff *skb, *tmp; if (qdisc->flags & TCQ_F_BUILTIN || !refcount_dec_and_test(&qdisc->refcnt)) @@ -745,7 +799,11 @@ void qdisc_destroy(struct Qdisc *qdisc) module_put(ops->owner); dev_put(qdisc_dev(qdisc)); - kfree_skb_list(qdisc->gso_skb); + skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) { + __skb_unlink(skb, &qdisc->gso_skb); + kfree_skb_list(skb); + } + kfree_skb(qdisc->skb_bad_txq); qdisc_free(qdisc); } @@ -973,6 +1031,7 @@ static void dev_init_scheduler_queue(struct net_device *dev, rcu_assign_pointer(dev_queue->qdisc, qdisc); dev_queue->qdisc_sleeping = qdisc; + __skb_queue_head_init(&qdisc->gso_skb); } void dev_init_scheduler(struct net_device *dev) -- cgit v1.2.3 From 7bbde83b1860c28a1cc35516352c4e7e5172c29a Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 7 Dec 2017 09:56:04 -0800 Subject: net: sched: drop qdisc_reset from dev_graft_qdisc In qdisc_graft_qdisc a "new" qdisc is attached and the 'qdisc_destroy' operation is called on the old qdisc. The destroy operation will wait a rcu grace period and call qdisc_rcu_free(). At which point gso_cpu_skb is free'd along with all stats so no need to zero stats and gso_cpu_skb from the graft operation itself. Further after dropping the qdisc locks we can not continue to call qdisc_reset before waiting an rcu grace period so that the qdisc is detached from all cpus. By removing the qdisc_reset() here we get the correct property of waiting an rcu grace period and letting the qdisc_destroy operation clean up the qdisc correctly. Note, a refcnt greater than 1 would cause the destroy operation to be aborted however if this ever happened the reference to the qdisc would be lost and we would have a memory leak. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index dfeabe319c56..482ba2234470 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -819,10 +819,6 @@ struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue, root_lock = qdisc_lock(oqdisc); spin_lock_bh(root_lock); - /* Prune old scheduler */ - if (oqdisc && refcount_read(&oqdisc->refcnt) <= 1) - qdisc_reset(oqdisc); - /* ... and graft new one */ if (qdisc == NULL) qdisc = &noop_qdisc; @@ -977,6 +973,16 @@ static bool some_qdisc_is_busy(struct net_device *dev) return false; } +static void dev_qdisc_reset(struct net_device *dev, + struct netdev_queue *dev_queue, + void *none) +{ + struct Qdisc *qdisc = dev_queue->qdisc_sleeping; + + if (qdisc) + qdisc_reset(qdisc); +} + /** * dev_deactivate_many - deactivate transmissions on several devices * @head: list of devices to deactivate @@ -987,7 +993,6 @@ static bool some_qdisc_is_busy(struct net_device *dev) void dev_deactivate_many(struct list_head *head) { struct net_device *dev; - bool sync_needed = false; list_for_each_entry(dev, head, close_list) { netdev_for_each_tx_queue(dev, dev_deactivate_queue, @@ -997,20 +1002,25 @@ void dev_deactivate_many(struct list_head *head) &noop_qdisc); dev_watchdog_down(dev); - sync_needed |= !dev->dismantle; } /* Wait for outstanding qdisc-less dev_queue_xmit calls. * This is avoided if all devices are in dismantle phase : * Caller will call synchronize_net() for us */ - if (sync_needed) - synchronize_net(); + synchronize_net(); /* Wait for outstanding qdisc_run calls. */ - list_for_each_entry(dev, head, close_list) + list_for_each_entry(dev, head, close_list) { while (some_qdisc_is_busy(dev)) yield(); + /* The new qdisc is assigned at this point so we can safely + * unwind stale skb lists and qdisc statistics + */ + netdev_for_each_tx_queue(dev, dev_qdisc_reset, NULL); + if (dev_ingress_queue(dev)) + dev_qdisc_reset(dev, dev_ingress_queue(dev), NULL); + } } void dev_deactivate(struct net_device *dev) -- cgit v1.2.3 From 70e57d5e3f8ec7c482b92ef43e543d87134689ab Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 7 Dec 2017 09:56:23 -0800 Subject: net: sched: use skb list for skb_bad_tx Similar to how gso is handled use skb list for skb_bad_tx this is required with lockless qdiscs because we may have multiple cores attempting to push skbs into skb_bad_tx concurrently Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/net/sch_generic.h | 2 +- net/sched/sch_generic.c | 106 +++++++++++++++++++++++++++++++++++++--------- 2 files changed, 87 insertions(+), 21 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 9b9e4feda127..da2528036e2e 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -95,7 +95,7 @@ struct Qdisc { struct gnet_stats_queue qstats; unsigned long state; struct Qdisc *next_sched; - struct sk_buff *skb_bad_txq; + struct sk_buff_head skb_bad_txq; int padded; refcount_t refcnt; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 482ba2234470..84cef0570862 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -45,6 +45,68 @@ EXPORT_SYMBOL(default_qdisc_ops); * - ingress filtering is also serialized via qdisc root lock * - updates to tree and tree walking are only done under the rtnl mutex. */ + +static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q) +{ + const struct netdev_queue *txq = q->dev_queue; + spinlock_t *lock = NULL; + struct sk_buff *skb; + + if (q->flags & TCQ_F_NOLOCK) { + lock = qdisc_lock(q); + spin_lock(lock); + } + + skb = skb_peek(&q->skb_bad_txq); + if (skb) { + /* check the reason of requeuing without tx lock first */ + txq = skb_get_tx_queue(txq->dev, skb); + if (!netif_xmit_frozen_or_stopped(txq)) { + skb = __skb_dequeue(&q->skb_bad_txq); + if (qdisc_is_percpu_stats(q)) { + qdisc_qstats_cpu_backlog_dec(q, skb); + qdisc_qstats_cpu_qlen_dec(q); + } else { + qdisc_qstats_backlog_dec(q, skb); + q->q.qlen--; + } + } else { + skb = NULL; + } + } + + if (lock) + spin_unlock(lock); + + return skb; +} + +static inline struct sk_buff *qdisc_dequeue_skb_bad_txq(struct Qdisc *q) +{ + struct sk_buff *skb = skb_peek(&q->skb_bad_txq); + + if (unlikely(skb)) + skb = __skb_dequeue_bad_txq(q); + + return skb; +} + +static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q, + struct sk_buff *skb) +{ + spinlock_t *lock = NULL; + + if (q->flags & TCQ_F_NOLOCK) { + lock = qdisc_lock(q); + spin_lock(lock); + } + + __skb_queue_tail(&q->skb_bad_txq, skb); + + if (lock) + spin_unlock(lock); +} + static inline int __dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) { __skb_queue_head(&q->gso_skb, skb); @@ -117,9 +179,15 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q, if (!nskb) break; if (unlikely(skb_get_queue_mapping(nskb) != mapping)) { - q->skb_bad_txq = nskb; - qdisc_qstats_backlog_inc(q, nskb); - q->q.qlen++; + qdisc_enqueue_skb_bad_txq(q, nskb); + + if (qdisc_is_percpu_stats(q)) { + qdisc_qstats_cpu_backlog_inc(q, nskb); + qdisc_qstats_cpu_qlen_inc(q); + } else { + qdisc_qstats_backlog_inc(q, nskb); + q->q.qlen++; + } break; } skb->next = nskb; @@ -180,19 +248,9 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, } validate: *validate = true; - skb = q->skb_bad_txq; - if (unlikely(skb)) { - /* check the reason of requeuing without tx lock first */ - txq = skb_get_tx_queue(txq->dev, skb); - if (!netif_xmit_frozen_or_stopped(txq)) { - q->skb_bad_txq = NULL; - qdisc_qstats_backlog_dec(q, skb); - q->q.qlen--; - goto bulk; - } - skb = NULL; - goto trace; - } + skb = qdisc_dequeue_skb_bad_txq(q); + if (unlikely(skb)) + goto bulk; if (!(q->flags & TCQ_F_ONETXQUEUE) || !netif_xmit_frozen_or_stopped(txq)) skb = q->dequeue(q); @@ -680,6 +738,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, sch->padded = (char *) sch - (char *) p; } __skb_queue_head_init(&sch->gso_skb); + __skb_queue_head_init(&sch->skb_bad_txq); qdisc_skb_head_init(&sch->q); spin_lock_init(&sch->q.lock); @@ -753,14 +812,16 @@ void qdisc_reset(struct Qdisc *qdisc) if (ops->reset) ops->reset(qdisc); - kfree_skb(qdisc->skb_bad_txq); - qdisc->skb_bad_txq = NULL; - skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) { __skb_unlink(skb, &qdisc->gso_skb); kfree_skb_list(skb); } + skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp) { + __skb_unlink(skb, &qdisc->skb_bad_txq); + kfree_skb_list(skb); + } + qdisc->q.qlen = 0; qdisc->qstats.backlog = 0; } @@ -804,7 +865,11 @@ void qdisc_destroy(struct Qdisc *qdisc) kfree_skb_list(skb); } - kfree_skb(qdisc->skb_bad_txq); + skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp) { + __skb_unlink(skb, &qdisc->skb_bad_txq); + kfree_skb_list(skb); + } + qdisc_free(qdisc); } EXPORT_SYMBOL(qdisc_destroy); @@ -1042,6 +1107,7 @@ static void dev_init_scheduler_queue(struct net_device *dev, rcu_assign_pointer(dev_queue->qdisc, qdisc); dev_queue->qdisc_sleeping = qdisc; __skb_queue_head_init(&qdisc->gso_skb); + __skb_queue_head_init(&qdisc->skb_bad_txq); } void dev_init_scheduler(struct net_device *dev) -- cgit v1.2.3 From fd8e8d1a775d82f04215f4b884a1962774805346 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 7 Dec 2017 09:56:42 -0800 Subject: net: sched: check for frozen queue before skb_bad_txq check I can not think of any reason to pull the bad txq skb off the qdisc if the txq we plan to send this on is still frozen. So check for frozen queue first and abort before dequeuing either skb_bad_txq skb or normal qdisc dequeue() skb. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 84cef0570862..5ff93c2b5b99 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -204,7 +204,7 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, int *packets) { const struct netdev_queue *txq = q->dev_queue; - struct sk_buff *skb; + struct sk_buff *skb = NULL; *packets = 1; if (unlikely(!skb_queue_empty(&q->gso_skb))) { @@ -248,12 +248,15 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, } validate: *validate = true; + + if ((q->flags & TCQ_F_ONETXQUEUE) && + netif_xmit_frozen_or_stopped(txq)) + return skb; + skb = qdisc_dequeue_skb_bad_txq(q); if (unlikely(skb)) goto bulk; - if (!(q->flags & TCQ_F_ONETXQUEUE) || - !netif_xmit_frozen_or_stopped(txq)) - skb = q->dequeue(q); + skb = q->dequeue(q); if (skb) { bulk: if (qdisc_may_bulk(q)) -- cgit v1.2.3 From 7e66016f2c65bfc1181f42274fcb7f1183ab1bb5 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 7 Dec 2017 09:57:00 -0800 Subject: net: sched: helpers to sum qlen and qlen for per cpu logic Add qdisc qlen helper routines for lockless qdiscs to use. The qdisc qlen is no longer used in the hotpath but it is reported via stats query on the qdisc so it still needs to be tracked. This adds the per cpu operations needed along with a helper to return the summation of per cpu stats. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/net/sch_generic.h | 20 ++++++++++++++++++++ net/sched/sch_api.c | 3 ++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index da2528036e2e..8f8c0afe529b 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -292,11 +292,31 @@ static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) BUILD_BUG_ON(sizeof(qcb->data) < sz); } +static inline int qdisc_qlen_cpu(const struct Qdisc *q) +{ + return this_cpu_ptr(q->cpu_qstats)->qlen; +} + static inline int qdisc_qlen(const struct Qdisc *q) { return q->q.qlen; } +static inline int qdisc_qlen_sum(const struct Qdisc *q) +{ + __u32 qlen = 0; + int i; + + if (q->flags & TCQ_F_NOLOCK) { + for_each_possible_cpu(i) + qlen += per_cpu_ptr(q->cpu_qstats, i)->qlen; + } else { + qlen = q->q.qlen; + } + + return qlen; +} + static inline struct qdisc_skb_cb *qdisc_skb_cb(const struct sk_buff *skb) { return (struct qdisc_skb_cb *)skb->cb; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index a48ca41b7ecf..c669bb3b89b2 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -797,7 +797,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, goto nla_put_failure; if (q->ops->dump && q->ops->dump(q, skb) < 0) goto nla_put_failure; - qlen = q->q.qlen; + + qlen = qdisc_qlen_sum(q); stab = rtnl_dereference(q->stab); if (stab && qdisc_dump_stab(skb, stab) < 0) -- cgit v1.2.3 From b01ac095c740fc21f4bb21abe900b0f5b3042cf9 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 7 Dec 2017 09:57:20 -0800 Subject: net: sched: add support for TCQ_F_NOLOCK subqueues to sch_mq The sch_mq qdisc creates a sub-qdisc per tx queue which are then called independently for enqueue and dequeue operations. However statistics are aggregated and pushed up to the "master" qdisc. This patch adds support for any of the sub-qdiscs to be per cpu statistic qdiscs. To handle this case add a check when calculating stats and aggregate the per cpu stats if needed. Also exports __gnet_stats_copy_queue() to use as a helper function. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/net/gen_stats.h | 3 +++ net/core/gen_stats.c | 9 +++++---- net/sched/sch_mq.c | 25 ++++++++++++++++++------- 3 files changed, 26 insertions(+), 11 deletions(-) diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h index 304f7aa9cc01..0304ba2ae353 100644 --- a/include/net/gen_stats.h +++ b/include/net/gen_stats.h @@ -49,6 +49,9 @@ int gnet_stats_copy_rate_est(struct gnet_dump *d, int gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue __percpu *cpu_q, struct gnet_stats_queue *q, __u32 qlen); +void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, + const struct gnet_stats_queue __percpu *cpu_q, + const struct gnet_stats_queue *q, __u32 qlen); int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len); int gnet_stats_finish_copy(struct gnet_dump *d); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 87f28557b329..b2b2323bdc84 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -252,10 +252,10 @@ __gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats, } } -static void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, - const struct gnet_stats_queue __percpu *cpu, - const struct gnet_stats_queue *q, - __u32 qlen) +void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, + const struct gnet_stats_queue __percpu *cpu, + const struct gnet_stats_queue *q, + __u32 qlen) { if (cpu) { __gnet_stats_copy_queue_cpu(qstats, cpu); @@ -269,6 +269,7 @@ static void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, qstats->qlen = qlen; } +EXPORT_SYMBOL(__gnet_stats_copy_queue); /** * gnet_stats_copy_queue - copy queue statistics into statistics TLV diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index 213b586a06a0..bc59f05e1a0f 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -17,6 +17,7 @@ #include #include #include +#include struct mq_sched { struct Qdisc **qdiscs; @@ -103,15 +104,25 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) memset(&sch->qstats, 0, sizeof(sch->qstats)); for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { + struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL; + struct gnet_stats_queue __percpu *cpu_qstats = NULL; + __u32 qlen = 0; + qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping; spin_lock_bh(qdisc_lock(qdisc)); - sch->q.qlen += qdisc->q.qlen; - sch->bstats.bytes += qdisc->bstats.bytes; - sch->bstats.packets += qdisc->bstats.packets; - sch->qstats.backlog += qdisc->qstats.backlog; - sch->qstats.drops += qdisc->qstats.drops; - sch->qstats.requeues += qdisc->qstats.requeues; - sch->qstats.overlimits += qdisc->qstats.overlimits; + + if (qdisc_is_percpu_stats(qdisc)) { + cpu_bstats = qdisc->cpu_bstats; + cpu_qstats = qdisc->cpu_qstats; + } + + qlen = qdisc_qlen_sum(qdisc); + + __gnet_stats_copy_basic(NULL, &sch->bstats, + cpu_bstats, &qdisc->bstats); + __gnet_stats_copy_queue(&sch->qstats, + cpu_qstats, &qdisc->qstats, qlen); + spin_unlock_bh(qdisc_lock(qdisc)); } return 0; -- cgit v1.2.3 From ce679e8df7ed2a92660556d100cf370fe22b4eab Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 7 Dec 2017 09:57:39 -0800 Subject: net: sched: add support for TCQ_F_NOLOCK subqueues to sch_mqprio The sch_mqprio qdisc creates a sub-qdisc per tx queue which are then called independently for enqueue and dequeue operations. However statistics are aggregated and pushed up to the "master" qdisc. This patch adds support for any of the sub-qdiscs to be per cpu statistic qdiscs. To handle this case add a check when calculating stats and aggregate the per cpu stats if needed. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- net/sched/sch_mq.c | 35 +++++++++++++++---------- net/sched/sch_mqprio.c | 69 ++++++++++++++++++++++++++++++++++---------------- 2 files changed, 69 insertions(+), 35 deletions(-) diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index bc59f05e1a0f..8cbb5c829d59 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -98,33 +98,42 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) struct net_device *dev = qdisc_dev(sch); struct Qdisc *qdisc; unsigned int ntx; + __u32 qlen = 0; sch->q.qlen = 0; memset(&sch->bstats, 0, sizeof(sch->bstats)); memset(&sch->qstats, 0, sizeof(sch->qstats)); + /* MQ supports lockless qdiscs. However, statistics accounting needs + * to account for all, none, or a mix of locked and unlocked child + * qdiscs. Percpu stats are added to counters in-band and locking + * qdisc totals are added at end. + */ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { - struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL; - struct gnet_stats_queue __percpu *cpu_qstats = NULL; - __u32 qlen = 0; - qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping; spin_lock_bh(qdisc_lock(qdisc)); if (qdisc_is_percpu_stats(qdisc)) { - cpu_bstats = qdisc->cpu_bstats; - cpu_qstats = qdisc->cpu_qstats; + qlen = qdisc_qlen_sum(qdisc); + __gnet_stats_copy_basic(NULL, &sch->bstats, + qdisc->cpu_bstats, + &qdisc->bstats); + __gnet_stats_copy_queue(&sch->qstats, + qdisc->cpu_qstats, + &qdisc->qstats, qlen); + } else { + sch->q.qlen += qdisc->q.qlen; + sch->bstats.bytes += qdisc->bstats.bytes; + sch->bstats.packets += qdisc->bstats.packets; + sch->qstats.backlog += qdisc->qstats.backlog; + sch->qstats.drops += qdisc->qstats.drops; + sch->qstats.requeues += qdisc->qstats.requeues; + sch->qstats.overlimits += qdisc->qstats.overlimits; } - qlen = qdisc_qlen_sum(qdisc); - - __gnet_stats_copy_basic(NULL, &sch->bstats, - cpu_bstats, &qdisc->bstats); - __gnet_stats_copy_queue(&sch->qstats, - cpu_qstats, &qdisc->qstats, qlen); - spin_unlock_bh(qdisc_lock(qdisc)); } + return 0; } diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index b85885a9d8a1..8622745f3cd9 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -388,22 +388,40 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) struct nlattr *nla = (struct nlattr *)skb_tail_pointer(skb); struct tc_mqprio_qopt opt = { 0 }; struct Qdisc *qdisc; - unsigned int i; + unsigned int ntx, tc; sch->q.qlen = 0; memset(&sch->bstats, 0, sizeof(sch->bstats)); memset(&sch->qstats, 0, sizeof(sch->qstats)); - for (i = 0; i < dev->num_tx_queues; i++) { - qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc); + /* MQ supports lockless qdiscs. However, statistics accounting needs + * to account for all, none, or a mix of locked and unlocked child + * qdiscs. Percpu stats are added to counters in-band and locking + * qdisc totals are added at end. + */ + for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { + qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping; spin_lock_bh(qdisc_lock(qdisc)); - sch->q.qlen += qdisc->q.qlen; - sch->bstats.bytes += qdisc->bstats.bytes; - sch->bstats.packets += qdisc->bstats.packets; - sch->qstats.backlog += qdisc->qstats.backlog; - sch->qstats.drops += qdisc->qstats.drops; - sch->qstats.requeues += qdisc->qstats.requeues; - sch->qstats.overlimits += qdisc->qstats.overlimits; + + if (qdisc_is_percpu_stats(qdisc)) { + __u32 qlen = qdisc_qlen_sum(qdisc); + + __gnet_stats_copy_basic(NULL, &sch->bstats, + qdisc->cpu_bstats, + &qdisc->bstats); + __gnet_stats_copy_queue(&sch->qstats, + qdisc->cpu_qstats, + &qdisc->qstats, qlen); + } else { + sch->q.qlen += qdisc->q.qlen; + sch->bstats.bytes += qdisc->bstats.bytes; + sch->bstats.packets += qdisc->bstats.packets; + sch->qstats.backlog += qdisc->qstats.backlog; + sch->qstats.drops += qdisc->qstats.drops; + sch->qstats.requeues += qdisc->qstats.requeues; + sch->qstats.overlimits += qdisc->qstats.overlimits; + } + spin_unlock_bh(qdisc_lock(qdisc)); } @@ -411,9 +429,9 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map)); opt.hw = priv->hw_offload; - for (i = 0; i < netdev_get_num_tc(dev); i++) { - opt.count[i] = dev->tc_to_txq[i].count; - opt.offset[i] = dev->tc_to_txq[i].offset; + for (tc = 0; tc < netdev_get_num_tc(dev); tc++) { + opt.count[tc] = dev->tc_to_txq[tc].count; + opt.offset[tc] = dev->tc_to_txq[tc].offset; } if (nla_put(skb, TCA_OPTIONS, NLA_ALIGN(sizeof(opt)), &opt)) @@ -495,7 +513,6 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, if (cl >= TC_H_MIN_PRIORITY) { int i; __u32 qlen = 0; - struct Qdisc *qdisc; struct gnet_stats_queue qstats = {0}; struct gnet_stats_basic_packed bstats = {0}; struct net_device *dev = qdisc_dev(sch); @@ -511,18 +528,26 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, for (i = tc.offset; i < tc.offset + tc.count; i++) { struct netdev_queue *q = netdev_get_tx_queue(dev, i); + struct Qdisc *qdisc = rtnl_dereference(q->qdisc); + struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL; + struct gnet_stats_queue __percpu *cpu_qstats = NULL; - qdisc = rtnl_dereference(q->qdisc); spin_lock_bh(qdisc_lock(qdisc)); - qlen += qdisc->q.qlen; - bstats.bytes += qdisc->bstats.bytes; - bstats.packets += qdisc->bstats.packets; - qstats.backlog += qdisc->qstats.backlog; - qstats.drops += qdisc->qstats.drops; - qstats.requeues += qdisc->qstats.requeues; - qstats.overlimits += qdisc->qstats.overlimits; + if (qdisc_is_percpu_stats(qdisc)) { + cpu_bstats = qdisc->cpu_bstats; + cpu_qstats = qdisc->cpu_qstats; + } + + qlen = qdisc_qlen_sum(qdisc); + __gnet_stats_copy_basic(NULL, &sch->bstats, + cpu_bstats, &qdisc->bstats); + __gnet_stats_copy_queue(&sch->qstats, + cpu_qstats, + &qdisc->qstats, + qlen); spin_unlock_bh(qdisc_lock(qdisc)); } + /* Reclaim root sleeping lock before completing stats */ if (d->lock) spin_lock_bh(d->lock); -- cgit v1.2.3 From 4a86a4cf68b4996a656d8c666617e8d839f8f839 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 7 Dec 2017 09:57:59 -0800 Subject: net: skb_array: expose peek API This adds a peek routine to skb_array.h for use with qdisc. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/linux/skb_array.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/skb_array.h b/include/linux/skb_array.h index 8621ffdeecbf..c7addf37d119 100644 --- a/include/linux/skb_array.h +++ b/include/linux/skb_array.h @@ -72,6 +72,11 @@ static inline bool __skb_array_empty(struct skb_array *a) return !__ptr_ring_peek(&a->ring); } +static inline struct sk_buff *__skb_array_peek(struct skb_array *a) +{ + return __ptr_ring_peek(&a->ring); +} + static inline bool skb_array_empty(struct skb_array *a) { return ptr_ring_empty(&a->ring); -- cgit v1.2.3 From c5ad119fb6c09b0297446be05bd66602fa564758 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 7 Dec 2017 09:58:19 -0800 Subject: net: sched: pfifo_fast use skb_array This converts the pfifo_fast qdisc to use the skb_array data structure and set the lockless qdisc bit. pfifo_fast is the first qdisc to support the lockless bit that can be a child of a qdisc requiring locking. So we add logic to clear the lock bit on initialization in these cases when the qdisc graft operation occurs. This also removes the logic used to pick the next band to dequeue from and instead just checks a per priority array for packets from top priority to lowest. This might need to be a bit more clever but seems to work for now. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- net/sched/sch_api.c | 5 ++ net/sched/sch_generic.c | 140 ++++++++++++++++++++++++++++++------------------ 2 files changed, 92 insertions(+), 53 deletions(-) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index c669bb3b89b2..a904276b657d 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -955,6 +955,11 @@ skip: } else { const struct Qdisc_class_ops *cops = parent->ops->cl_ops; + /* Only support running class lockless if parent is lockless */ + if (new && (new->flags & TCQ_F_NOLOCK) && + parent && !(parent->flags & TCQ_F_NOLOCK)) + new->flags &= ~TCQ_F_NOLOCK; + err = -EOPNOTSUPP; if (cops && cops->graft) { unsigned long cl = cops->find(parent, classid); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 5ff93c2b5b99..ff6a5acf6ab0 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -578,93 +579,93 @@ static const u8 prio2band[TC_PRIO_MAX + 1] = { /* * Private data for a pfifo_fast scheduler containing: - * - queues for the three band - * - bitmap indicating which of the bands contain skbs + * - rings for priority bands */ struct pfifo_fast_priv { - u32 bitmap; - struct qdisc_skb_head q[PFIFO_FAST_BANDS]; + struct skb_array q[PFIFO_FAST_BANDS]; }; -/* - * Convert a bitmap to the first band number where an skb is queued, where: - * bitmap=0 means there are no skbs on any band. - * bitmap=1 means there is an skb on band 0. - * bitmap=7 means there are skbs on all 3 bands, etc. - */ -static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0}; - -static inline struct qdisc_skb_head *band2list(struct pfifo_fast_priv *priv, - int band) +static inline struct skb_array *band2list(struct pfifo_fast_priv *priv, + int band) { - return priv->q + band; + return &priv->q[band]; } static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, struct sk_buff **to_free) { - if (qdisc->q.qlen < qdisc_dev(qdisc)->tx_queue_len) { - int band = prio2band[skb->priority & TC_PRIO_MAX]; - struct pfifo_fast_priv *priv = qdisc_priv(qdisc); - struct qdisc_skb_head *list = band2list(priv, band); - - priv->bitmap |= (1 << band); - qdisc->q.qlen++; - return __qdisc_enqueue_tail(skb, qdisc, list); - } + int band = prio2band[skb->priority & TC_PRIO_MAX]; + struct pfifo_fast_priv *priv = qdisc_priv(qdisc); + struct skb_array *q = band2list(priv, band); + int err; - return qdisc_drop(skb, qdisc, to_free); + err = skb_array_produce(q, skb); + + if (unlikely(err)) + return qdisc_drop_cpu(skb, qdisc, to_free); + + qdisc_qstats_cpu_qlen_inc(qdisc); + qdisc_qstats_cpu_backlog_inc(qdisc, skb); + return NET_XMIT_SUCCESS; } static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) { struct pfifo_fast_priv *priv = qdisc_priv(qdisc); - int band = bitmap2band[priv->bitmap]; - - if (likely(band >= 0)) { - struct qdisc_skb_head *qh = band2list(priv, band); - struct sk_buff *skb = __qdisc_dequeue_head(qh); + struct sk_buff *skb = NULL; + int band; - if (likely(skb != NULL)) { - qdisc_qstats_backlog_dec(qdisc, skb); - qdisc_bstats_update(qdisc, skb); - } + for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) { + struct skb_array *q = band2list(priv, band); - qdisc->q.qlen--; - if (qh->qlen == 0) - priv->bitmap &= ~(1 << band); + if (__skb_array_empty(q)) + continue; - return skb; + skb = skb_array_consume_bh(q); + } + if (likely(skb)) { + qdisc_qstats_cpu_backlog_dec(qdisc, skb); + qdisc_bstats_cpu_update(qdisc, skb); + qdisc_qstats_cpu_qlen_dec(qdisc); } - return NULL; + return skb; } static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc) { struct pfifo_fast_priv *priv = qdisc_priv(qdisc); - int band = bitmap2band[priv->bitmap]; + struct sk_buff *skb = NULL; + int band; - if (band >= 0) { - struct qdisc_skb_head *qh = band2list(priv, band); + for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) { + struct skb_array *q = band2list(priv, band); - return qh->head; + skb = __skb_array_peek(q); } - return NULL; + return skb; } static void pfifo_fast_reset(struct Qdisc *qdisc) { - int prio; + int i, band; struct pfifo_fast_priv *priv = qdisc_priv(qdisc); - for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) - __qdisc_reset_queue(band2list(priv, prio)); + for (band = 0; band < PFIFO_FAST_BANDS; band++) { + struct skb_array *q = band2list(priv, band); + struct sk_buff *skb; - priv->bitmap = 0; - qdisc->qstats.backlog = 0; - qdisc->q.qlen = 0; + while ((skb = skb_array_consume_bh(q)) != NULL) + kfree_skb(skb); + } + + for_each_possible_cpu(i) { + struct gnet_stats_queue *q = per_cpu_ptr(qdisc->cpu_qstats, i); + + q->backlog = 0; + q->qlen = 0; + } } static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb) @@ -682,17 +683,48 @@ nla_put_failure: static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt) { - int prio; + unsigned int qlen = qdisc_dev(qdisc)->tx_queue_len; struct pfifo_fast_priv *priv = qdisc_priv(qdisc); + int prio; + + /* guard against zero length rings */ + if (!qlen) + return -EINVAL; - for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) - qdisc_skb_head_init(band2list(priv, prio)); + for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { + struct skb_array *q = band2list(priv, prio); + int err; + + err = skb_array_init(q, qlen, GFP_KERNEL); + if (err) + return -ENOMEM; + } /* Can by-pass the queue discipline */ qdisc->flags |= TCQ_F_CAN_BYPASS; return 0; } +static void pfifo_fast_destroy(struct Qdisc *sch) +{ + struct pfifo_fast_priv *priv = qdisc_priv(sch); + int prio; + + for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) { + struct skb_array *q = band2list(priv, prio); + + /* NULL ring is possible if destroy path is due to a failed + * skb_array_init() in pfifo_fast_init() case. + */ + if (!&q->ring.queue) + continue; + /* Destroy ring but no need to kfree_skb because a call to + * pfifo_fast_reset() has already done that work. + */ + ptr_ring_cleanup(&q->ring, NULL); + } +} + struct Qdisc_ops pfifo_fast_ops __read_mostly = { .id = "pfifo_fast", .priv_size = sizeof(struct pfifo_fast_priv), @@ -700,9 +732,11 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = { .dequeue = pfifo_fast_dequeue, .peek = pfifo_fast_peek, .init = pfifo_fast_init, + .destroy = pfifo_fast_destroy, .reset = pfifo_fast_reset, .dump = pfifo_fast_dump, .owner = THIS_MODULE, + .static_flags = TCQ_F_NOLOCK | TCQ_F_CPUSTATS, }; EXPORT_SYMBOL(pfifo_fast_ops); -- cgit v1.2.3