From 3afa6d00fb4f9712fbb44b63ba31f88b6f9239fe Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 20 Aug 2012 07:59:10 +0000 Subject: cls_cgroup: Allow classifier cgroups to have their classid reset to 0 The network classifier cgroup initalizes each cgroups instance classid value to 0. However, the sock_update_classid function only updates classid's in sockets if the tasks cgroup classid is not zero, and if it differs from the current classid. The later check is to prevent cache line dirtying, but the former is detrimental, as it prevents resetting a classid for a cgroup to 0. While this is not a common action, it has administrative usefulness (if the admin wants to disable classification of a certain group temporarily for instance). Easy fix, just remove the zero check. Tested successfully by myself Signed-off-by: Neil Horman CC: "David S. Miller" Signed-off-by: David S. Miller --- net/core/sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index 8f67ced8d6a8..116786c55fe9 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1230,7 +1230,7 @@ void sock_update_classid(struct sock *sk) rcu_read_lock(); /* doing current task, which cannot vanish. */ classid = task_cls_classid(current); rcu_read_unlock(); - if (classid && classid != sk->sk_classid) + if (classid != sk->sk_classid) sk->sk_classid = classid; } EXPORT_SYMBOL(sock_update_classid); -- cgit v1.2.3 From 5640f7685831e088fe6c2e1f863a6805962f8e81 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 23 Sep 2012 23:04:42 +0000 Subject: net: use a per task frag allocator We currently use a per socket order-0 page cache for tcp_sendmsg() operations. This page is used to build fragments for skbs. Its done to increase probability of coalescing small write() into single segments in skbs still in write queue (not yet sent) But it wastes a lot of memory for applications handling many mostly idle sockets, since each socket holds one page in sk->sk_sndmsg_page Its also quite inefficient to build TSO 64KB packets, because we need about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit page allocator more than wanted. This patch adds a per task frag allocator and uses bigger pages, if available. An automatic fallback is done in case of memory pressure. (up to 32768 bytes per frag, thats order-3 pages on x86) This increases TCP stream performance by 20% on loopback device, but also benefits on other network devices, since 8x less frags are mapped on transmit and unmapped on tx completion. Alexander Duyck mentioned a probable performance win on systems with IOMMU enabled. Its possible some SG enabled hardware cant cope with bigger fragments, but their ndo_start_xmit() should already handle this, splitting a fragment in sub fragments, since some arches have PAGE_SIZE=65536 Successfully tested on various ethernet devices. (ixgbe, igb, bnx2x, tg3, mellanox mlx4) Signed-off-by: Eric Dumazet Cc: Ben Hutchings Cc: Vijay Subramanian Cc: Alexander Duyck Tested-by: Vijay Subramanian Signed-off-by: David S. Miller --- net/core/sock.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 2 deletions(-) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index 2693f7649222..727114cd6f7e 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1744,6 +1744,45 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, } EXPORT_SYMBOL(sock_alloc_send_skb); +/* On 32bit arches, an skb frag is limited to 2^15 */ +#define SKB_FRAG_PAGE_ORDER get_order(32768) + +bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) +{ + int order; + + if (pfrag->page) { + if (atomic_read(&pfrag->page->_count) == 1) { + pfrag->offset = 0; + return true; + } + if (pfrag->offset < pfrag->size) + return true; + put_page(pfrag->page); + } + + /* We restrict high order allocations to users that can afford to wait */ + order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0; + + do { + gfp_t gfp = sk->sk_allocation; + + if (order) + gfp |= __GFP_COMP | __GFP_NOWARN; + pfrag->page = alloc_pages(gfp, order); + if (likely(pfrag->page)) { + pfrag->offset = 0; + pfrag->size = PAGE_SIZE << order; + return true; + } + } while (--order >= 0); + + sk_enter_memory_pressure(sk); + sk_stream_moderate_sndbuf(sk); + return false; +} +EXPORT_SYMBOL(sk_page_frag_refill); + static void __lock_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) @@ -2173,8 +2212,8 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_error_report = sock_def_error_report; sk->sk_destruct = sock_def_destruct; - sk->sk_sndmsg_page = NULL; - sk->sk_sndmsg_off = 0; + sk->sk_frag.page = NULL; + sk->sk_frag.offset = 0; sk->sk_peek_off = -1; sk->sk_peer_pid = NULL; @@ -2417,6 +2456,12 @@ void sk_common_release(struct sock *sk) xfrm_sk_free_policy(sk); sk_refcnt_debug_release(sk); + + if (sk->sk_frag.page) { + put_page(sk->sk_frag.page); + sk->sk_frag.page = NULL; + } + sock_put(sk); } EXPORT_SYMBOL(sk_common_release); -- cgit v1.2.3 From e2bcabec6ea5ba30dd2097dc1566e9957d14117c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 25 Sep 2012 11:32:13 +0000 Subject: net: remove sk_init() helper It seems sk_init() has no value today and even does strange things : # grep . /proc/sys/net/core/?mem_* /proc/sys/net/core/rmem_default:212992 /proc/sys/net/core/rmem_max:131071 /proc/sys/net/core/wmem_default:212992 /proc/sys/net/core/wmem_max:131071 We can remove it completely. Signed-off-by: Eric Dumazet Reviewed-by: Shan Wei Signed-off-by: David S. Miller --- net/core/sock.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'net/core/sock.c') diff --git a/net/core/sock.c b/net/core/sock.c index 727114cd6f7e..f5a426097236 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1464,19 +1464,6 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) } EXPORT_SYMBOL_GPL(sk_setup_caps); -void __init sk_init(void) -{ - if (totalram_pages <= 4096) { - sysctl_wmem_max = 32767; - sysctl_rmem_max = 32767; - sysctl_wmem_default = 32767; - sysctl_rmem_default = 32767; - } else if (totalram_pages >= 131072) { - sysctl_wmem_max = 131071; - sysctl_rmem_max = 131071; - } -} - /* * Simple resource managers for sockets. */ -- cgit v1.2.3