From 7f92083eb58f85ea114d97f65fcbe22be5b0468d Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Fri, 30 Sep 2016 11:11:07 +0200
Subject: vti6: flush x-netns xfrm cache when vti interface is removed

This is the same fix than commit a5d0dc810abf ("vti: flush x-netns xfrm
cache when vti interface is removed")

This patch fixes a refcnt problem when a x-netns vti6 interface is removed:
unregister_netdevice: waiting for vti6_test to become free. Usage count = 1

Here is a script to reproduce the problem:

ip link set dev ntfp2 up
ip addr add dev ntfp2 2001::1/64
ip link add vti6_test type vti6 local 2001::1 remote 2001::2 key 1
ip netns add secure
ip link set vti6_test netns secure
ip netns exec secure ip link set vti6_test up
ip netns exec secure ip link s lo up
ip netns exec secure ip addr add dev vti6_test 2003::1/64
ip -6 xfrm policy add dir out tmpl src 2001::1 dst 2001::2 proto esp \
	   mode tunnel mark 1
ip -6 xfrm policy add dir in tmpl src 2001::2 dst 2001::1 proto esp \
	   mode tunnel mark 1
ip xfrm state add src 2001::1 dst 2001::2 proto esp spi 1 mode tunnel \
	   enc des3_ede 0x112233445566778811223344556677881122334455667788 mark 1
ip xfrm state add src 2001::2 dst 2001::1 proto esp spi 1 mode tunnel \
	   enc des3_ede 0x112233445566778811223344556677881122334455667788 mark 1
ip netns exec secure  ping6 -c 4 2003::2
ip netns del secure

CC: Lance Richardson <lrichard@redhat.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Lance Richardson <lrichard@redhat.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv6/ip6_vti.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index 8a02ca8a11af..c299c1e2bbf0 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -1138,6 +1138,33 @@ static struct xfrm6_protocol vti_ipcomp6_protocol __read_mostly = {
 	.priority	=	100,
 };
 
+static bool is_vti6_tunnel(const struct net_device *dev)
+{
+	return dev->netdev_ops == &vti6_netdev_ops;
+}
+
+static int vti6_device_event(struct notifier_block *unused,
+			     unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct ip6_tnl *t = netdev_priv(dev);
+
+	if (!is_vti6_tunnel(dev))
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_DOWN:
+		if (!net_eq(t->net, dev_net(dev)))
+			xfrm_garbage_collect(t->net);
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block vti6_notifier_block __read_mostly = {
+	.notifier_call = vti6_device_event,
+};
+
 /**
  * vti6_tunnel_init - register protocol and reserve needed resources
  *
@@ -1148,6 +1175,8 @@ static int __init vti6_tunnel_init(void)
 	const char *msg;
 	int err;
 
+	register_netdevice_notifier(&vti6_notifier_block);
+
 	msg = "tunnel device";
 	err = register_pernet_device(&vti6_net_ops);
 	if (err < 0)
@@ -1180,6 +1209,7 @@ xfrm_proto_ah_failed:
 xfrm_proto_esp_failed:
 	unregister_pernet_device(&vti6_net_ops);
 pernet_dev_failed:
+	unregister_netdevice_notifier(&vti6_notifier_block);
 	pr_err("vti6 init: failed to register %s\n", msg);
 	return err;
 }
@@ -1194,6 +1224,7 @@ static void __exit vti6_tunnel_cleanup(void)
 	xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH);
 	xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP);
 	unregister_pernet_device(&vti6_net_ops);
+	unregister_netdevice_notifier(&vti6_notifier_block);
 }
 
 module_init(vti6_tunnel_init);
-- 
cgit v1.2.3


From 330e832abda923df06a4ca6d3faac6e9c1b42548 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 17 Nov 2016 13:21:46 +0100
Subject: xfrm: unbreak xfrm_sk_policy_lookup

if we succeed grabbing the refcount, then
  if (err && !xfrm_pol_hold_rcu)

will evaluate to false so this hits last else branch which then
sets policy to ERR_PTR(0).

Fixes: ae33786f73a7ce ("xfrm: policy: only use rcu in xfrm_sk_policy_lookup")
Reported-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Tested-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_policy.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index fd6986634e6f..5bf7e1bfeac7 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1268,12 +1268,14 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
 			err = security_xfrm_policy_lookup(pol->security,
 						      fl->flowi_secid,
 						      policy_to_flow_dir(dir));
-			if (!err && !xfrm_pol_hold_rcu(pol))
-				goto again;
-			else if (err == -ESRCH)
+			if (!err) {
+				if (!xfrm_pol_hold_rcu(pol))
+					goto again;
+			} else if (err == -ESRCH) {
 				pol = NULL;
-			else
+			} else {
 				pol = ERR_PTR(err);
+			}
 		} else
 			pol = NULL;
 	}
-- 
cgit v1.2.3


From 6b226487815574193c1da864f2eac274781a2b0c Mon Sep 17 00:00:00 2001
From: Miroslav Urbanek <mu@miroslavurbanek.com>
Date: Mon, 21 Nov 2016 15:48:21 +0100
Subject: flowcache: Increase threshold for refusing new allocations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The threshold for OOM protection is too small for systems with large
number of CPUs. Applications report ENOBUFs on connect() every 10
minutes.

The problem is that the variable net->xfrm.flow_cache_gc_count is a
global counter while the variable fc->high_watermark is a per-CPU
constant. Take the number of CPUs into account as well.

Fixes: 6ad3122a08e3 ("flowcache: Avoid OOM condition under preasure")
Reported-by: Lukáš Koldrt <lk@excello.cz>
Tested-by: Jan Hejl <jh@excello.cz>
Signed-off-by: Miroslav Urbanek <mu@miroslavurbanek.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/core/flow.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/core/flow.c b/net/core/flow.c
index 3937b1b68d5b..18e8893d4be5 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -95,7 +95,6 @@ static void flow_cache_gc_task(struct work_struct *work)
 	list_for_each_entry_safe(fce, n, &gc_list, u.gc_list) {
 		flow_entry_kill(fce, xfrm);
 		atomic_dec(&xfrm->flow_cache_gc_count);
-		WARN_ON(atomic_read(&xfrm->flow_cache_gc_count) < 0);
 	}
 }
 
@@ -236,9 +235,8 @@ flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
 		if (fcp->hash_count > fc->high_watermark)
 			flow_cache_shrink(fc, fcp);
 
-		if (fcp->hash_count > 2 * fc->high_watermark ||
-		    atomic_read(&net->xfrm.flow_cache_gc_count) > fc->high_watermark) {
-			atomic_inc(&net->xfrm.flow_cache_genid);
+		if (atomic_read(&net->xfrm.flow_cache_gc_count) >
+		    2 * num_online_cpus() * fc->high_watermark) {
 			flo = ERR_PTR(-ENOBUFS);
 			goto ret_object;
 		}
-- 
cgit v1.2.3


From 6d8b49c3a3a3e1f11b52edd3b9beb6693bb8061d Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Wed, 9 Nov 2016 10:24:40 -0800
Subject: netfilter: Update ip_route_me_harder to consider L3 domain

ip_route_me_harder is not considering the L3 domain and sending lookups
to the wrong table. For example consider the following output rule:

iptables -I OUTPUT -p tcp --dport 12345 -j REJECT --reject-with tcp-reset

using perf to analyze lookups via the fib_table_lookup tracepoint shows:

vrf-test  1187 [001] 46887.295927: fib:fib_table_lookup: table 255 oif 0 iif 0 src 0.0.0.0 dst 10.100.1.254 tos 0 scope 0 flags 0
        ffffffff8143922c perf_trace_fib_table_lookup ([kernel.kallsyms])
        ffffffff81493aac fib_table_lookup ([kernel.kallsyms])
        ffffffff8148dda3 __inet_dev_addr_type ([kernel.kallsyms])
        ffffffff8148ddf6 inet_addr_type ([kernel.kallsyms])
        ffffffff8149e344 ip_route_me_harder ([kernel.kallsyms])

and

vrf-test  1187 [001] 46887.295933: fib:fib_table_lookup: table 255 oif 0 iif 1 src 10.100.1.254 dst 10.100.1.2 tos 0 scope 0 flags
        ffffffff8143922c perf_trace_fib_table_lookup ([kernel.kallsyms])
        ffffffff81493aac fib_table_lookup ([kernel.kallsyms])
        ffffffff814998ff fib4_rule_action ([kernel.kallsyms])
        ffffffff81437f35 fib_rules_lookup ([kernel.kallsyms])
        ffffffff81499758 __fib_lookup ([kernel.kallsyms])
        ffffffff8144f010 fib_lookup.constprop.34 ([kernel.kallsyms])
        ffffffff8144f759 __ip_route_output_key_hash ([kernel.kallsyms])
        ffffffff8144fc6a ip_route_output_flow ([kernel.kallsyms])
        ffffffff8149e39b ip_route_me_harder ([kernel.kallsyms])

In both cases the lookups are directed to table 255 rather than the
table associated with the device via the L3 domain. Update both
lookups to pull the L3 domain from the dst currently attached to the
skb.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index c3776ff6749f..b3cc1335adbc 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -24,10 +24,11 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t
 	struct flowi4 fl4 = {};
 	__be32 saddr = iph->saddr;
 	__u8 flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0;
+	struct net_device *dev = skb_dst(skb)->dev;
 	unsigned int hh_len;
 
 	if (addr_type == RTN_UNSPEC)
-		addr_type = inet_addr_type(net, saddr);
+		addr_type = inet_addr_type_dev_table(net, dev, saddr);
 	if (addr_type == RTN_LOCAL || addr_type == RTN_UNICAST)
 		flags |= FLOWI_FLAG_ANYSRC;
 	else
@@ -40,6 +41,8 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t
 	fl4.saddr = saddr;
 	fl4.flowi4_tos = RT_TOS(iph->tos);
 	fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0;
+	if (!fl4.flowi4_oif)
+		fl4.flowi4_oif = l3mdev_master_ifindex(dev);
 	fl4.flowi4_mark = skb->mark;
 	fl4.flowi4_flags = flags;
 	rt = ip_route_output_key(net, &fl4);
-- 
cgit v1.2.3


From 00b4422fe363cc7cadc51c50c5a0c3c510f0fa14 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Wed, 9 Nov 2016 10:25:05 -0800
Subject: netfilter: Update nf_send_reset6 to consider L3 domain

nf_send_reset6 is not considering the L3 domain and lookups are sent
to the wrong table. For example consider the following output rule:

ip6tables -A OUTPUT -p tcp --dport 12345 -j REJECT --reject-with tcp-reset

using perf to analyze lookups via the fib6_table_lookup tracepoint shows:

swapper     0 [001]   248.787816: fib6:fib6_table_lookup: table 255 oif 0 iif 1 src 2100:1::3 dst 2100:1:
        ffffffff81439cdc perf_trace_fib6_table_lookup ([kernel.kallsyms])
        ffffffff814c1ce3 trace_fib6_table_lookup ([kernel.kallsyms])
        ffffffff814c3e89 ip6_pol_route ([kernel.kallsyms])
        ffffffff814c40d5 ip6_pol_route_output ([kernel.kallsyms])
        ffffffff814e7b6f fib6_rule_action ([kernel.kallsyms])
        ffffffff81437f60 fib_rules_lookup ([kernel.kallsyms])
        ffffffff814e7c79 fib6_rule_lookup ([kernel.kallsyms])
        ffffffff814c2541 ip6_route_output_flags ([kernel.kallsyms])
                     528 nf_send_reset6 ([nf_reject_ipv6])

The lookup is directed to table 255 rather than the table associated with
the device via the L3 domain. Update nf_send_reset6 to pull the L3 domain
from the dst currently attached to the skb.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/nf_reject_ipv6.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c
index a5400223fd74..10090400c72f 100644
--- a/net/ipv6/netfilter/nf_reject_ipv6.c
+++ b/net/ipv6/netfilter/nf_reject_ipv6.c
@@ -156,6 +156,7 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
 	fl6.daddr = oip6h->saddr;
 	fl6.fl6_sport = otcph->dest;
 	fl6.fl6_dport = otcph->source;
+	fl6.flowi6_oif = l3mdev_master_ifindex(skb_dst(oldskb)->dev);
 	security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6));
 	dst = ip6_route_output(net, NULL, &fl6);
 	if (dst->error) {
-- 
cgit v1.2.3


From abd66e9f3cc50c9c3ba4cf609749374090a2f215 Mon Sep 17 00:00:00 2001
From: Laura Garcia Liebana <nevola@gmail.com>
Date: Mon, 14 Nov 2016 22:33:34 +0100
Subject: netfilter: nft_hash: validate maximum value of u32 netlink hash
 attribute

Use the function nft_parse_u32_check() to fetch the value and validate
the u32 attribute into the hash len u8 field.

This patch revisits 4da449ae1df9 ("netfilter: nft_exthdr: Add size check
on u8 nft_exthdr attributes").

Fixes: cb1b69b0b15b ("netfilter: nf_tables: add hash expression")
Signed-off-by: Laura Garcia Liebana <nevola@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_hash.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index baf694de3935..d5447a22275c 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -53,6 +53,7 @@ static int nft_hash_init(const struct nft_ctx *ctx,
 {
 	struct nft_hash *priv = nft_expr_priv(expr);
 	u32 len;
+	int err;
 
 	if (!tb[NFTA_HASH_SREG] ||
 	    !tb[NFTA_HASH_DREG] ||
@@ -67,8 +68,10 @@ static int nft_hash_init(const struct nft_ctx *ctx,
 	priv->sreg = nft_parse_register(tb[NFTA_HASH_SREG]);
 	priv->dreg = nft_parse_register(tb[NFTA_HASH_DREG]);
 
-	len = ntohl(nla_get_be32(tb[NFTA_HASH_LEN]));
-	if (len == 0 || len > U8_MAX)
+	err = nft_parse_u32_check(tb[NFTA_HASH_LEN], U8_MAX, &len);
+	if (err < 0)
+		return err;
+	if (len == 0)
 		return -ERANGE;
 
 	priv->len = len;
-- 
cgit v1.2.3


From 728e87b49605f7ee02c0415c8255d3d185a36154 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 16 Nov 2016 15:13:35 +0100
Subject: netfilter: nat: fix cmp return value

The comparator works like memcmp, i.e. 0 means objects are equal.
In other words, when objects are distinct they are treated as identical,
when they are distinct they are allegedly the same.

The first case is rare (distinct objects are unlikely to get hashed to
same bucket).

The second case results in unneeded port conflict resolutions attempts.

Fixes: 870190a9ec907 ("netfilter: nat: convert nat bysrc hash to rhashtable")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_nat_core.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index bbb8f3df79f7..c632429706eb 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -193,9 +193,12 @@ static int nf_nat_bysource_cmp(struct rhashtable_compare_arg *arg,
 	const struct nf_nat_conn_key *key = arg->key;
 	const struct nf_conn *ct = obj;
 
-	return same_src(ct, key->tuple) &&
-	       net_eq(nf_ct_net(ct), key->net) &&
-	       nf_ct_zone_equal(ct, key->zone, IP_CT_DIR_ORIGINAL);
+	if (!same_src(ct, key->tuple) ||
+	    !net_eq(nf_ct_net(ct), key->net) ||
+	    !nf_ct_zone_equal(ct, key->zone, IP_CT_DIR_ORIGINAL))
+		return 1;
+
+	return 0;
 }
 
 static struct rhashtable_params nf_nat_bysource_params = {
-- 
cgit v1.2.3


From 7223ecd4669921cb2a709193521967aaa2b06862 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 16 Nov 2016 15:13:36 +0100
Subject: netfilter: nat: switch to new rhlist interface

I got offlist bug report about failing connections and high cpu usage.
This happens because we hit 'elasticity' checks in rhashtable that
refuses bucket list exceeding 16 entries.

The nat bysrc hash unfortunately needs to insert distinct objects that
share same key and are identical (have same source tuple), this cannot
be avoided.

Switch to the rhlist interface which is designed for this.

The nulls_base is removed here, I don't think its needed:

A (unlikely) false positive results in unneeded port clash resolution,
a false negative results in packet drop during conntrack confirmation,
when we try to insert the duplicate into main conntrack hash table.

Tested by adding multiple ip addresses to host, then adding
iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE

... and then creating multiple connections, from same source port but
different addresses:

for i in $(seq 2000 2032);do nc -p 1234 192.168.7.1 $i > /dev/null  & done

(all of these then get hashed to same bysource slot)

Then, to test that nat conflict resultion is working:

nc -s 10.0.0.1 -p 1234 192.168.7.1 2000
nc -s 10.0.0.2 -p 1234 192.168.7.1 2000

tcp  .. src=10.0.0.1 dst=192.168.7.1 sport=1234 dport=2000 src=192.168.7.1 dst=192.168.7.10 sport=2000 dport=1024 [ASSURED]
tcp  .. src=10.0.0.2 dst=192.168.7.1 sport=1234 dport=2000 src=192.168.7.1 dst=192.168.7.10 sport=2000 dport=1025 [ASSURED]
tcp  .. src=192.168.7.10 dst=192.168.7.1 sport=1234 dport=2000 src=192.168.7.1 dst=192.168.7.10 sport=2000 dport=1234 [ASSURED]
tcp  .. src=192.168.7.10 dst=192.168.7.1 sport=1234 dport=2001 src=192.168.7.1 dst=192.168.7.10 sport=2001 dport=1234 [ASSURED]
[..]

-> nat altered source ports to 1024 and 1025, respectively.
This can also be confirmed on destination host which shows
ESTAB      0      0   192.168.7.1:2000      192.168.7.10:1024
ESTAB      0      0   192.168.7.1:2000      192.168.7.10:1025
ESTAB      0      0   192.168.7.1:2000      192.168.7.10:1234

Cc: Herbert Xu <herbert@gondor.apana.org.au>
Fixes: 870190a9ec907 ("netfilter: nat: convert nat bysrc hash to rhashtable")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_nat_core.c | 40 ++++++++++++++++++++++++----------------
 1 file changed, 24 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index c632429706eb..5b9c884a452e 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -42,7 +42,7 @@ struct nf_nat_conn_key {
 	const struct nf_conntrack_zone *zone;
 };
 
-static struct rhashtable nf_nat_bysource_table;
+static struct rhltable nf_nat_bysource_table;
 
 inline const struct nf_nat_l3proto *
 __nf_nat_l3proto_find(u8 family)
@@ -207,7 +207,6 @@ static struct rhashtable_params nf_nat_bysource_params = {
 	.obj_cmpfn = nf_nat_bysource_cmp,
 	.nelem_hint = 256,
 	.min_size = 1024,
-	.nulls_base = (1U << RHT_BASE_SHIFT),
 };
 
 /* Only called for SRC manip */
@@ -226,12 +225,15 @@ find_appropriate_src(struct net *net,
 		.tuple = tuple,
 		.zone = zone
 	};
+	struct rhlist_head *hl;
 
-	ct = rhashtable_lookup_fast(&nf_nat_bysource_table, &key,
-				    nf_nat_bysource_params);
-	if (!ct)
+	hl = rhltable_lookup(&nf_nat_bysource_table, &key,
+			     nf_nat_bysource_params);
+	if (!hl)
 		return 0;
 
+	ct = container_of(hl, typeof(*ct), nat_bysource);
+
 	nf_ct_invert_tuplepr(result,
 			     &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 	result->dst = tuple->dst;
@@ -449,11 +451,17 @@ nf_nat_setup_info(struct nf_conn *ct,
 	}
 
 	if (maniptype == NF_NAT_MANIP_SRC) {
+		struct nf_nat_conn_key key = {
+			.net = nf_ct_net(ct),
+			.tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+			.zone = nf_ct_zone(ct),
+		};
 		int err;
 
-		err = rhashtable_insert_fast(&nf_nat_bysource_table,
-					     &ct->nat_bysource,
-					     nf_nat_bysource_params);
+		err = rhltable_insert_key(&nf_nat_bysource_table,
+					  &key,
+					  &ct->nat_bysource,
+					  nf_nat_bysource_params);
 		if (err)
 			return NF_DROP;
 	}
@@ -570,8 +578,8 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
 	 * will delete entry from already-freed table.
 	 */
 	ct->status &= ~IPS_NAT_DONE_MASK;
-	rhashtable_remove_fast(&nf_nat_bysource_table, &ct->nat_bysource,
-			       nf_nat_bysource_params);
+	rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource,
+			nf_nat_bysource_params);
 
 	/* don't delete conntrack.  Although that would make things a lot
 	 * simpler, we'd end up flushing all conntracks on nat rmmod.
@@ -701,8 +709,8 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
 	if (!nat)
 		return;
 
-	rhashtable_remove_fast(&nf_nat_bysource_table, &ct->nat_bysource,
-			       nf_nat_bysource_params);
+	rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource,
+			nf_nat_bysource_params);
 }
 
 static struct nf_ct_ext_type nat_extend __read_mostly = {
@@ -837,13 +845,13 @@ static int __init nf_nat_init(void)
 {
 	int ret;
 
-	ret = rhashtable_init(&nf_nat_bysource_table, &nf_nat_bysource_params);
+	ret = rhltable_init(&nf_nat_bysource_table, &nf_nat_bysource_params);
 	if (ret)
 		return ret;
 
 	ret = nf_ct_extend_register(&nat_extend);
 	if (ret < 0) {
-		rhashtable_destroy(&nf_nat_bysource_table);
+		rhltable_destroy(&nf_nat_bysource_table);
 		printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
 		return ret;
 	}
@@ -867,7 +875,7 @@ static int __init nf_nat_init(void)
 	return 0;
 
  cleanup_extend:
-	rhashtable_destroy(&nf_nat_bysource_table);
+	rhltable_destroy(&nf_nat_bysource_table);
 	nf_ct_extend_unregister(&nat_extend);
 	return ret;
 }
@@ -886,7 +894,7 @@ static void __exit nf_nat_cleanup(void)
 	for (i = 0; i < NFPROTO_NUMPROTO; i++)
 		kfree(nf_nat_l4protos[i]);
 
-	rhashtable_destroy(&nf_nat_bysource_table);
+	rhltable_destroy(&nf_nat_bysource_table);
 }
 
 MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From d3e2a1110cae6ee5eeb1f9a97addf03e974f12e6 Mon Sep 17 00:00:00 2001
From: "Anders K. Pedersen" <akp@cohaesio.com>
Date: Sun, 20 Nov 2016 16:38:47 +0000
Subject: netfilter: nf_tables: fix inconsistent element expiration calculation

As Liping Zhang reports, after commit a8b1e36d0d1d ("netfilter: nft_dynset:
fix element timeout for HZ != 1000"), priv->timeout was stored in jiffies,
while set->timeout was stored in milliseconds. This is inconsistent and
incorrect.

Firstly, we already call msecs_to_jiffies in nft_set_elem_init, so
priv->timeout will be converted to jiffies twice.

Secondly, if the user did not specify the NFTA_DYNSET_TIMEOUT attr,
set->timeout will be used, but we forget to call msecs_to_jiffies
when do update elements.

Fix this by using jiffies internally for traditional sets and doing the
conversions to/from msec when interacting with userspace - as dynset
already does.

This is preferable to doing the conversions, when elements are inserted or
updated, because this can happen very frequently on busy dynsets.

Fixes: a8b1e36d0d1d ("netfilter: nft_dynset: fix element timeout for HZ != 1000")
Reported-by: Liping Zhang <zlpnobody@gmail.com>
Signed-off-by: Anders K. Pedersen <akp@cohaesio.com>
Acked-by: Liping Zhang <zlpnobody@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 026581b04ea8..e5194f6f906c 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2570,7 +2570,8 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
 	}
 
 	if (set->timeout &&
-	    nla_put_be64(skb, NFTA_SET_TIMEOUT, cpu_to_be64(set->timeout),
+	    nla_put_be64(skb, NFTA_SET_TIMEOUT,
+			 cpu_to_be64(jiffies_to_msecs(set->timeout)),
 			 NFTA_SET_PAD))
 		goto nla_put_failure;
 	if (set->gc_int &&
@@ -2859,7 +2860,8 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
 	if (nla[NFTA_SET_TIMEOUT] != NULL) {
 		if (!(flags & NFT_SET_TIMEOUT))
 			return -EINVAL;
-		timeout = be64_to_cpu(nla_get_be64(nla[NFTA_SET_TIMEOUT]));
+		timeout = msecs_to_jiffies(be64_to_cpu(nla_get_be64(
+						nla[NFTA_SET_TIMEOUT])));
 	}
 	gc_int = 0;
 	if (nla[NFTA_SET_GC_INTERVAL] != NULL) {
@@ -3178,7 +3180,8 @@ static int nf_tables_fill_setelem(struct sk_buff *skb,
 
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) &&
 	    nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT,
-			 cpu_to_be64(*nft_set_ext_timeout(ext)),
+			 cpu_to_be64(jiffies_to_msecs(
+						*nft_set_ext_timeout(ext))),
 			 NFTA_SET_ELEM_PAD))
 		goto nla_put_failure;
 
@@ -3447,7 +3450,7 @@ void *nft_set_elem_init(const struct nft_set *set,
 		memcpy(nft_set_ext_data(ext), data, set->dlen);
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION))
 		*nft_set_ext_expiration(ext) =
-			jiffies + msecs_to_jiffies(timeout);
+			jiffies + timeout;
 	if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT))
 		*nft_set_ext_timeout(ext) = timeout;
 
@@ -3535,7 +3538,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 	if (nla[NFTA_SET_ELEM_TIMEOUT] != NULL) {
 		if (!(set->flags & NFT_SET_TIMEOUT))
 			return -EINVAL;
-		timeout = be64_to_cpu(nla_get_be64(nla[NFTA_SET_ELEM_TIMEOUT]));
+		timeout = msecs_to_jiffies(be64_to_cpu(nla_get_be64(
+					nla[NFTA_SET_ELEM_TIMEOUT])));
 	} else if (set->flags & NFT_SET_TIMEOUT) {
 		timeout = set->timeout;
 	}
-- 
cgit v1.2.3


From 49cdc4c74918a5576cb93b679629714d8a9ef399 Mon Sep 17 00:00:00 2001
From: Liping Zhang <zlpnobody@gmail.com>
Date: Mon, 21 Nov 2016 21:18:23 +0800
Subject: netfilter: nft_range: add the missing NULL pointer check

Otherwise, kernel panic will happen if the user does not specify
the related attributes.

Fixes: 0f3cd9b36977 ("netfilter: nf_tables: add range expression")
Signed-off-by: Liping Zhang <zlpnobody@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_range.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c
index fbc88009ca2e..8f0aaaea1376 100644
--- a/net/netfilter/nft_range.c
+++ b/net/netfilter/nft_range.c
@@ -59,6 +59,12 @@ static int nft_range_init(const struct nft_ctx *ctx, const struct nft_expr *expr
 	int err;
 	u32 op;
 
+	if (!tb[NFTA_RANGE_SREG]      ||
+	    !tb[NFTA_RANGE_OP]	      ||
+	    !tb[NFTA_RANGE_FROM_DATA] ||
+	    !tb[NFTA_RANGE_TO_DATA])
+		return -EINVAL;
+
 	err = nft_data_init(NULL, &priv->data_from, sizeof(priv->data_from),
 			    &desc_from, tb[NFTA_RANGE_FROM_DATA]);
 	if (err < 0)
-- 
cgit v1.2.3


From fd05d7b18cec1af043990c4b3aabc6780575375c Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Thu, 24 Nov 2016 19:21:27 +0100
Subject: net: dsa: fix fixed-link-phy device leaks

Make sure to drop the reference taken by of_phy_find_device() when
registering and deregistering the fixed-link PHY-device.

Fixes: 39b0c705195e ("net: dsa: Allow configuration of CPU & DSA port
speeds/duplex")
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/dsa.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index a6902c1e2f28..cb0091b99592 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -233,6 +233,8 @@ int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev,
 		genphy_read_status(phydev);
 		if (ds->ops->adjust_link)
 			ds->ops->adjust_link(ds, port, phydev);
+
+		put_device(&phydev->mdio.dev);
 	}
 
 	return 0;
@@ -509,8 +511,9 @@ void dsa_cpu_dsa_destroy(struct device_node *port_dn)
 	if (of_phy_is_fixed_link(port_dn)) {
 		phydev = of_phy_find_device(port_dn);
 		if (phydev) {
-			phy_device_free(phydev);
 			fixed_phy_unregister(phydev);
+			put_device(&phydev->mdio.dev);
+			phy_device_free(phydev);
 		}
 	}
 }
-- 
cgit v1.2.3


From 9590112241baff6f9d0e751f9c8ecacbe591417a Mon Sep 17 00:00:00 2001
From: Jon Paul Maloy <jon.maloy@ericsson.com>
Date: Fri, 25 Nov 2016 10:35:02 -0500
Subject: tipc: fix link statistics counter errors

In commit e4bf4f76962b ("tipc: simplify packet sequence number
handling") we changed the internal representation of the packet
sequence number counters from u32 to u16, reflecting what is really
sent over the wire.

Since then some link statistics counters have been displaying incorrect
values, partially because the counters meant to be used as sequence
number snapshots are now used as direct counters, stored as u32, and
partially because some counter updates are just missing in the code.

In this commit we correct this in two ways. First, we base the
displayed packet sent/received values on direct counters instead
of as previously a calculated difference between current sequence
number and a snapshot. Second, we add the missing updates of the
counters.

This change is compatible with the current netlink API, and requires
no changes to the user space tools.

Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/link.c | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/net/tipc/link.c b/net/tipc/link.c
index ecc12411155e..bda89bf9f4ff 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -47,8 +47,8 @@
 #include <linux/pkt_sched.h>
 
 struct tipc_stats {
-	u32 sent_info;		/* used in counting # sent packets */
-	u32 recv_info;		/* used in counting # recv'd packets */
+	u32 sent_pkts;
+	u32 recv_pkts;
 	u32 sent_states;
 	u32 recv_states;
 	u32 sent_probes;
@@ -857,7 +857,6 @@ void tipc_link_reset(struct tipc_link *l)
 	l->acked = 0;
 	l->silent_intv_cnt = 0;
 	l->rst_cnt = 0;
-	l->stats.recv_info = 0;
 	l->stale_count = 0;
 	l->bc_peer_is_up = false;
 	memset(&l->mon_state, 0, sizeof(l->mon_state));
@@ -888,6 +887,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
 	struct sk_buff_head *transmq = &l->transmq;
 	struct sk_buff_head *backlogq = &l->backlogq;
 	struct sk_buff *skb, *_skb, *bskb;
+	int pkt_cnt = skb_queue_len(list);
 
 	/* Match msg importance against this and all higher backlog limits: */
 	if (!skb_queue_empty(backlogq)) {
@@ -901,6 +901,11 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
 		return -EMSGSIZE;
 	}
 
+	if (pkt_cnt > 1) {
+		l->stats.sent_fragmented++;
+		l->stats.sent_fragments += pkt_cnt;
+	}
+
 	/* Prepare each packet for sending, and add to relevant queue: */
 	while (skb_queue_len(list)) {
 		skb = skb_peek(list);
@@ -920,6 +925,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
 			__skb_queue_tail(xmitq, _skb);
 			TIPC_SKB_CB(skb)->ackers = l->ackers;
 			l->rcv_unacked = 0;
+			l->stats.sent_pkts++;
 			seqno++;
 			continue;
 		}
@@ -968,6 +974,7 @@ void tipc_link_advance_backlog(struct tipc_link *l, struct sk_buff_head *xmitq)
 		msg_set_ack(hdr, ack);
 		msg_set_bcast_ack(hdr, bc_ack);
 		l->rcv_unacked = 0;
+		l->stats.sent_pkts++;
 		seqno++;
 	}
 	l->snd_nxt = seqno;
@@ -1260,7 +1267,7 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb,
 
 		/* Deliver packet */
 		l->rcv_nxt++;
-		l->stats.recv_info++;
+		l->stats.recv_pkts++;
 		if (!tipc_data_input(l, skb, l->inputq))
 			rc |= tipc_link_input(l, skb, l->inputq);
 		if (unlikely(++l->rcv_unacked >= TIPC_MIN_LINK_WIN))
@@ -1800,10 +1807,6 @@ void tipc_link_set_queue_limits(struct tipc_link *l, u32 win)
 void tipc_link_reset_stats(struct tipc_link *l)
 {
 	memset(&l->stats, 0, sizeof(l->stats));
-	if (!link_is_bc_sndlink(l)) {
-		l->stats.sent_info = l->snd_nxt;
-		l->stats.recv_info = l->rcv_nxt;
-	}
 }
 
 static void link_print(struct tipc_link *l, const char *str)
@@ -1867,12 +1870,12 @@ static int __tipc_nl_add_stats(struct sk_buff *skb, struct tipc_stats *s)
 	};
 
 	struct nla_map map[] = {
-		{TIPC_NLA_STATS_RX_INFO, s->recv_info},
+		{TIPC_NLA_STATS_RX_INFO, 0},
 		{TIPC_NLA_STATS_RX_FRAGMENTS, s->recv_fragments},
 		{TIPC_NLA_STATS_RX_FRAGMENTED, s->recv_fragmented},
 		{TIPC_NLA_STATS_RX_BUNDLES, s->recv_bundles},
 		{TIPC_NLA_STATS_RX_BUNDLED, s->recv_bundled},
-		{TIPC_NLA_STATS_TX_INFO, s->sent_info},
+		{TIPC_NLA_STATS_TX_INFO, 0},
 		{TIPC_NLA_STATS_TX_FRAGMENTS, s->sent_fragments},
 		{TIPC_NLA_STATS_TX_FRAGMENTED, s->sent_fragmented},
 		{TIPC_NLA_STATS_TX_BUNDLES, s->sent_bundles},
@@ -1947,9 +1950,9 @@ int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg,
 		goto attr_msg_full;
 	if (nla_put_u32(msg->skb, TIPC_NLA_LINK_MTU, link->mtu))
 		goto attr_msg_full;
-	if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, link->rcv_nxt))
+	if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, link->stats.recv_pkts))
 		goto attr_msg_full;
-	if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, link->snd_nxt))
+	if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, link->stats.sent_pkts))
 		goto attr_msg_full;
 
 	if (tipc_link_is_up(link))
@@ -2004,12 +2007,12 @@ static int __tipc_nl_add_bc_link_stat(struct sk_buff *skb,
 	};
 
 	struct nla_map map[] = {
-		{TIPC_NLA_STATS_RX_INFO, stats->recv_info},
+		{TIPC_NLA_STATS_RX_INFO, stats->recv_pkts},
 		{TIPC_NLA_STATS_RX_FRAGMENTS, stats->recv_fragments},
 		{TIPC_NLA_STATS_RX_FRAGMENTED, stats->recv_fragmented},
 		{TIPC_NLA_STATS_RX_BUNDLES, stats->recv_bundles},
 		{TIPC_NLA_STATS_RX_BUNDLED, stats->recv_bundled},
-		{TIPC_NLA_STATS_TX_INFO, stats->sent_info},
+		{TIPC_NLA_STATS_TX_INFO, stats->sent_pkts},
 		{TIPC_NLA_STATS_TX_FRAGMENTS, stats->sent_fragments},
 		{TIPC_NLA_STATS_TX_FRAGMENTED, stats->sent_fragmented},
 		{TIPC_NLA_STATS_TX_BUNDLES, stats->sent_bundles},
@@ -2076,9 +2079,9 @@ int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg)
 		goto attr_msg_full;
 	if (nla_put_string(msg->skb, TIPC_NLA_LINK_NAME, bcl->name))
 		goto attr_msg_full;
-	if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, bcl->rcv_nxt))
+	if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, 0))
 		goto attr_msg_full;
-	if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, bcl->snd_nxt))
+	if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, 0))
 		goto attr_msg_full;
 
 	prop = nla_nest_start(msg->skb, TIPC_NLA_LINK_PROP);
-- 
cgit v1.2.3


From d936377414fadbafb4d17148d222fe45ca5442d4 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 27 Nov 2016 01:18:01 +0100
Subject: net, sched: respect rcu grace period on cls destruction

Roi reported a crash in flower where tp->root was NULL in ->classify()
callbacks. Reason is that in ->destroy() tp->root is set to NULL via
RCU_INIT_POINTER(). It's problematic for some of the classifiers, because
this doesn't respect RCU grace period for them, and as a result, still
outstanding readers from tc_classify() will try to blindly dereference
a NULL tp->root.

The tp->root object is strictly private to the classifier implementation
and holds internal data the core such as tc_ctl_tfilter() doesn't know
about. Within some classifiers, such as cls_bpf, cls_basic, etc, tp->root
is only checked for NULL in ->get() callback, but nowhere else. This is
misleading and seemed to be copied from old classifier code that was not
cleaned up properly. For example, d3fa76ee6b4a ("[NET_SCHED]: cls_basic:
fix NULL pointer dereference") moved tp->root initialization into ->init()
routine, where before it was part of ->change(), so ->get() had to deal
with tp->root being NULL back then, so that was indeed a valid case, after
d3fa76ee6b4a, not really anymore. We used to set tp->root to NULL long
ago in ->destroy(), see 47a1a1d4be29 ("pkt_sched: remove unnecessary xchg()
in packet classifiers"); but the NULLifying was reintroduced with the
RCUification, but it's not correct for every classifier implementation.

In the cases that are fixed here with one exception of cls_cgroup, tp->root
object is allocated and initialized inside ->init() callback, which is always
performed at a point in time after we allocate a new tp, which means tp and
thus tp->root was not globally visible in the tp chain yet (see tc_ctl_tfilter()).
Also, on destruction tp->root is strictly kfree_rcu()'ed in ->destroy()
handler, same for the tp which is kfree_rcu()'ed right when we return
from ->destroy() in tcf_destroy(). This means, the head object's lifetime
for such classifiers is always tied to the tp lifetime. The RCU callback
invocation for the two kfree_rcu() could be out of order, but that's fine
since both are independent.

Dropping the RCU_INIT_POINTER(tp->root, NULL) for these classifiers here
means that 1) we don't need a useless NULL check in fast-path and, 2) that
outstanding readers of that tp in tc_classify() can still execute under
respect with RCU grace period as it is actually expected.

Things that haven't been touched here: cls_fw and cls_route. They each
handle tp->root being NULL in ->classify() path for historic reasons, so
their ->destroy() implementation can stay as is. If someone actually
cares, they could get cleaned up at some point to avoid the test in fast
path. cls_u32 doesn't set tp->root to NULL. For cls_rsvp, I just added a
!head should anyone actually be using/testing it, so it at least aligns with
cls_fw and cls_route. For cls_flower we additionally need to defer rhashtable
destruction (to a sleepable context) after RCU grace period as concurrent
readers might still access it. (Note that in this case we need to hold module
reference to keep work callback address intact, since we only wait on module
unload for all call_rcu()s to finish.)

This fixes one race to bring RCU grace period guarantees back. Next step
as worked on by Cong however is to fix 1e052be69d04 ("net_sched: destroy
proto tp when all filters are gone") to get the order of unlinking the tp
in tc_ctl_tfilter() for the RTM_DELTFILTER case right by moving
RCU_INIT_POINTER() before tcf_destroy() and let the notification for
removal be done through the prior ->delete() callback. Both are independant
issues. Once we have that right, we can then clean tp->root up for a number
of classifiers by not making them RCU pointers, which requires a new callback
(->uninit) that is triggered from tp's RCU callback, where we just kfree()
tp->root from there.

Fixes: 1f947bf151e9 ("net: sched: rcu'ify cls_bpf")
Fixes: 9888faefe132 ("net: sched: cls_basic use RCU")
Fixes: 70da9f0bf999 ("net: sched: cls_flow use RCU")
Fixes: 77b9900ef53a ("tc: introduce Flower classifier")
Fixes: bf3994d2ed31 ("net/sched: introduce Match-all classifier")
Fixes: 952313bd6258 ("net: sched: cls_cgroup use RCU")
Reported-by: Roi Dayan <roid@mellanox.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Roi Dayan <roid@mellanox.com>
Cc: Jiri Pirko <jiri@mellanox.com>
Acked-by: John Fastabend <john.r.fastabend@intel.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_basic.c    |  4 ----
 net/sched/cls_bpf.c      |  4 ----
 net/sched/cls_cgroup.c   |  7 +++----
 net/sched/cls_flow.c     |  1 -
 net/sched/cls_flower.c   | 31 ++++++++++++++++++++++++++-----
 net/sched/cls_matchall.c |  1 -
 net/sched/cls_rsvp.h     |  3 ++-
 net/sched/cls_tcindex.c  |  1 -
 8 files changed, 31 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index eb219b78cd49..5877f6061b57 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -62,9 +62,6 @@ static unsigned long basic_get(struct tcf_proto *tp, u32 handle)
 	struct basic_head *head = rtnl_dereference(tp->root);
 	struct basic_filter *f;
 
-	if (head == NULL)
-		return 0UL;
-
 	list_for_each_entry(f, &head->flist, link) {
 		if (f->handle == handle) {
 			l = (unsigned long) f;
@@ -109,7 +106,6 @@ static bool basic_destroy(struct tcf_proto *tp, bool force)
 		tcf_unbind_filter(tp, &f->res);
 		call_rcu(&f->rcu, basic_delete_filter);
 	}
-	RCU_INIT_POINTER(tp->root, NULL);
 	kfree_rcu(head, rcu);
 	return true;
 }
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index bb1d5a487081..0a47ba5e6109 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -292,7 +292,6 @@ static bool cls_bpf_destroy(struct tcf_proto *tp, bool force)
 		call_rcu(&prog->rcu, __cls_bpf_delete_prog);
 	}
 
-	RCU_INIT_POINTER(tp->root, NULL);
 	kfree_rcu(head, rcu);
 	return true;
 }
@@ -303,9 +302,6 @@ static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle)
 	struct cls_bpf_prog *prog;
 	unsigned long ret = 0UL;
 
-	if (head == NULL)
-		return 0UL;
-
 	list_for_each_entry(prog, &head->plist, link) {
 		if (prog->handle == handle) {
 			ret = (unsigned long) prog;
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 85233c470035..c1f20077837f 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -137,11 +137,10 @@ static bool cls_cgroup_destroy(struct tcf_proto *tp, bool force)
 
 	if (!force)
 		return false;
-
-	if (head) {
-		RCU_INIT_POINTER(tp->root, NULL);
+	/* Head can still be NULL due to cls_cgroup_init(). */
+	if (head)
 		call_rcu(&head->rcu, cls_cgroup_destroy_rcu);
-	}
+
 	return true;
 }
 
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index e39672394c7b..6575aba87630 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -596,7 +596,6 @@ static bool flow_destroy(struct tcf_proto *tp, bool force)
 		list_del_rcu(&f->list);
 		call_rcu(&f->rcu, flow_destroy_filter);
 	}
-	RCU_INIT_POINTER(tp->root, NULL);
 	kfree_rcu(head, rcu);
 	return true;
 }
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index f6f40fba599b..b296f3991ab2 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -13,6 +13,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/rhashtable.h>
+#include <linux/workqueue.h>
 
 #include <linux/if_ether.h>
 #include <linux/in6.h>
@@ -64,7 +65,10 @@ struct cls_fl_head {
 	bool mask_assigned;
 	struct list_head filters;
 	struct rhashtable_params ht_params;
-	struct rcu_head rcu;
+	union {
+		struct work_struct work;
+		struct rcu_head	rcu;
+	};
 };
 
 struct cls_fl_filter {
@@ -269,6 +273,24 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f)
 	dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
 }
 
+static void fl_destroy_sleepable(struct work_struct *work)
+{
+	struct cls_fl_head *head = container_of(work, struct cls_fl_head,
+						work);
+	if (head->mask_assigned)
+		rhashtable_destroy(&head->ht);
+	kfree(head);
+	module_put(THIS_MODULE);
+}
+
+static void fl_destroy_rcu(struct rcu_head *rcu)
+{
+	struct cls_fl_head *head = container_of(rcu, struct cls_fl_head, rcu);
+
+	INIT_WORK(&head->work, fl_destroy_sleepable);
+	schedule_work(&head->work);
+}
+
 static bool fl_destroy(struct tcf_proto *tp, bool force)
 {
 	struct cls_fl_head *head = rtnl_dereference(tp->root);
@@ -282,10 +304,9 @@ static bool fl_destroy(struct tcf_proto *tp, bool force)
 		list_del_rcu(&f->list);
 		call_rcu(&f->rcu, fl_destroy_filter);
 	}
-	RCU_INIT_POINTER(tp->root, NULL);
-	if (head->mask_assigned)
-		rhashtable_destroy(&head->ht);
-	kfree_rcu(head, rcu);
+
+	__module_get(THIS_MODULE);
+	call_rcu(&head->rcu, fl_destroy_rcu);
 	return true;
 }
 
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 25927b6c4436..f935429bd5ef 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -114,7 +114,6 @@ static bool mall_destroy(struct tcf_proto *tp, bool force)
 
 		call_rcu(&f->rcu, mall_destroy_filter);
 	}
-	RCU_INIT_POINTER(tp->root, NULL);
 	kfree_rcu(head, rcu);
 	return true;
 }
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index 4f05a19fb073..322438fb3ffc 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -152,7 +152,8 @@ static int rsvp_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 		return -1;
 	nhptr = ip_hdr(skb);
 #endif
-
+	if (unlikely(!head))
+		return -1;
 restart:
 
 #if RSVP_DST_LEN == 4
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index 96144bdf30db..0751245a6ace 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -543,7 +543,6 @@ static bool tcindex_destroy(struct tcf_proto *tp, bool force)
 	walker.fn = tcindex_destroy_element;
 	tcindex_walk(tp, &walker);
 
-	RCU_INIT_POINTER(tp->root, NULL);
 	call_rcu(&p->rcu, __tcindex_destroy);
 	return true;
 }
-- 
cgit v1.2.3


From 4df21dfcf2291865cf673ac786a81c7a3f7afcf5 Mon Sep 17 00:00:00 2001
From: Julian Wollrath <jwollrath@web.de>
Date: Fri, 25 Nov 2016 15:05:26 +0100
Subject: tcp: Set DEFAULT_TCP_CONG to bbr if DEFAULT_BBR is set

Signed-off-by: Julian Wollrath <jwollrath@web.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 300b06888fdf..b54b3ca939db 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -715,6 +715,7 @@ config DEFAULT_TCP_CONG
 	default "reno" if DEFAULT_RENO
 	default "dctcp" if DEFAULT_DCTCP
 	default "cdg" if DEFAULT_CDG
+	default "bbr" if DEFAULT_BBR
 	default "cubic"
 
 config TCP_MD5SIG
-- 
cgit v1.2.3


From 79dc7e3f1cd323be4c81aa1a94faa1b3ed987fb2 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Sun, 27 Nov 2016 18:52:53 -0800
Subject: net: handle no dst on skb in icmp6_send

Andrey reported the following while fuzzing the kernel with syzkaller:

kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] SMP KASAN
Modules linked in:
CPU: 0 PID: 3859 Comm: a.out Not tainted 4.9.0-rc6+ #429
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
task: ffff8800666d4200 task.stack: ffff880067348000
RIP: 0010:[<ffffffff833617ec>]  [<ffffffff833617ec>]
icmp6_send+0x5fc/0x1e30 net/ipv6/icmp.c:451
RSP: 0018:ffff88006734f2c0  EFLAGS: 00010206
RAX: ffff8800666d4200 RBX: 0000000000000000 RCX: 0000000000000000
RDX: 0000000000000000 RSI: dffffc0000000000 RDI: 0000000000000018
RBP: ffff88006734f630 R08: ffff880064138418 R09: 0000000000000003
R10: dffffc0000000000 R11: 0000000000000005 R12: 0000000000000000
R13: ffffffff84e7e200 R14: ffff880064138484 R15: ffff8800641383c0
FS:  00007fb3887a07c0(0000) GS:ffff88006cc00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000020000000 CR3: 000000006b040000 CR4: 00000000000006f0
Stack:
 ffff8800666d4200 ffff8800666d49f8 ffff8800666d4200 ffffffff84c02460
 ffff8800666d4a1a 1ffff1000ccdaa2f ffff88006734f498 0000000000000046
 ffff88006734f440 ffffffff832f4269 ffff880064ba7456 0000000000000000
Call Trace:
 [<ffffffff83364ddc>] icmpv6_param_prob+0x2c/0x40 net/ipv6/icmp.c:557
 [<     inline     >] ip6_tlvopt_unknown net/ipv6/exthdrs.c:88
 [<ffffffff83394405>] ip6_parse_tlv+0x555/0x670 net/ipv6/exthdrs.c:157
 [<ffffffff8339a759>] ipv6_parse_hopopts+0x199/0x460 net/ipv6/exthdrs.c:663
 [<ffffffff832ee773>] ipv6_rcv+0xfa3/0x1dc0 net/ipv6/ip6_input.c:191
 ...

icmp6_send / icmpv6_send is invoked for both rx and tx paths. In both
cases the dst->dev should be preferred for determining the L3 domain
if the dst has been set on the skb. Fallback to the skb->dev if it has
not. This covers the case reported here where icmp6_send is invoked on
Rx before the route lookup.

Fixes: 5d41ce29e ("net: icmp6_send should use dst dev to determine L3 domain")
Reported-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/icmp.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 7370ad2e693a..2772004ba5a1 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -447,8 +447,10 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 
 	if (__ipv6_addr_needs_scope_id(addr_type))
 		iif = skb->dev->ifindex;
-	else
-		iif = l3mdev_master_ifindex(skb_dst(skb)->dev);
+	else {
+		dst = skb_dst(skb);
+		iif = l3mdev_master_ifindex(dst ? dst->dev : skb->dev);
+	}
 
 	/*
 	 *	Must not send error if the source does not uniquely
-- 
cgit v1.2.3


From 7a99cd6e213685b78118382e6a8fed506c82ccb2 Mon Sep 17 00:00:00 2001
From: Nikita Yushchenko <nikita.yoush@cogentembedded.com>
Date: Mon, 28 Nov 2016 09:48:48 +0300
Subject: net: dsa: fix unbalanced dsa_switch_tree reference counting

_dsa_register_switch() gets a dsa_switch_tree object either via
dsa_get_dst() or via dsa_add_dst(). Former path does not increase kref
in returned object (resulting into caller not owning a reference),
while later path does create a new object (resulting into caller owning
a reference).

The rest of _dsa_register_switch() assumes that it owns a reference, and
calls dsa_put_dst().

This causes a memory breakage if first switch in the tree initialized
successfully, but second failed to initialize. In particular, freed
dsa_swith_tree object is left referenced by switch that was initialized,
and later access to sysfs attributes of that switch cause OOPS.

To fix, need to add kref_get() call to dsa_get_dst().

Fixes: 83c0afaec7b7 ("net: dsa: Add new binding implementation")
Signed-off-by: Nikita Yushchenko <nikita.yoush@cogentembedded.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/dsa2.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index f8a7d9aab437..5fff951a0a49 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -28,8 +28,10 @@ static struct dsa_switch_tree *dsa_get_dst(u32 tree)
 	struct dsa_switch_tree *dst;
 
 	list_for_each_entry(dst, &dsa_switch_trees, list)
-		if (dst->tree == tree)
+		if (dst->tree == tree) {
+			kref_get(&dst->refcount);
 			return dst;
+		}
 	return NULL;
 }
 
-- 
cgit v1.2.3


From 9b57da0630c9fd36ed7a20fc0f98dc82cc0777fa Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 29 Nov 2016 02:17:34 +0100
Subject: netfilter: ipv6: nf_defrag: drop mangled skb on ream error

Dmitry Vyukov reported GPF in network stack that Andrey traced down to
negative nh offset in nf_ct_frag6_queue().

Problem is that all network headers before fragment header are pulled.
Normal ipv6 reassembly will drop the skb when errors occur further down
the line.

netfilter doesn't do this, and instead passed the original fragment
along.  That was also fine back when netfilter ipv6 defrag worked with
cloned fragments, as the original, pristine fragment was passed on.

So we either have to undo the pull op, or discard such fragments.
Since they're malformed after all (e.g. overlapping fragment) it seems
preferrable to just drop them.

Same for temporary errors -- it doesn't make sense to accept (and
perhaps forward!) only some fragments of same datagram.

Fixes: 029f7f3b8701cc7ac ("netfilter: ipv6: nf_defrag: avoid/free clone operations")
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Debugged-by: Andrey Konovalov <andreyknvl@google.com>
Diagnosed-by: Eric Dumazet <Eric Dumazet <edumazet@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/nf_conntrack_reasm.c   | 4 ++--
 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index e4347aeb2e65..9948b5ce52da 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -576,11 +576,11 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
 	/* Jumbo payload inhibits frag. header */
 	if (ipv6_hdr(skb)->payload_len == 0) {
 		pr_debug("payload len = 0\n");
-		return -EINVAL;
+		return 0;
 	}
 
 	if (find_prev_fhdr(skb, &prevhdr, &nhoff, &fhoff) < 0)
-		return -EINVAL;
+		return 0;
 
 	if (!pskb_may_pull(skb, fhoff + sizeof(*fhdr)))
 		return -ENOMEM;
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index f7aab5ab93a5..f06b0471f39f 100644
--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -69,7 +69,7 @@ static unsigned int ipv6_defrag(void *priv,
 	if (err == -EINPROGRESS)
 		return NF_STOLEN;
 
-	return NF_ACCEPT;
+	return err == 0 ? NF_ACCEPT : NF_DROP;
 }
 
 static struct nf_hook_ops ipv6_defrag_ops[] = {
-- 
cgit v1.2.3


From 95c2027bfeda21a28eb245121e6a249f38d0788e Mon Sep 17 00:00:00 2001
From: Amir Vadai <amir@vadai.me>
Date: Mon, 28 Nov 2016 12:56:40 +0200
Subject: net/sched: pedit: make sure that offset is valid

Add a validation function to make sure offset is valid:
1. Not below skb head (could happen when offset is negative).
2. Validate both 'offset' and 'at'.

Signed-off-by: Amir Vadai <amir@vadai.me>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_pedit.c | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index b54d56d4959b..cf9b2fe8eac6 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -108,6 +108,17 @@ static void tcf_pedit_cleanup(struct tc_action *a, int bind)
 	kfree(keys);
 }
 
+static bool offset_valid(struct sk_buff *skb, int offset)
+{
+	if (offset > 0 && offset > skb->len)
+		return false;
+
+	if  (offset < 0 && -offset > skb_headroom(skb))
+		return false;
+
+	return true;
+}
+
 static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
 		     struct tcf_result *res)
 {
@@ -134,6 +145,11 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
 			if (tkey->offmask) {
 				char *d, _d;
 
+				if (!offset_valid(skb, off + tkey->at)) {
+					pr_info("tc filter pedit 'at' offset %d out of bounds\n",
+						off + tkey->at);
+					goto bad;
+				}
 				d = skb_header_pointer(skb, off + tkey->at, 1,
 						       &_d);
 				if (!d)
@@ -146,10 +162,10 @@ static int tcf_pedit(struct sk_buff *skb, const struct tc_action *a,
 					" offset must be on 32 bit boundaries\n");
 				goto bad;
 			}
-			if (offset > 0 && offset > skb->len) {
-				pr_info("tc filter pedit"
-					" offset %d can't exceed pkt length %d\n",
-				       offset, skb->len);
+
+			if (!offset_valid(skb, off + offset)) {
+				pr_info("tc filter pedit offset %d out of bounds\n",
+					offset);
 				goto bad;
 			}
 
-- 
cgit v1.2.3


From 707693c8a498697aa8db240b93eb76ec62e30892 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 28 Nov 2016 19:22:12 +0800
Subject: netlink: Call cb->done from a worker thread

The cb->done interface expects to be called in process context.
This was broken by the netlink RCU conversion.  This patch fixes
it by adding a worker struct to make the cb->done call where
necessary.

Fixes: 21e4902aea80 ("netlink: Lockless lookup with RCU grace...")
Reported-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlink/af_netlink.c | 27 +++++++++++++++++++++++----
 net/netlink/af_netlink.h |  2 ++
 2 files changed, 25 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 62bea4591054..602e5ebe9db3 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -322,14 +322,11 @@ static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
 	sk_mem_charge(sk, skb->truesize);
 }
 
-static void netlink_sock_destruct(struct sock *sk)
+static void __netlink_sock_destruct(struct sock *sk)
 {
 	struct netlink_sock *nlk = nlk_sk(sk);
 
 	if (nlk->cb_running) {
-		if (nlk->cb.done)
-			nlk->cb.done(&nlk->cb);
-
 		module_put(nlk->cb.module);
 		kfree_skb(nlk->cb.skb);
 	}
@@ -346,6 +343,28 @@ static void netlink_sock_destruct(struct sock *sk)
 	WARN_ON(nlk_sk(sk)->groups);
 }
 
+static void netlink_sock_destruct_work(struct work_struct *work)
+{
+	struct netlink_sock *nlk = container_of(work, struct netlink_sock,
+						work);
+
+	nlk->cb.done(&nlk->cb);
+	__netlink_sock_destruct(&nlk->sk);
+}
+
+static void netlink_sock_destruct(struct sock *sk)
+{
+	struct netlink_sock *nlk = nlk_sk(sk);
+
+	if (nlk->cb_running && nlk->cb.done) {
+		INIT_WORK(&nlk->work, netlink_sock_destruct_work);
+		schedule_work(&nlk->work);
+		return;
+	}
+
+	__netlink_sock_destruct(sk);
+}
+
 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on
  * SMP. Look, when several writers sleep and reader wakes them up, all but one
  * immediately hit write lock and grab all the cpus. Exclusive sleep solves
diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h
index 3cfd6cc60504..4fdb38318977 100644
--- a/net/netlink/af_netlink.h
+++ b/net/netlink/af_netlink.h
@@ -3,6 +3,7 @@
 
 #include <linux/rhashtable.h>
 #include <linux/atomic.h>
+#include <linux/workqueue.h>
 #include <net/sock.h>
 
 #define NLGRPSZ(x)	(ALIGN(x, sizeof(unsigned long) * 8) / 8)
@@ -33,6 +34,7 @@ struct netlink_sock {
 
 	struct rhash_head	node;
 	struct rcu_head		rcu;
+	struct work_struct	work;
 };
 
 static inline struct netlink_sock *nlk_sk(struct sock *sk)
-- 
cgit v1.2.3


From 648f0c28df282636c0c8a7a19ca3ce5fc80a39c3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 28 Nov 2016 06:26:49 -0800
Subject: net/dccp: fix use-after-free in dccp_invalid_packet

pskb_may_pull() can reallocate skb->head, we need to reload dh pointer
in dccp_invalid_packet() or risk use after free.

Bug found by Andrey Konovalov using syzkaller.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/ipv4.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index b567c8725aea..edbe59d203ef 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -700,6 +700,7 @@ int dccp_invalid_packet(struct sk_buff *skb)
 {
 	const struct dccp_hdr *dh;
 	unsigned int cscov;
+	u8 dccph_doff;
 
 	if (skb->pkt_type != PACKET_HOST)
 		return 1;
@@ -721,18 +722,19 @@ int dccp_invalid_packet(struct sk_buff *skb)
 	/*
 	 * If P.Data Offset is too small for packet type, drop packet and return
 	 */
-	if (dh->dccph_doff < dccp_hdr_len(skb) / sizeof(u32)) {
-		DCCP_WARN("P.Data Offset(%u) too small\n", dh->dccph_doff);
+	dccph_doff = dh->dccph_doff;
+	if (dccph_doff < dccp_hdr_len(skb) / sizeof(u32)) {
+		DCCP_WARN("P.Data Offset(%u) too small\n", dccph_doff);
 		return 1;
 	}
 	/*
 	 * If P.Data Offset is too too large for packet, drop packet and return
 	 */
-	if (!pskb_may_pull(skb, dh->dccph_doff * sizeof(u32))) {
-		DCCP_WARN("P.Data Offset(%u) too large\n", dh->dccph_doff);
+	if (!pskb_may_pull(skb, dccph_doff * sizeof(u32))) {
+		DCCP_WARN("P.Data Offset(%u) too large\n", dccph_doff);
 		return 1;
 	}
-
+	dh = dccp_hdr(skb);
 	/*
 	 * If P.type is not Data, Ack, or DataAck and P.X == 0 (the packet
 	 * has short sequence numbers), drop packet and return
-- 
cgit v1.2.3


From 725cbb62e7ade1bb29aa21a902d74e72b42a0f3d Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Mon, 28 Nov 2016 15:40:13 +0100
Subject: sched: cls_flower: remove from hashtable only in case skip sw flag is
 not set

Be symmetric to hashtable insert and remove filter from hashtable only
in case skip sw flag is not set.

Fixes: e69985c67c33 ("net/sched: cls_flower: Introduce support in SKIP SW flag")
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Reviewed-by: Amir Vadai <amir@vadai.me>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_flower.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index b296f3991ab2..904442421db3 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -732,8 +732,9 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 		goto errout;
 
 	if (fold) {
-		rhashtable_remove_fast(&head->ht, &fold->ht_node,
-				       head->ht_params);
+		if (!tc_skip_sw(fold->flags))
+			rhashtable_remove_fast(&head->ht, &fold->ht_node,
+					       head->ht_params);
 		fl_hw_destroy_filter(tp, (unsigned long)fold);
 	}
 
@@ -760,8 +761,9 @@ static int fl_delete(struct tcf_proto *tp, unsigned long arg)
 	struct cls_fl_head *head = rtnl_dereference(tp->root);
 	struct cls_fl_filter *f = (struct cls_fl_filter *) arg;
 
-	rhashtable_remove_fast(&head->ht, &f->ht_node,
-			       head->ht_params);
+	if (!tc_skip_sw(f->flags))
+		rhashtable_remove_fast(&head->ht, &f->ht_node,
+				       head->ht_params);
 	list_del_rcu(&f->list);
 	fl_hw_destroy_filter(tp, (unsigned long)f);
 	tcf_unbind_filter(tp, &f->res);
-- 
cgit v1.2.3


From a510887824171ad260cc4a2603396c6247fdd091 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@kernel.org>
Date: Mon, 28 Nov 2016 12:36:58 -0300
Subject: GSO: Reload iph after pskb_may_pull

As it may get stale and lead to use after free.

Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Alexander Duyck <aduyck@mirantis.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Fixes: cbc53e08a793 ("GSO: Add GSO type for fixed IPv4 ID")
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/af_inet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 5ddf5cda07f4..215143246e4b 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1233,7 +1233,7 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 		fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID);
 
 		/* fixed ID is invalid if DF bit is not set */
-		if (fixedid && !(iph->frag_off & htons(IP_DF)))
+		if (fixedid && !(ip_hdr(skb)->frag_off & htons(IP_DF)))
 			goto out;
 	}
 
-- 
cgit v1.2.3


From 0d8f3c67151faaa80e332c254372dca58fb2a9d4 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Mon, 28 Nov 2016 19:24:54 +0100
Subject: net: dsa: slave: fix of-node leak and phy priority

Make sure to drop the reference taken by of_parse_phandle() before
returning from dsa_slave_phy_setup().

Note that this also modifies the PHY priority so that any fixed-link
node is only parsed when no phy-handle is given, which is in accordance
with the common scheme for this.

Fixes: 0d8bcdd383b8 ("net: dsa: allow for more complex PHY setups")
Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/slave.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 6b1282c006b1..2a5c20a13fe4 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1125,7 +1125,7 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
 	p->phy_interface = mode;
 
 	phy_dn = of_parse_phandle(port_dn, "phy-handle", 0);
-	if (of_phy_is_fixed_link(port_dn)) {
+	if (!phy_dn && of_phy_is_fixed_link(port_dn)) {
 		/* In the case of a fixed PHY, the DT node associated
 		 * to the fixed PHY is the Port DT node
 		 */
@@ -1135,7 +1135,7 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
 			return ret;
 		}
 		phy_is_fixed = true;
-		phy_dn = port_dn;
+		phy_dn = of_node_get(port_dn);
 	}
 
 	if (ds->ops->get_phy_flags)
@@ -1154,6 +1154,7 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
 			ret = dsa_slave_phy_connect(p, slave_dev, phy_id);
 			if (ret) {
 				netdev_err(slave_dev, "failed to connect to phy%d: %d\n", phy_id, ret);
+				of_node_put(phy_dn);
 				return ret;
 			}
 		} else {
@@ -1162,6 +1163,8 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
 						phy_flags,
 						p->phy_interface);
 		}
+
+		of_node_put(phy_dn);
 	}
 
 	if (p->phy && phy_is_fixed)
-- 
cgit v1.2.3


From 3f65047c853a2a5abcd8ac1984af3452b5df4ada Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Mon, 28 Nov 2016 19:24:55 +0100
Subject: of_mdio: add helper to deregister fixed-link PHYs

Add helper to deregister fixed-link PHYs registered using
of_phy_register_fixed_link().

Convert the two drivers that care to deregister their fixed-link PHYs to
use the new helper, but note that most drivers currently fail to do so.

Signed-off-by: Johan Hovold <johan@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/dsa.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index cb0091b99592..7899919cd9f0 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -506,16 +506,8 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,
 
 void dsa_cpu_dsa_destroy(struct device_node *port_dn)
 {
-	struct phy_device *phydev;
-
-	if (of_phy_is_fixed_link(port_dn)) {
-		phydev = of_phy_find_device(port_dn);
-		if (phydev) {
-			fixed_phy_unregister(phydev);
-			put_device(&phydev->mdio.dev);
-			phy_device_free(phydev);
-		}
-	}
+	if (of_phy_is_fixed_link(port_dn))
+		of_phy_deregister_fixed_link(port_dn);
 }
 
 static void dsa_switch_destroy