summaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
authorPablo Neira Ayuso <pablo@netfilter.org>2016-09-25 23:23:57 +0200
committerPablo Neira Ayuso <pablo@netfilter.org>2016-09-25 23:34:19 +0200
commitf20fbc0717f9f007c94b2641134b19228d0ce9ed (patch)
tree1404248ebbec552a3fb7928b75322b65d74de1bd /net/ipv4
parent8cb2a7d5667ab9a9c2fdd356357b85b63b320901 (diff)
parentfe0acb5fcb7fe8cb3d68bbdb8459865c972d8f83 (diff)
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Conflicts: net/netfilter/core.c net/netfilter/nf_tables_netdev.c Resolve two conflicts before pull request for David's net-next tree: 1) Between c73c24849011 ("netfilter: nf_tables_netdev: remove redundant ip_hdr assignment") from the net tree and commit ddc8b6027ad0 ("netfilter: introduce nft_set_pktinfo_{ipv4, ipv6}_validate()"). 2) Between e8bffe0cf964 ("net: Add _nf_(un)register_hooks symbols") and Aaron Conole's patches to replace list_head with single linked list. Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig18
-rw-r--r--net/ipv4/Makefile3
-rw-r--r--net/ipv4/af_inet.c14
-rw-r--r--net/ipv4/devinet.c11
-rw-r--r--net/ipv4/fib_frontend.c3
-rw-r--r--net/ipv4/fib_rules.c3
-rw-r--r--net/ipv4/fib_semantics.c8
-rw-r--r--net/ipv4/fib_trie.c10
-rw-r--r--net/ipv4/gre_offload.c6
-rw-r--r--net/ipv4/inet_diag.c49
-rw-r--r--net/ipv4/ip_gre.c23
-rw-r--r--net/ipv4/ip_input.c5
-rw-r--r--net/ipv4/ip_output.c13
-rw-r--r--net/ipv4/ip_sockglue.c7
-rw-r--r--net/ipv4/ip_tunnel.c76
-rw-r--r--net/ipv4/ip_tunnel_core.c2
-rw-r--r--net/ipv4/ip_vti.c15
-rw-r--r--net/ipv4/ipip.c35
-rw-r--r--net/ipv4/ipmr.c7
-rw-r--r--net/ipv4/netfilter/nft_chain_route_ipv4.c11
-rw-r--r--net/ipv4/netfilter/nft_reject_ipv4.c1
-rw-r--r--net/ipv4/raw.c6
-rw-r--r--net/ipv4/route.c34
-rw-r--r--net/ipv4/tcp.c61
-rw-r--r--net/ipv4/tcp_bbr.c896
-rw-r--r--net/ipv4/tcp_cdg.c12
-rw-r--r--net/ipv4/tcp_cong.c2
-rw-r--r--net/ipv4/tcp_fastopen.c2
-rw-r--r--net/ipv4/tcp_input.c495
-rw-r--r--net/ipv4/tcp_ipv4.c2
-rw-r--r--net/ipv4/tcp_minisocks.c6
-rw-r--r--net/ipv4/tcp_offload.c13
-rw-r--r--net/ipv4/tcp_output.c106
-rw-r--r--net/ipv4/tcp_rate.c186
-rw-r--r--net/ipv4/tcp_timer.c1
-rw-r--r--net/ipv4/tcp_yeah.c2
-rw-r--r--net/ipv4/udp.c6
-rw-r--r--net/ipv4/udp_diag.c14
-rw-r--r--net/ipv4/udp_offload.c6
-rw-r--r--net/ipv4/xfrm4_policy.c4
40 files changed, 1779 insertions, 395 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 50d6a9b49f6c..300b06888fdf 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -640,6 +640,21 @@ config TCP_CONG_CDG
D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using
delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg
+config TCP_CONG_BBR
+ tristate "BBR TCP"
+ default n
+ ---help---
+
+ BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
+ maximize network utilization and minimize queues. It builds an explicit
+ model of the the bottleneck delivery rate and path round-trip
+ propagation delay. It tolerates packet loss and delay unrelated to
+ congestion. It can operate over LAN, WAN, cellular, wifi, or cable
+ modem links. It can coexist with flows that use loss-based congestion
+ control, and can operate with shallow buffers, deep buffers,
+ bufferbloat, policers, or AQM schemes that do not provide a delay
+ signal. It requires the fq ("Fair Queue") pacing packet scheduler.
+
choice
prompt "Default TCP congestion control"
default DEFAULT_CUBIC
@@ -674,6 +689,9 @@ choice
config DEFAULT_CDG
bool "CDG" if TCP_CONG_CDG=y
+ config DEFAULT_BBR
+ bool "BBR" if TCP_CONG_BBR=y
+
config DEFAULT_RENO
bool "Reno"
endchoice
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 24629b6f57cc..bc6a6c8b9bcd 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -8,7 +8,7 @@ obj-y := route.o inetpeer.o protocol.o \
inet_timewait_sock.o inet_connection_sock.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
- tcp_recovery.o \
+ tcp_rate.o tcp_recovery.o \
tcp_offload.o datagram.o raw.o udp.o udplite.o \
udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o \
@@ -41,6 +41,7 @@ obj-$(CONFIG_INET_DIAG) += inet_diag.o
obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
+obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index e94b47be0019..1effc986739e 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1192,7 +1192,7 @@ EXPORT_SYMBOL(inet_sk_rebuild_header);
struct sk_buff *inet_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
- bool udpfrag = false, fixedid = false, encap;
+ bool udpfrag = false, fixedid = false, gso_partial, encap;
struct sk_buff *segs = ERR_PTR(-EINVAL);
const struct net_offload *ops;
unsigned int offset = 0;
@@ -1245,6 +1245,8 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
if (IS_ERR_OR_NULL(segs))
goto out;
+ gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
+
skb = segs;
do {
iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
@@ -1259,9 +1261,13 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
iph->id = htons(id);
id += skb_shinfo(skb)->gso_segs;
}
- tot_len = skb_shinfo(skb)->gso_size +
- SKB_GSO_CB(skb)->data_offset +
- skb->head - (unsigned char *)iph;
+
+ if (gso_partial)
+ tot_len = skb_shinfo(skb)->gso_size +
+ SKB_GSO_CB(skb)->data_offset +
+ skb->head - (unsigned char *)iph;
+ else
+ tot_len = skb->len - nhoff;
} else {
if (!fixedid)
iph->id = htons(id++);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 415e117967c7..062a67ca9a21 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -2232,7 +2232,7 @@ static struct devinet_sysctl_table {
};
static int __devinet_sysctl_register(struct net *net, char *dev_name,
- struct ipv4_devconf *p)
+ int ifindex, struct ipv4_devconf *p)
{
int i;
struct devinet_sysctl_table *t;
@@ -2255,6 +2255,8 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,
goto free;
p->sysctl = t;
+
+ inet_netconf_notify_devconf(net, NETCONFA_ALL, ifindex, p);
return 0;
free:
@@ -2286,7 +2288,7 @@ static int devinet_sysctl_register(struct in_device *idev)
if (err)
return err;
err = __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name,
- &idev->cnf);
+ idev->dev->ifindex, &idev->cnf);
if (err)
neigh_sysctl_unregister(idev->arp_parms);
return err;
@@ -2347,11 +2349,12 @@ static __net_init int devinet_init_net(struct net *net)
}
#ifdef CONFIG_SYSCTL
- err = __devinet_sysctl_register(net, "all", all);
+ err = __devinet_sysctl_register(net, "all", NETCONFA_IFINDEX_ALL, all);
if (err < 0)
goto err_reg_all;
- err = __devinet_sysctl_register(net, "default", dflt);
+ err = __devinet_sysctl_register(net, "default",
+ NETCONFA_IFINDEX_DEFAULT, dflt);
if (err < 0)
goto err_reg_dflt;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 317c31939732..4e56a4c20a3c 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -503,6 +503,7 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
if (!dev)
return -ENODEV;
cfg->fc_oif = dev->ifindex;
+ cfg->fc_table = l3mdev_fib_table(dev);
if (colon) {
struct in_ifaddr *ifa;
struct in_device *in_dev = __in_dev_get_rtnl(dev);
@@ -1021,7 +1022,7 @@ no_promotions:
* First of all, we scan fib_info list searching
* for stray nexthop entries, then ignite fib_flush.
*/
- if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
+ if (fib_sync_down_addr(dev, ifa->ifa_local))
fib_flush(dev_net(dev));
}
}
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 6e9ea69e5f75..770bebed6b28 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -56,6 +56,9 @@ int __fib_lookup(struct net *net, struct flowi4 *flp,
};
int err;
+ /* update flow if oif or iif point to device enslaved to l3mdev */
+ l3mdev_update_flow(net, flowi4_to_flowi(flp));
+
err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg);
#ifdef CONFIG_IP_ROUTE_CLASSID
if (arg.rule)
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 8066ccc48a17..388d3e21629b 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1057,6 +1057,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
fi->fib_priority = cfg->fc_priority;
fi->fib_prefsrc = cfg->fc_prefsrc;
fi->fib_type = cfg->fc_type;
+ fi->fib_tb_id = cfg->fc_table;
fi->fib_nhs = nhs;
change_nexthops(fi) {
@@ -1337,18 +1338,21 @@ nla_put_failure:
* referring to it.
* - device went down -> we must shutdown all nexthops going via it.
*/
-int fib_sync_down_addr(struct net *net, __be32 local)
+int fib_sync_down_addr(struct net_device *dev, __be32 local)
{
int ret = 0;
unsigned int hash = fib_laddr_hashfn(local);
struct hlist_head *head = &fib_info_laddrhash[hash];
+ struct net *net = dev_net(dev);
+ int tb_id = l3mdev_fib_table(dev);
struct fib_info *fi;
if (!fib_info_laddrhash || local == 0)
return 0;
hlist_for_each_entry(fi, head, fib_lhash) {
- if (!net_eq(fi->fib_net, net))
+ if (!net_eq(fi->fib_net, net) ||
+ fi->fib_tb_id != tb_id)
continue;
if (fi->fib_prefsrc == local) {
fi->fib_flags |= RTNH_F_DEAD;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index e2ffc2a5c7db..241f27bbd7ad 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1081,7 +1081,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
struct trie *t = (struct trie *)tb->tb_data;
struct fib_alias *fa, *new_fa;
struct key_vector *l, *tp;
- unsigned int nlflags = 0;
+ u16 nlflags = NLM_F_EXCL;
struct fib_info *fi;
u8 plen = cfg->fc_dst_len;
u8 slen = KEYLENGTH - plen;
@@ -1126,6 +1126,8 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
if (cfg->fc_nlflags & NLM_F_EXCL)
goto out;
+ nlflags &= ~NLM_F_EXCL;
+
/* We have 2 goals:
* 1. Find exact match for type, scope, fib_info to avoid
* duplicate routes
@@ -1151,6 +1153,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
struct fib_info *fi_drop;
u8 state;
+ nlflags |= NLM_F_REPLACE;
fa = fa_first;
if (fa_match) {
if (fa == fa_match)
@@ -1191,7 +1194,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
if (state & FA_S_ACCESSED)
rt_cache_flush(cfg->fc_nlinfo.nl_net);
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
- tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
+ tb->tb_id, &cfg->fc_nlinfo, nlflags);
goto succeeded;
}
@@ -1203,7 +1206,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
goto out;
if (cfg->fc_nlflags & NLM_F_APPEND)
- nlflags = NLM_F_APPEND;
+ nlflags |= NLM_F_APPEND;
else
fa = fa_first;
}
@@ -1211,6 +1214,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
if (!(cfg->fc_nlflags & NLM_F_CREATE))
goto out;
+ nlflags |= NLM_F_CREATE;
err = -ENOBUFS;
new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
if (!new_fa)
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index ecd1e09dbbf1..96e0efecefa6 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -24,7 +24,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
__be16 protocol = skb->protocol;
u16 mac_len = skb->mac_len;
int gre_offset, outer_hlen;
- bool need_csum, ufo;
+ bool need_csum, ufo, gso_partial;
if (!skb->encapsulation)
goto out;
@@ -69,6 +69,8 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
goto out;
}
+ gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL);
+
outer_hlen = skb_tnl_header_len(skb);
gre_offset = outer_hlen - tnl_hlen;
skb = segs;
@@ -96,7 +98,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
greh = (struct gre_base_hdr *)skb_transport_header(skb);
pcsum = (__sum16 *)(greh + 1);
- if (skb_is_gso(skb)) {
+ if (gso_partial) {
unsigned int partial_adj;
/* Adjust checksum to account for the fact that
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index abfbe492ebfe..e4d16fc5bbb3 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -99,6 +99,7 @@ static size_t inet_sk_attr_size(void)
+ nla_total_size(1) /* INET_DIAG_SHUTDOWN */
+ nla_total_size(1) /* INET_DIAG_TOS */
+ nla_total_size(1) /* INET_DIAG_TCLASS */
+ + nla_total_size(4) /* INET_DIAG_MARK */
+ nla_total_size(sizeof(struct inet_diag_meminfo))
+ nla_total_size(sizeof(struct inet_diag_msg))
+ nla_total_size(SK_MEMINFO_VARS * sizeof(u32))
@@ -109,7 +110,8 @@ static size_t inet_sk_attr_size(void)
int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
struct inet_diag_msg *r, int ext,
- struct user_namespace *user_ns)
+ struct user_namespace *user_ns,
+ bool net_admin)
{
const struct inet_sock *inet = inet_sk(sk);
@@ -136,6 +138,9 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
}
#endif
+ if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, sk->sk_mark))
+ goto errout;
+
r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
r->idiag_inode = sock_i_ino(sk);
@@ -149,7 +154,8 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
struct sk_buff *skb, const struct inet_diag_req_v2 *req,
struct user_namespace *user_ns,
u32 portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh)
+ const struct nlmsghdr *unlh,
+ bool net_admin)
{
const struct tcp_congestion_ops *ca_ops;
const struct inet_diag_handler *handler;
@@ -175,7 +181,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
r->idiag_timer = 0;
r->idiag_retrans = 0;
- if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns))
+ if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin))
goto errout;
if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
@@ -274,10 +280,11 @@ static int inet_csk_diag_fill(struct sock *sk,
const struct inet_diag_req_v2 *req,
struct user_namespace *user_ns,
u32 portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh)
+ const struct nlmsghdr *unlh,
+ bool net_admin)
{
- return inet_sk_diag_fill(sk, inet_csk(sk), skb, req,
- user_ns, portid, seq, nlmsg_flags, unlh);
+ return inet_sk_diag_fill(sk, inet_csk(sk), skb, req, user_ns,
+ portid, seq, nlmsg_flags, unlh, net_admin);
}
static int inet_twsk_diag_fill(struct sock *sk,
@@ -319,8 +326,9 @@ static int inet_twsk_diag_fill(struct sock *sk,
static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
u32 portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh)
+ const struct nlmsghdr *unlh, bool net_admin)
{
+ struct request_sock *reqsk = inet_reqsk(sk);
struct inet_diag_msg *r;
struct nlmsghdr *nlh;
long tmo;
@@ -334,7 +342,7 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
inet_diag_msg_common_fill(r, sk);
r->idiag_state = TCP_SYN_RECV;
r->idiag_timer = 1;
- r->idiag_retrans = inet_reqsk(sk)->num_retrans;
+ r->idiag_retrans = reqsk->num_retrans;
BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) !=
offsetof(struct sock, sk_cookie));
@@ -346,6 +354,10 @@ static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
r->idiag_uid = 0;
r->idiag_inode = 0;
+ if (net_admin && nla_put_u32(skb, INET_DIAG_MARK,
+ inet_rsk(reqsk)->ir_mark))
+ return -EMSGSIZE;
+
nlmsg_end(skb, nlh);
return 0;
}
@@ -354,7 +366,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
const struct inet_diag_req_v2 *r,
struct user_namespace *user_ns,
u32 portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh)
+ const struct nlmsghdr *unlh, bool net_admin)
{
if (sk->sk_state == TCP_TIME_WAIT)
return inet_twsk_diag_fill(sk, skb, portid, seq,
@@ -362,10 +374,10 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
if (sk->sk_state == TCP_NEW_SYN_RECV)
return inet_req_diag_fill(sk, skb, portid, seq,
- nlmsg_flags, unlh);
+ nlmsg_flags, unlh, net_admin);
return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq,
- nlmsg_flags, unlh);
+ nlmsg_flags, unlh, net_admin);
}
struct sock *inet_diag_find_one_icsk(struct net *net,
@@ -435,7 +447,8 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
err = sk_diag_fill(sk, rep, req,
sk_user_ns(NETLINK_CB(in_skb).sk),
NETLINK_CB(in_skb).portid,
- nlh->nlmsg_seq, 0, nlh);
+ nlh->nlmsg_seq, 0, nlh,
+ netlink_net_capable(in_skb, CAP_NET_ADMIN));
if (err < 0) {
WARN_ON(err == -EMSGSIZE);
nlmsg_free(rep);
@@ -796,7 +809,8 @@ static int inet_csk_diag_dump(struct sock *sk,
struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *r,
- const struct nlattr *bc)
+ const struct nlattr *bc,
+ bool net_admin)
{
if (!inet_diag_bc_sk(bc, sk))
return 0;
@@ -804,7 +818,8 @@ static int inet_csk_diag_dump(struct sock *sk,
return inet_csk_diag_fill(sk, skb, r,
sk_user_ns(NETLINK_CB(cb->skb).sk),
NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
+ cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh,
+ net_admin);
}
static void twsk_build_assert(void)
@@ -840,6 +855,7 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
struct net *net = sock_net(skb->sk);
int i, num, s_i, s_num;
u32 idiag_states = r->idiag_states;
+ bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
if (idiag_states & TCPF_SYN_RECV)
idiag_states |= TCPF_NEW_SYN_RECV;
@@ -880,7 +896,8 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
cb->args[3] > 0)
goto next_listen;
- if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {
+ if (inet_csk_diag_dump(sk, skb, cb, r,
+ bc, net_admin) < 0) {
spin_unlock_bh(&ilb->lock);
goto done;
}
@@ -948,7 +965,7 @@ skip_listen_ht:
sk_user_ns(NETLINK_CB(cb->skb).sk),
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
- cb->nlh);
+ cb->nlh, net_admin);
if (res < 0) {
spin_unlock_bh(lock);
goto done;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 113cc43df789..576f705d8180 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -246,25 +246,6 @@ static void gre_err(struct sk_buff *skb, u32 info)
ipgre_err(skb, info, &tpi);
}
-static __be64 key_to_tunnel_id(__be32 key)
-{
-#ifdef __BIG_ENDIAN
- return (__force __be64)((__force u32)key);
-#else
- return (__force __be64)((__force u64)key << 32);
-#endif
-}
-
-/* Returns the least-significant 32 bits of a __be64. */
-static __be32 tunnel_id_to_key(__be64 x)
-{
-#ifdef __BIG_ENDIAN
- return (__force __be32)x;
-#else
- return (__force __be32)((__force u64)x >> 32);
-#endif
-}
-
static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
{
@@ -290,7 +271,7 @@ static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
__be64 tun_id;
flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
- tun_id = key_to_tunnel_id(tpi->key);
+ tun_id = key32_to_tunnel_id(tpi->key);
tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
if (!tun_dst)
return PACKET_REJECT;
@@ -446,7 +427,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
gre_build_header(skb, tunnel_hlen, flags, proto,
- tunnel_id_to_key(tun_info->key.tun_id), 0);
+ tunnel_id_to_key32(tun_info->key.tun_id), 0);
df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 4b351af3e67b..d6feabb03516 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -312,6 +312,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
+ struct net_device *dev = skb->dev;
/* if ingress device is enslaved to an L3 master device pass the
* skb to its handler for processing
@@ -341,7 +342,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
*/
if (!skb_valid_dst(skb)) {
int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
- iph->tos, skb->dev);
+ iph->tos, dev);
if (unlikely(err)) {
if (err == -EXDEV)
__NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
@@ -370,7 +371,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
__IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len);
} else if (skb->pkt_type == PACKET_BROADCAST ||
skb->pkt_type == PACKET_MULTICAST) {
- struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
/* RFC 1122 3.3.6:
*
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 65569274efb8..05d105832bdb 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -99,6 +99,14 @@ int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
iph->tot_len = htons(skb->len);
ip_send_check(iph);
+
+ /* if egress device is enslaved to an L3 master device pass the
+ * skb to its handler for processing
+ */
+ skb = l3mdev_ip_out(sk, skb);
+ if (unlikely(!skb))
+ return 0;
+
return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
net, sk, skb, NULL, skb_dst(skb)->dev,
dst_output);
@@ -490,7 +498,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
to->tc_index = from->tc_index;
#endif
nf_copy(to, from);
-#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
+#if IS_ENABLED(CONFIG_IP_VS)
to->ipvs_property = from->ipvs_property;
#endif
skb_copy_secmark(to, from);
@@ -1574,8 +1582,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
}
oif = arg->bound_dev_if;
- if (!oif && netif_index_is_l3_master(net, skb->skb_iif))
- oif = skb->skb_iif;
+ oif = oif ? : skb->skb_iif;
flowi4_init_output(&fl4, oif,
IP4_REPLY_MARK(net, skb->mark),
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 71a52f4d4cff..af4919792b6a 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -284,9 +284,12 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
ipc->ttl = val;
break;
case IP_TOS:
- if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
+ if (cmsg->cmsg_len == CMSG_LEN(sizeof(int)))
+ val = *(int *)CMSG_DATA(cmsg);
+ else if (cmsg->cmsg_len == CMSG_LEN(sizeof(u8)))
+ val = *(u8 *)CMSG_DATA(cmsg);
+ else
return -EINVAL;
- val = *(int *)CMSG_DATA(cmsg);
if (val < 0 || val > 255)
return -EINVAL;
ipc->tos = val;
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 95649ebd2874..5719d6ba0824 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -55,6 +55,7 @@
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
#include <net/udp.h>
+#include <net/dst_metadata.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6.h>
@@ -546,6 +547,81 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
return 0;
}
+void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ u32 headroom = sizeof(struct iphdr);
+ struct ip_tunnel_info *tun_info;
+ const struct ip_tunnel_key *key;
+ const struct iphdr *inner_iph;
+ struct rtable *rt;
+ struct flowi4 fl4;
+ __be16 df = 0;
+ u8 tos, ttl;
+
+ tun_info = skb_tunnel_info(skb);
+ if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+ ip_tunnel_info_af(tun_info) != AF_INET))
+ goto tx_error;
+ key = &tun_info->key;
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+ inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
+ tos = key->tos;
+ if (tos == 1) {
+ if (skb->protocol == htons(ETH_P_IP))
+ tos = inner_iph->tos;
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
+ }
+ init_tunnel_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0,
+ RT_TOS(tos), tunnel->parms.link);
+ if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
+ goto tx_error;
+ rt = ip_route_output_key(tunnel->net, &fl4);
+ if (IS_ERR(rt)) {
+ dev->stats.tx_carrier_errors++;
+ goto tx_error;
+ }
+ if (rt->dst.dev == dev) {
+ ip_rt_put(rt);
+ dev->stats.collisions++;
+ goto tx_error;
+ }
+ tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
+ ttl = key->ttl;
+ if (ttl == 0) {
+ if (skb->protocol == htons(ETH_P_IP))
+ ttl = inner_iph->ttl;
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
+ else
+ ttl = ip4_dst_hoplimit(&rt->dst);
+ }
+ if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
+ df = htons(IP_DF);
+ else if (skb->protocol == htons(ETH_P_IP))
+ df = inner_iph->frag_off & htons(IP_DF);
+ headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
+ if (headroom > dev->needed_headroom)
+ dev->needed_headroom = headroom;
+
+ if (skb_cow_head(skb, dev->needed_headroom)) {
+ ip_rt_put(rt);
+ goto tx_dropped;
+ }
+ iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos,
+ key->ttl, df, !net_eq(tunnel->net, dev_net(dev)));
+ return;
+tx_error:
+ dev->stats.tx_errors++;
+ goto kfree;
+tx_dropped:
+ dev->stats.tx_dropped++;
+kfree:
+ kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
+
void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
const struct iphdr *tnl_params, u8 protocol)
{
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 0f227db0e9ac..777bc1883870 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -69,7 +69,7 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
s