From 05e22e8395058745bd0312bc488b522197852aff Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 19 Jun 2020 12:12:34 -0700
Subject: tcp: remove indirect calls for icsk->icsk_af_ops->queue_xmit

Mitigate RETPOLINE costs in __tcp_transmit_skb()
by using INDIRECT_CALL_INET() wrapper.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c  | 6 ++++++
 net/ipv4/tcp_output.c | 7 ++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 090d3097ee15..d946356187ed 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -539,6 +539,12 @@ no_route:
 }
 EXPORT_SYMBOL(__ip_queue_xmit);
 
+int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
+{
+	return __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos);
+}
+EXPORT_SYMBOL(ip_queue_xmit);
+
 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 {
 	to->pkt_type = from->pkt_type;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a50e1990a845..be1bd37185d8 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1064,6 +1064,9 @@ static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
 	list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
 }
 
+INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl));
+INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl));
+
 /* This routine actually transmits TCP packets queued in by
  * tcp_do_sendmsg().  This is used by both the initial
  * transmission and possible later retransmissions.
@@ -1235,7 +1238,9 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
 
 	tcp_add_tx_delay(skb, tp);
 
-	err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
+	err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit,
+				 inet6_csk_xmit, ip_queue_xmit,
+				 sk, skb, &inet->cork.fl);
 
 	if (unlikely(err > 0)) {
 		tcp_enter_cwr(sk);
-- 
cgit v1.2.3


From dd2e0b86fc4ee146ac8f3275833d0187efeb950a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 19 Jun 2020 12:12:35 -0700
Subject: tcp: remove indirect calls for icsk->icsk_af_ops->send_check

Mitigate RETPOLINE costs in __tcp_transmit_skb()
by using INDIRECT_CALL_INET() wrapper.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index be1bd37185d8..04b70fe31fa2 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1066,6 +1066,7 @@ static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
 
 INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl));
 INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl));
+INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb));
 
 /* This routine actually transmits TCP packets queued in by
  * tcp_do_sendmsg().  This is used by both the initial
@@ -1210,7 +1211,9 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
 	}
 #endif
 
-	icsk->icsk_af_ops->send_check(sk, skb);
+	INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check,
+			   tcp_v6_send_check, tcp_v4_send_check,
+			   sk, skb);
 
 	if (likely(tcb->tcp_flags & TCPHDR_ACK))
 		tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt);
-- 
cgit v1.2.3


From 5521d95e076238f1615cf1cdb135f318ef798b49 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 23 Jun 2020 15:31:14 -0700
Subject: net: move tcp gro declarations to net/tcp.h

This patch removes following (C=1 W=1) warnings for CONFIG_RETPOLINE=y :

net/ipv4/tcp_offload.c:306:16: warning: symbol 'tcp4_gro_receive' was not declared. Should it be static?
net/ipv4/tcp_offload.c:306:17: warning: no previous prototype for 'tcp4_gro_receive' [-Wmissing-prototypes]
net/ipv4/tcp_offload.c:319:29: warning: symbol 'tcp4_gro_complete' was not declared. Should it be static?
net/ipv4/tcp_offload.c:319:29: warning: no previous prototype for 'tcp4_gro_complete' [-Wmissing-prototypes]
  CHECK   net/ipv6/tcpv6_offload.c
net/ipv6/tcpv6_offload.c:16:16: warning: symbol 'tcp6_gro_receive' was not declared. Should it be static?
net/ipv6/tcpv6_offload.c:29:29: warning: symbol 'tcp6_gro_complete' was not declared. Should it be static?
  CC      net/ipv6/tcpv6_offload.o
net/ipv6/tcpv6_offload.c:16:17: warning: no previous prototype for 'tcp6_gro_receive' [-Wmissing-prototypes]
   16 | struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb)
      |                 ^~~~~~~~~~~~~~~~
net/ipv6/tcpv6_offload.c:29:29: warning: no previous prototype for 'tcp6_gro_complete' [-Wmissing-prototypes]
   29 | INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff)
      |                             ^~~~~~~~~~~~~~~~~

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/af_inet.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 02aa5cb3a4fd..d8dbff1dd1fa 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1432,8 +1432,6 @@ static struct sk_buff *ipip_gso_segment(struct sk_buff *skb,
 	return inet_gso_segment(skb, features);
 }
 
-INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *,
-							   struct sk_buff *));
 INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp4_gro_receive(struct list_head *,
 							   struct sk_buff *));
 struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
@@ -1608,7 +1606,6 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
 	return -EINVAL;
 }
 
-INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *, int));
 INDIRECT_CALLABLE_DECLARE(int udp4_gro_complete(struct sk_buff *, int));
 int inet_gro_complete(struct sk_buff *skb, int nhoff)
 {
-- 
cgit v1.2.3


From 6db693285cd109e566c553694002dea769612b24 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 23 Jun 2020 15:31:15 -0700
Subject: udp: move gro declarations to net/udp.h

This removes following warnings :
  CC      net/ipv4/udp_offload.o
net/ipv4/udp_offload.c:504:17: warning: no previous prototype for 'udp4_gro_receive' [-Wmissing-prototypes]
  504 | struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb)
      |                 ^~~~~~~~~~~~~~~~
net/ipv4/udp_offload.c:584:29: warning: no previous prototype for 'udp4_gro_complete' [-Wmissing-prototypes]
  584 | INDIRECT_CALLABLE_SCOPE int udp4_gro_complete(struct sk_buff *skb, int nhoff)
      |                             ^~~~~~~~~~~~~~~~~

  CHECK   net/ipv6/udp_offload.c
net/ipv6/udp_offload.c:115:16: warning: symbol 'udp6_gro_receive' was not declared. Should it be static?
net/ipv6/udp_offload.c:148:29: warning: symbol 'udp6_gro_complete' was not declared. Should it be static?
  CC      net/ipv6/udp_offload.o
net/ipv6/udp_offload.c:115:17: warning: no previous prototype for 'udp6_gro_receive' [-Wmissing-prototypes]
  115 | struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb)
      |                 ^~~~~~~~~~~~~~~~
net/ipv6/udp_offload.c:148:29: warning: no previous prototype for 'udp6_gro_complete' [-Wmissing-prototypes]
  148 | INDIRECT_CALLABLE_SCOPE int udp6_gro_complete(struct sk_buff *skb, int nhoff)
      |                             ^~~~~~~~~~~~~~~~~
Signed-off-by: Eric Dumazet <edumazet@google.com>

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/af_inet.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index d8dbff1dd1fa..ea6ed6d487ed 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1432,8 +1432,6 @@ static struct sk_buff *ipip_gso_segment(struct sk_buff *skb,
 	return inet_gso_segment(skb, features);
 }
 
-INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp4_gro_receive(struct list_head *,
-							   struct sk_buff *));
 struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
 {
 	const struct net_offload *ops;
@@ -1606,7 +1604,6 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
 	return -EINVAL;
 }
 
-INDIRECT_CALLABLE_DECLARE(int udp4_gro_complete(struct sk_buff *, int));
 int inet_gro_complete(struct sk_buff *skb, int nhoff)
 {
 	__be16 newlen = htons(skb->len - nhoff);
-- 
cgit v1.2.3


From aad4a0a9513af962137c4842463d11ed491eec37 Mon Sep 17 00:00:00 2001
From: Dmitry Yakunin <zeil@yandex-team.ru>
Date: Sat, 20 Jun 2020 18:30:51 +0300
Subject: tcp: Expose tcp_sock_set_keepidle_locked

This is preparation for usage in bpf_setsockopt.

v2:
  - remove redundant EXPORT_SYMBOL (Alexei Starovoitov)

Signed-off-by: Dmitry Yakunin <zeil@yandex-team.ru>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200620153052.9439-2-zeil@yandex-team.ru
---
 net/ipv4/tcp.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 810cc164f795..de36c91d32ea 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2957,7 +2957,7 @@ void tcp_sock_set_user_timeout(struct sock *sk, u32 val)
 }
 EXPORT_SYMBOL(tcp_sock_set_user_timeout);
 
-static int __tcp_sock_set_keepidle(struct sock *sk, int val)
+int tcp_sock_set_keepidle_locked(struct sock *sk, int val)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
@@ -2984,7 +2984,7 @@ int tcp_sock_set_keepidle(struct sock *sk, int val)
 	int err;
 
 	lock_sock(sk);
-	err = __tcp_sock_set_keepidle(sk, val);
+	err = tcp_sock_set_keepidle_locked(sk, val);
 	release_sock(sk);
 	return err;
 }
@@ -3183,7 +3183,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		break;
 
 	case TCP_KEEPIDLE:
-		err = __tcp_sock_set_keepidle(sk, val);
+		err = tcp_sock_set_keepidle_locked(sk, val);
 		break;
 	case TCP_KEEPINTVL:
 		if (val < 1 || val > MAX_TCP_KEEPINTVL)
-- 
cgit v1.2.3


From b08d4d3b6c0460306e8a0608413b201705200d33 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 23 Jun 2020 16:08:04 -0700
Subject: net: bpf: Add bpf_seq_afinfo in tcp_iter_state

A new field bpf_seq_afinfo is added to tcp_iter_state
to provide bpf tcp iterator afinfo. There are two
reasons on why we did this.

First, the current way to get afinfo from PDE_DATA
does not work for bpf iterator as its seq_file
inode does not conform to /proc/net/{tcp,tcp6}
inode structures. More specifically, anonymous
bpf iterator will use an anonymous inode which
is shared in the system and we cannot change inode
private data structure at all.

Second, bpf iterator for tcp/tcp6 wants to
traverse all tcp and tcp6 sockets in one pass
and bpf program can control whether they want
to skip one sk_family or not. Having a different
afinfo with family AF_UNSPEC make it easier
to understand in the code.

This patch does not change /proc/net/{tcp,tcp6} behavior
as the bpf_seq_afinfo will be NULL for these two proc files.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20200623230804.3987829-1-yhs@fb.com
---
 net/ipv4/tcp_ipv4.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ad6435ba6d72..9cb65ee4ec63 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2211,13 +2211,18 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock);
  */
 static void *listening_get_next(struct seq_file *seq, void *cur)
 {
-	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
+	struct tcp_seq_afinfo *afinfo;
 	struct tcp_iter_state *st = seq->private;
 	struct net *net = seq_file_net(seq);
 	struct inet_listen_hashbucket *ilb;
 	struct hlist_nulls_node *node;
 	struct sock *sk = cur;
 
+	if (st->bpf_seq_afinfo)
+		afinfo = st->bpf_seq_afinfo;
+	else
+		afinfo = PDE_DATA(file_inode(seq->file));
+
 	if (!sk) {
 get_head:
 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
@@ -2235,7 +2240,8 @@ get_sk:
 	sk_nulls_for_each_from(sk, node) {
 		if (!net_eq(sock_net(sk), net))
 			continue;
-		if (sk->sk_family == afinfo->family)
+		if (afinfo->family == AF_UNSPEC ||
+		    sk->sk_family == afinfo->family)
 			return sk;
 	}
 	spin_unlock(&ilb->lock);
@@ -2272,11 +2278,16 @@ static inline bool empty_bucket(const struct tcp_iter_state *st)
  */
 static void *established_get_first(struct seq_file *seq)
 {
-	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
+	struct tcp_seq_afinfo *afinfo;
 	struct tcp_iter_state *st = seq->private;
 	struct net *net = seq_file_net(seq);
 	void *rc = NULL;
 
+	if (st->bpf_seq_afinfo)
+		afinfo = st->bpf_seq_afinfo;
+	else
+		afinfo = PDE_DATA(file_inode(seq->file));
+
 	st->offset = 0;
 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
 		struct sock *sk;
@@ -2289,7 +2300,8 @@ static void *established_get_first(struct seq_file *seq)
 
 		spin_lock_bh(lock);
 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
-			if (sk->sk_family != afinfo->family ||
+			if ((afinfo->family != AF_UNSPEC &&
+			     sk->sk_family != afinfo->family) ||
 			    !net_eq(sock_net(sk), net)) {
 				continue;
 			}
@@ -2304,19 +2316,25 @@ out:
 
 static void *established_get_next(struct seq_file *seq, void *cur)
 {
-	struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
+	struct tcp_seq_afinfo *afinfo;
 	struct sock *sk = cur;
 	struct hlist_nulls_node *node;
 	struct tcp_iter_state *st = seq->private;
 	struct net *net = seq_file_net(seq);
 
+	if (st->bpf_seq_afinfo)
+		afinfo = st->bpf_seq_afinfo;
+	else
+		afinfo = PDE_DATA(file_inode(seq->file));
+
 	++st->num;
 	++st->offset;
 
 	sk = sk_nulls_next(sk);
 
 	sk_nulls_for_each_from(sk, node) {
-		if (sk->sk_family == afinfo->family &&
+		if ((afinfo->family == AF_UNSPEC ||
+		     sk->sk_family == afinfo->family) &&
 		    net_eq(sock_net(sk), net))
 			return sk;
 	}
-- 
cgit v1.2.3


From 52d87d5f6418ba1b8b449ed5eea1532664896851 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 23 Jun 2020 16:08:05 -0700
Subject: net: bpf: Implement bpf iterator for tcp

The bpf iterator for tcp is implemented. Both tcp4 and tcp6
sockets will be traversed. It is up to bpf program to
filter for tcp4 or tcp6 only, or both families of sockets.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20200623230805.3987959-1-yhs@fb.com
---
 net/ipv4/tcp_ipv4.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 123 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 9cb65ee4ec63..ea0df9fd7618 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2613,6 +2613,74 @@ out:
 	return 0;
 }
 
+#ifdef CONFIG_BPF_SYSCALL
+struct bpf_iter__tcp {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct sock_common *, sk_common);
+	uid_t uid __aligned(8);
+};
+
+static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
+			     struct sock_common *sk_common, uid_t uid)
+{
+	struct bpf_iter__tcp ctx;
+
+	meta->seq_num--;  /* skip SEQ_START_TOKEN */
+	ctx.meta = meta;
+	ctx.sk_common = sk_common;
+	ctx.uid = uid;
+	return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+	struct sock *sk = v;
+	uid_t uid;
+
+	if (v == SEQ_START_TOKEN)
+		return 0;
+
+	if (sk->sk_state == TCP_TIME_WAIT) {
+		uid = 0;
+	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
+		const struct request_sock *req = v;
+
+		uid = from_kuid_munged(seq_user_ns(seq),
+				       sock_i_uid(req->rsk_listener));
+	} else {
+		uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
+	}
+
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, false);
+	return tcp_prog_seq_show(prog, &meta, v, uid);
+}
+
+static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+
+	if (!v) {
+		meta.seq = seq;
+		prog = bpf_iter_get_info(&meta, true);
+		if (prog)
+			(void)tcp_prog_seq_show(prog, &meta, v, 0);
+	}
+
+	tcp_seq_stop(seq, v);
+}
+
+static const struct seq_operations bpf_iter_tcp_seq_ops = {
+	.show		= bpf_iter_tcp_seq_show,
+	.start		= tcp_seq_start,
+	.next		= tcp_seq_next,
+	.stop		= bpf_iter_tcp_seq_stop,
+};
+#endif
+
 static const struct seq_operations tcp4_seq_ops = {
 	.show		= tcp4_seq_show,
 	.start		= tcp_seq_start,
@@ -2844,8 +2912,63 @@ static struct pernet_operations __net_initdata tcp_sk_ops = {
        .exit_batch = tcp_sk_exit_batch,
 };
 
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
+		     struct sock_common *sk_common, uid_t uid)
+
+static int bpf_iter_init_tcp(void *priv_data)
+{
+	struct tcp_iter_state *st = priv_data;
+	struct tcp_seq_afinfo *afinfo;
+	int ret;
+
+	afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
+	if (!afinfo)
+		return -ENOMEM;
+
+	afinfo->family = AF_UNSPEC;
+	st->bpf_seq_afinfo = afinfo;
+	ret = bpf_iter_init_seq_net(priv_data);
+	if (ret)
+		kfree(afinfo);
+	return ret;
+}
+
+static void bpf_iter_fini_tcp(void *priv_data)
+{
+	struct tcp_iter_state *st = priv_data;
+
+	kfree(st->bpf_seq_afinfo);
+	bpf_iter_fini_seq_net(priv_data);
+}
+
+static const struct bpf_iter_reg tcp_reg_info = {
+	.target			= "tcp",
+	.seq_ops		= &bpf_iter_tcp_seq_ops,
+	.init_seq_private	= bpf_iter_init_tcp,
+	.fini_seq_private	= bpf_iter_fini_tcp,
+	.seq_priv_size		= sizeof(struct tcp_iter_state),
+	.ctx_arg_info_size	= 1,
+	.ctx_arg_info		= {
+		{ offsetof(struct bpf_iter__tcp, sk_common),
+		  PTR_TO_BTF_ID_OR_NULL },
+	},
+};
+
+static void __init bpf_iter_register(void)
+{
+	if (bpf_iter_reg_target(&tcp_reg_info))
+		pr_warn("Warning: could not register bpf iterator tcp\n");
+}
+
+#endif
+
 void __init tcp_v4_init(void)
 {
 	if (register_pernet_subsys(&tcp_sk_ops))
 		panic("Failed to create the TCP control socket.\n");
+
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+	bpf_iter_register();
+#endif
 }
-- 
cgit v1.2.3


From 9e8ca27afab6c92477b459f6a5d2af0cd3197c20 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 23 Jun 2020 16:08:12 -0700
Subject: net: bpf: Add bpf_seq_afinfo in udp_iter_state

Similar to tcp_iter_state, a new field bpf_seq_afinfo is
added to udp_iter_state to provide bpf udp iterator
afinfo.

This does not change /proc/net/{udp, udp6} behavior. But
it enables bpf iterator to avoid get afinfo from PDE_DATA
and iterate through all udp and udp6 sockets in one pass.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20200623230812.3988347-1-yhs@fb.com
---
 net/ipv4/udp.c | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 1b7ebbcae497..90355301b266 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2826,10 +2826,15 @@ EXPORT_SYMBOL(udp_prot);
 static struct sock *udp_get_first(struct seq_file *seq, int start)
 {
 	struct sock *sk;
-	struct udp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
+	struct udp_seq_afinfo *afinfo;
 	struct udp_iter_state *state = seq->private;
 	struct net *net = seq_file_net(seq);
 
+	if (state->bpf_seq_afinfo)
+		afinfo = state->bpf_seq_afinfo;
+	else
+		afinfo = PDE_DATA(file_inode(seq->file));
+
 	for (state->bucket = start; state->bucket <= afinfo->udp_table->mask;
 	     ++state->bucket) {
 		struct udp_hslot *hslot = &afinfo->udp_table->hash[state->bucket];
@@ -2841,7 +2846,8 @@ static struct sock *udp_get_first(struct seq_file *seq, int start)
 		sk_for_each(sk, &hslot->head) {
 			if (!net_eq(sock_net(sk), net))
 				continue;
-			if (sk->sk_family == afinfo->family)
+			if (afinfo->family == AF_UNSPEC ||
+			    sk->sk_family == afinfo->family)
 				goto found;
 		}
 		spin_unlock_bh(&hslot->lock);
@@ -2853,13 +2859,20 @@ found:
 
 static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
 {
-	struct udp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
+	struct udp_seq_afinfo *afinfo;
 	struct udp_iter_state *state = seq->private;
 	struct net *net = seq_file_net(seq);
 
+	if (state->bpf_seq_afinfo)
+		afinfo = state->bpf_seq_afinfo;
+	else
+		afinfo = PDE_DATA(file_inode(seq->file));
+
 	do {
 		sk = sk_next(sk);
-	} while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != afinfo->family));
+	} while (sk && (!net_eq(sock_net(sk), net) ||
+			(afinfo->family != AF_UNSPEC &&
+			 sk->sk_family != afinfo->family)));
 
 	if (!sk) {
 		if (state->bucket <= afinfo->udp_table->mask)
@@ -2904,9 +2917,14 @@ EXPORT_SYMBOL(udp_seq_next);
 
 void udp_seq_stop(struct seq_file *seq, void *v)
 {
-	struct udp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
+	struct udp_seq_afinfo *afinfo;
 	struct udp_iter_state *state = seq->private;
 
+	if (state->bpf_seq_afinfo)
+		afinfo = state->bpf_seq_afinfo;
+	else
+		afinfo = PDE_DATA(file_inode(seq->file));
+
 	if (state->bucket <= afinfo->udp_table->mask)
 		spin_unlock_bh(&afinfo->udp_table->hash[state->bucket].lock);
 }
-- 
cgit v1.2.3


From 5788b3a07fc5863606c3b92fa7b1ffe125e6eb4c Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 23 Jun 2020 16:08:13 -0700
Subject: net: bpf: Implement bpf iterator for udp

The bpf iterator for udp is implemented. Both udp4 and udp6
sockets will be traversed. It is up to bpf program to
filter for udp4 or udp6 only, or both families of sockets.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20200623230813.3988404-1-yhs@fb.com
---
 net/ipv4/udp.c | 116 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 116 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 90355301b266..31530129f137 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2968,6 +2968,67 @@ int udp4_seq_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
+#ifdef CONFIG_BPF_SYSCALL
+struct bpf_iter__udp {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct udp_sock *, udp_sk);
+	uid_t uid __aligned(8);
+	int bucket __aligned(8);
+};
+
+static int udp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
+			     struct udp_sock *udp_sk, uid_t uid, int bucket)
+{
+	struct bpf_iter__udp ctx;
+
+	meta->seq_num--;  /* skip SEQ_START_TOKEN */
+	ctx.meta = meta;
+	ctx.udp_sk = udp_sk;
+	ctx.uid = uid;
+	ctx.bucket = bucket;
+	return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int bpf_iter_udp_seq_show(struct seq_file *seq, void *v)
+{
+	struct udp_iter_state *state = seq->private;
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+	struct sock *sk = v;
+	uid_t uid;
+
+	if (v == SEQ_START_TOKEN)
+		return 0;
+
+	uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, false);
+	return udp_prog_seq_show(prog, &meta, v, uid, state->bucket);
+}
+
+static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+
+	if (!v) {
+		meta.seq = seq;
+		prog = bpf_iter_get_info(&meta, true);
+		if (prog)
+			(void)udp_prog_seq_show(prog, &meta, v, 0, 0);
+	}
+
+	udp_seq_stop(seq, v);
+}
+
+static const struct seq_operations bpf_iter_udp_seq_ops = {
+	.start		= udp_seq_start,
+	.next		= udp_seq_next,
+	.stop		= bpf_iter_udp_seq_stop,
+	.show		= bpf_iter_udp_seq_show,
+};
+#endif
+
 const struct seq_operations udp_seq_ops = {
 	.start		= udp_seq_start,
 	.next		= udp_seq_next,
@@ -3085,6 +3146,57 @@ static struct pernet_operations __net_initdata udp_sysctl_ops = {
 	.init	= udp_sysctl_init,
 };
 
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+DEFINE_BPF_ITER_FUNC(udp, struct bpf_iter_meta *meta,
+		     struct udp_sock *udp_sk, uid_t uid, int bucket)
+
+static int bpf_iter_init_udp(void *priv_data)
+{
+	struct udp_iter_state *st = priv_data;
+	struct udp_seq_afinfo *afinfo;
+	int ret;
+
+	afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
+	if (!afinfo)
+		return -ENOMEM;
+
+	afinfo->family = AF_UNSPEC;
+	afinfo->udp_table = &udp_table;
+	st->bpf_seq_afinfo = afinfo;
+	ret = bpf_iter_init_seq_net(priv_data);
+	if (ret)
+		kfree(afinfo);
+	return ret;
+}
+
+static void bpf_iter_fini_udp(void *priv_data)
+{
+	struct udp_iter_state *st = priv_data;
+
+	kfree(st->bpf_seq_afinfo);
+	bpf_iter_fini_seq_net(priv_data);
+}
+
+static const struct bpf_iter_reg udp_reg_info = {
+	.target			= "udp",
+	.seq_ops		= &bpf_iter_udp_seq_ops,
+	.init_seq_private	= bpf_iter_init_udp,
+	.fini_seq_private	= bpf_iter_fini_udp,
+	.seq_priv_size		= sizeof(struct udp_iter_state),
+	.ctx_arg_info_size	= 1,
+	.ctx_arg_info		= {
+		{ offsetof(struct bpf_iter__udp, udp_sk),
+		  PTR_TO_BTF_ID_OR_NULL },
+	},
+};
+
+static void __init bpf_iter_register(void)
+{
+	if (bpf_iter_reg_target(&udp_reg_info))
+		pr_warn("Warning: could not register bpf iterator udp\n");
+}
+#endif
+
 void __init udp_init(void)
 {
 	unsigned long limit;
@@ -3110,4 +3222,8 @@ void __init udp_init(void)
 
 	if (register_pernet_subsys(&udp_sysctl_ops))
 		panic("UDP: failed to init sysctl parameters.\n");
+
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+	bpf_iter_register();
+#endif
 }
-- 
cgit v1.2.3


From fdb7eb21ddd3cc07b8120b6f5cc0b279a0ed198e Mon Sep 17 00:00:00 2001
From: Yousuk Seung <ysseung@google.com>
Date: Fri, 26 Jun 2020 21:05:32 -0700
Subject: tcp: stamp SCM_TSTAMP_ACK later in tcp_clean_rtx_queue()

Currently tp->delivered is updated with sacked packets but not
cumulatively acked when SCP_TSTAMP_ACK is timestamped. This patch moves
a tcp_ack_tstamp() call in tcp_clean_rtx_queue() to later in the loop so
that when a skb is fully acked OPT_STATS of SCM_TSTAMP_ACK will include
the current skb in the delivered count. When not fully acked
tcp_ack_tstamp() is a no-op and there is no change in behavior.

Signed-off-by: Yousuk Seung <ysseung@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index f3a0eb139b76..2a683e785cca 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3078,8 +3078,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
 		u8 sacked = scb->sacked;
 		u32 acked_pcount;
 
-		tcp_ack_tstamp(sk, skb, prior_snd_una);
-
 		/* Determine how many packets and what bytes were acked, tso and else */
 		if (after(scb->end_seq, tp->snd_una)) {
 			if (tcp_skb_pcount(skb) == 1 ||
@@ -3143,6 +3141,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
 		if (!fully_acked)
 			break;
 
+		tcp_ack_tstamp(sk, skb, prior_snd_una);
+
 		next = skb_rb_next(skb);
 		if (unlikely(skb == tp->retransmit_skb_hint))
 			tp->retransmit_skb_hint = NULL;
-- 
cgit v1.2.3


From c634e34f6ebfb75259e6ce467523fd3adf30d3d2 Mon Sep 17 00:00:00 2001
From: Yousuk Seung <ysseung@google.com>
Date: Fri, 26 Jun 2020 21:05:33 -0700
Subject: tcp: add ece_ack flag to reno sack functions

Pass a boolean flag that tells the ECE state of the current ack to reno
sack functions. This is pure refactor for future patches to improve
tracking delivered counts.

Signed-off-by: Yousuk Seung <ysseung@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2a683e785cca..09bed29e3ef4 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1893,7 +1893,7 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend)
 
 /* Emulate SACKs for SACKless connection: account for a new dupack. */
 
-static void tcp_add_reno_sack(struct sock *sk, int num_dupack)
+static void tcp_add_reno_sack(struct sock *sk, int num_dupack, bool ece_ack)
 {
 	if (num_dupack) {
 		struct tcp_sock *tp = tcp_sk(sk);
@@ -1911,7 +1911,7 @@ static void tcp_add_reno_sack(struct sock *sk, int num_dupack)
 
 /* Account for ACK, ACKing some data in Reno Recovery phase. */
 
-static void tcp_remove_reno_sacks(struct sock *sk, int acked)
+static void tcp_remove_reno_sacks(struct sock *sk, int acked, bool ece_ack)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
@@ -2697,7 +2697,7 @@ static void tcp_process_loss(struct sock *sk, int flag, int num_dupack,
 		 * delivered. Lower inflight to clock out (re)tranmissions.
 		 */
 		if (after(tp->snd_nxt, tp->high_seq) && num_dupack)
-			tcp_add_reno_sack(sk, num_dupack);
+			tcp_add_reno_sack(sk, num_dupack, flag & FLAG_ECE);
 		else if (flag & FLAG_SND_UNA_ADVANCED)
 			tcp_reset_reno_sack(tp);
 	}
@@ -2779,6 +2779,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	int fast_rexmit = 0, flag = *ack_flag;
+	bool ece_ack = flag & FLAG_ECE;
 	bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) &&
 				      tcp_force_fast_retransmit(sk));
 
@@ -2787,7 +2788,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
 
 	/* Now state machine starts.
 	 * A. ECE, hence prohibit cwnd undoing, the reduction is required. */
-	if (flag & FLAG_ECE)
+	if (ece_ack)
 		tp->prior_ssthresh = 0;
 
 	/* B. In all the states check for reneging SACKs. */
@@ -2828,7 +2829,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
 	case TCP_CA_Recovery:
 		if (!(flag & FLAG_SND_UNA_ADVANCED)) {
 			if (tcp_is_reno(tp))
-				tcp_add_reno_sack(sk, num_dupack);
+				tcp_add_reno_sack(sk, num_dupack, ece_ack);
 		} else {
 			if (tcp_try_undo_partial(sk, prior_snd_una))
 				return;
@@ -2853,7 +2854,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
 		if (tcp_is_reno(tp)) {
 			if (flag & FLAG_SND_UNA_ADVANCED)
 				tcp_reset_reno_sack(tp);
-			tcp_add_reno_sack(sk, num_dupack);
+			tcp_add_reno_sack(sk, num_dupack, ece_ack);
 		}
 
 		if (icsk->icsk_ca_state <= TCP_CA_Disorder)
@@ -2877,7 +2878,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
 		}
 
 		/* Otherwise enter Recovery state */
-		tcp_enter_recovery(sk, (flag & FLAG_ECE));
+		tcp_enter_recovery(sk, ece_ack);
 		fast_rexmit = 1;
 	}
 
@@ -3053,7 +3054,7 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
  */
 static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
 			       u32 prior_snd_una,
-			       struct tcp_sacktag_state *sack)
+			       struct tcp_sacktag_state *sack, bool ece_ack)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	u64 first_ackt, last_ackt;
@@ -3191,7 +3192,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
 		}
 
 		if (tcp_is_reno(tp)) {
-			tcp_remove_reno_sacks(sk, pkts_acked);
+			tcp_remove_reno_sacks(sk, pkts_acked, ece_ack);
 
 			/* If any of the cumulatively ACKed segments was
 			 * retransmitted, non-SACK case cannot confirm that
@@ -3685,7 +3686,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 		goto no_queue;
 
 	/* See if we can take anything off of the retransmit queue. */
-	flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state);
+	flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state,
+				    flag & FLAG_ECE);
 
 	tcp_rack_update_reo_wnd(sk, &rs);
 
-- 
cgit v1.2.3


From f00394ce6054e54319aefa51e4c495f9ebeb8669 Mon Sep 17 00:00:00 2001
From: Yousuk Seung <ysseung@google.com>
Date: Fri, 26 Jun 2020 21:05:34 -0700
Subject: tcp: count sacked packets in tcp_sacktag_state

Add sack_delivered to tcp_sacktag_state and count the number of sacked
and dsacked packets. This is pure refactor for future patches to improve
tracking delivered counts.

Signed-off-by: Yousuk Seung <ysseung@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 09bed29e3ef4..db61ea597e39 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1138,6 +1138,7 @@ struct tcp_sacktag_state {
 	struct rate_sample *rate;
 	int	flag;
 	unsigned int mss_now;
+	u32	sack_delivered;
 };
 
 /* Check if skb is fully within the SACK block. In presence of GSO skbs,
@@ -1259,6 +1260,7 @@ static u8 tcp_sacktag_one(struct sock *sk,
 		state->flag |= FLAG_DATA_SACKED;
 		tp->sacked_out += pcount;
 		tp->delivered += pcount;  /* Out-of-order packets delivered */
+		state->sack_delivered += pcount;
 
 		/* Lost marker hint past SACKed? Tweak RFC3517 cnt */
 		if (tp->lost_skb_hint &&
@@ -1685,6 +1687,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
 	if (found_dup_sack) {
 		state->flag |= FLAG_DSACKING_ACK;
 		tp->delivered++; /* A spurious retransmission is delivered */
+		state->sack_delivered++;
 	}
 
 	/* Eliminate too old ACKs, but take into
@@ -3586,6 +3589,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 
 	sack_state.first_sackt = 0;
 	sack_state.rate = &rs;
+	sack_state.sack_delivered = 0;
 
 	/* We very likely will need to access rtx queue. */
 	prefetch(sk->tcp_rtx_queue.rb_node);
-- 
cgit v1.2.3


From 082d4fa980b07b1cc602305e9cc0815d19663ed3 Mon Sep 17 00:00:00 2001
From: Yousuk Seung <ysseung@google.com>
Date: Fri, 26 Jun 2020 21:05:35 -0700
Subject: tcp: update delivered_ce with delivered

Currently tp->delivered is updated in various places in tcp_ack() but
tp->delivered_ce is updated once at the end. As a result two counts in
OPT_STATS of SCM_TSTAMP_ACK timestamps generated in tcp_ack() may not be
in sync. This patch updates both counts at the same in tcp_ack().

Signed-off-by: Yousuk Seung <ysseung@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index db61ea597e39..8479b84f0a7f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -962,6 +962,15 @@ void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
 	}
 }
 
+/* Updates the delivered and delivered_ce counts */
+static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered,
+				bool ece_ack)
+{
+	tp->delivered += delivered;
+	if (ece_ack)
+		tp->delivered_ce += delivered;
+}
+
 /* This procedure tags the retransmission queue when SACKs arrive.
  *
  * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
@@ -1259,7 +1268,7 @@ static u8 tcp_sacktag_one(struct sock *sk,
 		sacked |= TCPCB_SACKED_ACKED;
 		state->flag |= FLAG_DATA_SACKED;
 		tp->sacked_out += pcount;
-		tp->delivered += pcount;  /* Out-of-order packets delivered */
+		/* Out-of-order packets delivered */
 		state->sack_delivered += pcount;
 
 		/* Lost marker hint past SACKed? Tweak RFC3517 cnt */
@@ -1686,7 +1695,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
 					 num_sacks, prior_snd_una);
 	if (found_dup_sack) {
 		state->flag |= FLAG_DSACKING_ACK;
-		tp->delivered++; /* A spurious retransmission is delivered */
+		/* A spurious retransmission is delivered */
 		state->sack_delivered++;
 	}
 
@@ -1907,7 +1916,7 @@ static void tcp_add_reno_sack(struct sock *sk, int num_dupack, bool ece_ack)
 		tcp_check_reno_reordering(sk, 0);
 		delivered = tp->sacked_out - prior_sacked;
 		if (delivered > 0)
-			tp->delivered += delivered;
+			tcp_count_delivered(tp, delivered, ece_ack);
 		tcp_verify_left_out(tp);
 	}
 }
@@ -1920,7 +1929,8 @@ static void tcp_remove_reno_sacks(struct sock *sk, int acked, bool ece_ack)
 
 	if (acked > 0) {
 		/* One ACK acked hole. The rest eat duplicate ACKs. */
-		tp->delivered += max_t(int, acked - tp->sacked_out, 1);
+		tcp_count_delivered(tp, max_t(int, acked - tp->sacked_out, 1),
+				    ece_ack);
 		if (acked - 1 >= tp->sacked_out)
 			tp->sacked_out = 0;
 		else
@@ -3116,7 +3126,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
 		if (sacked & TCPCB_SACKED_ACKED) {
 			tp->sacked_out -= acked_pcount;
 		} else if (tcp_is_sack(tp)) {
-			tp->delivered += acked_pcount;
+			tcp_count_delivered(tp, acked_pcount, ece_ack);
 			if (!tcp_skb_spurious_retrans(tp, skb))
 				tcp_rack_advance(tp, sacked, scb->end_seq,
 						 tcp_skb_timestamp_us(skb));
@@ -3562,10 +3572,9 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
 
 	delivered = tp->delivered - prior_delivered;
 	NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered);
-	if (flag & FLAG_ECE) {
-		tp->delivered_ce += delivered;
+	if (flag & FLAG_ECE)
 		NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered);
-	}
+
 	return delivered;
 }
 
@@ -3665,6 +3674,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 			ack_ev_flags |= CA_ACK_ECE;
 		}
 
+		if (sack_state.sack_delivered)
+			tcp_count_delivered(tp, sack_state.sack_delivered,
+					    flag & FLAG_ECE);
+
 		if (flag & FLAG_WIN_UPDATE)
 			ack_ev_flags |= CA_ACK_WIN_UPDATE;
 
-- 
cgit v1.2.3


From f53b9b0bdc59c0823679f2e3214e0d538f5951b9 Mon Sep 17 00:00:00 2001
From: Laura Garcia Liebana <nevola@gmail.com>
Date: Sun, 31 May 2020 22:26:23 +0200
Subject: netfilter: introduce support for reject at prerouting stage

REJECT statement can be only used in INPUT, FORWARD and OUTPUT
chains. This patch adds support of REJECT, both icmp and tcp
reset, at PREROUTING stage.

The need for this patch comes from the requirement of some
forwarding devices to reject traffic before the natting and
routing decisions.

The main use case is to be able to send a graceful termination
to legitimate clients that, under any circumstances, the NATed
endpoints are not available. This option allows clients to
decide either to perform a reconnection or manage the error in
their side, instead of just dropping the connection and let
them die due to timeout.

It is supported ipv4, ipv6 and inet families for nft
infrastructure.

Signed-off-by: Laura Garcia Liebana <nevola@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/nf_reject_ipv4.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index 2361fdac2c43..9dcfa4e461b6 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -96,6 +96,21 @@ void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb,
 }
 EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put);
 
+static int nf_reject_fill_skb_dst(struct sk_buff *skb_in)
+{
+	struct dst_entry *dst = NULL;
+	struct flowi fl;
+
+	memset(&fl, 0, sizeof(struct flowi));
+	fl.u.ip4.daddr = ip_hdr(skb_in)->saddr;
+	nf_ip_route(dev_net(skb_in->dev), &dst, &fl, false);
+	if (!dst)
+		return -1;
+
+	skb_dst_set(skb_in, dst);
+	return 0;
+}
+
 /* Send RST reply */
 void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook)
 {
@@ -109,6 +124,9 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook)
 	if (!oth)
 		return;
 
+	if (hook == NF_INET_PRE_ROUTING && nf_reject_fill_skb_dst(oldskb))
+		return;
+
 	if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
 		return;
 
@@ -175,6 +193,9 @@ void nf_send_unreach(struct sk_buff *skb_in, int code, int hook)
 	if (iph->frag_off & htons(IP_OFFSET))
 		return;
 
+	if (hook == NF_INET_PRE_ROUTING && nf_reject_fill_skb_dst(skb_in))
+		return;
+
 	if (skb_csum_unnecessary(skb_in) || !nf_reject_verify_csum(proto)) {
 		icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
 		return;
-- 
cgit v1.2.3


From ff91e9292fc5aafd9ee1dc44c03cff69a3b0f39f Mon Sep 17 00:00:00 2001
From: Yousuk Seung <ysseung@google.com>
Date: Tue, 30 Jun 2020 09:49:33 -0700
Subject: tcp: call tcp_ack_tstamp() when not fully acked

When skb is coalesced tcp_ack_tstamp() still needs to be called when not
fully acked in tcp_clean_rtx_queue(), otherwise SCM_TSTAMP_ACK
timestamps may never be fired. Since the original patch series had
dependent commits, this patch fixes the issue instead of reverting by
restoring calls to tcp_ack_tstamp() when skb is not fully acked.

Fixes: fdb7eb21ddd3 ("tcp: stamp SCM_TSTAMP_ACK later in tcp_clean_rtx_queue()")
Signed-off-by: Yousuk Seung <ysseung@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8479b84f0a7f..12c26c9565b7 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3172,8 +3172,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
 	if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
 		tp->snd_up = tp->snd_una;
 
-	if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
-		flag |= FLAG_SACK_RENEGING;
+	if (skb) {
+		tcp_ack_tstamp(sk, skb, prior_snd_una);
+		if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
+			flag |= FLAG_SACK_RENEGING;
+	}
 
 	if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
 		seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt);
-- 
cgit v1.2.3


From 7a6498ebcdc0fde4613a20ae405481d49166e8bb Mon Sep 17 00:00:00 2001
From: "Alexander A. Klimov" <grandmaster@al2klimov.de>
Date: Mon, 6 Jul 2020 19:38:50 +0200
Subject: Replace HTTP links with HTTPS ones: IPv*

Rationale:
Reduces attack surface on kernel devs opening the links for MITM
as HTTPS traffic is much harder to manipulate.

Deterministic algorithm:
For each file:
  If not .svg:
    For each line:
      If doesn't contain `\bxmlns\b`:
        For each link, `\bhttp://[^# \t\r\n]*(?:\w|/)`:
          If both the HTTP and HTTPS versions
          return 200 OK and serve the same content:
            Replace HTTP with HTTPS.

Signed-off-by: Alexander A. Klimov <grandmaster@al2klimov.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Kconfig                   | 8 ++++----
 net/ipv4/cipso_ipv4.c              | 4 ++--
 net/ipv4/fib_trie.c                | 2 +-
 net/ipv4/netfilter/ipt_CLUSTERIP.c | 2 +-
 net/ipv4/tcp_highspeed.c           | 2 +-
 net/ipv4/tcp_htcp.c                | 2 +-
 net/ipv4/tcp_input.c               | 2 +-
 net/ipv4/tcp_veno.c                | 2 +-
 8 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index e64e59b536d3..60db5a6487cc 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -10,7 +10,7 @@ config IP_MULTICAST
 	  intend to participate in the MBONE, a high bandwidth network on top
 	  of the Internet which carries audio and video broadcasts. More
 	  information about the MBONE is on the WWW at
-	  <http://www.savetz.com/mbone/>. For most people, it's safe to say N.
+	  <https://www.savetz.com/mbone/>. For most people, it's safe to say N.
 
 config IP_ADVANCED_ROUTER
 	bool "IP: advanced router"
@@ -73,7 +73,7 @@ config IP_MULTIPLE_TABLES
 
 	  If you need more information, see the Linux Advanced
 	  Routing and Traffic Control documentation at
-	  <http://lartc.org/howto/lartc.rpdb.html>
+	  <https://lartc.org/howto/lartc.rpdb.html>
 
 	  If unsure, say N.
 
@@ -280,7 +280,7 @@ config SYN_COOKIES
 	  continue to connect, even when your machine is under attack. There
 	  is no need for the legitimate users to change their TCP/IP software;
 	  SYN cookies work transparently to them. For technical information
-	  about SYN cookies, check out <http://cr.yp.to/syncookies.html>.
+	  about SYN cookies, check out <https://cr.yp.to/syncookies.html>.
 
 	  If you are SYN flooded, the source address reported by the kernel is
 	  likely to have been forged by the attacker; it is only reported as
@@ -525,7 +525,7 @@ config TCP_CONG_HSTCP
 	  A modification to TCP's congestion control mechanism for use
 	  with large congestion windows. A table indicates how much to
 	  increase the congestion window by when an ACK is received.
-	  For more detail see http://www.icir.org/floyd/hstcp.html
+	  For more detail see https://www.icir.org/floyd/hstcp.html
 
 config TCP_CONG_HYBLA
 	tristate "TCP-Hybla congestion control algorithm"
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index a23094b050f8..0f1b9065c0a6 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -10,9 +10,9 @@
  *
  * The CIPSO draft specification can be found in the kernel's Documentation
  * directory as well as the following URL:
- *   http://tools.ietf.org/id/draft-ietf-cipso-ipsecurity-01.txt
+ *   https://tools.ietf.org/id/draft-ietf-cipso-ipsecurity-01.txt
  * The FIPS-188 specification can be found at the following URL:
- *   http://www.itl.nist.gov/fipspubs/fip188.htm
+ *   https://www.itl.nist.gov/fipspubs/fip188.htm
  *
  * Author: Paul Moore <paul.moore@hp.com>
  */
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 248f1c1959a6..dcb0802a47d5 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -13,7 +13,7 @@
  *
  * An experimental study of compression methods for dynamic tries
  * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
- * http://www.csc.kth.se/~snilsson/software/dyntrie2/
+ * https://www.csc.kth.se/~snilsson/software/dyntrie2/
  *
  * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
  * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index f8755a4ae9d4..a8b980ad11d4 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -3,7 +3,7 @@
  * (C) 2003-2004 by Harald Welte <laforge@netfilter.org>
  * based on ideas of Fabio Olive Leite <olive@unixforge.org>
  *
- * Development of this code funded by SuSE Linux AG, http://www.suse.com/
+ * Development of this code funded by SuSE Linux AG, https://www.suse.com/
  */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/module.h>
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index bfdfbb972c57..349069d6cd0a 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -2,7 +2,7 @@
 /*
  * Sally Floyd's High Speed TCP (RFC 3649) congestion control
  *
- * See http://www.icir.org/floyd/hstcp.html
+ * See https://www.icir.org/floyd/hstcp.html
  *
  * John Heffner <jheffner@psc.edu>
  */
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 88e1f011afe0..55adcfcf96fe 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -4,7 +4,7 @@
  * R.N.Shorten, D.J.Leith:
  *   "H-TCP: TCP for high-speed and long-distance networks"
  *   Proc. PFLDnet, Argonne, 2004.
- * http://www.hamilton.ie/net/htcp3.pdf
+ * https://www.hamilton.ie/net/htcp3.pdf
  */
 
 #include <linux/mm.h>
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 12c26c9565b7..dc77309ea15b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -518,7 +518,7 @@ EXPORT_SYMBOL(tcp_initialize_rcv_mss);
  *
  * The algorithm for RTT estimation w/o timestamps is based on
  * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
- * <http://public.lanl.gov/radiant/pubs.html#DRS>
+ * <https://public.lanl.gov/radiant/pubs.html#DRS>
  *
  * More detail on this code can be found at
  * <http://staff.psc.edu/jheffner/>,
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 50a9a6e2c4cd..cd50a61c9976 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -7,7 +7,7 @@
  *    "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks."
  *    IEEE Journal on Selected Areas in Communication,
  *    Feb. 2003.
- * 	See http://www.ie.cuhk.edu.hk/fileadmin/staff_upload/soung/Journal/J3.pdf
+ * 	See https://www.ie.cuhk.edu.hk/fileadmin/staff_upload/soung/Journal/J3.pdf
  */
 
 #include <linux/mm.h>
-- 
cgit v1.2.3


From f5836749c9c04a10decd2742845ad4870965fdef Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Mon, 6 Jul 2020 16:01:25 -0700
Subject: bpf: Add BPF_CGROUP_INET_SOCK_RELEASE hook

Sometimes it's handy to know when the socket gets freed. In
particular, we'd like to try to use a smarter allocation of
ports for bpf_bind and explore the possibility of limiting
the number of SOCK_DGRAM sockets the process can have.

Implement BPF_CGROUP_INET_SOCK_RELEASE hook that triggers on
inet socket release. It triggers only for userspace sockets
(not in-kernel ones) and therefore has the same semantics as
the existing BPF_CGROUP_INET_SOCK_CREATE.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200706230128.4073544-2-sdf@google.com
---
 net/ipv4/af_inet.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index ea6ed6d487ed..ff141d630bdf 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -411,6 +411,9 @@ int inet_release(struct socket *sock)
 	if (sk) {
 		long timeout;
 
+		if (!sk->sk_kern_sock)
+			BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk);
+
 		/* Applications forget to leave groups before exiting */
 		ip_mc_drop_socket(sk);
 
-- 
cgit v1.2.3


From 6df2db5d37ba3df8c80d90c15f1e20480be43f75 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 6 Jul 2020 20:01:30 +0800
Subject: tunnel4: add cb_handler to struct xfrm_tunnel

This patch is to register a callback function tunnel4_rcv_cb with
is_ipip set in a xfrm_input_afinfo object for tunnel4 and tunnel64.

It will be called by xfrm_rcv_cb() from xfrm_input() when family
is AF_INET and proto is IPPROTO_IPIP or IPPROTO_IPV6.

v1->v2:
  - Fix a sparse warning caused by the missing "__rcu", as Jakub
    noticed.
  - Handle the err returned by xfrm_input_register_afinfo() in
    tunnel4_init/fini(), as Sabrina noticed.
v2->v3:
  - Add "#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)" to fix the build error
    when xfrm is disabled, reported by kbuild test robot.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/tunnel4.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
index c4b2ccbeba04..e44aaf41a138 100644
--- a/net/ipv4/tunnel4.c
+++ b/net/ipv4/tunnel4.c
@@ -110,6 +110,33 @@ drop:
 	return 0;
 }
 
+#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
+static int tunnel4_rcv_cb(struct sk_buff *skb, u8 proto, int err)
+{
+	struct xfrm_tunnel __rcu *head;
+	struct xfrm_tunnel *handler;
+	int ret;
+
+	head = (proto == IPPROTO_IPIP) ? tunnel4_handlers : tunnel64_handlers;
+
+	for_each_tunnel_rcu(head, handler) {
+		if (handler->cb_handler) {
+			ret = handler->cb_handler(skb, err);
+			if (ret <= 0)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+static const struct xfrm_input_afinfo tunnel4_input_afinfo = {
+	.family		=	AF_INET,
+	.is_ipip	=	true,
+	.callback	=	tunnel4_rcv_cb,
+};
+#endif
+
 #if IS_ENABLED(CONFIG_IPV6)
 static int tunnel64_rcv(struct sk_buff *skb)
 {
@@ -230,6 +257,18 @@ static int __init tunnel4_init(void)
 #endif
 		goto err;
 	}
+#endif
+#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
+	if (xfrm_input_register_afinfo(&tunnel4_input_afinfo)) {
+		inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP);
+#if IS_ENABLED(CONFIG_IPV6)
+		inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6);
+#endif
+#if IS_ENABLED(CONFIG_MPLS)
+		inet_del_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS);
+#endif
+		goto err;
+	}
 #endif
 	return 0;
 
@@ -240,6 +279,10 @@ err:
 
 static void __exit tunnel4_fini(void)
 {
+#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
+	if (xfrm_input_unregister_afinfo(&tunnel4_input_afinfo))
+		pr_err("tunnel4 close: can't remove input afinfo\n");
+#endif
 #if IS_ENABLED(CONFIG_MPLS)
 	if (inet_del_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS))
 		pr_err("tunnelmpls4 close: can't remove protocol\n");
-- 
cgit v1.2.3


From 87e66b9682d7067eb7db08040dae36b608a4d971 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 6 Jul 2020 20:01:32 +0800
Subject: ip_vti: support IPIP tunnel processing with .cb_handler

With tunnel4_input_afinfo added, IPIP tunnel processing in
ip_vti can be easily done with .cb_handler. So replace the
processing by calling ip_tunnel_rcv() with it.

v1->v2:
  - no change.
v2-v3:
  - enable it only when CONFIG_INET_XFRM_TUNNEL is defined, to fix
    the build error, reported by kbuild test robot.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/ip_vti.c | 51 +++++++++++++++++++++------------------------------
 1 file changed, 21 insertions(+), 30 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 1d9c8cff5ac3..68177f065117 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -91,32 +91,6 @@ static int vti_rcv_proto(struct sk_buff *skb)
 	return vti_rcv(skb, 0, false);
 }
 
-static int vti_rcv_tunnel(struct sk_buff *skb)
-{
-	struct ip_tunnel_net *itn = net_generic(dev_net(skb->dev), vti_net_id);
-	const struct iphdr *iph = ip_hdr(skb);
-	struct ip_tunnel *tunnel;
-
-	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
-				  iph->saddr, iph->daddr, 0);
-	if (tunnel) {
-		struct tnl_ptk_info tpi = {
-			.proto = htons(ETH_P_IP),
-		};
-
-		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
-			goto drop;
-		if (iptunnel_pull_header(skb, 0, tpi.proto, false))
-			goto drop;
-		return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, false);
-	}
-
-	return -EINVAL;
-drop:
-	kfree_skb(skb);
-	return 0;
-}
-
 static int vti_rcv_cb(struct sk_buff *skb, int err)
 {
 	unsigned short family;
@@ -495,11 +469,22 @@ static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = {
 	.priority	=	100,
 };
 
-static struct xfrm_tunnel ipip_handler __read_mostly = {
+#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
+static int vti_rcv_tunnel(struct sk_buff *skb)
+{
+	XFRM_SPI_SKB_CB(skb)->family = AF_INET;
+	XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
+
+	return vti_input(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr, 0, false);
+}
+
+static struct xfrm_tunnel vti_ipip_handler __read_mostly = {
 	.handler	=	vti_rcv_tunnel,
+	.cb_handler	=	vti_rcv_cb,
 	.err_handler	=	vti4_err,
 	.priority	=	0,
 };
+#endif
 
 static int __net_init vti_init_net(struct net *net)
 {
@@ -669,10 +654,12 @@ static int __init vti_init(void)
 	if (err < 0)
 		goto xfrm_proto_comp_failed;
 
+#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
 	msg = "ipip tunnel";
-	err = xfrm4_tunnel_register(&ipip_handler, AF_INET);
+	err = xfrm4_tunnel_register(&vti_ipip_handler, AF_INET);
 	if (err < 0)
 		goto xfrm_tunnel_failed;
+#endif
 
 	msg = "netlink interface";
 	err = rtnl_link_register(&vti_link_ops);
@@ -682,8 +669,10 @@ static int __init vti_init(void)
 	return err;
 
 rtnl_link_failed:
-	xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
+#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
+	xfrm4_tunnel_deregister(&vti_ipip_handler, AF_INET);
 xfrm_tunnel_failed:
+#endif
 	xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP);
 xfrm_proto_comp_failed:
 	xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
@@ -699,7 +688,9 @@ pernet_dev_failed:
 static void __exit vti_fini(void)
 {
 	rtnl_link_unregister(&vti_link_ops);
-	xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
+#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
+	xfrm4_tunnel_deregister(&vti_ipip_handler, AF_INET);
+#endif
 	xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP);
 	xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
 	xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
-- 
cgit v1.2.3


From e6ce64570f2451684b4f9bcbaee6c40c4a7dff82 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 6 Jul 2020 20:01:33 +0800
Subject: ip_vti: support IPIP6 tunnel processing

For IPIP6 tunnel processing, the functions called will be the
same as that for IPIP tunnel's. So reuse it and register it
with family == AF_INET6.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/ip_vti.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 68177f065117..c0b97b8f6fbd 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -658,7 +658,12 @@ static int __init vti_init(void)
 	msg = "ipip tunnel";
 	err = xfrm4_tunnel_register(&vti_ipip_handler, AF_INET);
 	if (err < 0)
-		goto xfrm_tunnel_failed;
+		goto xfrm_tunnel_ipip_failed;
+#if IS_ENABLED(CONFIG_IPV6)
+	err = xfrm4_tunnel_register(&vti_ipip_handler, AF_INET6);
+	if (err < 0)
+		goto xfrm_tunnel_ipip6_failed;
+#endif
 #endif
 
 	msg = "netlink interface";
@@ -670,8 +675,12 @@ static int __init vti_init(void)
 
 rtnl_link_failed:
 #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
+#if IS_ENABLED(CONFIG_IPV6)
+	xfrm4_tunnel_deregister(&vti_ipip_handler, AF_INET6);
+xfrm_tunnel_ipip6_failed:
+#endif
 	xfrm4_tunnel_deregister(&vti_ipip_handler, AF_INET);
-xfrm_tunnel_failed:
+xfrm_tunnel_ipip_failed:
 #endif
 	xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP);
 xfrm_proto_comp_failed:
@@ -689,6 +698,9 @@ static void __exit vti_fini(void)
 {
 	rtnl_link_unregister(&vti_link_ops);
 #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
+#if IS_ENABLED(CONFIG_IPV6)
+	xfrm4_tunnel_deregister(&vti_ipip_handler, AF_INET6);
+#endif
 	xfrm4_tunnel_deregister(&vti_ipip_handler, AF_INET);
 #endif
 	xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP);
-- 
cgit v1.2.3


From d5a7a5057387d79b91a6e2fd78a76ccd53f91e6c Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 6 Jul 2020 20:01:36 +0800
Subject: ipcomp: assign if_id to child tunnel from parent tunnel

The child tunnel if_id will be used for xfrm interface's lookup
when processing the IP(6)IP(6) packets in the next patches.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/ipcomp.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 59bfa3825810..b42683212c65 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -72,6 +72,7 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
 	t->props.flags = x->props.flags;
 	t->props.extra_flags = x->props.extra_flags;
 	memcpy(&t->mark, &x->mark, sizeof(t->mark));
+	t->if_id = x->if_id;
 
 	if (xfrm_init_state(t))
 		goto error;
-- 
cgit v1.2.3


From 3f935c75eb52dd968351dba824adf466fb9c9429 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Thu, 9 Jul 2020 15:12:39 +0200
Subject: inet_diag: support for wider protocol numbers

After commit bf9765145b85 ("sock: Make sk_protocol a 16-bit value")
the current size of 'sdiag_protocol' is not sufficient to represent
the possible protocol values.

This change introduces a new inet diag request attribute to let
user space specify the relevant protocol number using u32 values.

The attribute is parsed by inet diag core on get/dump command
and the extended protocol value, if available, is preferred to
'sdiag_protocol' to lookup the diag handler.

The parse attributed are exposed to all the diag handlers via
the cb->data.

Note that inet_diag_dump_one_icsk() is left unmodified, as it
will not be used by protocol using the extended attribute.

Suggested-by: David S. Miller <davem@davemloft.net>
Co-developed-by: Christoph Paasch <cpaasch@apple.com>
Signed-off-by: Christoph Paasch <cpaasch@apple.com>
Acked-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_diag.c | 65 ++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 48 insertions(+), 17 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 125f4f8a36b4..4a98dd736270 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -52,6 +52,11 @@ static DEFINE_MUTEX(inet_diag_table_mutex);
 
 static const struct inet_diag_handler *inet_diag_lock_handler(int proto)
 {
+	if (proto < 0 || proto >= IPPROTO_MAX) {
+		mutex_lock(&inet_diag_table_mutex);
+		return ERR_PTR(-ENOENT);
+	}
+
 	if (!inet_diag_table[proto])
 		sock_load_diag_module(AF_INET, proto);
 
@@ -181,6 +186,28 @@ errout:
 }
 EXPORT_SYMBOL_GPL(inet_diag_msg_attrs_fill);
 
+static void inet_diag_parse_attrs(const struct nlmsghdr *nlh, int hdrlen,
+				  struct nlattr **req_nlas)
+{
+	struct nlattr *nla;
+	int remaining;
+
+	nlmsg_for_each_attr(nla, nlh, hdrlen, remaining) {
+		int type = nla_type(nla);
+
+		if (type < __INET_DIAG_REQ_MAX)
+			req_nlas[type] = nla;
+	}
+}
+
+static int inet_diag_get_protocol(const struct inet_diag_req_v2 *req,
+				  const struct inet_diag_dump_data *data)
+{
+	if (data->req_nlas[INET_DIAG_REQ_PROTOCOL])
+		return nla_get_u32(data->req_nlas[INET_DIAG_REQ_PROTOCOL]);
+	return req->sdiag_protocol;
+}
+
 #define MAX_DUMP_ALLOC_SIZE (KMALLOC_MAX_SIZE - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
 
 int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
@@ -198,7 +225,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 	void *info = NULL;
 
 	cb_data = cb->data;
-	handler = inet_diag_table[req->sdiag_protocol];
+	handler = inet_diag_table[inet_diag_get_protocol(req, cb_data)];
 	BUG_ON(!handler);
 
 	nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
@@ -539,20 +566,25 @@ EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk);
 
 static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb,
 			       const struct nlmsghdr *nlh,
+			       int hdrlen,
 			       const struct inet_diag_req_v2 *req)
 {
 	const struct inet_diag_handler *handler;
-	int err;
+	struct inet_diag_dump_data dump_data;
+	int err, protocol;
 
-	handler = inet_diag_lock_handler(req->sdiag_protocol);
+	memset(&dump_data, 0, sizeof(dump_data));
+	inet_diag_parse_attrs(nlh, hdrlen, dump_data.req_nlas);
+	protocol = inet_diag_get_protocol(req, &dump_data);
+
+	handler = inet_diag_lock_handler(protocol);
 	if (IS_ERR(handler)) {
 		err = PTR_ERR(handler);
 	} else if (cmd == SOCK_DIAG_BY_FAMILY) {
-		struct inet_diag_dump_data empty_dump_data = {};
 		struct netlink_callback cb = {
 			.nlh = nlh,
 			.skb = in_skb,
-			.data = &empty_dump_data,
+			.data = &dump_data,
 		};
 		err = handler->dump_one(&cb, req);
 	} else if (cmd == SOCK_DESTROY && handler->destroy) {
@@ -1103,13 +1135,16 @@ EXPORT_SYMBOL_GPL(inet_diag_dump_icsk);
 static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
 			    const struct inet_diag_req_v2 *r)
 {
+	struct inet_diag_dump_data *cb_data = cb->data;
 	const struct inet_diag_handler *handler;
 	u32 prev_min_dump_alloc;
-	int err = 0;
+	int protocol, err = 0;
+
+	protocol = inet_diag_get_protocol(r, cb_data);
 
 again:
 	prev_min_dump_alloc = cb->min_dump_alloc;
-	handler = inet_diag_lock_handler(r->sdiag_protocol);
+	handler = inet_diag_lock_handler(protocol);
 	if (!IS_ERR(handler))
 		handler->dump(skb, cb, r);
 	else
@@