From c410bf01933e5e09d142c66c3df9ad470a7eec13 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 11 May 2020 14:54:34 +0100
Subject: rxrpc: Fix the excessive initial retransmission timeout

rxrpc currently uses a fixed 4s retransmission timeout until the RTT is
sufficiently sampled.  This can cause problems with some fileservers with
calls to the cache manager in the afs filesystem being dropped from the
fileserver because a packet goes missing and the retransmission timeout is
greater than the call expiry timeout.

Fix this by:

 (1) Copying the RTT/RTO calculation code from Linux's TCP implementation
     and altering it to fit rxrpc.

 (2) Altering the various users of the RTT to make use of the new SRTT
     value.

 (3) Replacing the use of rxrpc_resend_timeout to use the calculated RTO
     value instead (which is needed in jiffies), along with a backoff.

Notes:

 (1) rxrpc provides RTT samples by matching the serial numbers on outgoing
     DATA packets that have the RXRPC_REQUEST_ACK set and PING ACK packets
     against the reference serial number in incoming REQUESTED ACK and
     PING-RESPONSE ACK packets.

 (2) Each packet that is transmitted on an rxrpc connection gets a new
     per-connection serial number, even for retransmissions, so an ACK can
     be cross-referenced to a specific trigger packet.  This allows RTT
     information to be drawn from retransmitted DATA packets also.

 (3) rxrpc maintains the RTT/RTO state on the rxrpc_peer record rather than
     on an rxrpc_call because many RPC calls won't live long enough to
     generate more than one sample.

 (4) The calculated SRTT value is in units of 8ths of a microsecond rather
     than nanoseconds.

The (S)RTT and RTO values are displayed in /proc/net/rxrpc/peers.

Fixes: 17926a79320a ([AF_RXRPC]: Provide secure RxRPC sockets for use by userspace and kernel both"")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/Makefile      |   1 +
 net/rxrpc/ar-internal.h |  25 +++++--
 net/rxrpc/call_accept.c |   2 +-
 net/rxrpc/call_event.c  |  22 ++----
 net/rxrpc/input.c       |   6 +-
 net/rxrpc/misc.c        |   5 --
 net/rxrpc/output.c      |   9 +--
 net/rxrpc/peer_event.c  |  46 ------------
 net/rxrpc/peer_object.c |  12 +--
 net/rxrpc/proc.c        |   8 +-
 net/rxrpc/rtt.c         | 195 ++++++++++++++++++++++++++++++++++++++++++++++++
 net/rxrpc/sendmsg.c     |  26 +++----
 net/rxrpc/sysctl.c      |   9 ---
 13 files changed, 248 insertions(+), 118 deletions(-)
 create mode 100644 net/rxrpc/rtt.c

(limited to 'net')

diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile
index 6ffb7e9887ce..ddd0f95713a9 100644
--- a/net/rxrpc/Makefile
+++ b/net/rxrpc/Makefile
@@ -25,6 +25,7 @@ rxrpc-y := \
 	peer_event.o \
 	peer_object.o \
 	recvmsg.o \
+	rtt.o \
 	security.o \
 	sendmsg.o \
 	skbuff.o \
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 3eb1ab40ca5c..9fe264bec70c 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -7,6 +7,7 @@
 
 #include <linux/atomic.h>
 #include <linux/seqlock.h>
+#include <linux/win_minmax.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 #include <net/sock.h>
@@ -311,11 +312,14 @@ struct rxrpc_peer {
 #define RXRPC_RTT_CACHE_SIZE 32
 	spinlock_t		rtt_input_lock;	/* RTT lock for input routine */
 	ktime_t			rtt_last_req;	/* Time of last RTT request */
-	u64			rtt;		/* Current RTT estimate (in nS) */
-	u64			rtt_sum;	/* Sum of cache contents */
-	u64			rtt_cache[RXRPC_RTT_CACHE_SIZE]; /* Determined RTT cache */
-	u8			rtt_cursor;	/* next entry at which to insert */
-	u8			rtt_usage;	/* amount of cache actually used */
+	unsigned int		rtt_count;	/* Number of samples we've got */
+
+	u32			srtt_us;	/* smoothed round trip time << 3 in usecs */
+	u32			mdev_us;	/* medium deviation			*/
+	u32			mdev_max_us;	/* maximal mdev for the last rtt period	*/
+	u32			rttvar_us;	/* smoothed mdev_max			*/
+	u32			rto_j;		/* Retransmission timeout in jiffies */
+	u8			backoff;	/* Backoff timeout */
 
 	u8			cong_cwnd;	/* Congestion window size */
 };
@@ -1041,7 +1045,6 @@ extern unsigned long rxrpc_idle_ack_delay;
 extern unsigned int rxrpc_rx_window_size;
 extern unsigned int rxrpc_rx_mtu;
 extern unsigned int rxrpc_rx_jumbo_max;
-extern unsigned long rxrpc_resend_timeout;
 
 extern const s8 rxrpc_ack_priority[];
 
@@ -1069,8 +1072,6 @@ void rxrpc_send_keepalive(struct rxrpc_peer *);
  * peer_event.c
  */
 void rxrpc_error_report(struct sock *);
-void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace,
-			rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t);
 void rxrpc_peer_keepalive_worker(struct work_struct *);
 
 /*
@@ -1102,6 +1103,14 @@ extern const struct seq_operations rxrpc_peer_seq_ops;
 void rxrpc_notify_socket(struct rxrpc_call *);
 int rxrpc_recvmsg(struct socket *, struct msghdr *, size_t, int);
 
+/*
+ * rtt.c
+ */
+void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace,
+			rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t);
+unsigned long rxrpc_get_rto_backoff(struct rxrpc_peer *, bool);
+void rxrpc_peer_init_rtt(struct rxrpc_peer *);
+
 /*
  * rxkad.c
  */
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 70e44abf106c..b7611cc159e5 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -248,7 +248,7 @@ static void rxrpc_send_ping(struct rxrpc_call *call, struct sk_buff *skb)
 	struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
 	ktime_t now = skb->tstamp;
 
-	if (call->peer->rtt_usage < 3 ||
+	if (call->peer->rtt_count < 3 ||
 	    ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), now))
 		rxrpc_propose_ACK(call, RXRPC_ACK_PING, sp->hdr.serial,
 				  true, true,
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index cedbbb3a7c2e..2a65ac41055f 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -111,8 +111,8 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
 	} else {
 		unsigned long now = jiffies, ack_at;
 
-		if (call->peer->rtt_usage > 0)
-			ack_at = nsecs_to_jiffies(call->peer->rtt);
+		if (call->peer->srtt_us != 0)
+			ack_at = usecs_to_jiffies(call->peer->srtt_us >> 3);
 		else
 			ack_at = expiry;
 
@@ -157,24 +157,18 @@ static void rxrpc_congestion_timeout(struct rxrpc_call *call)
 static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j)
 {
 	struct sk_buff *skb;
-	unsigned long resend_at;
+	unsigned long resend_at, rto_j;
 	rxrpc_seq_t cursor, seq, top;
-	ktime_t now, max_age, oldest, ack_ts, timeout, min_timeo;
+	ktime_t now, max_age, oldest, ack_ts;
 	int ix;
 	u8 annotation, anno_type, retrans = 0, unacked = 0;
 
 	_enter("{%d,%d}", call->tx_hard_ack, call->tx_top);
 
-	if (call->peer->rtt_usage > 1)
-		timeout = ns_to_ktime(call->peer->rtt * 3 / 2);
-	else
-		timeout = ms_to_ktime(rxrpc_resend_timeout);
-	min_timeo = ns_to_ktime((1000000000 / HZ) * 4);
-	if (ktime_before(timeout, min_timeo))
-		timeout = min_timeo;
+	rto_j = call->peer->rto_j;
 
 	now = ktime_get_real();
-	max_age = ktime_sub(now, timeout);
+	max_age = ktime_sub(now, jiffies_to_usecs(rto_j));
 
 	spin_lock_bh(&call->lock);
 
@@ -219,7 +213,7 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j)
 	}
 
 	resend_at = nsecs_to_jiffies(ktime_to_ns(ktime_sub(now, oldest)));
-	resend_at += jiffies + rxrpc_resend_timeout;
+	resend_at += jiffies + rto_j;
 	WRITE_ONCE(call->resend_at, resend_at);
 
 	if (unacked)
@@ -234,7 +228,7 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j)
 					rxrpc_timer_set_for_resend);
 		spin_unlock_bh(&call->lock);
 		ack_ts = ktime_sub(now, call->acks_latest_ts);
-		if (ktime_to_ns(ack_ts) < call->peer->rtt)
+		if (ktime_to_us(ack_ts) < (call->peer->srtt_us >> 3))
 			goto out;
 		rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, true, false,
 				  rxrpc_propose_ack_ping_for_lost_ack);
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 69e09d69c896..e438bfd3fdf5 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -91,11 +91,11 @@ static void rxrpc_congestion_management(struct rxrpc_call *call,
 		/* We analyse the number of packets that get ACK'd per RTT
 		 * period and increase the window if we managed to fill it.
 		 */
-		if (call->peer->rtt_usage == 0)
+		if (call->peer->rtt_count == 0)
 			goto out;
 		if (ktime_before(skb->tstamp,
-				 ktime_add_ns(call->cong_tstamp,
-					      call->peer->rtt)))
+				 ktime_add_us(call->cong_tstamp,
+					      call->peer->srtt_us >> 3)))
 			goto out_no_clear_ca;
 		change = rxrpc_cong_rtt_window_end;
 		call->cong_tstamp = skb->tstamp;
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index 214405f75346..d4144fd86f84 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -63,11 +63,6 @@ unsigned int rxrpc_rx_mtu = 5692;
  */
 unsigned int rxrpc_rx_jumbo_max = 4;
 
-/*
- * Time till packet resend (in milliseconds).
- */
-unsigned long rxrpc_resend_timeout = 4 * HZ;
-
 const s8 rxrpc_ack_priority[] = {
 	[0]				= 0,
 	[RXRPC_ACK_DELAY]		= 1,
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 90e263c6aa69..f8b632a5c619 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -369,7 +369,7 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb,
 	    (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events) ||
 	     retrans ||
 	     call->cong_mode == RXRPC_CALL_SLOW_START ||
-	     (call->peer->rtt_usage < 3 && sp->hdr.seq & 1) ||
+	     (call->peer->rtt_count < 3 && sp->hdr.seq & 1) ||
 	     ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000),
 			  ktime_get_real())))
 		whdr.flags |= RXRPC_REQUEST_ACK;
@@ -423,13 +423,10 @@ done:
 		if (whdr.flags & RXRPC_REQUEST_ACK) {
 			call->peer->rtt_last_req = skb->tstamp;
 			trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_data, serial);
-			if (call->peer->rtt_usage > 1) {
+			if (call->peer->rtt_count > 1) {
 				unsigned long nowj = jiffies, ack_lost_at;
 
-				ack_lost_at = nsecs_to_jiffies(2 * call->peer->rtt);
-				if (ack_lost_at < 1)
-					ack_lost_at = 1;
-
+				ack_lost_at = rxrpc_get_rto_backoff(call->peer, retrans);
 				ack_lost_at += nowj;
 				WRITE_ONCE(call->ack_lost_at, ack_lost_at);
 				rxrpc_reduce_call_timer(call, ack_lost_at, nowj,
diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c
index 923b263c401b..b1449d971883 100644
--- a/net/rxrpc/peer_event.c
+++ b/net/rxrpc/peer_event.c
@@ -295,52 +295,6 @@ static void rxrpc_distribute_error(struct rxrpc_peer *peer, int error,
 	}
 }
 
-/*
- * Add RTT information to cache.  This is called in softirq mode and has
- * exclusive access to the peer RTT data.
- */
-void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why,
-			rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial,
-			ktime_t send_time, ktime_t resp_time)
-{
-	struct rxrpc_peer *peer = call->peer;
-	s64 rtt;
-	u64 sum = peer->rtt_sum, avg;
-	u8 cursor = peer->rtt_cursor, usage = peer->rtt_usage;
-
-	rtt = ktime_to_ns(ktime_sub(resp_time, send_time));
-	if (rtt < 0)
-		return;
-
-	spin_lock(&peer->rtt_input_lock);
-
-	/* Replace the oldest datum in the RTT buffer */
-	sum -= peer->rtt_cache[cursor];
-	sum += rtt;
-	peer->rtt_cache[cursor] = rtt;
-	peer->rtt_cursor = (cursor + 1) & (RXRPC_RTT_CACHE_SIZE - 1);
-	peer->rtt_sum = sum;
-	if (usage < RXRPC_RTT_CACHE_SIZE) {
-		usage++;
-		peer->rtt_usage = usage;
-	}
-
-	spin_unlock(&peer->rtt_input_lock);
-
-	/* Now recalculate the average */
-	if (usage == RXRPC_RTT_CACHE_SIZE) {
-		avg = sum / RXRPC_RTT_CACHE_SIZE;
-	} else {
-		avg = sum;
-		do_div(avg, usage);
-	}
-
-	/* Don't need to update this under lock */
-	peer->rtt = avg;
-	trace_rxrpc_rtt_rx(call, why, send_serial, resp_serial, rtt,
-			   usage, avg);
-}
-
 /*
  * Perform keep-alive pings.
  */
diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c
index 452163eadb98..ca29976bb193 100644
--- a/net/rxrpc/peer_object.c
+++ b/net/rxrpc/peer_object.c
@@ -225,6 +225,8 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp)
 		spin_lock_init(&peer->rtt_input_lock);
 		peer->debug_id = atomic_inc_return(&rxrpc_debug_id);
 
+		rxrpc_peer_init_rtt(peer);
+
 		if (RXRPC_TX_SMSS > 2190)
 			peer->cong_cwnd = 2;
 		else if (RXRPC_TX_SMSS > 1095)
@@ -497,14 +499,14 @@ void rxrpc_kernel_get_peer(struct socket *sock, struct rxrpc_call *call,
 EXPORT_SYMBOL(rxrpc_kernel_get_peer);
 
 /**
- * rxrpc_kernel_get_rtt - Get a call's peer RTT
+ * rxrpc_kernel_get_srtt - Get a call's peer smoothed RTT
  * @sock: The socket on which the call is in progress.
  * @call: The call to query
  *
- * Get the call's peer RTT.
+ * Get the call's peer smoothed RTT.
  */
-u64 rxrpc_kernel_get_rtt(struct socket *sock, struct rxrpc_call *call)
+u32 rxrpc_kernel_get_srtt(struct socket *sock, struct rxrpc_call *call)
 {
-	return call->peer->rtt;
+	return call->peer->srtt_us >> 3;
 }
-EXPORT_SYMBOL(rxrpc_kernel_get_rtt);
+EXPORT_SYMBOL(rxrpc_kernel_get_srtt);
diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c
index b9d053e42821..8b179e3c802a 100644
--- a/net/rxrpc/proc.c
+++ b/net/rxrpc/proc.c
@@ -222,7 +222,7 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v)
 		seq_puts(seq,
 			 "Proto Local                                          "
 			 " Remote                                         "
-			 " Use CW  MTU   LastUse          RTT Rc\n"
+			 " Use  CW   MTU LastUse      RTT      RTO\n"
 			 );
 		return 0;
 	}
@@ -236,15 +236,15 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v)
 	now = ktime_get_seconds();
 	seq_printf(seq,
 		   "UDP   %-47.47s %-47.47s %3u"
-		   " %3u %5u %6llus %12llu %2u\n",
+		   " %3u %5u %6llus %8u %8u\n",
 		   lbuff,
 		   rbuff,
 		   atomic_read(&peer->usage),
 		   peer->cong_cwnd,
 		   peer->mtu,
 		   now - peer->last_tx_at,
-		   peer->rtt,
-		   peer->rtt_cursor);
+		   peer->srtt_us >> 3,
+		   jiffies_to_usecs(peer->rto_j));
 
 	return 0;
 }
diff --git a/net/rxrpc/rtt.c b/net/rxrpc/rtt.c
new file mode 100644
index 000000000000..928d8b34a3ee
--- /dev/null
+++ b/net/rxrpc/rtt.c
@@ -0,0 +1,195 @@
+// SPDX-License-Identifier: GPL-2.0
+/* RTT/RTO calculation.
+ *
+ * Adapted from TCP for AF_RXRPC by David Howells (dhowells@redhat.com)
+ *
+ * https://tools.ietf.org/html/rfc6298
+ * https://tools.ietf.org/html/rfc1122#section-4.2.3.1
+ * http://ccr.sigcomm.org/archive/1995/jan95/ccr-9501-partridge87.pdf
+ */
+
+#include <linux/net.h>
+#include "ar-internal.h"
+
+#define RXRPC_RTO_MAX	((unsigned)(120 * HZ))
+#define RXRPC_TIMEOUT_INIT ((unsigned)(1*HZ))	/* RFC6298 2.1 initial RTO value	*/
+#define rxrpc_jiffies32 ((u32)jiffies)		/* As rxrpc_jiffies32 */
+#define rxrpc_min_rtt_wlen 300			/* As sysctl_tcp_min_rtt_wlen */
+
+static u32 rxrpc_rto_min_us(struct rxrpc_peer *peer)
+{
+	return 200;
+}
+
+static u32 __rxrpc_set_rto(const struct rxrpc_peer *peer)
+{
+	return _usecs_to_jiffies((peer->srtt_us >> 3) + peer->rttvar_us);
+}
+
+static u32 rxrpc_bound_rto(u32 rto)
+{
+	return min(rto, RXRPC_RTO_MAX);
+}
+
+/*
+ * Called to compute a smoothed rtt estimate. The data fed to this
+ * routine either comes from timestamps, or from segments that were
+ * known _not_ to have been retransmitted [see Karn/Partridge
+ * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
+ * piece by Van Jacobson.
+ * NOTE: the next three routines used to be one big routine.
+ * To save cycles in the RFC 1323 implementation it was better to break
+ * it up into three procedures. -- erics
+ */
+static void rxrpc_rtt_estimator(struct rxrpc_peer *peer, long sample_rtt_us)
+{
+	long m = sample_rtt_us; /* RTT */
+	u32 srtt = peer->srtt_us;
+
+	/*	The following amusing code comes from Jacobson's
+	 *	article in SIGCOMM '88.  Note that rtt and mdev
+	 *	are scaled versions of rtt and mean deviation.
+	 *	This is designed to be as fast as possible
+	 *	m stands for "measurement".
+	 *
+	 *	On a 1990 paper the rto value is changed to:
+	 *	RTO = rtt + 4 * mdev
+	 *
+	 * Funny. This algorithm seems to be very broken.
+	 * These formulae increase RTO, when it should be decreased, increase
+	 * too slowly, when it should be increased quickly, decrease too quickly
+	 * etc. I guess in BSD RTO takes ONE value, so that it is absolutely
+	 * does not matter how to _calculate_ it. Seems, it was trap
+	 * that VJ failed to avoid. 8)
+	 */
+	if (srtt != 0) {
+		m -= (srtt >> 3);	/* m is now error in rtt est */
+		srtt += m;		/* rtt = 7/8 rtt + 1/8 new */
+		if (m < 0) {
+			m = -m;		/* m is now abs(error) */
+			m -= (peer->mdev_us >> 2);   /* similar update on mdev */
+			/* This is similar to one of Eifel findings.
+			 * Eifel blocks mdev updates when rtt decreases.
+			 * This solution is a bit different: we use finer gain
+			 * for mdev in this case (alpha*beta).
+			 * Like Eifel it also prevents growth of rto,
+			 * but also it limits too fast rto decreases,
+			 * happening in pure Eifel.
+			 */
+			if (m > 0)
+				m >>= 3;
+		} else {
+			m -= (peer->mdev_us >> 2);   /* similar update on mdev */
+		}
+
+		peer->mdev_us += m;		/* mdev = 3/4 mdev + 1/4 new */
+		if (peer->mdev_us > peer->mdev_max_us) {
+			peer->mdev_max_us = peer->mdev_us;
+			if (peer->mdev_max_us > peer->rttvar_us)
+				peer->rttvar_us = peer->mdev_max_us;
+		}
+	} else {
+		/* no previous measure. */
+		srtt = m << 3;		/* take the measured time to be rtt */
+		peer->mdev_us = m << 1;	/* make sure rto = 3*rtt */
+		peer->rttvar_us = max(peer->mdev_us, rxrpc_rto_min_us(peer));
+		peer->mdev_max_us = peer->rttvar_us;
+	}
+
+	peer->srtt_us = max(1U, srtt);
+}
+
+/*
+ * Calculate rto without backoff.  This is the second half of Van Jacobson's
+ * routine referred to above.
+ */
+static void rxrpc_set_rto(struct rxrpc_peer *peer)
+{
+	u32 rto;
+
+	/* 1. If rtt variance happened to be less 50msec, it is hallucination.
+	 *    It cannot be less due to utterly erratic ACK generation made
+	 *    at least by solaris and freebsd. "Erratic ACKs" has _nothing_
+	 *    to do with delayed acks, because at cwnd>2 true delack timeout
+	 *    is invisible. Actually, Linux-2.4 also generates erratic
+	 *    ACKs in some circumstances.
+	 */
+	rto = __rxrpc_set_rto(peer);
+
+	/* 2. Fixups made earlier cannot be right.
+	 *    If we do not estimate RTO correctly without them,
+	 *    all the algo is pure shit and should be replaced
+	 *    with correct one. It is exactly, which we pretend to do.
+	 */
+
+	/* NOTE: clamping at RXRPC_RTO_MIN is not required, current algo
+	 * guarantees that rto is higher.
+	 */
+	peer->rto_j = rxrpc_bound_rto(rto);
+}
+
+static void rxrpc_ack_update_rtt(struct rxrpc_peer *peer, long rtt_us)
+{
+	if (rtt_us < 0)
+		return;
+
+	//rxrpc_update_rtt_min(peer, rtt_us);
+	rxrpc_rtt_estimator(peer, rtt_us);
+	rxrpc_set_rto(peer);
+
+	/* RFC6298: only reset backoff on valid RTT measurement. */
+	peer->backoff = 0;
+}
+
+/*
+ * Add RTT information to cache.  This is called in softirq mode and has
+ * exclusive access to the peer RTT data.
+ */
+void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why,
+			rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial,
+			ktime_t send_time, ktime_t resp_time)
+{
+	struct rxrpc_peer *peer = call->peer;
+	s64 rtt_us;
+
+	rtt_us = ktime_to_us(ktime_sub(resp_time, send_time));
+	if (rtt_us < 0)
+		return;
+
+	spin_lock(&peer->rtt_input_lock);
+	rxrpc_ack_update_rtt(peer, rtt_us);
+	if (peer->rtt_count < 3)
+		peer->rtt_count++;
+	spin_unlock(&peer->rtt_input_lock);
+
+	trace_rxrpc_rtt_rx(call, why, send_serial, resp_serial,
+			   peer->srtt_us >> 3, peer->rto_j);
+}
+
+/*
+ * Get the retransmission timeout to set in jiffies, backing it off each time
+ * we retransmit.
+ */
+unsigned long rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans)
+{
+	u64 timo_j;
+	u8 backoff = READ_ONCE(peer->backoff);
+
+	timo_j = peer->rto_j;
+	timo_j <<= backoff;
+	if (retrans && timo_j * 2 <= RXRPC_RTO_MAX)
+		WRITE_ONCE(peer->backoff, backoff + 1);
+
+	if (timo_j < 1)
+		timo_j = 1;
+
+	return timo_j;
+}
+
+void rxrpc_peer_init_rtt(struct rxrpc_peer *peer)
+{
+	peer->rto_j	= RXRPC_TIMEOUT_INIT;
+	peer->mdev_us	= jiffies_to_usecs(RXRPC_TIMEOUT_INIT);
+	peer->backoff	= 0;
+	//minmax_reset(&peer->rtt_min, rxrpc_jiffies32, ~0U);
+}
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 0fcf157aa09f..5e9c43d4a314 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -66,15 +66,14 @@ static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx,
 					    struct rxrpc_call *call)
 {
 	rxrpc_seq_t tx_start, tx_win;
-	signed long rtt2, timeout;
-	u64 rtt;
+	signed long rtt, timeout;
 
-	rtt = READ_ONCE(call->peer->rtt);
-	rtt2 = nsecs_to_jiffies64(rtt) * 2;
-	if (rtt2 < 2)
-		rtt2 = 2;
+	rtt = READ_ONCE(call->peer->srtt_us) >> 3;
+	rtt = usecs_to_jiffies(rtt) * 2;
+	if (rtt < 2)
+		rtt = 2;
 
-	timeout = rtt2;
+	timeout = rtt;
 	tx_start = READ_ONCE(call->tx_hard_ack);
 
 	for (;;) {
@@ -92,7 +91,7 @@ static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx,
 			return -EINTR;
 
 		if (tx_win != tx_start) {
-			timeout = rtt2;
+			timeout = rtt;
 			tx_start = tx_win;
 		}
 
@@ -271,16 +270,9 @@ static int rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,
 		_debug("need instant resend %d", ret);
 		rxrpc_instant_resend(call, ix);
 	} else {
-		unsigned long now = jiffies, resend_at;
+		unsigned long now = jiffies;
+		unsigned long resend_at = now + call->peer->rto_j;
 
-		if (call->peer->rtt_usage > 1)
-			resend_at = nsecs_to_jiffies(call->peer->rtt * 3 / 2);
-		else
-			resend_at = rxrpc_resend_timeout;
-		if (resend_at < 1)
-			resend_at = 1;
-
-		resend_at += now;
 		WRITE_ONCE(call->resend_at, resend_at);
 		rxrpc_reduce_call_timer(call, resend_at, now,
 					rxrpc_timer_set_for_send);
diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c
index 2bbb38161851..18dade4e6f9a 100644
--- a/net/rxrpc/sysctl.c
+++ b/net/rxrpc/sysctl.c
@@ -71,15 +71,6 @@ static struct ctl_table rxrpc_sysctl_table[] = {
 		.extra1		= (void *)&one_jiffy,
 		.extra2		= (void *)&max_jiffies,
 	},
-	{
-		.procname	= "resend_timeout",
-		.data		= &rxrpc_resend_timeout,
-		.maxlen		= sizeof(unsigned long),
-		.mode		= 0644,
-		.proc_handler	= proc_doulongvec_ms_jiffies_minmax,
-		.extra1		= (void *)&one_jiffy,
-		.extra2		= (void *)&max_jiffies,
-	},
 
 	/* Non-time values */
 	{
-- 
cgit v1.2.3


From b6dd5acde3f165e364881c36de942c5b252e2a27 Mon Sep 17 00:00:00 2001
From: Madhuparna Bhowmik <madhuparnabhowmik10@gmail.com>
Date: Sat, 16 May 2020 13:15:15 +0530
Subject: ipv6: Fix suspicious RCU usage warning in ip6mr

This patch fixes the following warning:

=============================
WARNING: suspicious RCU usage
5.7.0-rc4-next-20200507-syzkaller #0 Not tainted
-----------------------------
net/ipv6/ip6mr.c:124 RCU-list traversed in non-reader section!!

ipmr_new_table() returns an existing table, but there is no table at
init. Therefore the condition: either holding rtnl or the list is empty
is used.

Fixes: d1db275dd3f6e ("ipv6: ip6mr: support multiple tables")
Reported-by: kernel test robot <lkp@intel.com>
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Madhuparna Bhowmik <madhuparnabhowmik10@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6mr.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 65a54d74acc1..1e223e26f079 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -98,7 +98,8 @@ static void ipmr_expire_process(struct timer_list *t);
 #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
 #define ip6mr_for_each_table(mrt, net) \
 	list_for_each_entry_rcu(mrt, &net->ipv6.mr6_tables, list, \
-				lockdep_rtnl_is_held())
+				lockdep_rtnl_is_held() || \
+				list_empty(&net->ipv6.mr6_tables))
 
 static struct mr_table *ip6mr_mr_table_iter(struct net *net,
 					    struct mr_table *mrt)
-- 
cgit v1.2.3


From 5e5502e012b8129e11be616acb0f9c34bc8f8adb Mon Sep 17 00:00:00 2001
From: DENG Qingfang <dqfext@gmail.com>
Date: Wed, 13 May 2020 23:10:16 +0800
Subject: net: dsa: mt7530: fix roaming from DSA user ports

When a client moves from a DSA user port to a software port in a bridge,
it cannot reach any other clients that connected to the DSA user ports.
That is because SA learning on the CPU port is disabled, so the switch
ignores the client's frames from the CPU port and still thinks it is at
the user port.

Fix it by enabling SA learning on the CPU port.

To prevent the switch from learning from flooding frames from the CPU
port, set skb->offload_fwd_mark to 1 for unicast and broadcast frames,
and let the switch flood them instead of trapping to the CPU port.
Multicast frames still need to be trapped to the CPU port for snooping,
so set the SA_DIS bit of the MTK tag to 1 when transmitting those frames
to disable SA learning.

Fixes: b8f126a8d543 ("net-next: dsa: add dsa support for Mediatek MT7530 switch")
Signed-off-by: DENG Qingfang <dqfext@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/tag_mtk.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'net')

diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c
index b5705cba8318..d6619edd53e5 100644
--- a/net/dsa/tag_mtk.c
+++ b/net/dsa/tag_mtk.c
@@ -15,6 +15,7 @@
 #define MTK_HDR_XMIT_TAGGED_TPID_8100	1
 #define MTK_HDR_RECV_SOURCE_PORT_MASK	GENMASK(2, 0)
 #define MTK_HDR_XMIT_DP_BIT_MASK	GENMASK(5, 0)
+#define MTK_HDR_XMIT_SA_DIS		BIT(6)
 
 static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb,
 				    struct net_device *dev)
@@ -22,6 +23,9 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb,
 	struct dsa_port *dp = dsa_slave_to_port(dev);
 	u8 *mtk_tag;
 	bool is_vlan_skb = true;
+	unsigned char *dest = eth_hdr(skb)->h_dest;
+	bool is_multicast_skb = is_multicast_ether_addr(dest) &&
+				!is_broadcast_ether_addr(dest);
 
 	/* Build the special tag after the MAC Source Address. If VLAN header
 	 * is present, it's required that VLAN header and special tag is
@@ -47,6 +51,10 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb,
 		     MTK_HDR_XMIT_UNTAGGED;
 	mtk_tag[1] = (1 << dp->index) & MTK_HDR_XMIT_DP_BIT_MASK;
 
+	/* Disable SA learning for multicast frames */
+	if (unlikely(is_multicast_skb))
+		mtk_tag[1] |= MTK_HDR_XMIT_SA_DIS;
+
 	/* Tag control information is kept for 802.1Q */
 	if (!is_vlan_skb) {
 		mtk_tag[2] = 0;
@@ -61,6 +69,9 @@ static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev,
 {
 	int port;
 	__be16 *phdr, hdr;
+	unsigned char *dest = eth_hdr(skb)->h_dest;
+	bool is_multicast_skb = is_multicast_ether_addr(dest) &&
+				!is_broadcast_ether_addr(dest);
 
 	if (unlikely(!pskb_may_pull(skb, MTK_HDR_LEN)))
 		return NULL;
@@ -86,6 +97,10 @@ static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev,
 	if (!skb->dev)
 		return NULL;
 
+	/* Only unicast or broadcast frames are offloaded */
+	if (likely(!is_multicast_skb))
+		skb->offload_fwd_mark = 1;
+
 	return skb;
 }
 
-- 
cgit v1.2.3


From 84be69b869a5a496a6cfde9b3c29509207a1f1fa Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Sun, 17 May 2020 11:26:32 -0600
Subject: nexthop: Fix attribute checking for groups

For nexthop groups, attributes after NHA_GROUP_TYPE are invalid, but
nh_check_attr_group starts checking at NHA_GROUP. The group type defaults
to multipath and the NHA_GROUP_TYPE is currently optional so this has
slipped through so far. Fix the attribute checking to handle support of
new group types.

Fixes: 430a049190de ("nexthop: Add support for nexthop groups")
Signed-off-by: ASSOGBA Emery <assogba.emery@gmail.com>
Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/nexthop.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index fdfca534d094..2a31c4af845e 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -433,7 +433,7 @@ static int nh_check_attr_group(struct net *net, struct nlattr *tb[],
 		if (!valid_group_nh(nh, len, extack))
 			return -EINVAL;
 	}
-	for (i = NHA_GROUP + 1; i < __NHA_MAX; ++i) {
+	for (i = NHA_GROUP_TYPE + 1; i < __NHA_MAX; ++i) {
 		if (!tb[i])
 			continue;
 
-- 
cgit v1.2.3


From a6211caa634da39d861a47437ffcda8b38ef421b Mon Sep 17 00:00:00 2001
From: Yuqi Jin <jinyuqi@huawei.com>
Date: Sat, 16 May 2020 11:46:49 +0800
Subject: net: revert "net: get rid of an signed integer overflow in
 ip_idents_reserve()"

Commit adb03115f459 ("net: get rid of an signed integer overflow in ip_idents_reserve()")
used atomic_cmpxchg to replace "atomic_add_return" inside the function
"ip_idents_reserve". The reason was to avoid UBSAN warning.
However, this change has caused performance degrade and in GCC-8,
fno-strict-overflow is now mapped to -fwrapv -fwrapv-pointer
and signed integer overflow is now undefined by default at all
optimization levels[1]. Moreover, it was a bug in UBSAN vs -fwrapv
/-fno-strict-overflow, so Let's revert it safely.

[1] https://gcc.gnu.org/gcc-8/changes.html

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Eric Dumazet <edumazet@google.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Jiri Pirko <jiri@resnulli.us>
Cc: Arvind Sankar <nivedita@alum.mit.edu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jiong Wang <jiongwang@huawei.com>
Signed-off-by: Yuqi Jin <jinyuqi@huawei.com>
Signed-off-by: Shaokun Zhang <zhangshaokun@hisilicon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index fa829f31a3f5..b73f540fa19b 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -491,18 +491,16 @@ u32 ip_idents_reserve(u32 hash, int segs)
 	atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 	u32 old = READ_ONCE(*p_tstamp);
 	u32 now = (u32)jiffies;
-	u32 new, delta = 0;
+	u32 delta = 0;
 
 	if (old != now && cmpxchg(p_tstamp, old, now) == old)
 		delta = prandom_u32_max(now - old);
 
-	/* Do not use atomic_add_return() as it makes UBSAN unhappy */
-	do {
-		old = (u32)atomic_read(p_id);
-		new = old + delta + segs;
-	} while (atomic_cmpxchg(p_id, old, new) != old);
-
-	return new - segs;
+	/* If UBSAN reports an error there, please make sure your compiler
+	 * supports -fno-strict-overflow before reporting it that was a bug
+	 * in UBSAN, and it has been fixed in GCC-8.
+	 */
+	return atomic_add_return(segs + delta, p_id) - segs;
 }
 EXPORT_SYMBOL(ip_idents_reserve);
 
-- 
cgit v1.2.3


From 12555a2d97e5784eeb105ca9b1b533d4c95f1115 Mon Sep 17 00:00:00 2001
From: Todd Malsbary <todd.malsbary@linux.intel.com>
Date: Tue, 19 May 2020 09:45:34 -0700
Subject: mptcp: use rightmost 64 bits in ADD_ADDR HMAC

This changes the HMAC used in the ADD_ADDR option from the leftmost 64
bits to the rightmost 64 bits as described in RFC 8684, section 3.4.1.

This issue was discovered while adding support to packetdrill for the
ADD_ADDR v1 option.

Fixes: 3df523ab582c ("mptcp: Add ADD_ADDR handling")
Signed-off-by: Todd Malsbary <todd.malsbary@linux.intel.com>
Acked-by: Christoph Paasch <cpaasch@apple.com>
Reviewed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mptcp/options.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 45497af23906..b88fae233a62 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -545,7 +545,7 @@ static u64 add_addr_generate_hmac(u64 key1, u64 key2, u8 addr_id,
 
 	mptcp_crypto_hmac_sha(key1, key2, msg, 7, hmac);
 
-	return get_unaligned_be64(hmac);
+	return get_unaligned_be64(&hmac[MPTCP_ADDR_HMAC_LEN - sizeof(u64)]);
 }
 
 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
@@ -562,7 +562,7 @@ static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id,
 
 	mptcp_crypto_hmac_sha(key1, key2, msg, 19, hmac);
 
-	return get_unaligned_be64(hmac);
+	return get_unaligned_be64(&hmac[MPTCP_ADDR_HMAC_LEN - sizeof(u64)]);
 }
 #endif
 
-- 
cgit v1.2.3


From 88d7fcfa3b1fe670f0412b95be785aafca63352b Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Mon, 18 May 2020 17:13:34 -0700
Subject: net: inet_csk: Fix so_reuseport bind-address cache in tb->fast*

The commit 637bc8bbe6c0 ("inet: reset tb->fastreuseport when adding a reuseport sk")
added a bind-address cache in tb->fast*.  The tb->fast* caches the address
of a sk which has successfully been binded with SO_REUSEPORT ON.  The idea
is to avoid the expensive conflict search in inet_csk_bind_conflict().

There is an issue with wildcard matching where sk_reuseport_match() should
have returned false but it is currently returning true.  It ends up
hiding bind conflict.  For example,

bind("[::1]:443"); /* without SO_REUSEPORT. Succeed. */
bind("[::2]:443"); /* with    SO_REUSEPORT. Succeed. */
bind("[::]:443");  /* with    SO_REUSEPORT. Still Succeed where it shouldn't */

The last bind("[::]:443") with SO_REUSEPORT on should have failed because
it should have a conflict with the very first bind("[::1]:443") which
has SO_REUSEPORT off.  However, the address "[::2]" is cached in
tb->fast* in the second bind. In the last bind, the sk_reuseport_match()
returns true because the binding sk's wildcard addr "[::]" matches with
the "[::2]" cached in tb->fast*.

The correct bind conflict is reported by removing the second
bind such that tb->fast* cache is not involved and forces the
bind("[::]:443") to go through the inet_csk_bind_conflict():

bind("[::1]:443"); /* without SO_REUSEPORT. Succeed. */
bind("[::]:443");  /* with    SO_REUSEPORT. -EADDRINUSE */

The expected behavior for sk_reuseport_match() is, it should only allow
the "cached" tb->fast* address to be used as a wildcard match but not
the address of the binding sk.  To do that, the current
"bool match_wildcard" arg is split into
"bool match_sk1_wildcard" and "bool match_sk2_wildcard".

This change only affects the sk_reuseport_match() which is only
used by inet_csk (e.g. TCP).
The other use cases are calling inet_rcv_saddr_equal() and
this patch makes it pass the same "match_wildcard" arg twice to
the "ipv[46]_rcv_saddr_equal(..., match_wildcard, match_wildcard)".

Cc: Josef Bacik <jbacik@fb.com>
Fixes: 637bc8bbe6c0 ("inet: reset tb->fastreuseport when adding a reuseport sk")
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_connection_sock.c | 43 +++++++++++++++++++++++------------------
 1 file changed, 24 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 5f34eb951627..65c29f2bd89f 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -24,17 +24,19 @@
 #include <net/addrconf.h>
 
 #if IS_ENABLED(CONFIG_IPV6)
-/* match_wildcard == true:  IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6
- *                          only, and any IPv4 addresses if not IPv6 only
- * match_wildcard == false: addresses must be exactly the same, i.e.
- *                          IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY,
- *                          and 0.0.0.0 equals to 0.0.0.0 only
+/* match_sk*_wildcard == true:  IPV6_ADDR_ANY equals to any IPv6 addresses
+ *				if IPv6 only, and any IPv4 addresses
+ *				if not IPv6 only
+ * match_sk*_wildcard == false: addresses must be exactly the same, i.e.
+ *				IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY,
+ *				and 0.0.0.0 equals to 0.0.0.0 only
  */
 static bool ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6,
 				 const struct in6_addr *sk2_rcv_saddr6,
 				 __be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,
 				 bool sk1_ipv6only, bool sk2_ipv6only,
-				 bool match_wildcard)
+				 bool match_sk1_wildcard,
+				 bool match_sk2_wildcard)
 {
 	int addr_type = ipv6_addr_type(sk1_rcv_saddr6);
 	int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
@@ -44,8 +46,8 @@ static bool ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6,
 		if (!sk2_ipv6only) {
 			if (sk1_rcv_saddr == sk2_rcv_saddr)
 				return true;
-			if (!sk1_rcv_saddr || !sk2_rcv_saddr)
-				return match_wildcard;
+			return (match_sk1_wildcard && !sk1_rcv_saddr) ||
+				(match_sk2_wildcard && !sk2_rcv_saddr);
 		}
 		return false;
 	}
@@ -53,11 +55,11 @@ static bool ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6,
 	if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY)
 		return true;
 
-	if (addr_type2 == IPV6_ADDR_ANY && match_wildcard &&
+	if (addr_type2 == IPV6_ADDR_ANY && match_sk2_wildcard &&
 	    !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED))
 		return true;
 
-	if (addr_type == IPV6_ADDR_ANY && match_wildcard &&
+	if (addr_type == IPV6_ADDR_ANY && match_sk1_wildcard &&
 	    !(sk1_ipv6only && addr_type2 == IPV6_ADDR_MAPPED))
 		return true;
 
@@ -69,18 +71,19 @@ static bool ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6,
 }
 #endif
 
-/* match_wildcard == true:  0.0.0.0 equals to any IPv4 addresses
- * match_wildcard == false: addresses must be exactly the same, i.e.
- *                          0.0.0.0 only equals to 0.0.0.0
+/* match_sk*_wildcard == true:  0.0.0.0 equals to any IPv4 addresses
+ * match_sk*_wildcard == false: addresses must be exactly the same, i.e.
+ *				0.0.0.0 only equals to 0.0.0.0
  */
 static bool ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,
-				 bool sk2_ipv6only, bool match_wildcard)
+				 bool sk2_ipv6only, bool match_sk1_wildcard,
+				 bool match_sk2_wildcard)
 {
 	if (!sk2_ipv6only) {
 		if (sk1_rcv_saddr == sk2_rcv_saddr)
 			return true;
-		if (!sk1_rcv_saddr || !sk2_rcv_saddr)
-			return match_wildcard;
+		return (match_sk1_wildcard && !sk1_rcv_saddr) ||
+			(match_sk2_wildcard && !sk2_rcv_saddr);
 	}
 	return false;
 }
@@ -96,10 +99,12 @@ bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
 					    sk2->sk_rcv_saddr,
 					    ipv6_only_sock(sk),
 					    ipv6_only_sock(sk2),
+					    match_wildcard,
 					    match_wildcard);
 #endif
 	return ipv4_rcv_saddr_equal(sk->sk_rcv_saddr, sk2->sk_rcv_saddr,
-				    ipv6_only_sock(sk2), match_wildcard);
+				    ipv6_only_sock(sk2), match_wildcard,
+				    match_wildcard);
 }
 EXPORT_SYMBOL(inet_rcv_saddr_equal);
 
@@ -285,10 +290,10 @@ static inline int sk_reuseport_match(struct inet_bind_bucket *tb,
 					    tb->fast_rcv_saddr,
 					    sk->sk_rcv_saddr,
 					    tb->fast_ipv6_only,
-					    ipv6_only_sock(sk), true);
+					    ipv6_only_sock(sk), true, false);
 #endif
 	return ipv4_rcv_saddr_equal(tb->fast_rcv_saddr, sk->sk_rcv_saddr,
-				    ipv6_only_sock(sk), true);
+				    ipv6_only_sock(sk), true, false);
 }
 
 /* Obtain a reference to a local port for the given sock,
-- 
cgit v1.2.3


From c0bbbdc32febd4f034ecbf3ea17865785b2c0652 Mon Sep 17 00:00:00 2001
From: Boris Sukholitko <boris.sukholitko@broadcom.com>
Date: Tue, 19 May 2020 10:32:37 +0300
Subject: __netif_receive_skb_core: pass skb by reference

__netif_receive_skb_core may change the skb pointer passed into it (e.g.
in rx_handler). The original skb may be freed as a result of this
operation.

The callers of __netif_receive_skb_core may further process original skb
by using pt_prev pointer returned by __netif_receive_skb_core thus
leading to unpleasant effects.

The solution is to pass skb by reference into __netif_receive_skb_core.

v2: Added Fixes tag and comment regarding ppt_prev and skb invariant.

Fixes: 88eb1944e18c ("net: core: propagate SKB lists through packet_type lookup")
Signed-off-by: Boris Sukholitko <boris.sukholitko@broadcom.com>
Acked-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 6d327b7aa813..2d8aceee4284 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4988,11 +4988,12 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
 	return 0;
 }
 
-static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc,
+static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
 				    struct packet_type **ppt_prev)
 {
 	struct packet_type *ptype, *pt_prev;
 	rx_handler_func_t *rx_handler;
+	struct sk_buff *skb = *pskb;
 	struct net_device *orig_dev;
 	bool deliver_exact = false;
 	int ret = NET_RX_DROP;
@@ -5023,8 +5024,10 @@ another_round:
 		ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
 		preempt_enable();
 
-		if (ret2 != XDP_PASS)
-			return NET_RX_DROP;
+		if (ret2 != XDP_PASS) {
+			ret = NET_RX_DROP;
+			goto out;
+		}
 		skb_reset_mac_len(skb);
 	}
 
@@ -5174,6 +5177,13 @@ drop:
 	}
 
 out:
+	/* The invariant here is that if *ppt_prev is not NULL
+	 * then skb should also be non-NULL.
+	 *
+	 * Apparently *ppt_prev assignment above holds this invariant due to
+	 * skb dereferencing near it.
+	 */
+	*pskb = skb;
 	return ret;
 }
 
@@ -5183,7 +5193,7 @@ static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
 	struct packet_type *pt_prev = NULL;
 	int ret;
 
-	ret = __netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
+	ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
 	if (pt_prev)
 		ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
 					 skb->dev, pt_prev, orig_dev);
@@ -5261,7 +5271,7 @@ static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemallo
 		struct packet_type *pt_prev = NULL;
 
 		skb_list_del_init(skb);
-		__netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
+		__netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
 		if (!pt_prev)
 			continue;
 		if (pt_curr != pt_prev || od_curr != orig_dev) {
-- 
cgit v1.2.3


From 20a785aa52c82246055a089e55df9dac47d67da1 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Tue, 19 May 2020 16:04:05 -0400
Subject: sctp: Don't add the shutdown timer if its already been added

This BUG halt was reported a while back, but the patch somehow got
missed:

PID: 2879   TASK: c16adaa0  CPU: 1   COMMAND: "sctpn"
 #0 [f418dd28] crash_kexec at c04a7d8c
 #1 [f418dd7c] oops_end at c0863e02
 #2 [f418dd90] do_invalid_op at c040aaca
 #3 [f418de28] error_code (via invalid_op) at c08631a5
    EAX: f34baac0  EBX: 00000090  ECX: f418deb0  EDX: f5542950  EBP: 00000000
    DS:  007b      ESI: f34ba800  ES:  007b      EDI: f418dea0  GS:  00e0
    CS:  0060      EIP: c046fa5e  ERR: ffffffff  EFLAGS: 00010286
 #4 [f418de5c] add_timer at c046fa5e
 #5 [f418de68] sctp_do_sm at f8db8c77 [sctp]
 #6 [f418df30] sctp_primitive_SHUTDOWN at f8dcc1b5 [sctp]
 #7 [f418df48] inet_shutdown at c080baf9
 #8 [f418df5c] sys_shutdown at c079eedf
 #9 [f418df70] sys_socketcall at c079fe88
    EAX: ffffffda  EBX: 0000000d  ECX: bfceea90  EDX: 0937af98
    DS:  007b      ESI: 0000000c  ES:  007b      EDI: b7150ae4
    SS:  007b      ESP: bfceea7c  EBP: bfceeaa8  GS:  0033
    CS:  0073      EIP: b775c424  ERR: 00000066  EFLAGS: 00000282

It appears that the side effect that starts the shutdown timer was processed
multiple times, which can happen as multiple paths can trigger it.  This of
course leads to the BUG halt in add_timer getting called.

Fix seems pretty straightforward, just check before the timer is added if its
already been started.  If it has mod the timer instead to min(current
expiration, new expiration)

Its been tested but not confirmed to fix the problem, as the issue has only
occured in production environments where test kernels are enjoined from being
installed.  It appears to be a sane fix to me though.  Also, recentely,
Jere found a reproducer posted on list to confirm that this resolves the
issues

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: Vlad Yasevich <vyasevich@gmail.com>
CC: "David S. Miller" <davem@davemloft.net>
CC: jere.leppanen@nokia.com
CC: marcelo.leitner@gmail.com
CC: netdev@vger.kernel.org
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/sm_sideeffect.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 2bc29463e1dc..9f36fe911d08 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -1523,9 +1523,17 @@ static int sctp_cmd_interpreter(enum sctp_event_type event_type,
 			timeout = asoc->timeouts[cmd->obj.to];
 			BUG_ON(!timeout);
 
-			timer->expires = jiffies + timeout;
-			sctp_association_hold(asoc);
-			add_timer(timer);
+			/*
+			 * SCTP has a hard time with timer starts.  Because we process
+			 * timer starts as side effects, it can be hard to tell if we
+			 * have already started a timer or not, which leads to BUG
+			 * halts when we call add_timer. So here, instead of just starting
+			 * a timer, if the timer is already started, and just mod
+			 * the timer with the shorter of the two expiration times
+			 */
+			if (!timer_pending(timer))
+				sctp_association_hold(asoc);
+			timer_reduce(timer, jiffies + timeout);
 			break;
 
 		case SCTP_CMD_TIMER_RESTART:
-- 
cgit v1.2.3


From d1f129470e6cb79b8b97fecd12689f6eb49e27fe Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 28 Apr 2020 22:06:54 +0100
Subject: rxrpc: Trace discarded ACKs

Add a tracepoint to track received ACKs that are discarded due to being
outside of the Tx window.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/input.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index e438bfd3fdf5..2f22f082a66c 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -866,8 +866,12 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
 
 	/* Discard any out-of-order or duplicate ACKs (outside lock). */
 	if (before(first_soft_ack, call->ackr_first_seq) ||
-	    before(prev_pkt, call->ackr_prev_seq))
+	    before(prev_pkt, call->ackr_prev_seq)) {
+		trace_rxrpc_rx_discard_ack(call->debug_id, sp->hdr.serial,
+					   first_soft_ack, call->ackr_first_seq,
+					   prev_pkt, call->ackr_prev_seq);
 		return;
+	}
 
 	buf.info.rxMTU = 0;
 	ioffset = offset + nr_acks + 3;
@@ -879,8 +883,12 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
 
 	/* Discard any out-of-order or duplicate ACKs (inside lock). */
 	if (before(first_soft_ack, call->ackr_first_seq) ||
-	    before(prev_pkt, call->ackr_prev_seq))
+	    before(prev_pkt, call->ackr_prev_seq)) {
+		trace_rxrpc_rx_discard_ack(call->debug_id, sp->hdr.serial,
+					   first_soft_ack, call->ackr_first_seq,
+					   prev_pkt, call->ackr_prev_seq);
 		goto out;
+	}
 	call->acks_latest_ts = skb->tstamp;
 
 	call->ackr_first_seq = first_soft_ack;
-- 
cgit v1.2.3


From 441fdee1eaf050ef0040bde0d7af075c1c6a6d8b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 29 Apr 2020 23:48:43 +0100
Subject: rxrpc: Fix ack discard

The Rx protocol has a "previousPacket" field in it that is not handled in
the same way by all protocol implementations.  Sometimes it contains the
serial number of the last DATA packet received, sometimes the sequence
number of the last DATA packet received and sometimes the highest sequence
number so far received.

AF_RXRPC is using this to weed out ACKs that are out of date (it's possible
for ACK packets to get reordered on the wire), but this does not work with
OpenAFS which will just stick the sequence number of the last packet seen
into previousPacket.

The issue being seen is that big AFS FS.StoreData RPC (eg. of ~256MiB) are
timing out when partly sent.  A trace was captured, with an additional
tracepoint to show ACKs being discarded in rxrpc_input_ack().  Here's an
excerpt showing the problem.

 52873.203230: rxrpc_tx_data: c=000004ae DATA ed1a3584:00000002 0002449c q=00024499 fl=09

A DATA packet with sequence number 00024499 has been transmitted (the "q="
field).

 ...
 52873.243296: rxrpc_rx_ack: c=000004ae 00012a2b DLY r=00024499 f=00024497 p=00024496 n=0
 52873.243376: rxrpc_rx_ack: c=000004ae 00012a2c IDL r=0002449b f=00024499 p=00024498 n=0
 52873.243383: rxrpc_rx_ack: c=000004ae 00012a2d OOS r=0002449d f=00024499 p=0002449a n=2

The Out-Of-Sequence ACK indicates that the server didn't see DATA sequence
number 00024499, but did see seq 0002449a (previousPacket, shown as "p=",
skipped the number, but firstPacket, "f=", which shows the bottom of the
window is set at that point).

 52873.252663: rxrpc_retransmit: c=000004ae q=24499 a=02 xp=14581537
 52873.252664: rxrpc_tx_data: c=000004ae DATA ed1a3584:00000002 000244bc q=00024499 fl=0b *RETRANS*

The packet has been retransmitted.  Retransmission recurs until the peer
says it got the packet.

 52873.271013: rxrpc_rx_ack: c=000004ae 00012a31 OOS r=000244a1 f=00024499 p=0002449e n=6

More OOS ACKs indicate that the other packets that are already in the
transmission pipeline are being received.  The specific-ACK list is up to 6
ACKs and NAKs.

 ...
 52873.284792: rxrpc_rx_ack: c=000004ae 00012a49 OOS r=000244b9 f=00024499 p=000244b6 n=30
 52873.284802: rxrpc_retransmit: c=000004ae q=24499 a=0a xp=63505500
 52873.284804: rxrpc_tx_data: c=000004ae DATA ed1a3584:00000002 000244c2 q=00024499 fl=0b *RETRANS*
 52873.287468: rxrpc_rx_ack: c=000004ae 00012a4a OOS r=000244ba f=00024499 p=000244b7 n=31
 52873.287478: rxrpc_rx_ack: c=000004ae 00012a4b OOS r=000244bb f=00024499 p=000244b8 n=32

At this point, the server's receive window is full (n=32) with presumably 1
NAK'd packet and 31 ACK'd packets.  We can't transmit any more packets.

 52873.287488: rxrpc_retransmit: c=000004ae q=24499 a=0a xp=61327980
 52873.287489: rxrpc_tx_data: c=000004ae DATA ed1a3584:00000002 000244c3 q=00024499 fl=0b *RETRANS*
 52873.293850: rxrpc_rx_ack: c=000004ae 00012a4c DLY r=000244bc f=000244a0 p=00024499 n=25

And now we've received an ACK indicating that a DATA retransmission was
received.  7 packets have been processed (the occupied part of the window
moved, as indicated by f= and n=).

 52873.293853: rxrpc_rx_discard_ack: c=000004ae r=00012a4c 000244a0<00024499 00024499<000244b8

However, the DLY ACK gets discarded because its previousPacket has gone
backwards (from p=000244b8, in the ACK at 52873.287478 to p=00024499 in the
ACK at 52873.293850).

We then end up in a continuous cycle of retransmit/discard.  kafs fails to
update its window because it's discarding the ACKs and can't transmit an
extra packet that would clear the issue because the window is full.
OpenAFS doesn't change the previousPacket value in the ACKs because no new
DATA packets are received with a different previousPacket number.

Fix this by altering the discard check to only discard an ACK based on
previousPacket if there was no advance in the firstPacket.  This allows us
to transmit a new packet which will cause previousPacket to advance in the
next ACK.

The check, however, needs to allow for the possibility that previousPacket
may actually have had the serial number placed in it instead - in which
case it will go outside the window and we should ignore it.

Fixes: 1a2391c30c0b ("rxrpc: Fix detection of out of order acks")
Reported-by: Dave Botsch <botsch@cnf.cornell.edu>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/input.c | 30 ++++++++++++++++++++++++++----
 1 file changed, 26 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 2f22f082a66c..3be4177baf70 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -802,6 +802,30 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks,
 	}
 }
 
+/*
+ * Return true if the ACK is valid - ie. it doesn't appear to have regressed
+ * with respect to the ack state conveyed by preceding ACKs.
+ */
+static bool rxrpc_is_ack_valid(struct rxrpc_call *call,
+			       rxrpc_seq_t first_pkt, rxrpc_seq_t prev_pkt)
+{
+	rxrpc_seq_t base = READ_ONCE(call->ackr_first_seq);
+
+	if (after(first_pkt, base))
+		return true; /* The window advanced */
+
+	if (before(first_pkt, base))
+		return false; /* firstPacket regressed */
+
+	if (after_eq(prev_pkt, call->ackr_prev_seq))
+		return true; /* previousPacket hasn't regressed. */
+
+	/* Some rx implementations put a serial number in previousPacket. */
+	if (after_eq(prev_pkt, base + call->tx_winsize))
+		return false;
+	return true;
+}
+
 /*
  * Process an ACK packet.
  *
@@ -865,8 +889,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
 	}
 
 	/* Discard any out-of-order or duplicate ACKs (outside lock). */
-	if (before(first_soft_ack, call->ackr_first_seq) ||
-	    before(prev_pkt, call->ackr_prev_seq)) {
+	if (!rxrpc_is_ack_valid(call, first_soft_ack, prev_pkt)) {
 		trace_rxrpc_rx_discard_ack(call->debug_id, sp->hdr.serial,
 					   first_soft_ack, call->ackr_first_seq,
 					   prev_pkt, call->ackr_prev_seq);
@@ -882,8 +905,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
 	spin_lock(&call->input_lock);
 
 	/* Discard any out-of-order or duplicate ACKs (inside lock). */
-	if (before(first_soft_ack, call->ackr_first_seq) ||
-	    before(prev_pkt, call->ackr_prev_seq)) {
+	if (!rxrpc_is_ack_valid(call, first_soft_ack, prev_pkt)) {
 		trace_rxrpc_rx_discard_ack(call->debug_id, sp->hdr.serial,
 					   first_soft_ack, call->ackr_first_seq,
 					   prev_pkt, call->ackr_prev_seq);
-- 
cgit v1.2.3


From 687775cec056b38a4c8f3291e0dd7a9145f7b667 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 19 May 2020 18:24:43 -0700
Subject: ax25: fix setsockopt(SO_BINDTODEVICE)

syzbot was able to trigger this trace [1], probably by using
a zero optlen.

While we are at it, cap optlen to IFNAMSIZ - 1 instead of IFNAMSIZ.

[1]
BUG: KMSAN: uninit-value in strnlen+0xf9/0x170 lib/string.c:569
CPU: 0 PID: 8807 Comm: syz-executor483 Not tainted 5.7.0-rc4-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1c9/0x220 lib/dump_stack.c:118
 kmsan_report+0xf7/0x1e0 mm/kmsan/kmsan_report.c:121
 __msan_warning+0x58/0xa0 mm/kmsan/kmsan_instr.c:215
 strnlen+0xf9/0x170 lib/string.c:569
 dev_name_hash net/core/dev.c:207 [inline]
 netdev_name_node_lookup net/core/dev.c:277 [inline]
 __dev_get_by_name+0x75/0x2b0 net/core/dev.c:778
 ax25_setsockopt+0xfa3/0x1170 net/ax25/af_ax25.c:654
 __compat_sys_setsockopt+0x4ed/0x910 net/compat.c:403
 __do_compat_sys_setsockopt net/compat.c:413 [inline]
 __se_compat_sys_setsockopt+0xdd/0x100 net/compat.c:410
 __ia32_compat_sys_setsockopt+0x62/0x80 net/compat.c:410
 do_syscall_32_irqs_on arch/x86/entry/common.c:339 [inline]
 do_fast_syscall_32+0x3bf/0x6d0 arch/x86/entry/common.c:398
 entry_SYSENTER_compat+0x68/0x77 arch/x86/entry/entry_64_compat.S:139
RIP: 0023:0xf7f57dd9
Code: 90 e8 0b 00 00 00 f3 90 0f ae e8 eb f9 8d 74 26 00 89 3c 24 c3 90 90 90 90 90 90 90 90 90 90 90 90 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 c3 90 90 90 90 eb 0d 90 90 90 90 90 90 90 90 90 90 90 90
RSP: 002b:00000000ffae8c1c EFLAGS: 00000217 ORIG_RAX: 000000000000016e
RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000000000101
RDX: 0000000000000019 RSI: 0000000020000000 RDI: 0000000000000004
RBP: 0000000000000012 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000

Local variable ----devname@ax25_setsockopt created at:
 ax25_setsockopt+0xe6/0x1170 net/ax25/af_ax25.c:536
 ax25_setsockopt+0xe6/0x1170 net/ax25/af_ax25.c:536

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ax25/af_ax25.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index ff57ea89c27e..fd91cd34f25e 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -635,8 +635,10 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname,
 		break;
 
 	case SO_BINDTODEVICE:
-		if (optlen > IFNAMSIZ)
-			optlen = IFNAMSIZ;
+		if (optlen > IFNAMSIZ - 1)
+			optlen = IFNAMSIZ - 1;
+
+		memset(devname, 0, sizeof(devname));
 
 		if (copy_from_user(devname, optval, optlen)) {
 			res = -EFAULT;
-- 
cgit v1.2.3


From d69100b8eee27c2d60ee52df76e0b80a8d492d34 Mon Sep 17 00:00:00 2001
From: Stephen Worley <sworley@cumulusnetworks.com>
Date: Tue, 19 May 2020 21:57:12 -0400
Subject: net: nlmsg_cancel() if put fails for nhmsg

Fixes data remnant seen when we fail to reserve space for a
nexthop group during a larger dump.

If we fail the reservation, we goto nla_put_failure and
cancel the message.

Reproduce with the following iproute2 commands:
=====================
ip link add dummy1 type dummy
ip link add dummy2 type dummy
ip link add dummy3 type dummy
ip link add dummy4 type dummy
ip link add dummy5 type dummy
ip link add dummy6 type dummy
ip link add dummy7 type dummy
ip link add dummy8 type dummy
ip link add dummy9 type dummy
ip link add dummy10 type dummy
ip link add dummy11 type dummy
ip link add dummy12 type dummy
ip link add dummy13 type dummy
ip link add dummy14 type dummy
ip link add dummy15 type dummy
ip link add dummy16 type dummy
ip link add dummy17 type dummy
ip link add dummy18 type dummy
ip link add dummy19 type dummy
ip link add dummy20 type dummy
ip link add dummy21 type dummy
ip link add dummy22 type dummy
ip link add dummy23 type dummy
ip link add dummy24 type dummy
ip link add dummy25 type dummy
ip link add dummy26 type dummy
ip link add dummy27 type dummy
ip link add dummy28 type dummy
ip link add dummy29 type dummy
ip link add dummy30 type dummy
ip link add dummy31 type dummy
ip link add dummy32 type dummy

ip link set dummy1 up
ip link set dummy2 up
ip link set dummy3 up
ip link set dummy4 up
ip link set dummy5 up
ip link set dummy6 up
ip link set dummy7 up
ip link set dummy8 up
ip link set dummy9 up
ip link set dummy10 up
ip link set dummy11 up
ip link set dummy12 up
ip link set dummy13 up
ip link set dummy14 up
ip link set dummy15 up
ip link set dummy16 up
ip link set dummy17 up
ip link set dummy18 up
ip link set dummy19 up
ip link set dummy20 up
ip link set dummy21 up
ip link set dummy22 up
ip link set dummy23 up
ip link set dummy24 up
ip link set dummy25 up
ip link set dummy26 up
ip link set dummy27 up
ip link set dummy28 up
ip link set dummy29 up
ip link set dummy30 up
ip link set dummy31 up
ip link set dummy32 up

ip link set dummy33 up
ip link set dummy34 up

ip link set vrf-red up
ip link set vrf-blue up

ip link set dummyVRFred up
ip link set dummyVRFblue up

ip ro add 1.1.1.1/32 dev dummy1
ip ro add 1.1.1.2/32 dev dummy2
ip ro add 1.1.1.3/32 dev dummy3
ip ro add 1.1.1.4/32 dev dummy4
ip ro add 1.1.1.5/32 dev dummy5
ip ro add 1.1.1.6/32 dev dummy6
ip ro add 1.1.1.7/32 dev dummy7
ip ro add 1.1.1.8/32 dev dummy8
ip ro add 1.1.1.9/32 dev dummy9
ip ro add 1.1.1.10/32 dev dummy10
ip ro add 1.1.1.11/32 dev dummy11
ip ro add 1.1.1.12/32 dev dummy12
ip ro add 1.1.1.13/32 dev dummy13
ip ro add 1.1.1.14/32 dev dummy14
ip ro add 1.1.1.15/32 dev dummy15
ip ro add 1.1.1.16/32 dev dummy16
ip ro add 1.1.1.17/32 dev dummy17
ip ro add 1.1.1.18/32 dev dummy18
ip ro add 1.1.1.19/32 dev dummy19
ip ro add 1.1.1.20/32 dev dummy20
ip ro add 1.1.1.21/32 dev dummy21
ip ro add 1.1.1.22/32 dev dummy22
ip ro add 1.1.1.23/32 dev dummy23
ip ro add 1.1.1.24/32 dev dummy24
ip ro add 1.1.1.25/32 dev dummy25
ip ro add 1.1.1.26/32 dev dummy26
ip ro add 1.1.1.27/32 dev dummy27
ip ro add 1.1.1.28/32 dev dummy28
ip ro add 1.1.1.29/32 dev dummy29
ip ro add 1.1.1.30/32 dev dummy30
ip ro add 1.1.1.31/32 dev dummy31
ip ro add 1.1.1.32/32 dev dummy32

ip next add id 1 via 1.1.1.1 dev dummy1
ip next add id 2 via 1.1.1.2 dev dummy2
ip next add id 3 via 1.1.1.3 dev dummy3
ip next add id 4 via 1.1.1.4 dev dummy4
ip next add id 5 via 1.1.1.5 dev dummy5
ip next add id 6 via 1.1.1.6 dev dummy6
ip next add id 7 via 1.1.1.7 dev dummy7
ip next add id 8 via 1.1.1.8 dev dummy8
ip next add id 9 via 1.1.1.9 dev dummy9
ip next add id 10 via 1.1.1.10 dev dummy10
ip next add id 11 via 1.1.1.11 dev dummy11
ip next add id 12 via 1.1.1.12 dev dummy12
ip next add id 13 via 1.1.1.13 dev dummy13
ip next add id 14 via 1.1.1.14 dev dummy14
ip next add id 15 via 1.1.1.15 dev dummy15
ip next add id 16 via 1.1.1.16 dev dummy16
ip next add id 17 via 1.1.1.17 dev dummy17
ip next add id 18 via 1.1.1.18 dev dummy18
ip next add id 19 via 1.1.1.19 dev dummy19
ip next add id 20 via 1.1.1.20 dev dummy20
ip next add id 21 via 1.1.1.21 dev dummy21
ip next add id 22 via 1.1.1.22 dev dummy22
ip next add id 23 via 1.1.1.23 dev dummy23
ip next add id 24 via 1.1.1.24 dev dummy24
ip next add id 25 via 1.1.1.25 dev dummy25
ip next add id 26 via 1.1.1.26 dev dummy26
ip next add id 27 via 1.1.1.27 dev dummy27
ip next add id 28 via 1.1.1.28 dev dummy28
ip next add id 29 via 1.1.1.29 dev dummy29
ip next add id 30 via 1.1.1.30 dev dummy30
ip next add id 31 via 1.1.1.31 dev dummy31
ip next add id 32 via 1.1.1.32 dev dummy32

i=100

while [ $i -le 200 ]
do
ip next add id $i group 1/2/3/4/5/6/7/8/9/10/11/12/13/14/15/16/17/18/19

	echo $i

	((i++))

done

ip next add id 999 group 1/2/3/4/5/6

ip next ls

========================

Fixes: ab84be7e54fc ("net: Initial nexthop code")
Signed-off-by: Stephen Worley <sworley@cumulusnetworks.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/nexthop.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 2a31c4af845e..715e14475220 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -276,6 +276,7 @@ out:
 	return 0;
 
 nla_put_failure:
+	nlmsg_cancel(skb, nlh);
 	return -EMSGSIZE;
 }
 
-- 
cgit v1.2.3


From 7c87e32d2e380228ada79d20ac5b7674718ef097 Mon Sep 17 00:00:00 2001
From: Michal Kubecek <mkubecek@suse.cz>
Date: Sun, 10 May 2020 21:04:09 +0200
Subject: ethtool: count header size in reply size estimate

As ethnl_request_ops::reply_size handlers do not include common header
size into calculated/estimated reply size, it needs to be added in
ethnl_default_doit() and ethnl_default_notify() before allocating the
message. On the other hand, strset_reply_size() should not add common
header size.

Fixes: 728480f12442 ("ethtool: default handlers for GET requests")
Reported-by: Oleksij Rempel <o.rempel@pengutronix.de>
Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ethtool/netlink.c | 4 ++--
 net/ethtool/strset.c  | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
index 0c772318c023..ed5357210193 100644
--- a/net/ethtool/netlink.c
+++ b/net/ethtool/netlink.c
@@ -342,7 +342,7 @@ static int ethnl_default_doit(struct sk_buff *skb, struct genl_info *info)
 	ret = ops->reply_size(req_info, reply_data);
 	if (ret < 0)
 		goto err_cleanup;
-	reply_len = ret;
+	reply_len = ret + ethnl_reply_header_size();
 	ret = -ENOMEM;
 	rskb = ethnl_reply_init(reply_len, req_info->dev, ops->reply_cmd,
 				ops->hdr_attr, info, &reply_payload);
@@ -588,7 +588,7 @@ static void ethnl_default_notify(struct net_device *dev, unsigned int cmd,
 	ret = ops->reply_size(req_info, reply_data);
 	if (ret < 0)
 		goto err_cleanup;
-	reply_len = ret;
+	reply_len = ret + ethnl_reply_header_size();
 	ret = -ENOMEM;
 	skb = genlmsg_new(reply_len, GFP_KERNEL);
 	if (!skb)
diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c
index 95eae5c68a52..0eed4e4909ab 100644
--- a/net/ethtool/strset.c
+++ b/net/ethtool/strset.c
@@ -324,7 +324,6 @@ static int strset_reply_size(const struct ethnl_req_info *req_base,
 	int len = 0;
 	int ret;
 
-	len += ethnl_reply_header_size();
 	for (i = 0; i < ETH_SS_COUNT; i++) {
 		const struct strset_info *set_info = &data->sets[i];
 
-- 
cgit v1.2.3


From d28ea1fbbf437054ef339afec241019f2c4e2bb6 Mon Sep 17 00:00:00 2001
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Date: Tue, 19 May 2020 23:44:16 +0530
Subject: net: qrtr: Fix passing invalid reference to qrtr_local_enqueue()

Once the traversal of the list is completed with list_for_each_entry(),
the iterator (node) will point to an invalid object. So passing this to
qrtr_local_enqueue() which is outside of the iterator block is erroneous
eventhough the object is not used.

So fix this by passing NULL to qrtr_local_enqueue().

Fixes: bdabad3e363d ("net: Add Qualcomm IPC router")
Reported-by: kbuild test robot <lkp@intel.com>
Reported-by: Julia Lawall <julia.lawall@lip6.fr>
Signe