From 0ce294d88457bccd7f9991f883fec80022a1ddbd Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 7 Dec 2017 11:33:30 -0800
Subject: tcp: correctly test congestion state in RACK

RACK does not test the loss recovery state correctly to compute
the reordering window. It assumes if lost_out is zero then TCP is
not in loss recovery. But it can be zero during recovery before
calling tcp_rack_detect_loss(): when an ACK acknowledges all
packets marked lost before receiving this ACK, but has not yet
to discover new ones by tcp_rack_detect_loss(). The fix is to
simply test the congestion state directly.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Priyaranjan Jha <priyarjha@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_recovery.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index d3ea89020c69..3143664902e9 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -55,7 +55,8 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
 	 * to queuing or delayed ACKs.
 	 */
 	reo_wnd = 1000;
-	if ((tp->rack.reord || !tp->lost_out) && min_rtt != ~0U) {
+	if ((tp->rack.reord || inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery) &&
+	    min_rtt != ~0U) {
 		reo_wnd = max((min_rtt >> 2) * tp->rack.reo_wnd_steps, reo_wnd);
 		reo_wnd = min(reo_wnd, tp->srtt_us >> 3);
 	}
-- 
cgit v1.2.3


From cd1fc85b4399d47e3d6626301741ba8c38cd475a Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 7 Dec 2017 11:33:31 -0800
Subject: tcp: always evaluate losses in RACK upon undo

When sender detects spurious retransmission, all packets
marked lost are remarked to be in-flight. However some may
be considered lost based on its timestamps in RACK. This patch
forces RACK to re-evaluate, which may be skipped previously if
the ACK does not advance RACK timestamp.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Priyaranjan Jha <priyarjha@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 075c559570e6..9550cc42de2d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2329,6 +2329,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
 	}
 	tp->snd_cwnd_stamp = tcp_jiffies32;
 	tp->undo_marker = 0;
+	tp->rack.advanced = 1; /* Force RACK to re-exam losses */
 }
 
 static inline bool tcp_may_undo(const struct tcp_sock *tp)
-- 
cgit v1.2.3


From 428aec5e69fa17d223e1495f395833c50770f7ae Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 7 Dec 2017 11:33:32 -0800
Subject: tcp: fix off-by-one bug in RACK

RACK should mark a packet lost when remaining wait time is zero.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Priyaranjan Jha <priyarjha@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_recovery.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 3143664902e9..0c182303e62e 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -80,12 +80,12 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
 		 */
 		remaining = tp->rack.rtt_us + reo_wnd -
 			    tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp);
-		if (remaining < 0) {
+		if (remaining <= 0) {
 			tcp_rack_mark_skb_lost(sk, skb);
 			list_del_init(&skb->tcp_tsorted_anchor);
 		} else {
-			/* Record maximum wait time (+1 to avoid 0) */
-			*reo_timeout = max_t(u32, *reo_timeout, 1 + remaining);
+			/* Record maximum wait time */
+			*reo_timeout = max_t(u32, *reo_timeout, remaining);
 		}
 	}
 }
-- 
cgit v1.2.3


From 6065fd0d179b96ddc488c76542349bcb148a95fd Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Thu, 7 Dec 2017 11:33:33 -0800
Subject: tcp: evaluate packet losses upon RTT change

RACK skips an ACK unless it advances the most recently delivered
TX timestamp (rack.mstamp). Since RACK also uses the most recent
RTT to decide if a packet is lost, RACK should still run the
loss detection whenever the most recent RTT changes. For example,
an ACK that does not advance the timestamp but triggers the cwnd
undo due to reordering, would then use the most recent (higher)
RTT measurement to detect further losses.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Reviewed-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Priyaranjan Jha <priyarjha@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_recovery.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 0c182303e62e..3a81720ac0c4 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -117,13 +117,8 @@ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
 {
 	u32 rtt_us;
 
-	if (tp->rack.mstamp &&
-	    !tcp_rack_sent_after(xmit_time, tp->rack.mstamp,
-				 end_seq, tp->rack.end_seq))
-		return;
-
 	rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time);
-	if (sacked & TCPCB_RETRANS) {
+	if (rtt_us < tcp_min_rtt(tp) && (sacked & TCPCB_RETRANS)) {
 		/* If the sacked packet was retransmitted, it's ambiguous
 		 * whether the retransmission or the original (or the prior
 		 * retransmission) was sacked.
@@ -134,13 +129,15 @@ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
 		 * so it's at least one RTT (i.e., retransmission is at least
 		 * an RTT later).
 		 */
-		if (rtt_us < tcp_min_rtt(tp))
-			return;
+		return;
 	}
-	tp->rack.rtt_us = rtt_us;
-	tp->rack.mstamp = xmit_time;
-	tp->rack.end_seq = end_seq;
 	tp->rack.advanced = 1;
+	tp->rack.rtt_us = rtt_us;
+	if (tcp_rack_sent_after(xmit_time, tp->rack.mstamp,
+				end_seq, tp->rack.end_seq)) {
+		tp->rack.mstamp = xmit_time;
+		tp->rack.end_seq = end_seq;
+	}
 }
 
 /* We have waited long enough to accommodate reordering. Mark the expired
-- 
cgit v1.2.3