Merge branch 'tcp-eor'

Martin KaFai Lau says: ==================== tcp: Make use of MSG_EOR in tcp_sendmsg v4: ~ Do not set eor bit in do_tcp_sendpages() since there is no way to pass MSG_EOR from the userland now. ~ Avoid rmw by testing MSG_EOR first in tcp_sendmsg(). ~ Move TCP_SKB_CB(skb)->eor test to a new helper tcp_skb_can_collapse_to() (suggested by Soheil). ~ Add some packetdrill tests. v3: ~ Separate EOR marking from the SKBTX_ANY_TSTAMP logic. ~ Move the eor bit test back to the loop in tcp_sendmsg and tcp_sendpage because there could be >1 threads doing sendmsg. ~ Thanks to Eric Dumazet's suggestions on v2. ~ The TCP timestamp bug fixes are separated into other threads. v2: ~ Rework based on the recent work "add TX timestamping via cmsg" by Soheil Hassas Yeganeh <soheil.kdev@gmail.com> ~ This version takes the MSG_EOR bit as a signal of end-of-response-message and leave the selective timestamping job to the cmsg ~ Changes based on the v1 feedback (like avoid unlikely check in a loop and adding tcp_sendpage support) ~ The first 3 patches are bug fixes. The fixes in this series depend on the newly introduced txstamp_ack in net-next. I will make relevant patches against net after getting some feedback. ~ The test results are based on the recently posted net fix: "tcp: Fix SOF_TIMESTAMPING_TX_ACK when handling dup acks" One potential use case is to use MSG_EOR with SOF_TIMESTAMPING_TX_ACK to get a more accurate TCP ack timestamping on application protocol with multiple outgoing response messages (e.g. HTTP2). One of our use case is at the webserver. The webserver tracks the HTTP2 response latency by measuring when the webserver sends the first byte to the socket till the TCP ACK of the last byte is received. In the cases where we don't have client side measurement, measuring from the server side is the only option. In the cases we have the client side measurement, the server side data can also be used to justify/cross-check-with the client side data. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
author: David S. Miller <davem@davemloft.net> 2016-04-28 16:14:20 -0400
committer: David S. Miller <davem@davemloft.net> 2016-04-28 16:14:20 -0400
commit: f345c9a5726d10cc3613d17ae75dcb1ab0986ba1 (patch)
tree: 45ae88e3dff70f289650dd4cc1bc97672a82af70
parent: 2a9e8438a29c00432ae14eaceb088b965f8ac290 (diff)
parent: a166140e810e74682f3ca248ef3879177b5c1315 (diff)
4 files changed, 29 insertions, 3 deletions
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 992f317c1abe..24ec80483805 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -761,7 +761,8 @@ struct tcp_skb_cb {
 
 	__u8		ip_dsfield;	/* IPv4 tos or IPv6 dsfield	*/
 	__u8		txstamp_ack:1,	/* Record TX timestamp for ack? */
-			unused:7;
+			eor:1,		/* Is skb MSG_EOR marked? */
+			unused:6;
 	__u32		ack_seq;	/* Sequence number ACK'd	*/
 	union {
 		struct inet_skb_parm	h4;
@@ -808,6 +809,11 @@ static inline int tcp_skb_mss(const struct sk_buff *skb)
 	return TCP_SKB_CB(skb)->tcp_gso_size;
 }
 
+static inline bool tcp_skb_can_collapse_to(const struct sk_buff *skb)
+{
+	return likely(!TCP_SKB_CB(skb)->eor);
+}
+
 /* Events passed to congestion control interface */
 enum tcp_ca_event {
 	CA_EVENT_TX_START,	/* first transmit when no packets in flight */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 91993782a947..cb4d1cabb42c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -909,7 +909,8 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
 		int copy, i;
 		bool can_coalesce;
 
-		if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
+		if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0 ||
+		    !tcp_skb_can_collapse_to(skb)) {
 new_segment:
 			if (!sk_stream_memory_free(sk))
 				goto wait_for_sndbuf;
@@ -1157,7 +1158,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 			copy = max - skb->len;
 		}
 
-		if (copy <= 0) {
+		if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
 new_segment:
 			/* Allocate new segment. If the interface is SG,
 			 * allocate skb fitting to single page.
@@ -1251,6 +1252,8 @@ new_segment:
 		copied += copy;
 		if (!msg_data_left(msg)) {
 			tcp_tx_timestamp(sk, sockc.tsflags, skb);
+			if (unlikely(flags & MSG_EOR))
+				TCP_SKB_CB(skb)->eor = 1;
 			goto out;
 		}
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 70c370b93762..1fb19c91e091 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1303,6 +1303,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
 	}
 
 	TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
+	TCP_SKB_CB(prev)->eor = TCP_SKB_CB(skb)->eor;
 	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
 		TCP_SKB_CB(prev)->end_seq++;
 
@@ -1368,6 +1369,9 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
 	if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
 		goto fallback;
 
+	if (!tcp_skb_can_collapse_to(prev))
+		goto fallback;
+
 	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
 		  !before(end_seq, TCP_SKB_CB(skb)->end_seq);
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b3a31b4df57c..1a487ff95d4c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1134,6 +1134,12 @@ static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
 	}
 }
 
+static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
+{
+	TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor;
+	TCP_SKB_CB(skb)->eor = 0;
+}
+
 /* Function to create two new TCP segments.  Shrinks the given segment
  * to the specified size and appends a new segment with the rest of the
  * packet to the list.  This won't be called frequently, I hope.
@@ -1179,6 +1185,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
 	TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
 	TCP_SKB_CB(buff)->tcp_flags = flags;
 	TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
+	tcp_skb_fragment_eor(skb, buff);
 
 	if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {
 		/* Copy and checksum data tail into the new buffer. */
@@ -1739,6 +1746,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
 	/* This packet was never sent out yet, so no SACK bits. */
 	TCP_SKB_CB(buff)->sacked = 0;
 
+	tcp_skb_fragment_eor(skb, buff);
+
 	buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL;
 	skb_split(skb, buff, len);
 	tcp_fragment_tstamp(skb, buff);
@@ -2499,6 +2508,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
 	 * packet counting does not break.
 	 */
 	TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
+	TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor;
 
 	/* changed transmit queue under us so clear hints */
 	tcp_clear_retrans_hints_partial(tp);
@@ -2550,6 +2560,9 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
 		if (!tcp_can_collapse(sk, skb))
 			break;
 
+		if (!tcp_skb_can_collapse_to(to))
+			break;
+
 		space -= skb->len;
 
 		if (first) {
author	David S. Miller <davem@davemloft.net>	2016-04-28 16:14:20 -0400
committer	David S. Miller <davem@davemloft.net>	2016-04-28 16:14:20 -0400
commit	f345c9a5726d10cc3613d17ae75dcb1ab0986ba1 (patch)
tree	45ae88e3dff70f289650dd4cc1bc97672a82af70
parent	2a9e8438a29c00432ae14eaceb088b965f8ac290 (diff)
parent	a166140e810e74682f3ca248ef3879177b5c1315 (diff)