summaryrefslogtreecommitdiffstats
path: root/net/mptcp/protocol.c
diff options
context:
space:
mode:
authorPaolo Abeni <pabeni@redhat.com>2020-11-19 11:46:03 -0800
committerJakub Kicinski <kuba@kernel.org>2020-11-20 15:33:25 -0800
commitea4ca586b16ff2eb6157fe13969eb72d2403a3a1 (patch)
tree8d17cb274f9432777f1f69fe0ee729b714cc345a /net/mptcp/protocol.c
parentfa3fe2b150316b294f2c662653501273ff25bba8 (diff)
mptcp: refine MPTCP-level ack scheduling
Send timely MPTCP-level ack is somewhat difficult when the insertion into the msk receive level is performed by the worker. It needs TCP-level dup-ack to notify the MPTCP-level ack_seq increase, as both the TCP-level ack seq and the rcv window are unchanged. We can actually avoid processing incoming data with the worker, and let the subflow or recevmsg() send ack as needed. When recvmsg() moves the skbs inside the msk receive queue, the msk space is still unchanged, so tcp_cleanup_rbuf() could end-up skipping TCP-level ack generation. Anyway, when __mptcp_move_skbs() is invoked, a known amount of bytes is going to be consumed soon: we update rcv wnd computation taking them in account. Additionally we need to explicitly trigger tcp_cleanup_rbuf() when recvmsg() consumes a significant amount of the receive buffer. Signed-off-by: Paolo Abeni <pabeni@redhat.com> Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'net/mptcp/protocol.c')
-rw-r--r--net/mptcp/protocol.c105
1 files changed, 51 insertions, 54 deletions
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index e1606e072218..4b7794835fea 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -407,16 +407,42 @@ static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk)
mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN;
}
-static void mptcp_send_ack(struct mptcp_sock *msk)
+static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
+{
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */
+ if (subflow->request_join && !subflow->fully_established)
+ return false;
+
+ /* only send if our side has not closed yet */
+ return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT));
+}
+
+static void mptcp_send_ack(struct mptcp_sock *msk, bool force)
{
struct mptcp_subflow_context *subflow;
+ struct sock *pick = NULL;
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
- lock_sock(ssk);
- tcp_send_ack(ssk);
- release_sock(ssk);
+ if (force) {
+ lock_sock(ssk);
+ tcp_send_ack(ssk);
+ release_sock(ssk);
+ continue;
+ }
+
+ /* if the hintes ssk is still active, use it */
+ pick = ssk;
+ if (ssk == msk->ack_hint)
+ break;
+ }
+ if (!force && pick) {
+ lock_sock(pick);
+ tcp_cleanup_rbuf(pick, 1);
+ release_sock(pick);
}
}
@@ -468,7 +494,7 @@ static bool mptcp_check_data_fin(struct sock *sk)
ret = true;
mptcp_set_timeout(sk, NULL);
- mptcp_send_ack(msk);
+ mptcp_send_ack(msk, true);
mptcp_close_wake_up(sk);
}
return ret;
@@ -483,7 +509,6 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
unsigned int moved = 0;
bool more_data_avail;
struct tcp_sock *tp;
- u32 old_copied_seq;
bool done = false;
int sk_rbuf;
@@ -500,7 +525,6 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
pr_debug("msk=%p ssk=%p", msk, ssk);
tp = tcp_sk(ssk);
- old_copied_seq = tp->copied_seq;
do {
u32 map_remaining, offset;
u32 seq = tp->copied_seq;
@@ -564,11 +588,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
break;
}
} while (more_data_avail);
+ msk->ack_hint = ssk;
*bytes += moved;
- if (tp->copied_seq != old_copied_seq)
- tcp_cleanup_rbuf(ssk, 1);
-
return done;
}
@@ -672,19 +694,8 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf)
goto wake;
- if (move_skbs_to_msk(msk, ssk))
- goto wake;
+ move_skbs_to_msk(msk, ssk);
- /* mptcp socket is owned, release_cb should retry */
- if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED,
- &sk->sk_tsq_flags)) {
- sock_hold(sk);
-
- /* need to try again, its possible release_cb() has already
- * been called after the test_and_set_bit() above.
- */
- move_skbs_to_msk(msk, ssk);
- }
wake:
if (wake)
sk->sk_data_ready(sk);
@@ -1095,18 +1106,6 @@ static void mptcp_nospace(struct mptcp_sock *msk)
mptcp_clean_una((struct sock *)msk);
}
-static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
-{
- struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
-
- /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */
- if (subflow->request_join && !subflow->fully_established)
- return false;
-
- /* only send if our side has not closed yet */
- return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT));
-}
-
#define MPTCP_SEND_BURST_SIZE ((1 << 16) - \
sizeof(struct tcphdr) - \
MAX_TCP_OPTION_SPACE - \
@@ -1534,7 +1533,7 @@ new_measure:
msk->rcvq_space.time = mstamp;
}
-static bool __mptcp_move_skbs(struct mptcp_sock *msk)
+static bool __mptcp_move_skbs(struct mptcp_sock *msk, unsigned int rcv)
{
unsigned int moved = 0;
bool done;
@@ -1553,12 +1552,16 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
slowpath = lock_sock_fast(ssk);
done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved);
+ if (moved && rcv) {
+ WRITE_ONCE(msk->rmem_pending, min(rcv, moved));
+ tcp_cleanup_rbuf(ssk, 1);
+ WRITE_ONCE(msk->rmem_pending, 0);
+ }
unlock_sock_fast(ssk, slowpath);
} while (!done);
if (mptcp_ofo_queue(msk) || moved > 0) {
- if (!mptcp_check_data_fin((struct sock *)msk))
- mptcp_send_ack(msk);
+ mptcp_check_data_fin((struct sock *)msk);
return true;
}
return false;
@@ -1582,8 +1585,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
__mptcp_flush_join_list(msk);
- while (len > (size_t)copied) {
- int bytes_read;
+ for (;;) {
+ int bytes_read, old_space;
bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied);
if (unlikely(bytes_read < 0)) {
@@ -1595,9 +1598,14 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
copied += bytes_read;
if (skb_queue_empty(&sk->sk_receive_queue) &&
- __mptcp_move_skbs(msk))
+ __mptcp_move_skbs(msk, len - copied))
continue;
+ /* be sure to advertise window change */
+ old_space = READ_ONCE(msk->old_wspace);
+ if ((tcp_space(sk) - old_space) >= old_space)
+ mptcp_send_ack(msk, false);
+
/* only the master socket status is relevant here. The exit
* conditions mirror closely tcp_recvmsg()
*/
@@ -1650,7 +1658,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
/* .. race-breaker: ssk might have gotten new data
* after last __mptcp_move_skbs() returned false.
*/
- if (unlikely(__mptcp_move_skbs(msk)))
+ if (unlikely(__mptcp_move_skbs(msk, 0)))
set_bit(MPTCP_DATA_READY, &msk->flags);
} else if (unlikely(!test_bit(MPTCP_DATA_READY, &msk->flags))) {
/* data to read but mptcp_wait_data() cleared DATA_READY */
@@ -1881,7 +1889,6 @@ static void mptcp_worker(struct work_struct *work)
if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
__mptcp_close_subflow(msk);
- __mptcp_move_skbs(msk);
if (mptcp_send_head(sk))
mptcp_push_pending(sk, 0);
@@ -1965,6 +1972,7 @@ static int __mptcp_init_sock(struct sock *sk)
msk->out_of_order_queue = RB_ROOT;
msk->first_pending = NULL;
+ msk->ack_hint = NULL;
msk->first = NULL;
inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
@@ -2500,8 +2508,7 @@ static int mptcp_getsockopt(struct sock *sk, int level, int optname,
return -EOPNOTSUPP;
}
-#define MPTCP_DEFERRED_ALL (TCPF_DELACK_TIMER_DEFERRED | \
- TCPF_WRITE_TIMER_DEFERRED)
+#define MPTCP_DEFERRED_ALL (TCPF_WRITE_TIMER_DEFERRED)
/* this is very alike tcp_release_cb() but we must handle differently a
* different set of events
@@ -2519,16 +2526,6 @@ static void mptcp_release_cb(struct sock *sk)
sock_release_ownership(sk);
- if (flags & TCPF_DELACK_TIMER_DEFERRED) {
- struct mptcp_sock *msk = mptcp_sk(sk);
- struct sock *ssk;
-
- ssk = mptcp_subflow_recv_lookup(msk);
- if (!ssk || sk->sk_state == TCP_CLOSE ||
- !schedule_work(&msk->work))
- __sock_put(sk);
- }
-
if (flags & TCPF_WRITE_TIMER_DEFERRED) {
mptcp_retransmit_handler(sk);
__sock_put(sk);