diff options
Diffstat (limited to 'net/mptcp/protocol.c')
-rw-r--r-- | net/mptcp/protocol.c | 592 |
1 files changed, 537 insertions, 55 deletions
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e959104832ef..1833bc1f4a43 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -21,6 +21,7 @@ #endif #include <net/mptcp.h> #include "protocol.h" +#include "mib.h" #define MPTCP_SAME_STATE TCP_MAX_STATES @@ -37,6 +38,8 @@ struct mptcp_skb_cb { #define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0])) +static struct percpu_counter mptcp_sockets_allocated; + /* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not * completed yet or has failed, return the subflow socket. * Otherwise return NULL. @@ -104,19 +107,6 @@ set_state: return ssock; } -static struct sock *mptcp_subflow_get(const struct mptcp_sock *msk) -{ - struct mptcp_subflow_context *subflow; - - sock_owned_by_me((const struct sock *)msk); - - mptcp_for_each_subflow(msk, subflow) { - return mptcp_subflow_tcp_sock(subflow); - } - - return NULL; -} - static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, struct sk_buff *skb, unsigned int offset, size_t copy_len) @@ -254,6 +244,60 @@ wake: sk->sk_data_ready(sk); } +static void __mptcp_flush_join_list(struct mptcp_sock *msk) +{ + if (likely(list_empty(&msk->join_list))) + return; + + spin_lock_bh(&msk->join_list_lock); + list_splice_tail_init(&msk->join_list, &msk->conn_list); + spin_unlock_bh(&msk->join_list_lock); +} + +static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk) +{ + long tout = ssk && inet_csk(ssk)->icsk_pending ? + inet_csk(ssk)->icsk_timeout - jiffies : 0; + + if (tout <= 0) + tout = mptcp_sk(sk)->timer_ival; + mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN; +} + +static bool mptcp_timer_pending(struct sock *sk) +{ + return timer_pending(&inet_csk(sk)->icsk_retransmit_timer); +} + +static void mptcp_reset_timer(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + unsigned long tout; + + /* should never be called with mptcp level timer cleared */ + tout = READ_ONCE(mptcp_sk(sk)->timer_ival); + if (WARN_ON_ONCE(!tout)) + tout = TCP_RTO_MIN; + sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout); +} + +void mptcp_data_acked(struct sock *sk) +{ + mptcp_reset_timer(sk); + + if (!sk_stream_is_writeable(sk) && + schedule_work(&mptcp_sk(sk)->work)) + sock_hold(sk); +} + +static void mptcp_stop_timer(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + sk_stop_timer(sk, &icsk->icsk_retransmit_timer); + mptcp_sk(sk)->timer_ival = 0; +} + static bool mptcp_ext_cache_refill(struct mptcp_sock *msk) { if (!msk->cached_ext) @@ -277,41 +321,149 @@ static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) return NULL; } -static inline bool mptcp_skb_can_collapse_to(const struct mptcp_sock *msk, - const struct sk_buff *skb, - const struct mptcp_ext *mpext) +static bool mptcp_skb_can_collapse_to(u64 write_seq, + const struct sk_buff *skb, + const struct mptcp_ext *mpext) { if (!tcp_skb_can_collapse_to(skb)) return false; /* can collapse only if MPTCP level sequence is in order */ - return mpext && mpext->data_seq + mpext->data_len == msk->write_seq; + return mpext && mpext->data_seq + mpext->data_len == write_seq; +} + +static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, + const struct page_frag *pfrag, + const struct mptcp_data_frag *df) +{ + return df && pfrag->page == df->page && + df->data_seq + df->data_len == msk->write_seq; +} + +static void dfrag_uncharge(struct sock *sk, int len) +{ + sk_mem_uncharge(sk, len); + sk_wmem_queued_add(sk, -len); +} + +static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag) +{ + int len = dfrag->data_len + dfrag->overhead; + + list_del(&dfrag->list); + dfrag_uncharge(sk, len); + put_page(dfrag->page); +} + +static void mptcp_clean_una(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_data_frag *dtmp, *dfrag; + u64 snd_una = atomic64_read(&msk->snd_una); + bool cleaned = false; + + list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) { + if (after64(dfrag->data_seq + dfrag->data_len, snd_una)) + break; + + dfrag_clear(sk, dfrag); + cleaned = true; + } + + dfrag = mptcp_rtx_head(sk); + if (dfrag && after64(snd_una, dfrag->data_seq)) { + u64 delta = dfrag->data_seq + dfrag->data_len - snd_una; + + dfrag->data_seq += delta; + dfrag->data_len -= delta; + + dfrag_uncharge(sk, delta); + cleaned = true; + } + + if (cleaned) { + sk_mem_reclaim_partial(sk); + + /* Only wake up writers if a subflow is ready */ + if (test_bit(MPTCP_SEND_SPACE, &msk->flags)) + sk_stream_write_space(sk); + } +} + +/* ensure we get enough memory for the frag hdr, beyond some minimal amount of + * data + */ +static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag) +{ + if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag), + pfrag, sk->sk_allocation))) + return true; + + sk->sk_prot->enter_memory_pressure(sk); + sk_stream_moderate_sndbuf(sk); + return false; +} + +static struct mptcp_data_frag * +mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag, + int orig_offset) +{ + int offset = ALIGN(orig_offset, sizeof(long)); + struct mptcp_data_frag *dfrag; + + dfrag = (struct mptcp_data_frag *)(page_to_virt(pfrag->page) + offset); + dfrag->data_len = 0; + dfrag->data_seq = msk->write_seq; + dfrag->overhead = offset - orig_offset + sizeof(struct mptcp_data_frag); + dfrag->offset = offset + sizeof(struct mptcp_data_frag); + dfrag->page = pfrag->page; + + return dfrag; } static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, - struct msghdr *msg, long *timeo, int *pmss_now, + struct msghdr *msg, struct mptcp_data_frag *dfrag, + long *timeo, int *pmss_now, int *ps_goal) { - int mss_now, avail_size, size_goal, ret; + int mss_now, avail_size, size_goal, offset, ret, frag_truesize = 0; + bool dfrag_collapsed, can_collapse = false; struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_ext *mpext = NULL; + bool retransmission = !!dfrag; struct sk_buff *skb, *tail; - bool can_collapse = false; struct page_frag *pfrag; + struct page *page; + u64 *write_seq; size_t psize; /* use the mptcp page cache so that we can easily move the data * from one substream to another, but do per subflow memory accounting + * Note: pfrag is used only !retransmission, but the compiler if + * fooled into a warning if we don't init here */ pfrag = sk_page_frag(sk); - while (!sk_page_frag_refill(ssk, pfrag) || + while ((!retransmission && !mptcp_page_frag_refill(ssk, pfrag)) || !mptcp_ext_cache_refill(msk)) { ret = sk_stream_wait_memory(ssk, timeo); if (ret) return ret; + + /* if sk_stream_wait_memory() sleeps snd_una can change + * significantly, refresh the rtx queue + */ + mptcp_clean_una(sk); + if (unlikely(__mptcp_needs_tcp_fallback(msk))) return 0; } + if (!retransmission) { + write_seq = &msk->write_seq; + page = pfrag->page; + } else { + write_seq = &dfrag->data_seq; + page = dfrag->page; + } /* compute copy limit */ mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags); @@ -329,32 +481,74 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, * SSN association set here */ can_collapse = (size_goal - skb->len > 0) && - mptcp_skb_can_collapse_to(msk, skb, mpext); + mptcp_skb_can_collapse_to(*write_seq, skb, mpext); if (!can_collapse) TCP_SKB_CB(skb)->eor = 1; else avail_size = size_goal - skb->len; } - psize = min_t(size_t, pfrag->size - pfrag->offset, avail_size); - - /* Copy to page */ - pr_debug("left=%zu", msg_data_left(msg)); - psize = copy_page_from_iter(pfrag->page, pfrag->offset, - min_t(size_t, msg_data_left(msg), psize), - &msg->msg_iter); - pr_debug("left=%zu", msg_data_left(msg)); - if (!psize) - return -EINVAL; + + if (!retransmission) { + /* reuse tail pfrag, if possible, or carve a new one from the + * page allocator + */ + dfrag = mptcp_rtx_tail(sk); + offset = pfrag->offset; + dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag); + if (!dfrag_collapsed) { + dfrag = mptcp_carve_data_frag(msk, pfrag, offset); + offset = dfrag->offset; + frag_truesize = dfrag->overhead; + } + psize = min_t(size_t, pfrag->size - offset, avail_size); + + /* Copy to page */ + pr_debug("left=%zu", msg_data_left(msg)); + psize = copy_page_from_iter(pfrag->page, offset, + min_t(size_t, msg_data_left(msg), + psize), + &msg->msg_iter); + pr_debug("left=%zu", msg_data_left(msg)); + if (!psize) + return -EINVAL; + + if (!sk_wmem_schedule(sk, psize + dfrag->overhead)) + return -ENOMEM; + } else { + offset = dfrag->offset; + psize = min_t(size_t, dfrag->data_len, avail_size); + } /* tell the TCP stack to delay the push so that we can safely * access the skb after the sendpages call */ - ret = do_tcp_sendpages(ssk, pfrag->page, pfrag->offset, psize, + ret = do_tcp_sendpages(ssk, page, offset, psize, msg->msg_flags | MSG_SENDPAGE_NOTLAST); if (ret <= 0) return ret; - if (unlikely(ret < psize)) - iov_iter_revert(&msg->msg_iter, psize - ret); + + frag_truesize += ret; + if (!retransmission) { + if (unlikely(ret < psize)) + iov_iter_revert(&msg->msg_iter, psize - ret); + + /* send successful, keep track of sent data for mptcp-level + * retransmission + */ + dfrag->data_len += ret; + if (!dfrag_collapsed) { + get_page(dfrag->page); + list_add_tail(&dfrag->list, &msk->rtx_queue); + sk_wmem_queued_add(sk, frag_truesize); + } else { + sk_wmem_queued_add(sk, ret); + } + + /* charge data on mptcp rtx queue to the master socket + * Note: we charge such data both to sk and ssk + */ + sk->sk_forward_alloc -= frag_truesize; + } /* if the tail skb extension is still the cached one, collapsing * really happened. Note: we can't check for 'same skb' as the sk_buff @@ -373,7 +567,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, msk->cached_ext = NULL; memset(mpext, 0, sizeof(*mpext)); - mpext->data_seq = msk->write_seq; + mpext->data_seq = *write_seq; mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq; mpext->data_len = ret; mpext->use_map = 1; @@ -384,13 +578,51 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, mpext->dsn64); out: - pfrag->offset += ret; - msk->write_seq += ret; + if (!retransmission) + pfrag->offset += frag_truesize; + *write_seq += ret; mptcp_subflow_ctx(ssk)->rel_write_seq += ret; return ret; } +static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + struct sock *backup = NULL; + + sock_owned_by_me((const struct sock *)msk); + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + if (!sk_stream_memory_free(ssk)) { + struct socket *sock = ssk->sk_socket; + + if (sock) { + clear_bit(MPTCP_SEND_SPACE, &msk->flags); + smp_mb__after_atomic(); + + /* enables sk->write_space() callbacks */ + set_bit(SOCK_NOSPACE, &sock->flags); + } + + return NULL; + } + + if (subflow->backup) { + if (!backup) + backup = ssk; + + continue; + } + + return ssk; + } + + return backup; +} + static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk) { struct socket *sock; @@ -438,17 +670,29 @@ fallback: return ret >= 0 ? ret + copied : (copied ? copied : ret); } - ssk = mptcp_subflow_get(msk); - if (!ssk) { - release_sock(sk); - return -ENOTCONN; + mptcp_clean_una(sk); + + __mptcp_flush_join_list(msk); + ssk = mptcp_subflow_get_send(msk); + while (!sk_stream_memory_free(sk) || !ssk) { + ret = sk_stream_wait_memory(sk, &timeo); + if (ret) + goto out; + + mptcp_clean_una(sk); + + ssk = mptcp_subflow_get_send(msk); + if (list_empty(&msk->conn_list)) { + ret = -ENOTCONN; + goto out; + } } pr_debug("conn_list->subflow=%p", ssk); lock_sock(ssk); while (msg_data_left(msg)) { - ret = mptcp_sendmsg_frag(sk, ssk, msg, &timeo, &mss_now, + ret = mptcp_sendmsg_frag(sk, ssk, msg, NULL, &timeo, &mss_now, &size_goal); if (ret < 0) break; @@ -461,10 +705,15 @@ fallback: copied += ret; } + mptcp_set_timeout(sk, ssk); if (copied) { ret = copied; tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle, size_goal); + + /* start the timer, if it's not pending */ + if (!mptcp_timer_pending(sk)) + mptcp_reset_timer(sk); } ssk_check_wmem(msk, ssk); @@ -572,6 +821,7 @@ fallback: len = min_t(size_t, len, INT_MAX); target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); + __mptcp_flush_join_list(msk); while (len > (size_t)copied) { int bytes_read; @@ -651,6 +901,69 @@ out_err: return copied; } +static void mptcp_retransmit_handler(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + if (atomic64_read(&msk->snd_una) == msk->write_seq) { + mptcp_stop_timer(sk); + } else { + set_bit(MPTCP_WORK_RTX, &msk->flags); + if (schedule_work(&msk->work)) + sock_hold(sk); + } +} + +static void mptcp_retransmit_timer(struct timer_list *t) +{ + struct inet_connection_sock *icsk = from_timer(icsk, t, + icsk_retransmit_timer); + struct sock *sk = &icsk->icsk_inet.sk; + + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) { + mptcp_retransmit_handler(sk); + } else { + /* delegate our work to tcp_release_cb() */ + if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, + &sk->sk_tsq_flags)) + sock_hold(sk); + } + bh_unlock_sock(sk); + sock_put(sk); +} + +/* Find an idle subflow. Return NULL if there is unacked data at tcp + * level. + * + * A backup subflow is returned only if that is the only kind available. + */ +static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + struct sock *backup = NULL; + + sock_owned_by_me((const struct sock *)msk); + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + /* still data outstanding at TCP level? Don't retransmit. */ + if (!tcp_write_queue_empty(ssk)) + return NULL; + + if (subflow->backup) { + if (!backup) + backup = ssk; + continue; + } + + return ssk; + } + + return backup; +} + /* subflow sockets can be either outgoing (connect) or incoming * (accept). * @@ -684,10 +997,63 @@ static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); - struct sock *sk = &msk->sk.icsk_inet.sk; + struct sock *ssk, *sk = &msk->sk.icsk_inet.sk; + int orig_len, orig_offset, ret, mss_now = 0, size_goal = 0; + struct mptcp_data_frag *dfrag; + u64 orig_write_seq; + size_t copied = 0; + struct msghdr msg; + long timeo = 0; lock_sock(sk); + mptcp_clean_una(sk); + __mptcp_flush_join_list(msk); __mptcp_move_skbs(msk); + + if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) + goto unlock; + + dfrag = mptcp_rtx_head(sk); + if (!dfrag) + goto unlock; + + ssk = mptcp_subflow_get_retrans(msk); + if (!ssk) + goto reset_unlock; + + lock_sock(ssk); + + msg.msg_flags = MSG_DONTWAIT; + orig_len = dfrag->data_len; + orig_offset = dfrag->offset; + orig_write_seq = dfrag->data_seq; + while (dfrag->data_len > 0) { + ret = mptcp_sendmsg_frag(sk, ssk, &msg, dfrag, &timeo, &mss_now, + &size_goal); + if (ret < 0) + break; + + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); + copied += ret; + dfrag->data_len -= ret; + dfrag->offset += ret; + } + if (copied) + tcp_push(ssk, msg.msg_flags, mss_now, tcp_sk(ssk)->nonagle, + size_goal); + + dfrag->data_seq = orig_write_seq; + dfrag->offset = orig_offset; + dfrag->data_len = orig_len; + + mptcp_set_timeout(sk, ssk); + release_sock(ssk); + +reset_unlock: + if (!mptcp_timer_pending(sk)) + mptcp_reset_timer(sk); + +unlock: release_sock(sk); sock_put(sk); } @@ -696,22 +1062,55 @@ static int __mptcp_init_sock(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); + spin_lock_init(&msk->join_list_lock); + INIT_LIST_HEAD(&msk->conn_list); + INIT_LIST_HEAD(&msk->join_list); + INIT_LIST_HEAD(&msk->rtx_queue); __set_bit(MPTCP_SEND_SPACE, &msk->flags); INIT_WORK(&msk->work, mptcp_worker); msk->first = NULL; inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; + mptcp_pm_data_init(msk); + + /* re-use the csk retrans timer for MPTCP-level retrans */ + timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0); + return 0; } static int mptcp_init_sock(struct sock *sk) { - if (!mptcp_is_enabled(sock_net(sk))) + struct net *net = sock_net(sk); + int ret; + + if (!mptcp_is_enabled(net)) return -ENOPROTOOPT; - return __mptcp_init_sock(sk); + if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net)) + return -ENOMEM; + + ret = __mptcp_init_sock(sk); + if (ret) + return ret; + + sk_sockets_allocated_inc(sk); + sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[2]; + + return 0; +} + +static void __mptcp_clear_xmit(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_data_frag *dtmp, *dfrag; + + sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer); + + list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) + dfrag_clear(sk, dfrag); } static void mptcp_cancel_work(struct sock *sk) @@ -767,10 +1166,14 @@ static void mptcp_close(struct sock *sk, long timeout) mptcp_token_destroy(msk->token); inet_sk_state_store(sk, TCP_CLOSE); + __mptcp_flush_join_list(msk); + list_splice_init(&msk->conn_list, &conn_list); data_fin_tx_seq = msk->write_seq; + __mptcp_clear_xmit(sk); + release_sock(sk); list_for_each_entry_safe(subflow, tmp, &conn_list, node) { @@ -782,6 +1185,7 @@ static void mptcp_close(struct sock *sk, long timeout) } mptcp_cancel_work(sk); + mptcp_pm_close(msk); __skb_queue_purge(&sk->sk_receive_queue); @@ -811,6 +1215,15 @@ static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr; } +static int mptcp_disconnect(struct sock *sk, int flags) +{ + lock_sock(sk); + __mptcp_clear_xmit(sk); + release_sock(sk); + mptcp_cancel_work(sk); + return tcp_disconnect(sk, flags); +} + #if IS_ENABLED(CONFIG_MPTCP_IPV6) static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk) { @@ -854,6 +1267,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk, struct request_sock *req) } msk->write_seq = subflow_req->idsn + 1; + atomic64_set(&msk->snd_una, msk->write_seq); if (subflow_req->remote_key_valid) { msk->can_ack = true; msk->remote_key = subflow_req->remote_key; @@ -920,7 +1334,12 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, list_add(&subflow->node, &msk->conn_list); bh_unlock_sock(new_mptcp_sock); + + __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK); local_bh_enable(); + } else { + MPTCP_INC_STATS(sock_net(sk), + MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); } return newsk; @@ -932,6 +1351,8 @@ static void mptcp_destroy(struct sock *sk) if (msk->cached_ext) __skb_ext_put(msk->cached_ext); + + sk_sockets_allocated_dec(sk); } static int mptcp_setsockopt(struct sock *sk, int level, int optname, @@ -984,7 +1405,8 @@ static int mptcp_getsockopt(struct sock *sk, int level, int optname, return -EOPNOTSUPP; } -#define MPTCP_DEFERRED_ALL TCPF_DELACK_TIMER_DEFERRED +#define MPTCP_DEFERRED_ALL (TCPF_DELACK_TIMER_DEFERRED | \ + TCPF_WRITE_TIMER_DEFERRED) /* this is very alike tcp_release_cb() but we must handle differently a * different set of events @@ -1000,6 +1422,8 @@ static void mptcp_release_cb(struct sock *sk) nflags = flags & ~MPTCP_DEFERRED_ALL; } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags); + sock_release_ownership(sk); + if (flags & TCPF_DELACK_TIMER_DEFERRED) { struct mptcp_sock *msk = mptcp_sk(sk); struct sock *ssk; @@ -1008,6 +1432,11 @@ static void mptcp_release_cb(struct sock *sk) if (!ssk || !schedule_work(&msk->work)) __sock_put(sk); } + + if (flags & TCPF_WRITE_TIMER_DEFERRED) { + mptcp_retransmit_handler(sk); + __sock_put(sk); + } } static int mptcp_get_port(struct sock *sk, unsigned short snum) @@ -1031,13 +1460,15 @@ void mptcp_finish_connect(struct sock *ssk) u64 ack_seq; subflow = mptcp_subflow_ctx(ssk); - - if (!subflow->mp_capable) - return; - sk = subflow->conn; msk = mptcp_sk(sk); + if (!subflow->mp_capable) { + MPTCP_INC_STATS(sock_net(sk), + MPTCP_MIB_MPCAPABLEACTIVEFALLBACK); + return; + } + pr_debug("msk=%p, token=%u", sk, subflow->token); mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq); @@ -1055,10 +1486,9 @@ void mptcp_finish_connect(struct sock *ssk) WRITE_ONCE(msk->write_seq, subflow->idsn + 1); WRITE_ONCE(msk->ack_seq, ack_seq); WRITE_ONCE(msk->can_ack, 1); - if (inet_sk_state_load(sk) != TCP_ESTABLISHED) { - inet_sk_state_store(sk, TCP_ESTABLISHED); - sk->sk_state_change(sk); - } + atomic64_set(&msk->snd_una, msk->write_seq); + + mptcp_pm_new_connection(msk, 0); } static void mptcp_sock_graft(struct sock *sk, struct socket *parent) @@ -1070,6 +1500,46 @@ static void mptcp_sock_graft(struct sock *sk, struct socket *parent) write_unlock_bh(&sk->sk_callback_lock); } +bool mptcp_finish_join(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); + struct sock *parent = (void *)msk; + struct socket *parent_sock; + bool ret; + + pr_debug("msk=%p, subflow=%p", msk, subflow); + + /* mptcp socket already closing? */ + if (inet_sk_state_load(parent) != TCP_ESTABLISHED) + return false; + + if (!msk->pm.server_side) + return true; + + /* passive connection, attach to msk socket */ + parent_sock = READ_ONCE(parent->sk_socket); + if (parent_sock && !sk->sk_socket) + mptcp_sock_graft(sk, parent_sock); + + ret = mptcp_pm_allow_new_subflow(msk); + if (ret) { + /* active connections are already on conn_list */ + spin_lock_bh(&msk->join_list_lock); + if (!WARN_ON_ONCE(!list_empty(&subflow->node))) + list_add_tail(&subflow->node, &msk->join_list); + spin_unlock_bh(&msk->join_list_lock); + } + return ret; +} + +bool mptcp_sk_is_subflow(const struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + + return subflow->mp_join == 1; +} + static bool mptcp_memory_free(const struct sock *sk, int wake) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -1081,6 +1551,7 @@ static struct proto mptcp_prot = { .name = "MPTCP", .owner = THIS_MODULE, .init = mptcp_init_sock, + .disconnect = mptcp_disconnect, .close = mptcp_close, .accept = mptcp_accept, .setsockopt = mptcp_setsockopt, @@ -1093,7 +1564,12 @@ static struct proto mptcp_prot = { .hash = inet_hash, .unhash = inet_unhash, .get_port = mptcp_get_port, + .sockets_allocated = &mptcp_sockets_allocated, + .memory_allocated = &tcp_memory_allocated, + .memory_pressure = &tcp_memory_pressure, .stream_memory_free = mptcp_memory_free, + .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), + .sysctl_mem = sysctl_tcp_mem, .obj_size = sizeof(struct mptcp_sock), .no_autobind = true, }; @@ -1249,6 +1725,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, /* set ssk->sk_socket of accept()ed flows to mptcp socket. * This is needed so NOSPACE flag can be set from tcp stack. */ + __mptcp_flush_join_list(msk); list_for_each_entry(subflow, &msk->conn_list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); @@ -1330,6 +1807,7 @@ static int mptcp_shutdown(struct socket *sock, int how) sock->state = SS_CONNECTED; } + __mptcp_flush_join_list(msk); mptcp_for_each_subflow(msk, subflow) { struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); @@ -1380,7 +1858,11 @@ void mptcp_proto_init(void) { mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo; + if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL)) + panic("Failed to allocate MPTCP pcpu counter\n"); + mptcp_subflow_init(); + mptcp_pm_init(); if (proto_register(&mptcp_prot, 1) != 0) panic("Failed to register MPTCP proto.\n"); |