From b410f04eb5b482b5efc4eee90de81ad35d3d923b Mon Sep 17 00:00:00 2001 From: Zhang Changzhong Date: Fri, 4 Dec 2020 16:48:14 +0800 Subject: ipv4: fix error return code in rtm_to_fib_config() Fix to return a negative error code from the error handling case instead of 0, as done elsewhere in this function. Fixes: d15662682db2 ("ipv4: Allow ipv6 gateway with ipv4 routes") Reported-by: Hulk Robot Signed-off-by: Zhang Changzhong Reviewed-by: David Ahern Link: https://lore.kernel.org/r/1607071695-33740-1-git-send-email-zhangchangzhong@huawei.com Signed-off-by: Jakub Kicinski --- net/ipv4/fib_frontend.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index b87140a1fa28..cdf6ec5aa45d 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -825,7 +825,7 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, if (has_gw && has_via) { NL_SET_ERR_MSG(extack, "Nexthop configuration can not contain both GATEWAY and VIA"); - goto errout; + return -EINVAL; } return 0; -- cgit v1.2.3 From 10c678bd0a035ac2c64a9b26b222f20556227a53 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 7 Dec 2020 15:55:40 +0800 Subject: udp: fix the proto value passed to ip_protocol_deliver_rcu for the segments Guillaume noticed that: for segments udp_queue_rcv_one_skb() returns the proto, and it should pass "ret" unmodified to ip_protocol_deliver_rcu(). Otherwize, with a negtive value passed, it will underflow inet_protos. This can be reproduced with IPIP FOU: # ip fou add port 5555 ipproto 4 # ethtool -K eth1 rx-gro-list on Fixes: cf329aa42b66 ("udp: cope with UDP GRO packet misdirection") Reported-by: Guillaume Nault Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv4/udp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 09f0a23d1a01..9eeebd4a0054 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2173,7 +2173,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) __skb_pull(skb, skb_transport_offset(skb)); ret = udp_queue_rcv_one_skb(sk, skb); if (ret > 0) - ip_protocol_deliver_rcu(dev_net(skb->dev), skb, -ret); + ip_protocol_deliver_rcu(dev_net(skb->dev), skb, ret); } return 0; } -- cgit v1.2.3 From cc00bcaa589914096edef7fb87ca5cee4a166b5c Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Wed, 25 Nov 2020 11:27:22 -0700 Subject: netfilter: x_tables: Switch synchronization to RCU When running concurrent iptables rules replacement with data, the per CPU sequence count is checked after the assignment of the new information. The sequence count is used to synchronize with the packet path without the use of any explicit locking. If there are any packets in the packet path using the table information, the sequence count is incremented to an odd value and is incremented to an even after the packet process completion. The new table value assignment is followed by a write memory barrier so every CPU should see the latest value. If the packet path has started with the old table information, the sequence counter will be odd and the iptables replacement will wait till the sequence count is even prior to freeing the old table info. However, this assumes that the new table information assignment and the memory barrier is actually executed prior to the counter check in the replacement thread. If CPU decides to execute the assignment later as there is no user of the table information prior to the sequence check, the packet path in another CPU may use the old table information. The replacement thread would then free the table information under it leading to a use after free in the packet processing context- Unable to handle kernel NULL pointer dereference at virtual address 000000000000008e pc : ip6t_do_table+0x5d0/0x89c lr : ip6t_do_table+0x5b8/0x89c ip6t_do_table+0x5d0/0x89c ip6table_filter_hook+0x24/0x30 nf_hook_slow+0x84/0x120 ip6_input+0x74/0xe0 ip6_rcv_finish+0x7c/0x128 ipv6_rcv+0xac/0xe4 __netif_receive_skb+0x84/0x17c process_backlog+0x15c/0x1b8 napi_poll+0x88/0x284 net_rx_action+0xbc/0x23c __do_softirq+0x20c/0x48c This could be fixed by forcing instruction order after the new table information assignment or by switching to RCU for the synchronization. Fixes: 80055dab5de0 ("netfilter: x_tables: make xt_replace_table wait until old rules are not used anymore") Reported-by: Sean Tranchetti Reported-by: kernel test robot Suggested-by: Florian Westphal Signed-off-by: Subash Abhinov Kasiviswanathan Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arp_tables.c | 14 +++++++------- net/ipv4/netfilter/ip_tables.c | 14 +++++++------- 2 files changed, 14 insertions(+), 14 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index d1e04d2b5170..563b62b76a5f 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -203,7 +203,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, local_bh_disable(); addend = xt_write_recseq_begin(); - private = READ_ONCE(table->private); /* Address dependency. */ + private = rcu_access_pointer(table->private); cpu = smp_processor_id(); table_base = private->entries; jumpstack = (struct arpt_entry **)private->jumpstack[cpu]; @@ -649,7 +649,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; - const struct xt_table_info *private = table->private; + const struct xt_table_info *private = xt_table_get_private_protected(table); /* We need atomic snapshot of counters: rest doesn't change * (other than comefrom, which userspace doesn't care @@ -673,7 +673,7 @@ static int copy_entries_to_user(unsigned int total_size, unsigned int off, num; const struct arpt_entry *e; struct xt_counters *counters; - struct xt_table_info *private = table->private; + struct xt_table_info *private = xt_table_get_private_protected(table); int ret = 0; void *loc_cpu_entry; @@ -807,7 +807,7 @@ static int get_info(struct net *net, void __user *user, const int *len) t = xt_request_find_table_lock(net, NFPROTO_ARP, name); if (!IS_ERR(t)) { struct arpt_getinfo info; - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = xt_table_get_private_protected(t); #ifdef CONFIG_COMPAT struct xt_table_info tmp; @@ -860,7 +860,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, t = xt_find_table_lock(net, NFPROTO_ARP, get.name); if (!IS_ERR(t)) { - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = xt_table_get_private_protected(t); if (get.size == private->size) ret = copy_entries_to_user(private->size, @@ -1017,7 +1017,7 @@ static int do_add_counters(struct net *net, sockptr_t arg, unsigned int len) } local_bh_disable(); - private = t->private; + private = xt_table_get_private_protected(t); if (private->number != tmp.num_counters) { ret = -EINVAL; goto unlock_up_free; @@ -1330,7 +1330,7 @@ static int compat_copy_entries_to_user(unsigned int total_size, void __user *userptr) { struct xt_counters *counters; - const struct xt_table_info *private = table->private; + const struct xt_table_info *private = xt_table_get_private_protected(table); void __user *pos; unsigned int size; int ret = 0; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index f15bc21d7301..6e2851f8d3a3 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -258,7 +258,7 @@ ipt_do_table(struct sk_buff *skb, WARN_ON(!(table->valid_hooks & (1 << hook))); local_bh_disable(); addend = xt_write_recseq_begin(); - private = READ_ONCE(table->private); /* Address dependency. */ + private = rcu_access_pointer(table->private); cpu = smp_processor_id(); table_base = private->entries; jumpstack = (struct ipt_entry **)private->jumpstack[cpu]; @@ -791,7 +791,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; - const struct xt_table_info *private = table->private; + const struct xt_table_info *private = xt_table_get_private_protected(table); /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care @@ -815,7 +815,7 @@ copy_entries_to_user(unsigned int total_size, unsigned int off, num; const struct ipt_entry *e; struct xt_counters *counters; - const struct xt_table_info *private = table->private; + const struct xt_table_info *private = xt_table_get_private_protected(table); int ret = 0; const void *loc_cpu_entry; @@ -964,7 +964,7 @@ static int get_info(struct net *net, void __user *user, const int *len) t = xt_request_find_table_lock(net, AF_INET, name); if (!IS_ERR(t)) { struct ipt_getinfo info; - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = xt_table_get_private_protected(t); #ifdef CONFIG_COMPAT struct xt_table_info tmp; @@ -1018,7 +1018,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr, t = xt_find_table_lock(net, AF_INET, get.name); if (!IS_ERR(t)) { - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = xt_table_get_private_protected(t); if (get.size == private->size) ret = copy_entries_to_user(private->size, t, uptr->entrytable); @@ -1173,7 +1173,7 @@ do_add_counters(struct net *net, sockptr_t arg, unsigned int len) } local_bh_disable(); - private = t->private; + private = xt_table_get_private_protected(t); if (private->number != tmp.num_counters) { ret = -EINVAL; goto unlock_up_free; @@ -1543,7 +1543,7 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, void __user *userptr) { struct xt_counters *counters; - const struct xt_table_info *private = table->private; + const struct xt_table_info *private = xt_table_get_private_protected(table); void __user *pos; unsigned int size; int ret = 0; -- cgit v1.2.3 From 72d05c00d7ecda85df29abd046da7e41cc071c17 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Dec 2020 08:21:31 -0800 Subject: tcp: select sane initial rcvq_space.space for big MSS Before commit a337531b942b ("tcp: up initial rmem to 128KB and SYN rwin to around 64KB") small tcp_rmem[1] values were overridden by tcp_fixup_rcvbuf() to accommodate various MSS. This is no longer the case, and Hazem Mohamed Abuelfotoh reported that DRS would not work for MTU 9000 endpoints receiving regular (1500 bytes) frames. Root cause is that tcp_init_buffer_space() uses tp->rcv_wnd for upper limit of rcvq_space.space computation, while it can select later a smaller value for tp->rcv_ssthresh and tp->window_clamp. ss -temoi on receiver would show : skmem:(r0,rb131072,t0,tb46080,f0,w0,o0,bl0,d0) rcv_space:62496 rcv_ssthresh:56596 This means that TCP can not increase its window in tcp_grow_window(), and that DRS can never kick. Fix this by making sure that rcvq_space.space is not bigger than number of bytes that can be held in TCP receive queue. People unable/unwilling to change their kernel can work around this issue by selecting a bigger tcp_rmem[1] value as in : echo "4096 196608 6291456" >/proc/sys/net/ipv4/tcp_rmem Based on an initial report and patch from Hazem Mohamed Abuelfotoh https://lore.kernel.org/netdev/20201204180622.14285-1-abuehaze@amazon.com/ Fixes: a337531b942b ("tcp: up initial rmem to 128KB and SYN rwin to around 64KB") Fixes: 041a14d26715 ("tcp: start receiver buffer autotuning sooner") Reported-by: Hazem Mohamed Abuelfotoh Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 389d1b340248..ef4bdb038a4b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -510,7 +510,6 @@ static void tcp_init_buffer_space(struct sock *sk) if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) tcp_sndbuf_expand(sk); - tp->rcvq_space.space = min_t(u32, tp->rcv_wnd, TCP_INIT_CWND * tp->advmss); tcp_mstamp_refresh(tp); tp->rcvq_space.time = tp->tcp_mstamp; tp->rcvq_space.seq = tp->copied_seq; @@ -534,6 +533,8 @@ static void tcp_init_buffer_space(struct sock *sk) tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); tp->snd_cwnd_stamp = tcp_jiffies32; + tp->rcvq_space.space = min3(tp->rcv_ssthresh, tp->rcv_wnd, + (u32)TCP_INIT_CWND * tp->advmss); } /* 4. Recalculate window clamp after socket hit its memory bounds. */ -- cgit v1.2.3 From 8ef44b6fe49d2b8d03ba9aa69063612b474f963b Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 8 Dec 2020 09:55:08 -0800 Subject: tcp: Retain ECT bits for tos reflection For DCTCP, we have to retain the ECT bits set by the congestion control algorithm on the socket when reflecting syn TOS in syn-ack, in order to make ECN work properly. Fixes: ac8f1710c12b ("tcp: reflect tos value received in SYN to the socket") Reported-by: Alexander Duyck Signed-off-by: Wei Wang Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 8391aa29e7a4..595dcc3afac5 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -984,7 +984,8 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ? - tcp_rsk(req)->syn_tos & ~INET_ECN_MASK : + (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | + (inet_sk(sk)->tos & INET_ECN_MASK) : inet_sk(sk)->tos; if (!INET_ECN_is_capable(tos) && @@ -1541,7 +1542,9 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; newinet->inet_id = prandom_u32(); - /* Set ToS of the new socket based upon the value of incoming SYN. */ + /* Set ToS of the new socket based upon the value of incoming SYN. + * ECT bits are set later in tcp_init_transfer(). + */ if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; -- cgit v1.2.3 From 299bcb55ecd1412f6df606e9dc0912d55610029e Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Tue, 8 Dec 2020 22:57:59 -0500 Subject: tcp: fix cwnd-limited bug for TSO deferral where we send nothing When cwnd is not a multiple of the TSO skb size of N*MSS, we can get into persistent scenarios where we have the following sequence: (1) ACK for full-sized skb of N*MSS arrives -> tcp_write_xmit() transmit full-sized skb with N*MSS -> move pacing release time forward -> exit tcp_write_xmit() because pacing time is in the future (2) TSQ callback or TCP internal pacing timer fires -> try to transmit next skb, but TSO deferral finds remainder of available cwnd is not big enough to trigger an immediate send now, so we defer sending until the next ACK. (3) repeat... So we can get into a case where we never mark ourselves as cwnd-limited for many seconds at a time, even with bulk/infinite-backlog senders, because: o In case (1) above, every time in tcp_write_xmit() we have enough cwnd to send a full-sized skb, we are not fully using the cwnd (because cwnd is not a multiple of the TSO skb size). So every time we send data, we are not cwnd limited, and so in the cwnd-limited tracking code in tcp_cwnd_validate() we mark ourselves as not cwnd-limited. o In case (2) above, every time in tcp_write_xmit() that we try to transmit the "remainder" of the cwnd but defer, we set the local variable is_cwnd_limited to true, but we do not send any packets, so sent_pkts is zero, so we don't call the cwnd-limited logic to update tp->is_cwnd_limited. Fixes: ca8a22634381 ("tcp: make cwnd-limited checks measurement-based, and gentler") Reported-by: Ingemar Johansson Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Acked-by: Soheil Hassas Yeganeh Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20201209035759.1225145-1-ncardwell.kernel@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_output.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index bf48cd73e967..99011768c264 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1880,7 +1880,8 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) * window, and remember whether we were cwnd-limited then. */ if (!before(tp->snd_una, tp->max_packets_seq) || - tp->packets_out > tp->max_packets_out) { + tp->packets_out > tp->max_packets_out || + is_cwnd_limited) { tp->max_packets_out = tp->packets_out; tp->max_packets_seq = tp->snd_nxt; tp->is_cwnd_limited = is_cwnd_limited; @@ -2702,6 +2703,10 @@ repair: else tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED); + is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd); + if (likely(sent_pkts || is_cwnd_limited)) + tcp_cwnd_validate(sk, is_cwnd_limited); + if (likely(sent_pkts)) { if (tcp_in_cwnd_reduction(sk)) tp->prr_out += sent_pkts; @@ -2709,8 +2714,6 @@ repair: /* Send one loss probe per tail loss episode. */ if (push_one != 2) tcp_schedule_loss_probe(sk, false); - is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd); - tcp_cwnd_validate(sk, is_cwnd_limited); return false; } return !tp->packets_out && !tcp_write_queue_empty(sk); -- cgit v1.2.3