From 161b838e25c6f83495e27e3f546b893622d442bf Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 14 Dec 2020 23:40:15 +0000 Subject: netfilter: nftables: fix incorrect increment of loop counter The intention of the err_expr cleanup path is to iterate over the allocated expr_array objects and free them, starting from i - 1 and working down to the start of the array. Currently the loop counter is being incremented instead of decremented and also the index i is being used instead of k, repeatedly destroying the same expr_array element. Fix this by decrementing k and using k as the index into expr_array. Addresses-Coverity: ("Infinite loop") Fixes: 8cfd9b0f8515 ("netfilter: nftables: generalize set expressions support") Signed-off-by: Colin Ian King Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 8d5aa0ac45f4..4186b1e52d58 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -5254,8 +5254,8 @@ static int nft_set_elem_expr_clone(const struct nft_ctx *ctx, return 0; err_expr: - for (k = i - 1; k >= 0; k++) - nft_expr_destroy(ctx, expr_array[i]); + for (k = i - 1; k >= 0; k--) + nft_expr_destroy(ctx, expr_array[k]); return -ENOMEM; } -- cgit v1.2.3 From 443d6e86f821a165fae3fc3fc13086d27ac140b1 Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Wed, 16 Dec 2020 21:38:02 -0700 Subject: netfilter: x_tables: Update remaining dereference to RCU This fixes the dereference to fetch the RCU pointer when holding the appropriate xtables lock. Reported-by: kernel test robot Fixes: cc00bcaa5899 ("netfilter: x_tables: Switch synchronization to RCU") Signed-off-by: Subash Abhinov Kasiviswanathan Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arp_tables.c | 2 +- net/ipv4/netfilter/ip_tables.c | 2 +- net/ipv6/netfilter/ip6_tables.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 563b62b76a5f..c576a63d09db 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1379,7 +1379,7 @@ static int compat_get_entries(struct net *net, xt_compat_lock(NFPROTO_ARP); t = xt_find_table_lock(net, NFPROTO_ARP, get.name); if (!IS_ERR(t)) { - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = xt_table_get_private_protected(t); struct xt_table_info info; ret = compat_table_info(private, &info); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 6e2851f8d3a3..e8f6f9d86237 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1589,7 +1589,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr, xt_compat_lock(AF_INET); t = xt_find_table_lock(net, AF_INET, get.name); if (!IS_ERR(t)) { - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = xt_table_get_private_protected(t); struct xt_table_info info; ret = compat_table_info(private, &info); if (!ret && get.size == info.size) diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index c4f532f4d311..0d453fa9e327 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1598,7 +1598,7 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr, xt_compat_lock(AF_INET6); t = xt_find_table_lock(net, AF_INET6, get.name); if (!IS_ERR(t)) { - const struct xt_table_info *private = t->private; + const struct xt_table_info *private = xt_table_get_private_protected(t); struct xt_table_info info; ret = compat_table_info(private, &info); if (!ret && get.size == info.size) -- cgit v1.2.3 From 2b33d6ffa9e38f344418976b06057e2fc2aa9e2a Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 17 Dec 2020 11:53:40 +0300 Subject: netfilter: ipset: fixes possible oops in mtype_resize currently mtype_resize() can cause oops t = ip_set_alloc(htable_size(htable_bits)); if (!t) { ret = -ENOMEM; goto out; } t->hregion = ip_set_alloc(ahash_sizeof_regions(htable_bits)); Increased htable_bits can force htable_size() to return 0. In own turn ip_set_alloc(0) returns not 0 but ZERO_SIZE_PTR, so follwoing access to t->hregion should trigger an OOPS. Signed-off-by: Vasily Averin Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_hash_gen.h | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 5f1208ad049e..1e7dddf824ae 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -640,7 +640,7 @@ mtype_resize(struct ip_set *set, bool retried) struct htype *h = set->data; struct htable *t, *orig; u8 htable_bits; - size_t dsize = set->dsize; + size_t hsize, dsize = set->dsize; #ifdef IP_SET_HASH_WITH_NETS u8 flags; struct mtype_elem *tmp; @@ -664,14 +664,12 @@ mtype_resize(struct ip_set *set, bool retried) retry: ret = 0; htable_bits++; - if (!htable_bits) { - /* In case we have plenty of memory :-) */ - pr_warn("Cannot increase the hashsize of set %s further\n", - set->name); - ret = -IPSET_ERR_HASH_FULL; - goto out; - } - t = ip_set_alloc(htable_size(htable_bits)); + if (!htable_bits) + goto hbwarn; + hsize = htable_size(htable_bits); + if (!hsize) + goto hbwarn; + t = ip_set_alloc(hsize); if (!t) { ret = -ENOMEM; goto out; @@ -813,6 +811,12 @@ cleanup: if (ret == -EAGAIN) goto retry; goto out; + +hbwarn: + /* In case we have plenty of memory :-) */ + pr_warn("Cannot increase the hashsize of set %s further\n", set->name); + ret = -IPSET_ERR_HASH_FULL; + goto out; } /* Get the current number of elements and ext_size in the set */ -- cgit v1.2.3 From 5c8193f568ae16f3242abad6518dc2ca6c8eef86 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 17 Dec 2020 17:53:18 +0300 Subject: netfilter: ipset: fix shift-out-of-bounds in htable_bits() htable_bits() can call jhash_size(32) and trigger shift-out-of-bounds UBSAN: shift-out-of-bounds in net/netfilter/ipset/ip_set_hash_gen.h:151:6 shift exponent 32 is too large for 32-bit type 'unsigned int' CPU: 0 PID: 8498 Comm: syz-executor519 Not tainted 5.10.0-rc7-next-20201208-syzkaller #0 Call Trace: __dump_stack lib/dump_stack.c:79 [inline] dump_stack+0x107/0x163 lib/dump_stack.c:120 ubsan_epilogue+0xb/0x5a lib/ubsan.c:148 __ubsan_handle_shift_out_of_bounds.cold+0xb1/0x181 lib/ubsan.c:395 htable_bits net/netfilter/ipset/ip_set_hash_gen.h:151 [inline] hash_mac_create.cold+0x58/0x9b net/netfilter/ipset/ip_set_hash_gen.h:1524 ip_set_create+0x610/0x1380 net/netfilter/ipset/ip_set_core.c:1115 nfnetlink_rcv_msg+0xecc/0x1180 net/netfilter/nfnetlink.c:252 netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2494 nfnetlink_rcv+0x1ac/0x420 net/netfilter/nfnetlink.c:600 netlink_unicast_kernel net/netlink/af_netlink.c:1304 [inline] netlink_unicast+0x533/0x7d0 net/netlink/af_netlink.c:1330 netlink_sendmsg+0x907/0xe40 net/netlink/af_netlink.c:1919 sock_sendmsg_nosec net/socket.c:652 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:672 ____sys_sendmsg+0x6e8/0x810 net/socket.c:2345 ___sys_sendmsg+0xf3/0x170 net/socket.c:2399 __sys_sendmsg+0xe5/0x1b0 net/socket.c:2432 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 This patch replaces htable_bits() by simple fls(hashsize - 1) call: it alone returns valid nbits both for round and non-round hashsizes. It is normal to set any nbits here because it is validated inside following htable_size() call which returns 0 for nbits>31. Fixes: 1feab10d7e6d("netfilter: ipset: Unified hash type generation") Reported-by: syzbot+d66bfadebca46cf61a2b@syzkaller.appspotmail.com Signed-off-by: Vasily Averin Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_hash_gen.h | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 1e7dddf824ae..6186358eac7c 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -141,20 +141,6 @@ htable_size(u8 hbits) return hsize * sizeof(struct hbucket *) + sizeof(struct htable); } -/* Compute htable_bits from the user input parameter hashsize */ -static u8 -htable_bits(u32 hashsize) -{ - /* Assume that hashsize == 2^htable_bits */ - u8 bits = fls(hashsize - 1); - - if (jhash_size(bits) != hashsize) - /* Round up to the first 2^n value */ - bits = fls(hashsize); - - return bits; -} - #ifdef IP_SET_HASH_WITH_NETS #if IPSET_NET_COUNT > 1 #define __CIDR(cidr, i) (cidr[i]) @@ -1525,7 +1511,11 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, if (!h) return -ENOMEM; - hbits = htable_bits(hashsize); + /* Compute htable_bits from the user input parameter hashsize. + * Assume that hashsize == 2^htable_bits, + * otherwise round up to the first 2^n value. + */ + hbits = fls(hashsize - 1); hsize = htable_size(hbits); if (hsize == 0) { kfree(h); -- cgit v1.2.3 From 8bee683384087a6275c9183a483435225f7bb209 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Mon, 14 Dec 2020 09:51:27 +0100 Subject: xsk: Fix memory leak for failed bind MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a possible memory leak when a bind of an AF_XDP socket fails. When the fill and completion rings are created, they are tied to the socket. But when the buffer pool is later created at bind time, the ownership of these two rings are transferred to the buffer pool as they might be shared between sockets (and the buffer pool cannot be created until we know what we are binding to). So, before the buffer pool is created, these two rings are cleaned up with the socket, and after they have been transferred they are cleaned up together with the buffer pool. The problem is that ownership was transferred before it was absolutely certain that the buffer pool could be created and initialized correctly and when one of these errors occurred, the fill and completion rings did neither belong to the socket nor the pool and where therefore leaked. Solve this by moving the ownership transfer to the point where the buffer pool has been completely set up and there is no way it can fail. Fixes: 7361f9c3d719 ("xsk: Move fill and completion rings to buffer pool") Reported-by: syzbot+cfa88ddd0655afa88763@syzkaller.appspotmail.com Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20201214085127.3960-1-magnus.karlsson@gmail.com --- net/xdp/xsk.c | 4 ++++ net/xdp/xsk_buff_pool.c | 2 -- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index ac4a317038f1..c6532d77fde7 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -878,6 +878,10 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) } } + /* FQ and CQ are now owned by the buffer pool and cleaned up with it. */ + xs->fq_tmp = NULL; + xs->cq_tmp = NULL; + xs->dev = dev; xs->zc = xs->umem->zc; xs->queue_id = qid; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 67a4494d63b6..818b75060922 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -75,8 +75,6 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, pool->fq = xs->fq_tmp; pool->cq = xs->cq_tmp; - xs->fq_tmp = NULL; - xs->cq_tmp = NULL; for (i = 0; i < pool->free_heads_cnt; i++) { xskb = &pool->heads[i]; -- cgit v1.2.3 From f09ced4053bc0a2094a12b60b646114c966ef4c6 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 18 Dec 2020 14:45:24 +0100 Subject: xsk: Fix race in SKB mode transmit with shared cq MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a race when multiple sockets are simultaneously calling sendto() when the completion ring is shared in the SKB case. This is the case when you share the same netdev and queue id through the XDP_SHARED_UMEM bind flag. The problem is that multiple processes can be in xsk_generic_xmit() and call the backpressure mechanism in xskq_prod_reserve(xs->pool->cq). As this is a shared resource in this specific scenario, a race might occur since the rings are single-producer single-consumer. Fix this by moving the tx_completion_lock from the socket to the pool as the pool is shared between the sockets that share the completion ring. (The pool is not shared when this is not the case.) And then protect the accesses to xskq_prod_reserve() with this lock. The tx_completion_lock is renamed cq_lock to better reflect that it protects accesses to the potentially shared completion ring. Fixes: 35fcde7f8deb ("xsk: support for Tx") Reported-by: Xuan Zhuo Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20201218134525.13119-2-magnus.karlsson@gmail.com --- net/xdp/xsk.c | 9 ++++++--- net/xdp/xsk_buff_pool.c | 1 + 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index c6532d77fde7..d531f9cd0de6 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -423,9 +423,9 @@ static void xsk_destruct_skb(struct sk_buff *skb) struct xdp_sock *xs = xdp_sk(skb->sk); unsigned long flags; - spin_lock_irqsave(&xs->tx_completion_lock, flags); + spin_lock_irqsave(&xs->pool->cq_lock, flags); xskq_prod_submit_addr(xs->pool->cq, addr); - spin_unlock_irqrestore(&xs->tx_completion_lock, flags); + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); sock_wfree(skb); } @@ -437,6 +437,7 @@ static int xsk_generic_xmit(struct sock *sk) bool sent_frame = false; struct xdp_desc desc; struct sk_buff *skb; + unsigned long flags; int err = 0; mutex_lock(&xs->mutex); @@ -468,10 +469,13 @@ static int xsk_generic_xmit(struct sock *sk) * if there is space in it. This avoids having to implement * any buffering in the Tx path. */ + spin_lock_irqsave(&xs->pool->cq_lock, flags); if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) { + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); kfree_skb(skb); goto out; } + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); skb->dev = xs->dev; skb->priority = sk->sk_priority; @@ -1303,7 +1307,6 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, xs->state = XSK_READY; mutex_init(&xs->mutex); spin_lock_init(&xs->rx_lock); - spin_lock_init(&xs->tx_completion_lock); INIT_LIST_HEAD(&xs->map_list); spin_lock_init(&xs->map_list_lock); diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 818b75060922..20598eea658c 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -71,6 +71,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, INIT_LIST_HEAD(&pool->free_list); INIT_LIST_HEAD(&pool->xsk_tx_list); spin_lock_init(&pool->xsk_tx_list_lock); + spin_lock_init(&pool->cq_lock); refcount_set(&pool->users, 1); pool->fq = xs->fq_tmp; -- cgit v1.2.3 From b1b95cb5c0a9694d47d5f845ba97e226cfda957d Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 18 Dec 2020 14:45:25 +0100 Subject: xsk: Rollback reservation at NETDEV_TX_BUSY MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rollback the reservation in the completion ring when we get a NETDEV_TX_BUSY. When this error is received from the driver, we are supposed to let the user application retry the transmit again. And in order to do this, we need to roll back the failed send so it can be retried. Unfortunately, we did not cancel the reservation we had made in the completion ring. By not doing this, we actually make the completion ring one entry smaller per NETDEV_TX_BUSY error we get, and after enough of these errors the completion ring will be of size zero and transmit will stop working. Fix this by cancelling the reservation when we get a NETDEV_TX_BUSY error. Fixes: 642e450b6b59 ("xsk: Do not discard packet when NETDEV_TX_BUSY") Reported-by: Xuan Zhuo Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20201218134525.13119-3-magnus.karlsson@gmail.com --- net/xdp/xsk.c | 3 +++ net/xdp/xsk_queue.h | 5 +++++ 2 files changed, 8 insertions(+) (limited to 'net') diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index d531f9cd0de6..8037b04a9edd 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -487,6 +487,9 @@ static int xsk_generic_xmit(struct sock *sk) if (err == NETDEV_TX_BUSY) { /* Tell user-space to retry the send */ skb->destructor = sock_wfree; + spin_lock_irqsave(&xs->pool->cq_lock, flags); + xskq_prod_cancel(xs->pool->cq); + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); /* Free skb without triggering the perf drop trace */ consume_skb(skb); err = -EAGAIN; diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 4a9663aa7afe..2823b7c3302d 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -334,6 +334,11 @@ static inline bool xskq_prod_is_full(struct xsk_queue *q) return xskq_prod_nb_free(q, 1) ? false : true; } +static inline void xskq_prod_cancel(struct xsk_queue *q) +{ + q->cached_prod--; +} + static inline int xskq_prod_reserve(struct xsk_queue *q) { if (xskq_prod_is_full(q)) -- cgit v1.2.3 From abdcd06c4dedbcabaec68c433c7f53f33307811f Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Wed, 16 Dec 2020 09:28:04 +0200 Subject: net: af_packet: fix procfs header for 64-bit pointers On 64-bit systems the packet procfs header field names following 'sk' are not aligned correctly: sk RefCnt Type Proto Iface R Rmem User Inode 00000000605d2c64 3 3 0003 7 1 450880 0 16643 00000000080e9b80 2 2 0000 0 0 0 0 17404 00000000b23b8a00 2 2 0000 0 0 0 0 17421 ... With this change field names are correctly aligned: sk RefCnt Type Proto Iface R Rmem User Inode 000000005c3b1d97 3 3 0003 7 1 21568 0 16178 000000007be55bb7 3 3 fbce 8 1 0 0 16250 00000000be62127d 3 3 fbcd 8 1 0 0 16254 ... Signed-off-by: Baruch Siach Link: https://lore.kernel.org/r/54917251d8433735d9a24e935a6cb8eb88b4058a.1608103684.git.baruch@tkos.co.il Signed-off-by: Jakub Kicinski --- net/packet/af_packet.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index de8e8dbbdeb8..6bbc7a448593 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -4595,7 +4595,9 @@ static void packet_seq_stop(struct seq_file *seq, void *v) static int packet_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) - seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n"); + seq_printf(seq, + "%*sRefCnt Type Proto Iface R Rmem User Inode\n", + IS_ENABLED(CONFIG_64BIT) ? -17 : -9, "sk"); else { struct sock *s = sk_entry(v); const struct packet_sock *po = pkt_sk(s); -- cgit v1.2.3 From 698285da79f5b0b099db15a37ac661ac408c80eb Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Thu, 17 Dec 2020 22:29:46 +0100 Subject: net/sched: sch_taprio: ensure to reset/destroy all child qdiscs taprio_graft() can insert a NULL element in the array of child qdiscs. As a consquence, taprio_reset() might not reset child qdiscs completely, and taprio_destroy() might leak resources. Fix it by ensuring that loops that iterate over q->qdiscs[] don't end when they find the first NULL item. Fixes: 44d4775ca518 ("net/sched: sch_taprio: reset child qdiscs before freeing them") Fixes: 5a781ccbd19e ("tc: Add support for configuring the taprio scheduler") Suggested-by: Jakub Kicinski Signed-off-by: Davide Caratti Link: https://lore.kernel.org/r/13edef6778fef03adc751582562fba4a13e06d6a.1608240532.git.dcaratti@redhat.com Signed-off-by: Jakub Kicinski --- net/sched/sch_taprio.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index c74817ec9964..6f775275826a 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1605,8 +1605,9 @@ static void taprio_reset(struct Qdisc *sch) hrtimer_cancel(&q->advance_timer); if (q->qdiscs) { - for (i = 0; i < dev->num_tx_queues && q->qdiscs[i]; i++) - qdisc_reset(q->qdiscs[i]); + for (i = 0; i < dev->num_tx_queues; i++) + if (q->qdiscs[i]) + qdisc_reset(q->qdiscs[i]); } sch->qstats.backlog = 0; sch->q.qlen = 0; @@ -1626,7 +1627,7 @@ static void taprio_destroy(struct Qdisc *sch) taprio_disable_offload(dev, q, NULL); if (q->qdiscs) { - for (i = 0; i < dev->num_tx_queues && q->qdiscs[i]; i++) + for (i = 0; i < dev->num_tx_queues; i++) qdisc_put(q->qdiscs[i]); kfree(q->qdiscs); -- cgit v1.2.3 From 826f328e2b7e8854dd42ea44e6519cd75018e7b1 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 22 Dec 2020 22:49:44 +0100 Subject: net: dcb: Validate netlink message in DCB handler DCB uses the same handler function for both RTM_GETDCB and RTM_SETDCB messages. dcb_doit() bounces RTM_SETDCB mesasges if the user does not have the CAP_NET_ADMIN capability. However, the operation to be performed is not decided from the DCB message type, but from the DCB command. Thus DCB_CMD_*_GET commands are used for reading DCB objects, the corresponding SET and DEL commands are used for manipulation. The assumption is that set-like commands will be sent via an RTM_SETDCB message, and get-like ones via RTM_GETDCB. However, this assumption is not enforced. It is therefore possible to manipulate DCB objects without CAP_NET_ADMIN capability by sending the corresponding command in an RTM_GETDCB message. That is a bug. Fix it by validating the type of the request message against the type used for the response. Fixes: 2f90b8657ec9 ("ixgbe: this patch adds support for DCB to the kernel and ixgbe driver") Signed-off-by: Petr Machata Link: https://lore.kernel.org/r/a2a9b88418f3a58ef211b718f2970128ef9e3793.1608673640.git.me@pmachata.org Signed-off-by: Jakub Kicinski --- net/dcb/dcbnl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 084e159a12ba..7d49b6fd6cef 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -1765,6 +1765,8 @@ static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh, fn = &reply_funcs[dcb->cmd]; if (!fn->cb) return -EOPNOTSUPP; + if (fn->type != nlh->nlmsg_type) + return -EPERM; if (!tb[DCB_ATTR_IFNAME]) return -EINVAL; -- cgit v1.2.3 From 427c940558560bff2583d07fc119a21094675982 Mon Sep 17 00:00:00 2001 From: John Wang Date: Wed, 23 Dec 2020 13:55:23 +0800 Subject: net/ncsi: Use real net-device for response handler When aggregating ncsi interfaces and dedicated interfaces to bond interfaces, the ncsi response handler will use the wrong net device to find ncsi_dev, so that the ncsi interface will not work properly. Here, we use the original net device to fix it. Fixes: 138635cc27c9 ("net/ncsi: NCSI response packet handler") Signed-off-by: John Wang Link: https://lore.kernel.org/r/20201223055523.2069-1-wangzhiqiang.bj@bytedance.com Signed-off-by: Jakub Kicinski --- net/ncsi/ncsi-rsp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index 5b1f4ec66dd9..888ccc2d4e34 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -1120,7 +1120,7 @@ int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev, int payload, i, ret; /* Find the NCSI device */ - nd = ncsi_find_dev(dev); + nd = ncsi_find_dev(orig_dev); ndp = nd ? TO_NCSI_DEV_PRIV(nd) : NULL; if (!ndp) return -ENODEV; -- cgit v1.2.3 From 6cb56218ad9e580e519dcd23bfb3db08d8692e5a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 22 Dec 2020 23:23:56 +0100 Subject: netfilter: xt_RATEEST: reject non-null terminated string from userspace syzbot reports: detected buffer overflow in strlen [..] Call Trace: strlen include/linux/string.h:325 [inline] strlcpy include/linux/string.h:348 [inline] xt_rateest_tg_checkentry+0x2a5/0x6b0 net/netfilter/xt_RATEEST.c:143 strlcpy assumes src is a c-string. Check info->name before its used. Reported-by: syzbot+e86f7c428c8c50db65b4@syzkaller.appspotmail.com Fixes: 5859034d7eb8793 ("[NETFILTER]: x_tables: add RATEEST target") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_RATEEST.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index 37253d399c6b..0d5c422f8745 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -115,6 +115,9 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) } cfg; int ret; + if (strnlen(info->name, sizeof(est->name)) >= sizeof(est->name)) + return -ENAMETOOLONG; + net_get_random_once(&jhash_rnd, sizeof(jhash_rnd)); mutex_lock(&xn->hash_lock); -- cgit v1.2.3 From 95cd4bca7b1f4a25810f3ddfc5e767fb46931789 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 27 Dec 2020 12:33:44 +0100 Subject: netfilter: nft_dynset: report EOPNOTSUPP on missing set feature If userspace requests a feature which is not available the original set definition, then bail out with EOPNOTSUPP. If userspace sends unsupported dynset flags (new feature not supported by this kernel), then report EOPNOTSUPP to userspace. EINVAL should be only used to report malformed netlink messages from userspace. Fixes: 22fe54d5fefc ("netfilter: nf_tables: add support for dynamic set updates") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_dynset.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 983a1d5ca3ab..f35df221a633 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -177,7 +177,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx, u32 flags = ntohl(nla_get_be32(tb[NFTA_DYNSET_FLAGS])); if (flags & ~NFT_DYNSET_F_INV) - return -EINVAL; + return -EOPNOTSUPP; if (flags & NFT_DYNSET_F_INV) priv->invert = true; } @@ -210,7 +210,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx, timeout = 0; if (tb[NFTA_DYNSET_TIMEOUT] != NULL) { if (!(set->flags & NFT_SET_TIMEOUT)) - return -EINVAL; + return -EOPNOTSUPP; err = nf_msecs_to_jiffies64(tb[NFTA_DYNSET_TIMEOUT], &timeout); if (err) @@ -224,7 +224,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx, if (tb[NFTA_DYNSET_SREG_DATA] != NULL) { if (!(set->flags & NFT_SET_MAP)) - return -EINVAL; + return -EOPNOTSUPP; if (set->dtype == NFT_DATA_VERDICT) return -EOPNOTSUPP; -- cgit v1.2.3 From b4e70d8dd9ea6bd5d5fb3122586f652326ca09cd Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 27 Dec 2020 12:35:43 +0100 Subject: netfilter: nftables: add set expression flags The set flag NFT_SET_EXPR provides a hint to the kernel that userspace supports for multiple expressions per set element. In the same direction, NFT_DYNSET_F_EXPR specifies that dynset expression defines multiple expressions per set element. This allows new userspace software with old kernels to bail out with EOPNOTSUPP. This update is similar to ef516e8625dd ("netfilter: nf_tables: reintroduce the NFT_SET_CONCAT flag"). The NFT_SET_EXPR flag needs to be set on when the NFTA_SET_EXPRESSIONS attribute is specified. The NFT_SET_EXPR flag is not set on with NFTA_SET_EXPR to retain backward compatibility in old userspace binaries. Fixes: 48b0ae046ee9 ("netfilter: nftables: netlink support for several set element expressions") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 6 +++++- net/netfilter/nft_dynset.c | 9 +++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 4186b1e52d58..15c467f1a9dd 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4162,7 +4162,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT | NFT_SET_INTERVAL | NFT_SET_TIMEOUT | NFT_SET_MAP | NFT_SET_EVAL | - NFT_SET_OBJECT | NFT_SET_CONCAT)) + NFT_SET_OBJECT | NFT_SET_CONCAT | NFT_SET_EXPR)) return -EOPNOTSUPP; /* Only one of these operations is supported */ if ((flags & (NFT_SET_MAP | NFT_SET_OBJECT)) == @@ -4304,6 +4304,10 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, struct nlattr *tmp; int left; + if (!(flags & NFT_SET_EXPR)) { + err = -EINVAL; + goto err_set_alloc_name; + } i = 0; nla_for_each_nested(tmp, nla[NFTA_SET_EXPRESSIONS], left) { if (i == NFT_SET_EXPR_MAX) { diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index f35df221a633..0b053f75cd60 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -19,6 +19,7 @@ struct nft_dynset { enum nft_registers sreg_key:8; enum nft_registers sreg_data:8; bool invert; + bool expr; u8 num_exprs; u64 timeout; struct nft_expr *expr_array[NFT_SET_EXPR_MAX]; @@ -175,11 +176,12 @@ static int nft_dynset_init(const struct nft_ctx *ctx, if (tb[NFTA_DYNSET_FLAGS]) { u32 flags = ntohl(nla_get_be32(tb[NFTA_DYNSET_FLAGS])); - - if (flags & ~NFT_DYNSET_F_INV) + if (flags & ~(NFT_DYNSET_F_INV | NFT_DYNSET_F_EXPR)) return -EOPNOTSUPP; if (flags & NFT_DYNSET_F_INV) priv->invert = true; + if (flags & NFT_DYNSET_F_EXPR) + priv->expr = true; } set = nft_set_lookup_global(ctx->net, ctx->table, @@ -261,6 +263,9 @@ static int nft_dynset_init(const struct nft_ctx *ctx, struct nlattr *tmp; int left; + if (!priv->expr) + return -EINVAL; + i = 0; nla_for_each_nested(tmp, tb[NFTA_DYNSET_EXPRESSIONS], left) { if (i == NFT_SET_EXPR_MAX) { -- cgit v1.2.3 From 1ad58225dba3f2f598d2c6daed4323f24547168f Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Wed, 23 Dec 2020 22:23:20 +0100 Subject: net-sysfs: take the rtnl lock when storing xps_cpus Two race conditions can be triggered when storing xps cpus, resulting in various oops and invalid memory accesses: 1. Calling netdev_set_num_tc while netif_set_xps_queue: - netif_set_xps_queue uses dev->tc_num as one of the parameters to compute the size of new_dev_maps when allocating it. dev->tc_num is also used to access the map, and the compiler may generate code to retrieve this field multiple times in the function. - netdev_set_num_tc sets dev->tc_num. If new_dev_maps is allocated using dev->tc_num and then dev->tc_num is set to a higher value through netdev_set_num_tc, later accesses to new_dev_maps in netif_set_xps_queue could lead to accessing memory outside of new_dev_maps; triggering an oops. 2. Calling netif_set_xps_queue while netdev_set_num_tc is running: 2.1. netdev_set_num_tc starts by resetting the xps queues, dev->tc_num isn't updated yet. 2.2. netif_set_xps_queue is called, setting up the map with the *old* dev->num_tc. 2.3. netdev_set_num_tc updates dev->tc_num. 2.4. Later accesses to the map lead to out of bound accesses and oops. A similar issue can be found with netdev_reset_tc. One way of triggering this is to set an iface up (for which the driver uses netdev_set_num_tc in the open path, such as bnx2x) and writing to xps_cpus in a concurrent thread. With the right timing an oops is triggered. Both issues have the same fix: netif_set_xps_queue, netdev_set_num_tc and netdev_reset_tc should be mutually exclusive. We do that by taking the rtnl lock in xps_cpus_store. Fixes: 184c449f91fe ("net: Add support for XPS with QoS via traffic classes") Signed-off-by: Antoine Tenart Reviewed-by: Alexander Duyck Signed-off-by: Jakub Kicinski --- net/core/net-sysfs.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 999b70c59761..7cc15dec1717 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1396,7 +1396,13 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue, return err; } + if (!rtnl_trylock()) { + free_cpumask_var(mask); + return restart_syscall(); + } + err = netif_set_xps_queue(dev, mask, index); + rtnl_unlock(); free_cpumask_var(mask); -- cgit v1.2.3 From fb25038586d0064123e393cadf1fadd70a9df97a Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Wed, 23 Dec 2020 22:23:21 +0100 Subject: net-sysfs: take the rtnl lock when accessing xps_cpus_map and num_tc Accesses to dev->xps_cpus_map (when using dev->num_tc) should be protected by the rtnl lock, like we do for netif_set_xps_queue. I didn't see an actual bug being triggered, but let's be safe here and take the rtnl lock while accessing the map in sysfs. Fixes: 184c449f91fe ("net: Add support for XPS with QoS via traffic classes") Signed-off-by: Antoine Tenart Reviewed-by: Alexander Duyck Signed-off-by: Jakub Kicinski --- net/core/net-sysfs.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 7cc15dec1717..65886bfbf822 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1317,8 +1317,8 @@ static const struct attribute_group dql_group = { static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf) { + int cpu, len, ret, num_tc = 1, tc = 0; struct net_device *dev = queue->dev; - int cpu, len, num_tc = 1, tc = 0; struct xps_dev_maps *dev_maps; cpumask_var_t mask; unsigned long index; @@ -1328,22 +1328,31 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, index = get_netdev_queue_index(queue); + if (!rtnl_trylock()) + return restart_syscall(); + if (dev->num_tc) { /* Do not allow XPS on subordinate device directly */ num_tc = dev->num_tc; - if (num_tc < 0) - return -EINVAL; + if (num_tc < 0) { + ret = -EINVAL; + goto err_rtnl_unlock; + } /* If queue belongs to subordinate dev use its map */ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; tc = netdev_txq_to_tc(dev, index); - if (tc < 0) - return -EINVAL; + if (tc < 0) { + ret = -EINVAL; + goto err_rtnl_unlock; + } } - if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { + ret = -ENOMEM; + goto err_rtnl_unlock; + } rcu_read_lock(); dev_maps = rcu_dereference(dev->xps_cpus_map); @@ -1366,9 +1375,15 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, } rcu_read_unlock(); + rtnl_unlock(); + len = snprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask)); free_cpumask_var(mask); return len < PAGE_SIZE ? len : -EINVAL; + +err_rtnl_unlock: + rtnl_unlock(); + return ret; } static ssize_t xps_cpus_store(struct netdev_queue *queue, -- cgit v1.2.3 From 2d57b4f142e0b03e854612b8e28978935414bced Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Wed, 23 Dec 2020 22:23:22 +0100 Subject: net-sysfs: take the rtnl lock when storing xps_rxqs Two race conditions can be triggered when storing xps rxqs, resulting in various oops and invalid memory accesses: 1. Calling netdev_set_num_tc while netif_set_xps_queue: - netif_set_xps_queue uses dev->tc_num as one of the parameters to compute the size of new_dev_maps when allocating it. dev->tc_num is also used to access the map, and the compiler may generate code to retrieve this field multiple times in the function. - netdev_set_num_tc sets dev->tc_num. If new_dev_maps is allocated using dev->tc_num and then dev->tc_num is set to a higher value through netdev_set_num_tc, later accesses to new_dev_maps in netif_set_xps_queue could lead to accessing memory outside of new_dev_maps; triggering an oops. 2. Calling netif_set_xps_queue while netdev_set_num_tc is running: 2.1. netdev_set_num_tc starts by resetting the xps queues, dev->tc_num isn't updated yet. 2.2. netif_set_xps_queue is called, setting up the map with the *old* dev->num_tc. 2.3. netdev_set_num_tc updates dev->tc_num. 2.4. Later accesses to the map lead to out of bound accesses and oops. A similar issue can be found with netdev_reset_tc. One way of triggering this is to set an iface up (for which the driver uses netdev_set_num_tc in the open path, such as bnx2x) and writing to xps_rxqs in a concurrent thread. With the right timing an oops is triggered. Both issues have the same fix: netif_set_xps_queue, netdev_set_num_tc and netdev_reset_tc should be mutually exclusive. We do that by taking the rtnl lock in xps_rxqs_store. Fixes: 8af2c06ff4b1 ("net-sysfs: Add interface for Rx queue(s) map per Tx queue") Signed-off-by: Antoine Tenart Reviewed-by: Alexander Duyck Signed-off-by: Jakub Kicinski --- net/core/net-sysfs.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 65886bfbf822..62ca2f2c0ee6 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1499,10 +1499,17 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, return err; } + if (!rtnl_trylock()) { + bitmap_free(mask); + return restart_syscall(); + } + cpus_read_lock(); err = __netif_set_xps_queue(dev, mask, index, true); cpus_read_unlock(); + rtnl_unlock(); + bitmap_free(mask); return err ? : len; } -- cgit v1.2.3 From 4ae2bb81649dc03dfc95875f02126b14b773f7ab Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Wed, 23 Dec 2020 22:23:23 +0100 Subject: net-sysfs: take the rtnl lock when accessing xps_rxqs_map and num_tc Accesses to dev->xps_rxqs_map (when using dev->num_tc) should be protected by the rtnl lock, like we do for netif_set_xps_queue. I didn't see an actual bug being triggered, but let's be safe here and take the rtnl lock while accessing the map in sysfs. Fixes: 8af2c06ff4b1 ("net-sysfs: Add interface for Rx queue(s) map per Tx queue") Signed-off-by: Antoine Tenart Reviewed-by: Alexander Duyck Signed-off-by: Jakub Kicinski --- net/core/net-sysfs.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 62ca2f2c0ee6..daf502c13d6d 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1429,22 +1429,29 @@ static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) { + int j, len, ret, num_tc = 1, tc = 0; struct net_device *dev = queue->dev; struct xps_dev_maps *dev_maps; unsigned long *mask, index; - int j, len, num_tc = 1, tc = 0; index = get_netdev_queue_index(queue); + if (!rtnl_trylock()) + return restart_syscall(); + if (dev->num_tc) { num_tc = dev->num_tc; tc = netdev_txq_to_tc(dev, index); - if (tc < 0) - return -EINVAL; + if (tc < 0) { + ret = -EINVAL; + goto err_rtnl_unlock; + } } mask = bitmap_zalloc(dev->num_rx_queues, GFP_KERNEL); - if (!mask) - return -ENOMEM; + if (!mask) { + ret = -ENOMEM; + goto err_rtnl_unlock; + } rcu_read_lock(); dev_maps = rcu_dereference(dev->xps_rxqs_map); @@ -1470,10 +1477,16 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) out_no_maps: rcu_read_unlock(); + rtnl_unlock(); + len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues); bitmap_free(mask); return len < PAGE_SIZE ? len : -EINVAL; + +err_rtnl_unlock: + rtnl_unlock(); + return ret; } static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, -- cgit v1.2.3 From e7579d5d5b3298f7e888ed07ac16bfb7174c135a Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Mon, 21 Dec 2020 22:07:25 +0100 Subject: net: mptcp: cap forward allocation to 1M the following syzkaller reproducer: r0 = socket$inet_mptcp(0x2, 0x1, 0x106) bind$inet(r0, &(0x7f0000000080)={0x2, 0x4e24, @multicast2}, 0x10) connect$inet(r0, &(0x7f0000000480)={0x2, 0x4e24, @local}, 0x10) sendto$inet(r0, &(0x7f0000000100)="f6", 0xffffffe7, 0xc000, 0x0, 0x0) systematically triggers the following warning: WARNING: CPU: 2 PID: 8618 at net/core/stream.c:208 sk_stream_kill_queues+0x3fa/0x580 Modules linked in: CPU: 2 PID: 8618 Comm: syz-executor Not tainted 5.10.0+ #334 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.1-4.module+el8.1.0+4066+0f1aadab 04/04 RIP: 0010:sk_stream_kill_queues+0x3fa/0x580 Code: df 48 c1 ea 03 0f b6 04 02 84 c0 74 04 3c 03 7e 40 8b ab 20 02 00 00 e9 64 ff ff ff e8 df f0 81 2 RSP: 0018:ffffc9000290fcb0 EFLAGS: 00010293 RAX: ffff888011cb8000 RBX: 0000000000000000 RCX: ffffffff86eecf0e RDX: 0000000000000000 RSI: ffffffff86eecf6a RDI: 0000000000000005 RBP: 0000000000000e28 R08: ffff888011cb8000 R09: fffffbfff1f48139 R10: ffffffff8fa409c7 R11: fffffbfff1f48138 R12: ffff8880215e6220 R13: ffffffff8fa409c0 R14: ffffc9000290fd30 R15: 1ffff92000521fa2 FS: 00007f41c78f4800(0000) GS:ffff88802d000000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f95c803d088 CR3: 0000000025ed2000 CR4: 00000000000006f0 Call Trace: __mptcp_destroy_sock+0x4f5/0x8e0 mptcp_close+0x5e2/0x7f0 inet_release+0x12b/0x270 __sock_release+0xc8/0x270 sock_close+0x18/0x20 __fput+0x272/0x8e0 task_work_run+0xe0/0x1a0 exit_to_user_mode_prepare+0x1df/0x200 syscall_exit_to_user_mode+0x19/0x50 entry_SYSCALL_64_after_hwframe+0x44/0xa9 userspace programs provide arbitrarily high values of 'len' in sendmsg(): this is causing integer overflow of 'amount'. Cap forward allocation to 1 megabyte: higher values are not really useful. Suggested-by: Paolo Abeni Fixes: e93da92896bc ("mptcp: implement wmem reservation") Signed-off-by: Davide Caratti Link: https://lore.kernel.org/r/3334d00d8b2faecafdfab9aa593efcbf61442756.1608584474.git.dcaratti@redhat.com Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 09b19aa2f205..6628d8d74203 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -877,6 +877,9 @@ static void __mptcp_wmem_reserve(struct sock *sk, int size) struct mptcp_sock *msk = mptcp_sk(sk); WARN_ON_ONCE(msk->wmem_reserved); + if (WARN_ON_ONCE(amount < 0)) + amount = 0; + if (amount <= sk->sk_forward_alloc) goto reserve; @@ -1587,7 +1590,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) return -EOPNOTSUPP; - mptcp_lock_sock(sk, __mptcp_wmem_reserve(sk, len)); + mptcp_lock_sock(sk, __mptcp_wmem_reserve(sk, min_t(size_t, 1 << 20, len))); timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); -- cgit v1.2.3 From 21fdca22eb7df2a1e194b8adb812ce370748b733 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Thu, 24 Dec 2020 20:01:09 +0100 Subject: ipv4: Ignore ECN bits for fib lookups in fib_compute_spec_dst() RT_TOS() only clears one of the ECN bits. Therefore, when fib_compute_spec_dst() resorts to a fib lookup, it can return different results depending on the value of the second ECN bit. For example, ECT(0) and ECT(1) packets could be treated differently. $ ip netns add ns0 $ ip netns add ns1 $ ip link add name veth01 netns ns0 type veth peer name veth10 netns ns1 $ ip -netns ns0 link set dev lo up $ ip -netns ns1 link set dev lo up $ ip -netns ns0 link set dev veth01 up $ ip -netns ns1 link set dev veth10 up $ ip -netns ns0 address add 192.0.2.10/24 dev veth01 $ ip -netns ns1 address add 192.0.2.11/24 dev veth10 $ ip -netns ns1 address add 192.0.2.21/32 dev lo $ ip -netns ns1 route add 192.0.2.10/32 tos 4 dev veth10 src 192.0.2.21 $ ip netns exec ns1 sysctl -wq net.ipv4.icmp_echo_ignore_broadcasts=0 With TOS 4 and ECT(1), ns1 replies using source address 192.0.2.21 (ping uses -Q to set all TOS and ECN bits): $ ip netns exec ns0 ping -c 1 -b -Q 5 192.0.2.255 [...] 64 bytes from 192.0.2.21: icmp_seq=1 ttl=64 time=0.544 ms But with TOS 4 and ECT(0), ns1 replies using source address 192.0.2.11 because the "tos 4" route isn't matched: $ ip netns exec ns0 ping -c 1 -b -Q 6 192.0.2.255 [...] 64 bytes from 192.0.2.11: icmp_seq=1 ttl=64 time=0.597 ms After this patch the ECN bits don't affect the result anymore: $ ip netns exec ns0 ping -c 1 -b -Q 6 192.0.2.255 [...] 64 bytes from 192.0.2.21: icmp_seq=1 ttl=64 time=0.591 ms Fixes: 35ebf65e851c ("ipv4: Create and use fib_compute_spec_dst() helper.") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index cdf6ec5aa45d..84bb707bd88d 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -292,7 +292,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) .flowi4_iif = LOOPBACK_IFINDEX, .flowi4_oif = l3mdev_master_ifindex_rcu(dev), .daddr = ip_hdr(skb)->saddr, - .flowi4_tos = RT_TOS(ip_hdr(skb)->tos), + .flowi4_tos = ip_hdr(skb)->tos & IPTOS_RT_MASK, .flowi4_scope = scope, .flowi4_mark = vmark ? skb->mark : 0, }; -- cgit v1.2.3 From a533b70a657c03137dd49cbcfee70aac086ab2b1 Mon Sep 17 00:00:00 2001 From: weichenchen Date: Fri, 25 Dec 2020 13:44:45 +0800 Subject: net: neighbor: fix a crash caused by mod zero pneigh_enqueue() tries to obtain a random delay by mod NEIGH_VAR(p, PROXY_DELAY). However, NEIGH_VAR(p, PROXY_DELAY) migth be zero at that point because someone could write zero to /proc/sys/net/ipv4/neigh/[device]/proxy_delay after the callers check it. This patch uses prandom_u32_max() to get a random delay instead which avoids potential division by zero. Signed-off-by: weichenchen Signed-off-by: David S. Miller --- net/core/neighbour.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 9500d28a43b0..277ed854aef1 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1569,10 +1569,8 @@ static void neigh_proxy_process(struct timer_list *t) void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb) { - unsigned long now = jiffies; - - unsigned long sched_next = now + (prandom_u32() % - NEIGH_VAR(p, PROXY_DELAY)); + unsigned long sched_next = jiffies + + prandom_u32_max(NEIGH_VAR(p, PROXY_DELAY)); if (tbl->proxy_queue.qlen > NEIGH_VAR(p, PROXY_QLEN)) { kfree_skb(skb); -- cgit v1.2.3 From bd1248f1ddbc48b0c30565fce897a3b6423313b8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 24 Dec 2020 22:23:44 -0800 Subject: net: sched: prevent invalid Scell_log shift count Check Scell_log shift size in red_check_params() and modify all callers of red_check_params() to pass Scell_log. This prevents a shift out-of-bounds as detected by UBSAN: UBSAN: shift-out-of-bounds in ./include/net/red.h:252:22 shift exponent 72 is too large for 32-bit type 'int' Fixes: 8afa10cbe281 ("net_sched: red: Avoid illegal values") Signed-off-by: Randy Dunlap Reported-by: syzbot+97c5bd9cc81eca63d36e@syzkaller.appspotmail.com Cc: Nogah Frankel Cc: Jamal Hadi Salim Cc: Cong Wang Cc: Jiri Pirko Cc: netdev@vger.kernel.org Cc: "David S. Miller" Cc: Jakub Kicinski Signed-off-by: David S. Miller --- net/sched/sch_choke.c | 2 +- net/sched/sch_gred.c | 2 +- net/sched/sch_red.c | 2 +- net/sched/sch_sfq.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index bd618b00d319..50f680f03a54 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -362,7 +362,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt, ctl = nla_data(tb[TCA_CHOKE_PARMS]); - if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) + if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log)) return -EINVAL; if (ctl->limit > CHOKE_MAX_QUEUE) diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 8599c6f31b05..e0bc77533acc 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -480,7 +480,7 @@ static inline int gred_change_vq(struct Qdisc *sch, int dp, struct gred_sched *table = qdisc_priv(sch); struct gred_sched_data *q = table->tab[dp]; - if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) { + if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log)) { NL_SET_ERR_MSG_MOD(extack, "invalid RED parameters"); return -EINVAL; } diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index e89fab6ccb34..b4ae34d7aa96 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -250,7 +250,7 @@ static int __red_change(struct Qdisc *sch, struct nlattr **tb, max_P = tb[TCA_RED_MAX_P] ? nla_get_u32(tb[TCA_RED_MAX_P]) : 0; ctl = nla_data(tb[TCA_RED_PARMS]); - if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog)) + if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog, ctl->Scell_log)) return -EINVAL; err = red_get_flags(ctl->flags, TC_RED_HISTORIC_FLAGS, diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index bca2be57d9fc..b25e51440623 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -647,7 +647,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) } if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max, - ctl_v1->Wlog)) + ctl_v1->Wlog, ctl_v1->Scell_log)) return -EINVAL; if (ctl_v1 && ctl_v1->qth_min) { p = kmalloc(sizeof(*p), GFP_KERNEL); -- cgit v1.2.3 From 085c7c4e1c0e50d90b7d90f61a12e12b317a91e2 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 26 Dec 2020 15:44:53 -0800 Subject: erspan: fix version 1 check in gre_parse_header() Both version 0 and version 1 use ETH_P_ERSPAN, but version 0 does not have an erspan header. So the check in gre_parse_header() is wrong, we have to distinguish version 1 from version 0. We can just check the gre header length like is_erspan_type1(). Fixes: cb73ee40b1b3 ("net: ip_gre: use erspan key field for tunnel lookup") Reported-by: syzbot+f583ce3d4ddf9836b27a@syzkaller.appspotmail.com Cc: William Tu Cc: Lorenzo Bianconi Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/ipv4/gre_demux.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 66fdbfe5447c..5d1e6fe9d838 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -128,7 +128,7 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, * to 0 and sets the configured key in the * inner erspan header field */ - if (greh->protocol == htons(ETH_P_ERSPAN) || + if ((greh->protocol == htons(ETH_P_ERSPAN) && hdr_len != 4) || greh->protocol == htons(ETH_P_ERSPAN2)) { struct erspan_base_hdr *ershdr; -- cgit v1.2.3 From b40f97b91a3b167ab22c9e9f1ef00b1615ff01e9 Mon Sep 17 00:00:00 2001 From: Xie He Date: Thu, 31 Dec 2020 09:43:31 -0800 Subject: net: lapb: Decrease the refcount of "struct lapb_cb" in lapb_device_event In lapb_device_event, lapb_devtostruct is called to get a reference to an object of "struct lapb_cb". lapb_devtostruct increases the refcount of the object and returns a pointer to it. However, we didn't decrease the refcount after we finished using the pointer. This patch fixes this problem. Fixes: a4989fa91110 ("net/lapb: support netdev events") Cc: Martin Schiller Signed-off-by: Xie He Link: https://lore.kernel.org/r/20201231174331.64539-1-xie.he.0141@gmail.com Signed-off-by: Jakub Kicinski --- net/lapb/lapb_iface.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/lapb/lapb_iface.c b/net/lapb/lapb_iface.c index 213ea7abc9ab..40961889e9c0 100644 --- a/net/lapb/lapb_iface.c +++ b/net/lapb/lapb_iface.c @@ -489,6 +489,7 @@ static int lapb_device_event(struct notifier_block *this, unsigned long event, break; } + lapb_put(lapb); return NOTIFY_DONE; } -- cgit v1.2.3