summaryrefslogtreecommitdiffstats
path: root/net/xdp/xsk.c
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2019-12-27 14:20:10 -0800
committerDavid S. Miller <davem@davemloft.net>2019-12-27 14:20:10 -0800
commit2bbc078f812d45b8decb55935dab21199bd21489 (patch)
treeb217e030e0f80a26561cef679e8ae2643162b346 /net/xdp/xsk.c
parent9e41fbf3dd38327d440a8f3ba0c234519dbb5280 (diff)
parent7c8dce4b166113743adad131b5a24c4acc12f92c (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Daniel Borkmann says: ==================== pull-request: bpf-next 2019-12-27 The following pull-request contains BPF updates for your *net-next* tree. We've added 127 non-merge commits during the last 17 day(s) which contain a total of 110 files changed, 6901 insertions(+), 2721 deletions(-). There are three merge conflicts. Conflicts and resolution looks as follows: 1) Merge conflict in net/bpf/test_run.c: There was a tree-wide cleanup c593642c8be0 ("treewide: Use sizeof_field() macro") which gets in the way with b590cb5f802d ("bpf: Switch to offsetofend in BPF_PROG_TEST_RUN"): <<<<<<< HEAD if (!range_is_zero(__skb, offsetof(struct __sk_buff, priority) + sizeof_field(struct __sk_buff, priority), ======= if (!range_is_zero(__skb, offsetofend(struct __sk_buff, priority), >>>>>>> 7c8dce4b166113743adad131b5a24c4acc12f92c There are a few occasions that look similar to this. Always take the chunk with offsetofend(). Note that there is one where the fields differ in here: <<<<<<< HEAD if (!range_is_zero(__skb, offsetof(struct __sk_buff, tstamp) + sizeof_field(struct __sk_buff, tstamp), ======= if (!range_is_zero(__skb, offsetofend(struct __sk_buff, gso_segs), >>>>>>> 7c8dce4b166113743adad131b5a24c4acc12f92c Just take the one with offsetofend() /and/ gso_segs. Latter is correct due to 850a88cc4096 ("bpf: Expose __sk_buff wire_len/gso_segs to BPF_PROG_TEST_RUN"). 2) Merge conflict in arch/riscv/net/bpf_jit_comp.c: (I'm keeping Bjorn in Cc here for a double-check in case I got it wrong.) <<<<<<< HEAD if (is_13b_check(off, insn)) return -1; emit(rv_blt(tcc, RV_REG_ZERO, off >> 1), ctx); ======= emit_branch(BPF_JSLT, RV_REG_T1, RV_REG_ZERO, off, ctx); >>>>>>> 7c8dce4b166113743adad131b5a24c4acc12f92c Result should look like: emit_branch(BPF_JSLT, tcc, RV_REG_ZERO, off, ctx); 3) Merge conflict in arch/riscv/include/asm/pgtable.h: <<<<<<< HEAD ======= #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) #define VMALLOC_END (PAGE_OFFSET - 1) #define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) #define BPF_JIT_REGION_SIZE (SZ_128M) #define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE) #define BPF_JIT_REGION_END (VMALLOC_END) /* * Roughly size the vmemmap space to be large enough to fit enough * struct pages to map half the virtual address space. Then * position vmemmap directly below the VMALLOC region. */ #define VMEMMAP_SHIFT \ (CONFIG_VA_BITS - PAGE_SHIFT - 1 + STRUCT_PAGE_MAX_SHIFT) #define VMEMMAP_SIZE BIT(VMEMMAP_SHIFT) #define VMEMMAP_END (VMALLOC_START - 1) #define VMEMMAP_START (VMALLOC_START - VMEMMAP_SIZE) #define vmemmap ((struct page *)VMEMMAP_START) >>>>>>> 7c8dce4b166113743adad131b5a24c4acc12f92c Only take the BPF_* defines from there and move them higher up in the same file. Remove the rest from the chunk. The VMALLOC_* etc defines got moved via 01f52e16b868 ("riscv: define vmemmap before pfn_to_page calls"). Result: [...] #define __S101 PAGE_READ_EXEC #define __S110 PAGE_SHARED_EXEC #define __S111 PAGE_SHARED_EXEC #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) #define VMALLOC_END (PAGE_OFFSET - 1) #define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) #define BPF_JIT_REGION_SIZE (SZ_128M) #define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE) #define BPF_JIT_REGION_END (VMALLOC_END) /* * Roughly size the vmemmap space to be large enough to fit enough * struct pages to map half the virtual address space. Then * position vmemmap directly below the VMALLOC region. */ #define VMEMMAP_SHIFT \ (CONFIG_VA_BITS - PAGE_SHIFT - 1 + STRUCT_PAGE_MAX_SHIFT) #define VMEMMAP_SIZE BIT(VMEMMAP_SHIFT) #define VMEMMAP_END (VMALLOC_START - 1) #define VMEMMAP_START (VMALLOC_START - VMEMMAP_SIZE) [...] Let me know if there are any other issues. Anyway, the main changes are: 1) Extend bpftool to produce a struct (aka "skeleton") tailored and specific to a provided BPF object file. This provides an alternative, simplified API compared to standard libbpf interaction. Also, add libbpf extern variable resolution for .kconfig section to import Kconfig data, from Andrii Nakryiko. 2) Add BPF dispatcher for XDP which is a mechanism to avoid indirect calls by generating a branch funnel as discussed back in bpfconf'19 at LSF/MM. Also, add various BPF riscv JIT improvements, from Björn Töpel. 3) Extend bpftool to allow matching BPF programs and maps by name, from Paul Chaignon. 4) Support for replacing cgroup BPF programs attached with BPF_F_ALLOW_MULTI flag for allowing updates without service interruption, from Andrey Ignatov. 5) Cleanup and simplification of ring access functions for AF_XDP with a bonus of 0-5% performance improvement, from Magnus Karlsson. 6) Enable BPF JITs for x86-64 and arm64 by default. Also, final version of audit support for BPF, from Daniel Borkmann and latter with Jiri Olsa. 7) Move and extend test_select_reuseport into BPF program tests under BPF selftests, from Jakub Sitnicki. 8) Various BPF sample improvements for xdpsock for customizing parameters to set up and benchmark AF_XDP, from Jay Jayatheerthan. 9) Improve libbpf to provide a ulimit hint on permission denied errors. Also change XDP sample programs to attach in driver mode by default, from Toke Høiland-Jørgensen. 10) Extend BPF test infrastructure to allow changing skb mark from tc BPF programs, from Nikita V. Shirokov. 11) Optimize prologue code sequence in BPF arm32 JIT, from Russell King. 12) Fix xdp_redirect_cpu BPF sample to manually attach to tracepoints after libbpf conversion, from Jesper Dangaard Brouer. 13) Minor misc improvements from various others. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/xdp/xsk.c')
-rw-r--r--net/xdp/xsk.c79
1 files changed, 45 insertions, 34 deletions
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 328f661b83b2..02ada7ab8c6e 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -31,6 +31,8 @@
#define TX_BATCH_SIZE 16
+static DEFINE_PER_CPU(struct list_head, xskmap_flush_list);
+
bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
{
return READ_ONCE(xs->rx) && READ_ONCE(xs->umem) &&
@@ -39,21 +41,21 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt)
{
- return xskq_has_addrs(umem->fq, cnt);
+ return xskq_cons_has_entries(umem->fq, cnt);
}
EXPORT_SYMBOL(xsk_umem_has_addrs);
-u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
+bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
{
- return xskq_peek_addr(umem->fq, addr, umem);
+ return xskq_cons_peek_addr(umem->fq, addr, umem);
}
EXPORT_SYMBOL(xsk_umem_peek_addr);
-void xsk_umem_discard_addr(struct xdp_umem *umem)
+void xsk_umem_release_addr(struct xdp_umem *umem)
{
- xskq_discard_addr(umem->fq);
+ xskq_cons_release(umem->fq);
}
-EXPORT_SYMBOL(xsk_umem_discard_addr);
+EXPORT_SYMBOL(xsk_umem_release_addr);
void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
{
@@ -124,7 +126,7 @@ static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void *from_buf,
void *to_buf = xdp_umem_get_data(umem, addr);
addr = xsk_umem_add_offset_to_addr(addr);
- if (xskq_crosses_non_contig_pg(umem, addr, len + metalen)) {
+ if (xskq_cons_crosses_non_contig_pg(umem, addr, len + metalen)) {
void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT) + 1].addr;
u64 page_start = addr & ~(PAGE_SIZE - 1);
u64 first_len = PAGE_SIZE - (addr - page_start);
@@ -146,7 +148,7 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
u32 metalen;
int err;
- if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) ||
+ if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) ||
len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
xs->rx_dropped++;
return -ENOSPC;
@@ -165,9 +167,9 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
offset += metalen;
addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
- err = xskq_produce_batch_desc(xs->rx, addr, len);
+ err = xskq_prod_reserve_desc(xs->rx, addr, len);
if (!err) {
- xskq_discard_addr(xs->umem->fq);
+ xskq_cons_release(xs->umem->fq);
xdp_return_buff(xdp);
return 0;
}
@@ -178,7 +180,7 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
{
- int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len);
+ int err = xskq_prod_reserve_desc(xs->rx, xdp->handle, len);
if (err)
xs->rx_dropped++;
@@ -214,7 +216,7 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
static void xsk_flush(struct xdp_sock *xs)
{
- xskq_produce_flush_desc(xs->rx);
+ xskq_prod_submit(xs->rx);
xs->sk.sk_data_ready(&xs->sk);
}
@@ -234,7 +236,7 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
goto out_unlock;
}
- if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) ||
+ if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) ||
len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
err = -ENOSPC;
goto out_drop;
@@ -245,12 +247,12 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
memcpy(buffer, xdp->data_meta, len + metalen);
addr = xsk_umem_adjust_offset(xs->umem, addr, metalen);
- err = xskq_produce_batch_desc(xs->rx, addr, len);
+ err = xskq_prod_reserve_desc(xs->rx, addr, len);
if (err)
goto out_drop;
- xskq_discard_addr(xs->umem->fq);
- xskq_produce_flush_desc(xs->rx);
+ xskq_cons_release(xs->umem->fq);
+ xskq_prod_submit(xs->rx);
spin_unlock_bh(&xs->rx_lock);
@@ -264,11 +266,9 @@ out_unlock:
return err;
}
-int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp,
- struct xdp_sock *xs)
+int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
{
- struct xsk_map *m = container_of(map, struct xsk_map, map);
- struct list_head *flush_list = this_cpu_ptr(m->flush_list);
+ struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
int err;
err = xsk_rcv(xs, xdp);
@@ -281,10 +281,9 @@ int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp,
return 0;
}
-void __xsk_map_flush(struct bpf_map *map)
+void __xsk_map_flush(void)
{
- struct xsk_map *m = container_of(map, struct xsk_map, map);
- struct list_head *flush_list = this_cpu_ptr(m->flush_list);
+ struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
struct xdp_sock *xs, *tmp;
list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
@@ -295,7 +294,7 @@ void __xsk_map_flush(struct bpf_map *map)
void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
{
- xskq_produce_flush_addr_n(umem->cq, nb_entries);
+ xskq_prod_submit_n(umem->cq, nb_entries);
}
EXPORT_SYMBOL(xsk_umem_complete_tx);
@@ -317,13 +316,18 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc)
rcu_read_lock();
list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
- if (!xskq_peek_desc(xs->tx, desc, umem))
+ if (!xskq_cons_peek_desc(xs->tx, desc, umem))
continue;
- if (xskq_produce_addr_lazy(umem->cq, desc->addr))
+ /* This is the backpreassure mechanism for the Tx path.
+ * Reserve space in the completion queue and only proceed
+ * if there is space in it. This avoids having to implement
+ * any buffering in the Tx path.
+ */
+ if (xskq_prod_reserve_addr(umem->cq, desc->addr))
goto out;
- xskq_discard_desc(xs->tx);
+ xskq_cons_release(xs->tx);
rcu_read_unlock();
return true;
}
@@ -358,7 +362,7 @@ static void xsk_destruct_skb(struct sk_buff *skb)
unsigned long flags;
spin_lock_irqsave(&xs->tx_completion_lock, flags);
- WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr));
+ xskq_prod_submit_addr(xs->umem->cq, addr);
spin_unlock_irqrestore(&xs->tx_completion_lock, flags);
sock_wfree(skb);
@@ -378,7 +382,7 @@ static int xsk_generic_xmit(struct sock *sk)
if (xs->queue_id >= xs->dev->real_num_tx_queues)
goto out;
- while (xskq_peek_desc(xs->tx, &desc, xs->umem)) {
+ while (xskq_cons_peek_desc(xs->tx, &desc, xs->umem)) {
char *buffer;
u64 addr;
u32 len;
@@ -399,7 +403,12 @@ static int xsk_generic_xmit(struct sock *sk)
addr = desc.addr;
buffer = xdp_umem_get_data(xs->umem, addr);
err = skb_store_bits(skb, 0, buffer, len);
- if (unlikely(err) || xskq_reserve_addr(xs->umem->cq)) {
+ /* This is the backpreassure mechanism for the Tx path.
+ * Reserve space in the completion queue and only proceed
+ * if there is space in it. This avoids having to implement
+ * any buffering in the Tx path.
+ */
+ if (unlikely(err) || xskq_prod_reserve(xs->umem->cq)) {
kfree_skb(skb);
goto out;
}
@@ -411,7 +420,7 @@ static int xsk_generic_xmit(struct sock *sk)
skb->destructor = xsk_destruct_skb;
err = dev_direct_xmit(skb, xs->queue_id);
- xskq_discard_desc(xs->tx);
+ xskq_cons_release(xs->tx);
/* Ignore NET_XMIT_CN as packet might have been sent */
if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) {
/* SKB completed but not sent */
@@ -477,9 +486,9 @@ static __poll_t xsk_poll(struct file *file, struct socket *sock,
__xsk_sendmsg(sk);
}
- if (xs->rx && !xskq_empty_desc(xs->rx))
+ if (xs->rx && !xskq_prod_is_empty(xs->rx))
mask |= EPOLLIN | EPOLLRDNORM;
- if (xs->tx && !xskq_full_desc(xs->tx))
+ if (xs->tx && !xskq_cons_is_full(xs->tx))
mask |= EPOLLOUT | EPOLLWRNORM;
return mask;
@@ -1183,7 +1192,7 @@ static struct pernet_operations xsk_net_ops = {
static int __init xsk_init(void)
{
- int err;
+ int err, cpu;
err = proto_register(&xsk_proto, 0 /* no slab */);
if (err)
@@ -1201,6 +1210,8 @@ static int __init xsk_init(void)
if (err)
goto out_pernet;
+ for_each_possible_cpu(cpu)
+ INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu));
return 0;
out_pernet: