From 19d8f1ad12fd746e60707a58d954980013c7a35a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 4 Sep 2018 21:53:52 +0200 Subject: if_link: add IFLA_TARGET_NETNSID alias This adds IFLA_TARGET_NETNSID as an alias for IFLA_IF_NETNSID for RTM_*LINK requests. The new name is clearer and also aligns with the newly introduced IFA_TARGET_NETNSID propert for RTM_*ADDR requests. Signed-off-by: Christian Brauner Suggested-by: Nicolas Dichtel Cc: Jiri Benc Signed-off-by: David S. Miller --- tools/include/uapi/linux/if_link.h | 1 + 1 file changed, 1 insertion(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index cf01b6824244..1c73d63068b1 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -161,6 +161,7 @@ enum { IFLA_EVENT, IFLA_NEW_NETNSID, IFLA_IF_NETNSID, + IFLA_TARGET_NETNSID = IFLA_IF_NETNSID, /* new alias */ IFLA_CARRIER_UP_COUNT, IFLA_CARRIER_DOWN_COUNT, IFLA_NEW_IFINDEX, -- cgit v1.2.3 From 52b7b7843d9523ebc3c60c51c7afc4a45cc10aad Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 5 Sep 2018 16:58:03 -0700 Subject: tools/bpf: sync kernel uapi header if_link.h to tools Among others, this header will be used later for bpftool net support. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/if_link.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index cf01b6824244..43391e2d1153 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -164,6 +164,8 @@ enum { IFLA_CARRIER_UP_COUNT, IFLA_CARRIER_DOWN_COUNT, IFLA_NEW_IFINDEX, + IFLA_MIN_MTU, + IFLA_MAX_MTU, __IFLA_MAX }; @@ -334,6 +336,7 @@ enum { IFLA_BRPORT_GROUP_FWD_MASK, IFLA_BRPORT_NEIGH_SUPPRESS, IFLA_BRPORT_ISOLATED, + IFLA_BRPORT_BACKUP_PORT, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) @@ -459,6 +462,16 @@ enum { #define IFLA_MACSEC_MAX (__IFLA_MACSEC_MAX - 1) +/* XFRM section */ +enum { + IFLA_XFRM_UNSPEC, + IFLA_XFRM_LINK, + IFLA_XFRM_IF_ID, + __IFLA_XFRM_MAX +}; + +#define IFLA_XFRM_MAX (__IFLA_XFRM_MAX - 1) + enum macsec_validation_type { MACSEC_VALIDATE_DISABLED = 0, MACSEC_VALIDATE_CHECK = 1, @@ -920,6 +933,7 @@ enum { XDP_ATTACHED_DRV, XDP_ATTACHED_SKB, XDP_ATTACHED_HW, + XDP_ATTACHED_MULTI, }; enum { @@ -928,6 +942,9 @@ enum { IFLA_XDP_ATTACHED, IFLA_XDP_FLAGS, IFLA_XDP_PROG_ID, + IFLA_XDP_DRV_PROG_ID, + IFLA_XDP_SKB_PROG_ID, + IFLA_XDP_HW_PROG_ID, __IFLA_XDP_MAX, }; -- cgit v1.2.3 From 52d0d404d39dd9eac71a181615d6ca15e23d8e38 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Wed, 12 Sep 2018 10:04:21 +0800 Subject: geneve: add ttl inherit support Similar with commit 72f6d71e491e6 ("vxlan: add ttl inherit support"), currently ttl == 0 means "use whatever default value" on geneve instead of inherit inner ttl. To respect compatibility with old behavior, let's add a new IFLA_GENEVE_TTL_INHERIT for geneve ttl inherit support. Reported-by: Jianlin Shi Suggested-by: Jiri Benc Signed-off-by: Hangbin Liu Reviewed-by: Jiri Benc Signed-off-by: David S. Miller --- tools/include/uapi/linux/if_link.h | 1 + 1 file changed, 1 insertion(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index 1c73d63068b1..141cbfdc5865 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -542,6 +542,7 @@ enum { IFLA_GENEVE_UDP_ZERO_CSUM6_TX, IFLA_GENEVE_UDP_ZERO_CSUM6_RX, IFLA_GENEVE_LABEL, + IFLA_GENEVE_TTL_INHERIT, __IFLA_GENEVE_MAX }; #define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1) -- cgit v1.2.3 From 2f965e3fcd4b4159a5ea832d4b5e9ff2550fa571 Mon Sep 17 00:00:00 2001 From: Petar Penkov Date: Fri, 14 Sep 2018 07:46:19 -0700 Subject: bpf: sync bpf.h uapi with tools/ This patch syncs tools/include/uapi/linux/bpf.h with the flow dissector definitions from include/uapi/linux/bpf.h Signed-off-by: Petar Penkov Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/bpf.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 66917a4eba27..aa5ccd2385ed 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -152,6 +152,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_LWT_SEG6LOCAL, BPF_PROG_TYPE_LIRC_MODE2, BPF_PROG_TYPE_SK_REUSEPORT, + BPF_PROG_TYPE_FLOW_DISSECTOR, }; enum bpf_attach_type { @@ -172,6 +173,7 @@ enum bpf_attach_type { BPF_CGROUP_UDP4_SENDMSG, BPF_CGROUP_UDP6_SENDMSG, BPF_LIRC_MODE2, + BPF_FLOW_DISSECTOR, __MAX_BPF_ATTACH_TYPE }; @@ -2333,6 +2335,7 @@ struct __sk_buff { /* ... here. */ __u32 data_meta; + struct bpf_flow_keys *flow_keys; }; struct bpf_tunnel_key { @@ -2778,4 +2781,27 @@ enum bpf_task_fd_type { BPF_FD_TYPE_URETPROBE, /* filename + offset */ }; +struct bpf_flow_keys { + __u16 nhoff; + __u16 thoff; + __u16 addr_proto; /* ETH_P_* of valid addrs */ + __u8 is_frag; + __u8 is_first_frag; + __u8 is_encap; + __u8 ip_proto; + __be16 n_proto; + __be16 sport; + __be16 dport; + union { + struct { + __be32 ipv4_src; + __be32 ipv4_dst; + }; + struct { + __u32 ipv6_src[4]; /* in6_addr; network order */ + __u32 ipv6_dst[4]; /* in6_addr; network order */ + }; + }; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3 From 25025e0aab2f91a93a1557b299a79814559b4dc2 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 28 Sep 2018 14:45:48 +0000 Subject: bpf: sync include/uapi/linux/bpf.h to tools/include/uapi/linux/bpf.h The sync is required due to the appearance of a new map type: BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, which implements per-cpu cgroup local storage. Signed-off-by: Roman Gushchin Acked-by: Song Liu Cc: Daniel Borkmann Cc: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- tools/include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index aa5ccd2385ed..e2070d819e04 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -127,6 +127,7 @@ enum bpf_map_type { BPF_MAP_TYPE_SOCKHASH, BPF_MAP_TYPE_CGROUP_STORAGE, BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, + BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, }; enum bpf_prog_type { -- cgit v1.2.3 From 6acc9b432e6714d72d7d77ec7c27f6f8358d0c71 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Oct 2018 13:35:36 -0700 Subject: bpf: Add helper to retrieve socket in BPF This patch adds new BPF helper functions, bpf_sk_lookup_tcp() and bpf_sk_lookup_udp() which allows BPF programs to find out if there is a socket listening on this host, and returns a socket pointer which the BPF program can then access to determine, for instance, whether to forward or drop traffic. bpf_sk_lookup_xxx() may take a reference on the socket, so when a BPF program makes use of this function, it must subsequently pass the returned pointer into the newly added sk_release() to return the reference. By way of example, the following pseudocode would filter inbound connections at XDP if there is no corresponding service listening for the traffic: struct bpf_sock_tuple tuple; struct bpf_sock_ops *sk; populate_tuple(ctx, &tuple); // Extract the 5tuple from the packet sk = bpf_sk_lookup_tcp(ctx, &tuple, sizeof tuple, netns, 0); if (!sk) { // Couldn't find a socket listening for this traffic. Drop. return TC_ACT_SHOT; } bpf_sk_release(sk, 0); return TC_ACT_OK; Signed-off-by: Joe Stringer Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- tools/include/uapi/linux/bpf.h | 93 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 92 insertions(+), 1 deletion(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e2070d819e04..f9187b41dff6 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2144,6 +2144,77 @@ union bpf_attr { * request in the skb. * Return * 0 on success, or a negative error in case of failure. + * + * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags) + * Description + * Look for TCP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-NULL, released via **bpf_sk_release**\ (). + * + * The *ctx* should point to the context of the program, such as + * the skb or socket (depending on the hook in use). This is used + * to determine the base network namespace for the lookup. + * + * *tuple_size* must be one of: + * + * **sizeof**\ (*tuple*\ **->ipv4**) + * Look for an IPv4 socket. + * **sizeof**\ (*tuple*\ **->ipv6**) + * Look for an IPv6 socket. + * + * If the *netns* is zero, then the socket lookup table in the + * netns associated with the *ctx* will be used. For the TC hooks, + * this in the netns of the device in the skb. For socket hooks, + * this in the netns of the socket. If *netns* is non-zero, then + * it specifies the ID of the netns relative to the netns + * associated with the *ctx*. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to *struct bpf_sock*, or NULL in case of failure. + * + * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags) + * Description + * Look for UDP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-NULL, released via **bpf_sk_release**\ (). + * + * The *ctx* should point to the context of the program, such as + * the skb or socket (depending on the hook in use). This is used + * to determine the base network namespace for the lookup. + * + * *tuple_size* must be one of: + * + * **sizeof**\ (*tuple*\ **->ipv4**) + * Look for an IPv4 socket. + * **sizeof**\ (*tuple*\ **->ipv6**) + * Look for an IPv6 socket. + * + * If the *netns* is zero, then the socket lookup table in the + * netns associated with the *ctx* will be used. For the TC hooks, + * this in the netns of the device in the skb. For socket hooks, + * this in the netns of the socket. If *netns* is non-zero, then + * it specifies the ID of the netns relative to the netns + * associated with the *ctx*. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to *struct bpf_sock*, or NULL in case of failure. + * + * int bpf_sk_release(struct bpf_sock *sk) + * Description + * Release the reference held by *sock*. *sock* must be a non-NULL + * pointer that was returned from bpf_sk_lookup_xxx\ (). + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2229,7 +2300,10 @@ union bpf_attr { FN(get_current_cgroup_id), \ FN(get_local_storage), \ FN(sk_select_reuseport), \ - FN(skb_ancestor_cgroup_id), + FN(skb_ancestor_cgroup_id), \ + FN(sk_lookup_tcp), \ + FN(sk_lookup_udp), \ + FN(sk_release), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2399,6 +2473,23 @@ struct bpf_sock { */ }; +struct bpf_sock_tuple { + union { + struct { + __be32 saddr; + __be32 daddr; + __be16 sport; + __be16 dport; + } ipv4; + struct { + __be32 saddr[4]; + __be32 daddr[4]; + __be16 sport; + __be16 dport; + } ipv6; + }; +}; + #define XDP_PACKET_HEADROOM 256 /* User return codes for XDP prog type. -- cgit v1.2.3 From 421f4292f46e871cdcf8bdc3e6fdfcefe2e81a9d Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 16 Oct 2018 15:59:36 +0200 Subject: bpf, tls: add tls header to tools infrastructure Andrey reported a build error for the BPF kselftest suite when compiled on a machine which does not have tls related header bits installed natively: test_sockmap.c:120:23: fatal error: linux/tls.h: No such file or directory #include ^ compilation terminated. Fix it by adding the header to the tools include infrastructure and add definitions such as SOL_TLS that could potentially be missing. Fixes: e9dd904708c4 ("bpf: add tls support for testing in test_sockmap") Reported-by: Andrey Ignatov Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/tls.h | 78 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 tools/include/uapi/linux/tls.h (limited to 'tools/include') diff --git a/tools/include/uapi/linux/tls.h b/tools/include/uapi/linux/tls.h new file mode 100644 index 000000000000..ff02287495ac --- /dev/null +++ b/tools/include/uapi/linux/tls.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */ +/* + * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _UAPI_LINUX_TLS_H +#define _UAPI_LINUX_TLS_H + +#include + +/* TLS socket options */ +#define TLS_TX 1 /* Set transmit parameters */ +#define TLS_RX 2 /* Set receive parameters */ + +/* Supported versions */ +#define TLS_VERSION_MINOR(ver) ((ver) & 0xFF) +#define TLS_VERSION_MAJOR(ver) (((ver) >> 8) & 0xFF) + +#define TLS_VERSION_NUMBER(id) ((((id##_VERSION_MAJOR) & 0xFF) << 8) | \ + ((id##_VERSION_MINOR) & 0xFF)) + +#define TLS_1_2_VERSION_MAJOR 0x3 +#define TLS_1_2_VERSION_MINOR 0x3 +#define TLS_1_2_VERSION TLS_VERSION_NUMBER(TLS_1_2) + +/* Supported ciphers */ +#define TLS_CIPHER_AES_GCM_128 51 +#define TLS_CIPHER_AES_GCM_128_IV_SIZE 8 +#define TLS_CIPHER_AES_GCM_128_KEY_SIZE 16 +#define TLS_CIPHER_AES_GCM_128_SALT_SIZE 4 +#define TLS_CIPHER_AES_GCM_128_TAG_SIZE 16 +#define TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE 8 + +#define TLS_SET_RECORD_TYPE 1 +#define TLS_GET_RECORD_TYPE 2 + +struct tls_crypto_info { + __u16 version; + __u16 cipher_type; +}; + +struct tls12_crypto_info_aes_gcm_128 { + struct tls_crypto_info info; + unsigned char iv[TLS_CIPHER_AES_GCM_128_IV_SIZE]; + unsigned char key[TLS_CIPHER_AES_GCM_128_KEY_SIZE]; + unsigned char salt[TLS_CIPHER_AES_GCM_128_SALT_SIZE]; + unsigned char rec_seq[TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE]; +}; + +#endif /* _UAPI_LINUX_TLS_H */ -- cgit v1.2.3 From b55cbc8d9b44aaee94f19e995a5f241d453763ee Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Wed, 17 Oct 2018 16:24:48 +0200 Subject: bpf: fix doc of bpf_skb_adjust_room() in uapi len_diff is signed. Fixes: fa15601ab31e ("bpf: add documentation for eBPF helpers (33-41)") CC: Quentin Monnet Signed-off-by: Nicolas Dichtel Reviewed-by: Quentin Monnet Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f9187b41dff6..5e46f6732781 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1433,7 +1433,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags) + * int bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags) * Description * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. -- cgit v1.2.3 From da4e1b15f67613eca7ae998abd996cfee7cff1ba Mon Sep 17 00:00:00 2001 From: Mauricio Vasquez B Date: Thu, 18 Oct 2018 15:16:36 +0200 Subject: Sync uapi/bpf.h to tools/include Sync both files. Signed-off-by: Mauricio Vasquez B Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/bpf.h | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 5e46f6732781..a2fb333290dc 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -103,6 +103,7 @@ enum bpf_cmd { BPF_BTF_LOAD, BPF_BTF_GET_FD_BY_ID, BPF_TASK_FD_QUERY, + BPF_MAP_LOOKUP_AND_DELETE_ELEM, }; enum bpf_map_type { @@ -128,6 +129,8 @@ enum bpf_map_type { BPF_MAP_TYPE_CGROUP_STORAGE, BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, + BPF_MAP_TYPE_QUEUE, + BPF_MAP_TYPE_STACK, }; enum bpf_prog_type { @@ -462,6 +465,28 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) + * Description + * Push an element *value* in *map*. *flags* is one of: + * + * **BPF_EXIST** + * If the queue/stack is full, the oldest element is removed to + * make room for this. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_map_pop_elem(struct bpf_map *map, void *value) + * Description + * Pop an element from *map*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_map_peek_elem(struct bpf_map *map, void *value) + * Description + * Get an element from *map* without removing it. + * Return + * 0 on success, or a negative error in case of failure. + * * int bpf_probe_read(void *dst, u32 size, const void *src) * Description * For tracing programs, safely attempt to read *size* bytes from @@ -2303,7 +2328,10 @@ union bpf_attr { FN(skb_ancestor_cgroup_id), \ FN(sk_lookup_tcp), \ FN(sk_lookup_udp), \ - FN(sk_release), + FN(sk_release), \ + FN(map_push_elem), \ + FN(map_pop_elem), \ + FN(map_peek_elem), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From 09d62154f61316f7e97eae3f31ef8770c7e4b386 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 19 Oct 2018 15:51:02 +0200 Subject: tools, perf: add and use optimized ring_buffer_{read_head, write_tail} helpers Currently, on x86-64, perf uses LFENCE and MFENCE (rmb() and mb(), respectively) when processing events from the perf ring buffer which is unnecessarily expensive as we can do more lightweight in particular given this is critical fast-path in perf. According to Peter rmb()/mb() were added back then via a94d342b9cb0 ("tools/perf: Add required memory barriers") at a time where kernel still supported chips that needed it, but nowadays support for these has been ditched completely, therefore we can fix them up as well. While for x86-64, replacing rmb() and mb() with smp_*() variants would result in just a compiler barrier for the former and LOCK + ADD for the latter (__sync_synchronize() uses slower MFENCE by the way), Peter suggested we can use smp_{load_acquire,store_release}() instead for architectures where its implementation doesn't resolve in slower smp_mb(). Thus, e.g. in x86-64 we would be able to avoid CPU barrier entirely due to TSO. For architectures where the latter needs to use smp_mb() e.g. on arm, we stick to cheaper smp_rmb() variant for fetching the head. This work adds helpers ring_buffer_read_head() and ring_buffer_write_tail() for tools infrastructure that either switches to smp_load_acquire() for architectures where it is cheaper or uses READ_ONCE() + smp_rmb() barrier for those where it's not in order to fetch the data_head from the perf control page, and it uses smp_store_release() to write the data_tail. Latter is smp_mb() + WRITE_ONCE() combination or a cheaper variant if architecture allows for it. Those that rely on smp_rmb() and smp_mb() can further improve performance in a follow up step by implementing the two under tools/arch/*/include/asm/barrier.h such that they don't have to fallback to rmb() and mb() in tools/include/asm/barrier.h. Switch perf to use ring_buffer_read_head() and ring_buffer_write_tail() so it can make use of the optimizations. Later, we convert libbpf as well to use the same helpers. Side note [0]: the topic has been raised of whether one could simply use the C11 gcc builtins [1] for the smp_load_acquire() and smp_store_release() instead: __atomic_load_n(ptr, __ATOMIC_ACQUIRE); __atomic_store_n(ptr, val, __ATOMIC_RELEASE); Kernel and (presumably) tooling shipped along with the kernel has a minimum requirement of being able to build with gcc-4.6 and the latter does not have C11 builtins. While generally the C11 memory models don't align with the kernel's, the C11 load-acquire and store-release alone /could/ suffice, however. Issue is that this is implementation dependent on how the load-acquire and store-release is done by the compiler and the mapping of supported compilers must align to be compatible with the kernel's implementation, and thus needs to be verified/tracked on a case by case basis whether they match (unless an architecture uses them also from kernel side). The implementations for smp_load_acquire() and smp_store_release() in this patch have been adapted from the kernel side ones to have a concrete and compatible mapping in place. [0] http://patchwork.ozlabs.org/patch/985422/ [1] https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html Signed-off-by: Daniel Borkmann Acked-by: Peter Zijlstra (Intel) Cc: "Paul E. McKenney" Cc: Will Deacon Cc: Arnaldo Carvalho de Melo Signed-off-by: Alexei Starovoitov --- tools/include/asm/barrier.h | 35 +++++++++++++++++++ tools/include/linux/ring_buffer.h | 73 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+) create mode 100644 tools/include/linux/ring_buffer.h (limited to 'tools/include') diff --git a/tools/include/asm/barrier.h b/tools/include/asm/barrier.h index 391d942536e5..8d378c57cb01 100644 --- a/tools/include/asm/barrier.h +++ b/tools/include/asm/barrier.h @@ -1,4 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#include #if defined(__i386__) || defined(__x86_64__) #include "../../arch/x86/include/asm/barrier.h" #elif defined(__arm__) @@ -26,3 +27,37 @@ #else #include #endif + +/* + * Generic fallback smp_*() definitions for archs that haven't + * been updated yet. + */ + +#ifndef smp_rmb +# define smp_rmb() rmb() +#endif + +#ifndef smp_wmb +# define smp_wmb() wmb() +#endif + +#ifndef smp_mb +# define smp_mb() mb() +#endif + +#ifndef smp_store_release +# define smp_store_release(p, v) \ +do { \ + smp_mb(); \ + WRITE_ONCE(*p, v); \ +} while (0) +#endif + +#ifndef smp_load_acquire +# define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p1 = READ_ONCE(*p); \ + smp_mb(); \ + ___p1; \ +}) +#endif diff --git a/tools/include/linux/ring_buffer.h b/tools/include/linux/ring_buffer.h new file mode 100644 index 000000000000..9a083ae60473 --- /dev/null +++ b/tools/include/linux/ring_buffer.h @@ -0,0 +1,73 @@ +#ifndef _TOOLS_LINUX_RING_BUFFER_H_ +#define _TOOLS_LINUX_RING_BUFFER_H_ + +#include + +/* + * Contract with kernel for walking the perf ring buffer from + * user space requires the following barrier pairing (quote + * from kernel/events/ring_buffer.c): + * + * Since the mmap() consumer (userspace) can run on a + * different CPU: + * + * kernel user + * + * if (LOAD ->data_tail) { LOAD ->data_head + * (A) smp_rmb() (C) + * STORE $data LOAD $data + * smp_wmb() (B) smp_mb() (D) + * STORE ->data_head STORE ->data_tail + * } + * + * Where A pairs with D, and B pairs with C. + * + * In our case A is a control dependency that separates the + * load of the ->data_tail and the stores of $data. In case + * ->data_tail indicates there is no room in the buffer to + * store $data we do not. + * + * D needs to be a full barrier since it separates the data + * READ from the tail WRITE. + * + * For B a WMB is sufficient since it separates two WRITEs, + * and for C an RMB is sufficient since it separates two READs. + * + * Note, instead of B, C, D we could also use smp_store_release() + * in B and D as well as smp_load_acquire() in C. + * + * However, this optimization does not make sense for all kernel + * supported architectures since for a fair number it would + * resolve into READ_ONCE() + smp_mb() pair for smp_load_acquire(), + * and smp_mb() + WRITE_ONCE() pair for smp_store_release(). + * + * Thus for those smp_wmb() in B and smp_rmb() in C would still + * be less expensive. For the case of D this has either the same + * cost or is less expensive, for example, due to TSO x86 can + * avoid the CPU barrier entirely. + */ + +static inline u64 ring_buffer_read_head(struct perf_event_mmap_page *base) +{ +/* + * Architectures where smp_load_acquire() does not fallback to + * READ_ONCE() + smp_mb() pair. + */ +#if defined(__x86_64__) || defined(__aarch64__) || defined(__powerpc64__) || \ + defined(__ia64__) || defined(__sparc__) && defined(__arch64__) + return smp_load_acquire(&base->data_head); +#else + u64 head = READ_ONCE(base->data_head); + + smp_rmb(); + return head; +#endif +} + +static inline void ring_buffer_write_tail(struct perf_event_mmap_page *base, + u64 tail) +{ + smp_store_release(&base->data_tail, tail); +} + +#endif /* _TOOLS_LINUX_RING_BUFFER_H_ */ -- cgit v1.2.3 From f908d26b2c41d9a924371099c4979e4b5d385165 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 19 Oct 2018 19:56:50 -0700 Subject: bpf: libbpf support for msg_push_data Add support for new bpf_msg_push_data in libbpf. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- tools/include/uapi/linux/bpf.h | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index a2fb333290dc..852dc17ab47a 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2240,6 +2240,23 @@ union bpf_attr { * pointer that was returned from bpf_sk_lookup_xxx\ (). * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_msg_push_data(struct sk_buff *skb, u32 start, u32 len, u64 flags) + * Description + * For socket policies, insert *len* bytes into msg at offset + * *start*. + * + * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a + * *msg* it may want to insert metadata or options into the msg. + * This can later be read and used by any of the lower layer BPF + * hooks. + * + * This helper may fail if under memory pressure (a malloc + * fails) in these cases BPF programs will get an appropriate + * error and BPF programs will need to handle them. + * + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2331,7 +2348,8 @@ union bpf_attr { FN(sk_release), \ FN(map_push_elem), \ FN(map_pop_elem), \ - FN(map_peek_elem), + FN(map_peek_elem), \ + FN(msg_push_data), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3