summaryrefslogtreecommitdiffstats
path: root/tools
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-05-02 16:40:27 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2017-05-02 16:40:27 -0700
commit8d65b08debc7e62b2c6032d7fe7389d895b92cbc (patch)
tree0c3141b60c3a03cc32742b5750c5e763b9dae489 /tools
parent5a0387a8a8efb90ae7fea1e2e5c62de3efa74691 (diff)
parent5d15af6778b8e4ed1fd41b040283af278e7a9a72 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Millar: "Here are some highlights from the 2065 networking commits that happened this development cycle: 1) XDP support for IXGBE (John Fastabend) and thunderx (Sunil Kowuri) 2) Add a generic XDP driver, so that anyone can test XDP even if they lack a networking device whose driver has explicit XDP support (me). 3) Sparc64 now has an eBPF JIT too (me) 4) Add a BPF program testing framework via BPF_PROG_TEST_RUN (Alexei Starovoitov) 5) Make netfitler network namespace teardown less expensive (Florian Westphal) 6) Add symmetric hashing support to nft_hash (Laura Garcia Liebana) 7) Implement NAPI and GRO in netvsc driver (Stephen Hemminger) 8) Support TC flower offload statistics in mlxsw (Arkadi Sharshevsky) 9) Multiqueue support in stmmac driver (Joao Pinto) 10) Remove TCP timewait recycling, it never really could possibly work well in the real world and timestamp randomization really zaps any hint of usability this feature had (Soheil Hassas Yeganeh) 11) Support level3 vs level4 ECMP route hashing in ipv4 (Nikolay Aleksandrov) 12) Add socket busy poll support to epoll (Sridhar Samudrala) 13) Netlink extended ACK support (Johannes Berg, Pablo Neira Ayuso, and several others) 14) IPSEC hw offload infrastructure (Steffen Klassert)" * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (2065 commits) tipc: refactor function tipc_sk_recv_stream() tipc: refactor function tipc_sk_recvmsg() net: thunderx: Optimize page recycling for XDP net: thunderx: Support for XDP header adjustment net: thunderx: Add support for XDP_TX net: thunderx: Add support for XDP_DROP net: thunderx: Add basic XDP support net: thunderx: Cleanup receive buffer allocation net: thunderx: Optimize CQE_TX handling net: thunderx: Optimize RBDR descriptor handling net: thunderx: Support for page recycling ipx: call ipxitf_put() in ioctl error path net: sched: add helpers to handle extended actions qed*: Fix issues in the ptp filter config implementation. qede: Fix concurrency issue in PTP Tx path processing. stmmac: Add support for SIMATIC IOT2000 platform net: hns: fix ethtool_get_strings overflow in hns driver tcp: fix wraparound issue in tcp_lp bpf, arm64: fix jit branch offset related to ldimm64 bpf, arm64: implement jiting of BPF_XADD ...
Diffstat (limited to 'tools')
-rw-r--r--tools/build/feature/test-bpf.c3
-rwxr-xr-xtools/hv/bondvf.sh18
-rw-r--r--tools/include/uapi/linux/bpf.h32
-rw-r--r--tools/lib/bpf/bpf.c43
-rw-r--r--tools/lib/bpf/bpf.h6
-rw-r--r--tools/lib/bpf/libbpf.c3
-rw-r--r--tools/lib/bpf/libbpf.h2
-rw-r--r--tools/net/bpf_jit_disasm.c40
-rw-r--r--tools/testing/selftests/bpf/Makefile17
-rw-r--r--tools/testing/selftests/bpf/bpf_endian.h23
-rw-r--r--tools/testing/selftests/bpf/bpf_util.h7
-rw-r--r--tools/testing/selftests/bpf/test_iptunnel_common.h37
-rw-r--r--tools/testing/selftests/bpf/test_l4lb.c473
-rw-r--r--tools/testing/selftests/bpf/test_lru_map.c104
-rw-r--r--tools/testing/selftests/bpf/test_maps.c66
-rw-r--r--tools/testing/selftests/bpf/test_pkt_access.c64
-rw-r--r--tools/testing/selftests/bpf/test_progs.c283
-rw-r--r--tools/testing/selftests/bpf/test_verifier.c287
-rw-r--r--tools/testing/selftests/bpf/test_xdp.c235
-rw-r--r--tools/testing/selftests/net/Makefile2
-rwxr-xr-xtools/testing/selftests/net/netdevice.sh200
-rw-r--r--tools/testing/selftests/net/psock_fanout.c93
22 files changed, 1947 insertions, 91 deletions
diff --git a/tools/build/feature/test-bpf.c b/tools/build/feature/test-bpf.c
index e04ab89a1013..ebc6dceddb58 100644
--- a/tools/build/feature/test-bpf.c
+++ b/tools/build/feature/test-bpf.c
@@ -9,6 +9,9 @@
# define __NR_bpf 321
# elif defined(__aarch64__)
# define __NR_bpf 280
+# elif defined(__sparc__)
+# define __NR_bpf 349
+# else
# error __NR_bpf not defined. libbpf does not support your arch.
# endif
#endif
diff --git a/tools/hv/bondvf.sh b/tools/hv/bondvf.sh
index 4aa5369ffa4e..d85968cb1bf2 100755
--- a/tools/hv/bondvf.sh
+++ b/tools/hv/bondvf.sh
@@ -101,9 +101,25 @@ function create_bond_cfg_redhat {
echo BONDING_OPTS=\"mode=active-backup miimon=100 primary=$2\" >>$fn
}
+function del_eth_cfg_ubuntu {
+ local fn=$cfgdir/interfaces
+ local tmpfl=$(mktemp)
+
+ local nic_start='^[ \t]*(auto|iface|mapping|allow-.*)[ \t]+'$1
+ local nic_end='^[ \t]*(auto|iface|mapping|allow-.*|source)'
+
+ awk "/$nic_end/{x=0} x{next} /$nic_start/{x=1;next} 1" $fn >$tmpfl
+
+ cp $tmpfl $fn
+
+ rm $tmpfl
+}
+
function create_eth_cfg_ubuntu {
local fn=$cfgdir/interfaces
+ del_eth_cfg_ubuntu $1
+
echo $'\n'auto $1 >>$fn
echo iface $1 inet manual >>$fn
echo bond-master $2 >>$fn
@@ -119,6 +135,8 @@ function create_eth_cfg_pri_ubuntu {
function create_bond_cfg_ubuntu {
local fn=$cfgdir/interfaces
+ del_eth_cfg_ubuntu $1
+
echo $'\n'auto $1 >>$fn
echo iface $1 inet dhcp >>$fn
echo bond-mode active-backup >>$fn
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 0539a0ceef38..e553529929f6 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -81,6 +81,7 @@ enum bpf_cmd {
BPF_OBJ_GET,
BPF_PROG_ATTACH,
BPF_PROG_DETACH,
+ BPF_PROG_TEST_RUN,
};
enum bpf_map_type {
@@ -96,6 +97,8 @@ enum bpf_map_type {
BPF_MAP_TYPE_LRU_HASH,
BPF_MAP_TYPE_LRU_PERCPU_HASH,
BPF_MAP_TYPE_LPM_TRIE,
+ BPF_MAP_TYPE_ARRAY_OF_MAPS,
+ BPF_MAP_TYPE_HASH_OF_MAPS,
};
enum bpf_prog_type {
@@ -152,6 +155,7 @@ union bpf_attr {
__u32 value_size; /* size of value in bytes */
__u32 max_entries; /* max number of entries in a map */
__u32 map_flags; /* prealloc or not */
+ __u32 inner_map_fd; /* fd pointing to the inner map */
};
struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
@@ -186,6 +190,17 @@ union bpf_attr {
__u32 attach_type;
__u32 attach_flags;
};
+
+ struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */
+ __u32 prog_fd;
+ __u32 retval;
+ __u32 data_size_in;
+ __u32 data_size_out;
+ __aligned_u64 data_in;
+ __aligned_u64 data_out;
+ __u32 repeat;
+ __u32 duration;
+ } test;
} __attribute__((aligned(8)));
/* BPF helper function descriptions:
@@ -456,6 +471,18 @@ union bpf_attr {
* Return:
* > 0 length of the string including the trailing NUL on success
* < 0 error
+ *
+ * u64 bpf_get_socket_cookie(skb)
+ * Get the cookie for the socket stored inside sk_buff.
+ * @skb: pointer to skb
+ * Return: 8 Bytes non-decreasing number on success or 0 if the socket
+ * field is missing inside sk_buff
+ *
+ * u32 bpf_get_socket_uid(skb)
+ * Get the owner uid of the socket stored inside sk_buff.
+ * @skb: pointer to skb
+ * Return: uid of the socket owner on success or 0 if the socket pointer
+ * inside sk_buff is NULL
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -503,7 +530,9 @@ union bpf_attr {
FN(get_numa_node_id), \
FN(skb_change_head), \
FN(xdp_adjust_head), \
- FN(probe_read_str),
+ FN(probe_read_str), \
+ FN(get_socket_cookie), \
+ FN(get_socket_uid),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
@@ -574,6 +603,7 @@ struct __sk_buff {
__u32 tc_classid;
__u32 data;
__u32 data_end;
+ __u32 napi_id;
};
struct bpf_tunnel_key {
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 207c2eeddab0..4fe444b8092e 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -37,6 +37,8 @@
# define __NR_bpf 321
# elif defined(__aarch64__)
# define __NR_bpf 280
+# elif defined(__sparc__)
+# define __NR_bpf 349
# else
# error __NR_bpf not defined. libbpf does not support your arch.
# endif
@@ -69,6 +71,23 @@ int bpf_create_map(enum bpf_map_type map_type, int key_size,
return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
}
+int bpf_create_map_in_map(enum bpf_map_type map_type, int key_size,
+ int inner_map_fd, int max_entries, __u32 map_flags)
+{
+ union bpf_attr attr;
+
+ memset(&attr, '\0', sizeof(attr));
+
+ attr.map_type = map_type;
+ attr.key_size = key_size;
+ attr.value_size = 4;
+ attr.inner_map_fd = inner_map_fd;
+ attr.max_entries = max_entries;
+ attr.map_flags = map_flags;
+
+ return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
+}
+
int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns,
size_t insns_cnt, const char *license,
__u32 kern_version, char *log_buf, size_t log_buf_sz)
@@ -192,3 +211,27 @@ int bpf_prog_detach(int target_fd, enum bpf_attach_type type)
return sys_bpf(BPF_PROG_DETACH, &attr, sizeof(attr));
}
+
+int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size,
+ void *data_out, __u32 *size_out, __u32 *retval,
+ __u32 *duration)
+{
+ union bpf_attr attr;
+ int ret;
+
+ bzero(&attr, sizeof(attr));
+ attr.test.prog_fd = prog_fd;
+ attr.test.data_in = ptr_to_u64(data);
+ attr.test.data_out = ptr_to_u64(data_out);
+ attr.test.data_size_in = size;
+ attr.test.repeat = repeat;
+
+ ret = sys_bpf(BPF_PROG_TEST_RUN, &attr, sizeof(attr));
+ if (size_out)
+ *size_out = attr.test.data_size_out;
+ if (retval)
+ *retval = attr.test.retval;
+ if (duration)
+ *duration = attr.test.duration;
+ return ret;
+}
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 09c3dcac0496..edb4daeff7a5 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -26,6 +26,8 @@
int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size,
int max_entries, __u32 map_flags);
+int bpf_create_map_in_map(enum bpf_map_type map_type, int key_size,
+ int inner_map_fd, int max_entries, __u32 map_flags);
/* Recommend log buffer size */
#define BPF_LOG_BUF_SIZE 65536
@@ -45,6 +47,8 @@ int bpf_obj_get(const char *pathname);
int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type,
unsigned int flags);
int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type);
-
+int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size,
+ void *data_out, __u32 *size_out, __u32 *retval,
+ __u32 *duration);
#endif
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index ac6eb863b2a4..1a2c07eb7795 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1618,8 +1618,7 @@ int bpf_program__nth_fd(struct bpf_program *prog, int n)
return fd;
}
-static void bpf_program__set_type(struct bpf_program *prog,
- enum bpf_prog_type type)
+void bpf_program__set_type(struct bpf_program *prog, enum bpf_prog_type type)
{
prog->type = type;
}
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index b30394f9947a..32c7252f734e 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -25,6 +25,7 @@
#include <stdint.h>
#include <stdbool.h>
#include <sys/types.h> // for size_t
+#include <linux/bpf.h>
enum libbpf_errno {
__LIBBPF_ERRNO__START = 4000,
@@ -185,6 +186,7 @@ int bpf_program__set_sched_cls(struct bpf_program *prog);
int bpf_program__set_sched_act(struct bpf_program *prog);
int bpf_program__set_xdp(struct bpf_program *prog);
int bpf_program__set_perf_event(struct bpf_program *prog);
+void bpf_program__set_type(struct bpf_program *prog, enum bpf_prog_type type);
bool bpf_program__is_socket_filter(struct bpf_program *prog);
bool bpf_program__is_tracepoint(struct bpf_program *prog);
diff --git a/tools/net/bpf_jit_disasm.c b/tools/net/bpf_jit_disasm.c
index 544b05a53b70..ad572e6cdbd0 100644
--- a/tools/net/bpf_jit_disasm.c
+++ b/tools/net/bpf_jit_disasm.c
@@ -229,6 +229,7 @@ static void usage(void)
{
printf("Usage: bpf_jit_disasm [...]\n");
printf(" -o Also display related opcodes (default: off).\n");
+ printf(" -O <file> Write binary image of code to file, don't disassemble to stdout.\n");
printf(" -f <file> Read last image dump from file or stdin (default: klog).\n");
printf(" -h Display this help.\n");
}
@@ -238,12 +239,19 @@ int main(int argc, char **argv)
unsigned int len, klen, opt, opcodes = 0;
static uint8_t image[32768];
char *kbuff, *file = NULL;
+ char *ofile = NULL;
+ int ofd;
+ ssize_t nr;
+ uint8_t *pos;
- while ((opt = getopt(argc, argv, "of:")) != -1) {
+ while ((opt = getopt(argc, argv, "of:O:")) != -1) {
switch (opt) {
case 'o':
opcodes = 1;
break;
+ case 'O':
+ ofile = optarg;
+ break;
case 'f':
file = optarg;
break;
@@ -263,11 +271,35 @@ int main(int argc, char **argv)
}
len = get_last_jit_image(kbuff, klen, image, sizeof(image));
- if (len > 0)
- get_asm_insns(image, len, opcodes);
- else
+ if (len <= 0) {
fprintf(stderr, "No JIT image found!\n");
+ goto done;
+ }
+ if (!ofile) {
+ get_asm_insns(image, len, opcodes);
+ goto done;
+ }
+
+ ofd = open(ofile, O_WRONLY | O_CREAT | O_TRUNC, DEFFILEMODE);
+ if (ofd < 0) {
+ fprintf(stderr, "Could not open file %s for writing: ", ofile);
+ perror(NULL);
+ goto done;
+ }
+ pos = image;
+ do {
+ nr = write(ofd, pos, len);
+ if (nr < 0) {
+ fprintf(stderr, "Could not write data to %s: ", ofile);
+ perror(NULL);
+ goto done;
+ }
+ len -= nr;
+ pos += nr;
+ } while (len);
+ close(ofd);
+done:
put_log_buff(kbuff);
return 0;
}
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 9af09e8099c0..d8d94b9bd76c 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -8,16 +8,18 @@ ifneq ($(wildcard $(GENHDR)),)
GENFLAGS := -DHAVE_GENHDR
endif
-CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS)
-LDLIBS += -lcap
+CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) -I../../../include
+LDLIBS += -lcap -lelf
-TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map
+TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs
+
+TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o
TEST_PROGS := test_kmod.sh
include ../lib.mk
-BPFOBJ := $(OUTPUT)/bpf.o
+BPFOBJ := $(OUTPUT)/libbpf.a
$(TEST_GEN_PROGS): $(BPFOBJ)
@@ -28,3 +30,10 @@ force:
$(BPFOBJ): force
$(MAKE) -C $(BPFDIR) OUTPUT=$(OUTPUT)/
+
+CLANG ?= clang
+
+%.o: %.c
+ $(CLANG) -I../../../include/uapi -I../../../../samples/bpf/ \
+ -D__x86_64__ -Wno-compare-distinct-pointer-types \
+ -O2 -target bpf -c $< -o $@
diff --git a/tools/testing/selftests/bpf/bpf_endian.h b/tools/testing/selftests/bpf/bpf_endian.h
new file mode 100644
index 000000000000..19d0604f8694
--- /dev/null
+++ b/tools/testing/selftests/bpf/bpf_endian.h
@@ -0,0 +1,23 @@
+#ifndef __BPF_ENDIAN__
+#define __BPF_ENDIAN__
+
+#include <asm/byteorder.h>
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+# define __bpf_ntohs(x) __builtin_bswap16(x)
+# define __bpf_htons(x) __builtin_bswap16(x)
+#elif __BYTE_ORDER == __BIG_ENDIAN
+# define __bpf_ntohs(x) (x)
+# define __bpf_htons(x) (x)
+#else
+# error "Fix your __BYTE_ORDER?!"
+#endif
+
+#define bpf_htons(x) \
+ (__builtin_constant_p(x) ? \
+ __constant_htons(x) : __bpf_htons(x))
+#define bpf_ntohs(x) \
+ (__builtin_constant_p(x) ? \
+ __constant_ntohs(x) : __bpf_ntohs(x))
+
+#endif
diff --git a/tools/testing/selftests/bpf/bpf_util.h b/tools/testing/selftests/bpf/bpf_util.h
index 84a5d1823f02..20ecbaa0d85d 100644
--- a/tools/testing/selftests/bpf/bpf_util.h
+++ b/tools/testing/selftests/bpf/bpf_util.h
@@ -35,4 +35,11 @@ static inline unsigned int bpf_num_possible_cpus(void)
return possible_cpus;
}
+#define __bpf_percpu_val_align __attribute__((__aligned__(8)))
+
+#define BPF_DECLARE_PERCPU(type, name) \
+ struct { type v; /* padding */ } __bpf_percpu_val_align \
+ name[bpf_num_possible_cpus()]
+#define bpf_percpu(name, cpu) name[(cpu)].v
+
#endif /* __BPF_UTIL__ */
diff --git a/tools/testing/selftests/bpf/test_iptunnel_common.h b/tools/testing/selftests/bpf/test_iptunnel_common.h
new file mode 100644
index 000000000000..e4cd252a1b20
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_iptunnel_common.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#ifndef _TEST_IPTNL_COMMON_H
+#define _TEST_IPTNL_COMMON_H
+
+#include <linux/types.h>
+
+#define MAX_IPTNL_ENTRIES 256U
+
+struct vip {
+ union {
+ __u32 v6[4];
+ __u32 v4;
+ } daddr;
+ __u16 dport;
+ __u16 family;
+ __u8 protocol;
+};
+
+struct iptnl_info {
+ union {
+ __u32 v6[4];
+ __u32 v4;
+ } saddr;
+ union {
+ __u32 v6[4];
+ __u32 v4;
+ } daddr;
+ __u16 family;
+ __u8 dmac[6];
+};
+
+#endif
diff --git a/tools/testing/selftests/bpf/test_l4lb.c b/tools/testing/selftests/bpf/test_l4lb.c
new file mode 100644
index 000000000000..1e10c9590991
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_l4lb.c
@@ -0,0 +1,473 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stddef.h>
+#include <stdbool.h>
+#include <string.h>
+#include <linux/pkt_cls.h>
+#include <linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include "bpf_helpers.h"
+#include "test_iptunnel_common.h"
+#include "bpf_endian.h"
+
+int _version SEC("version") = 1;
+
+static inline __u32 rol32(__u32 word, unsigned int shift)
+{
+ return (word << shift) | (word >> ((-shift) & 31));
+}
+
+/* copy paste of jhash from kernel sources to make sure llvm
+ * can compile it into valid sequence of bpf instructions
+ */
+#define __jhash_mix(a, b, c) \
+{ \
+ a -= c; a ^= rol32(c, 4); c += b; \
+ b -= a; b ^= rol32(a, 6); a += c; \
+ c -= b; c ^= rol32(b, 8); b += a; \
+ a -= c; a ^= rol32(c, 16); c += b; \
+ b -= a; b ^= rol32(a, 19); a += c; \
+ c -= b; c ^= rol32(b, 4); b += a; \
+}
+
+#define __jhash_final(a, b, c) \
+{ \
+ c ^= b; c -= rol32(b, 14); \
+ a ^= c; a -= rol32(c, 11); \
+ b ^= a; b -= rol32(a, 25); \
+ c ^= b; c -= rol32(b, 16); \
+ a ^= c; a -= rol32(c, 4); \
+ b ^= a; b -= rol32(a, 14); \
+ c ^= b; c -= rol32(b, 24); \
+}
+
+#define JHASH_INITVAL 0xdeadbeef
+
+typedef unsigned int u32;
+
+static inline u32 jhash(const void *key, u32 length, u32 initval)
+{
+ u32 a, b, c;
+ const unsigned char *k = key;
+
+ a = b = c = JHASH_INITVAL + length + initval;
+
+ while (length > 12) {
+ a += *(u32 *)(k);
+ b += *(u32 *)(k + 4);
+ c += *(u32 *)(k + 8);
+ __jhash_mix(a, b, c);
+ length -= 12;
+ k += 12;
+ }
+ switch (length) {
+ case 12: c += (u32)k[11]<<24;
+ case 11: c += (u32)k[10]<<16;
+ case 10: c += (u32)k[9]<<8;
+ case 9: c += k[8];
+ case 8: b += (u32)k[7]<<24;
+ case 7: b += (u32)k[6]<<16;
+ case 6: b += (u32)k[5]<<8;
+ case 5: b += k[4];
+ case 4: a += (u32)k[3]<<24;
+ case 3: a += (u32)k[2]<<16;
+ case 2: a += (u32)k[1]<<8;
+ case 1: a += k[0];
+ __jhash_final(a, b, c);
+ case 0: /* Nothing left to add */
+ break;
+ }
+
+ return c;
+}
+
+static inline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval)
+{
+ a += initval;
+ b += initval;
+ c += initval;
+ __jhash_final(a, b, c);
+ return c;
+}
+
+static inline u32 jhash_2words(u32 a, u32 b, u32 initval)
+{
+ return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2));
+}
+
+#define PCKT_FRAGMENTED 65343
+#define IPV4_HDR_LEN_NO_OPT 20
+#define IPV4_PLUS_ICMP_HDR 28
+#define IPV6_PLUS_ICMP_HDR 48
+#define RING_SIZE 2
+#define MAX_VIPS 12
+#define MAX_REALS 5
+#define CTL_MAP_SIZE 16
+#define CH_RINGS_SIZE (MAX_VIPS * RING_SIZE)
+#define F_IPV6 (1 << 0)
+#define F_HASH_NO_SRC_PORT (1 << 0)
+#define F_ICMP (1 << 0)
+#define F_SYN_SET (1 << 1)
+
+struct packet_description {
+ union {
+ __be32 src;
+ __be32 srcv6[4];
+ };
+ union {
+ __be32 dst;
+ __be32 dstv6[4];
+ };
+ union {
+ __u32 ports;
+ __u16 port16[2];
+ };
+ __u8 proto;
+ __u8 flags;
+};
+
+struct ctl_value {
+ union {
+ __u64 value;
+ __u32 ifindex;
+ __u8 mac[6];
+ };
+};
+
+struct vip_meta {
+ __u32 flags;
+ __u32 vip_num;
+};
+
+struct real_definition {
+ union {
+ __be32 dst;
+ __be32 dstv6[4];
+ };
+ __u8 flags;
+};
+
+struct vip_stats {
+ __u64 bytes;
+ __u64 pkts;
+};
+
+struct eth_hdr {
+ unsigned char eth_dest[ETH_ALEN];
+ unsigned char eth_source[ETH_ALEN];
+ unsigned short eth_proto;
+};
+
+struct bpf_map_def SEC("maps") vip_map = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(struct vip),
+ .value_size = sizeof(struct vip_meta),
+ .max_entries = MAX_VIPS,
+};
+
+struct bpf_map_def SEC("maps") ch_rings = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(__u32),
+ .value_size = sizeof(__u32),
+ .max_entries = CH_RINGS_SIZE,
+};
+
+struct bpf_map_def SEC("maps") reals = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(__u32),
+ .value_size = sizeof(struct real_definition),
+ .max_entries = MAX_REALS,
+};
+
+struct bpf_map_def SEC("maps") stats = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(__u32),
+ .value_size = sizeof(struct vip_stats),
+ .max_entries = MAX_VIPS,
+};
+
+struct bpf_map_def SEC("maps") ctl_array = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(__u32),
+ .value_size = sizeof(struct ctl_value),
+ .max_entries = CTL_MAP_SIZE,
+};
+
+static __always_inline __u32 get_packet_hash(struct packet_description *pckt,
+ bool ipv6)
+{
+ if (ipv6)
+ return jhash_2words(jhash(pckt->srcv6, 16, MAX_VIPS),
+ pckt->ports, CH_RINGS_SIZE);
+ else
+ return jhash_2words(pckt->src, pckt->ports, CH_RINGS_SIZE);
+}
+
+static __always_inline bool get_packet_dst(struct real_definition **real,
+ struct packet_description *pckt,
+ struct vip_meta *vip_info,
+ bool is_ipv6)
+{
+ __u32 hash = get_packet_hash(pckt, is_ipv6) % RING_SIZE;
+ __u32 key = RING_SIZE * vip_info->vip_num + hash;
+ __u32 *real_pos;
+
+ real_pos = bpf_map_lookup_elem(&ch_rings, &key);
+ if (!real_pos)
+ return false;
+ key = *real_pos;
+ *real = bpf_map_lookup_elem(&reals, &key);
+ if (!(*real))
+ return false;
+ return true;
+}
+
+static __always_inline int parse_icmpv6(void *data, void *data_end, __u64 off,
+ struct packet_description *pckt)
+{
+ struct icmp6hdr *icmp_hdr;
+ struct ipv6hdr *ip6h;
+
+ icmp_hdr = data + off;
+ if (icmp_hdr + 1 > data_end)
+ return TC_ACT_SHOT;
+ if (icmp_hdr->icmp6_type != ICMPV6_PKT_TOOBIG)
+ return TC_ACT_OK;
+ off += sizeof(struct icmp6hdr);
+ ip6h = data + off;
+ if (ip6h + 1 > data_end)
+ return TC_ACT_SHOT;
+ pckt->proto = ip6h->nexthdr;
+ pckt->flags |= F_ICMP;
+ memcpy(pckt->srcv6, ip6h->daddr.s6_addr32, 16);
+ memcpy(pckt->dstv6, ip6h->saddr.s6_addr32, 16);
+ return TC_ACT_UNSPEC;
+}
+
+static __always_inline int parse_icmp(void *data, void *data_end, __u64 off,
+ struct packet_description *pckt)
+{
+ struct icmphdr *icmp_hdr;
+ struct iphdr *iph;
+
+ icmp_hdr = data + off;
+ if (icmp_hdr + 1 > data_end)
+ return TC_ACT_SHOT;
+ if (icmp_hdr->type != ICMP_DEST_UNREACH ||
+ icmp_hdr->code != ICMP_FRAG_NEEDED)
+ return TC_ACT_OK;
+ off += sizeof(struct icmphdr);
+ iph = data + off;
+ if (iph + 1 > data_end)
+ return TC_ACT_SHOT;
+ if (iph->ihl != 5)
+ return TC_ACT_SHOT;
+ pckt->proto = iph->protocol;
+ pckt->flags |= F_ICMP;
+ pckt->src = iph->daddr;
+ pckt->dst = iph->saddr;
+ return TC_ACT_UNSPEC;
+}
+
+static __always_inline bool parse_udp(void *data, __u64 off, void *data_end,
+ struct packet_description *pckt)
+{
+ struct udphdr *udp;
+ udp = data + off;
+
+ if (udp + 1 > data_end)
+ return false;
+
+ if (!(pckt->flags & F_ICMP)) {
+ pckt->port16[0] = udp->source;
+ pckt->port16[1] = udp->dest;
+ } else {
+ pckt->port16[0] = udp->dest;
+ pckt->port16[1] = udp->source;
+ }
+ return true;
+}
+
+static __always_inline bool parse_tcp(void *data, __u64 off, void *data_end,
+ struct packet_description *pckt)
+{
+ struct tcphdr *tcp;
+
+ tcp = data + off;
+ if (tcp + 1 > data_end)
+ return false;
+
+ if (tcp->syn)
+ pckt->flags |= F_SYN_SET;
+
+ if (!(pckt->flags & F_ICMP)) {
+ pckt->port16[0] = tcp->source;
+ pckt->port16[1] = tcp->dest;
+ } else {
+ pckt->port16[0] = tcp->dest;
+ pckt->port16[1] = tcp->source;
+ }
+ return true;
+}
+
+static __always_inline int process_packet(void *data, __u64 off, void *data_end,
+ bool is_ipv6, struct __sk_buff *skb)
+{
+ void *pkt_start = (void *)(long)skb->data;
+ struct packet_description pckt = {};
+ struct eth_hdr *eth = pkt_start;
+ struct bpf_tunnel_key tkey = {};
+ struct vip_stats *data_stats;
+ struct real_definition *dst;
+ struct vip_meta *vip_info;
+ struct ctl_value *cval;
+ __u32 v4_intf_pos = 1;
+ __u32 v6_intf_pos = 2;
+ struct ipv6hdr *ip6h;
+ struct vip vip = {};
+ struct iphdr *iph;
+ int tun_flag = 0;
+ __u16 pkt_bytes;
+ __u64 iph_len;
+ __u32 ifindex;
+ __u8 protocol;
+ __u32 vip_num;
+ int action;
+
+ tkey.tunnel_ttl = 64;
+ if (is_ipv6) {
+ ip6h = data + off;
+ if (ip6h + 1 > data_end)
+ return TC_ACT_SHOT;
+
+ iph_len = sizeof(struct ipv6hdr);
+ protocol = ip6h->nexthdr;
+ pckt.proto = protocol;
+ pkt_bytes = bpf_ntohs(ip6h->payload_len);
+ off += iph_len;
+ if (protocol == IPPROTO_FRAGMENT) {
+ return TC_ACT_SHOT;
+ } else if (protocol == IPPROTO_ICMPV6) {
+ action = parse_icmpv6(data, data_end, off, &pckt);
+ if (action >= 0)
+ return action;
+ off += IPV6_PLUS_ICMP_HDR;
+ } else {
+ memcpy(pckt.srcv6, ip6h->saddr.s6_addr32, 16);
+ memcpy(pckt.dstv6, ip6h->daddr.s6_addr32, 16);
+ }
+ } else {
+ iph = data + off;
+ if (iph + 1 > data_end)
+ return TC_ACT_SHOT;
+ if (iph->ihl != 5)
+ return TC_ACT_SHOT;
+
+ protocol = iph->protocol;
+ pckt.proto = protocol;
+ pkt_bytes = bpf_ntohs(iph->tot_len);
+ off += IPV4_HDR_LEN_NO_OPT;
+
+ if (iph->frag_off & PCKT_FRAGMENTED)
+ return TC_ACT_SHOT;
+ if (protocol == IPPROTO_ICMP) {
+ action = parse_icmp(data, data_end, off, &pckt);
+ if (action >= 0)
+ return action;
+ off += IPV4_PLUS_ICMP_HDR;
+ } else {
+ pckt.src = iph->saddr;
+ pckt.dst = iph->daddr;
+ }
+ }
+ protocol = pckt.proto;
+
+ if (protocol == IPPROTO_TCP) {
+ if (!parse_tcp(data, off, data_end, &pckt))
+ return TC_ACT_SHOT;
+ } else if (protocol == IPPROTO_UDP) {
+ if (!parse_udp(data, off, data_end, &pckt))
+ return TC_ACT_SHOT;
+ } else {
+ return TC_ACT_SHOT;
+ }
+
+ if (is_ipv6)
+ memcpy(vip.daddr.v6, pckt.dstv6, 16);
+ else
+ vip.daddr.v4 = pckt.dst;
+
+ vip.dport = pckt.port16[1];
+ vip.protocol = pckt.proto;
+ vip_info = bpf_map_lookup_elem(&vip_map, &vip);
+ if (!vip_info) {
+ vip.dport = 0;
+ vip_info = bpf_map_lookup_elem(&vip_map, &vip);
+ if (!vip_info)
+ return TC_ACT_SHOT;
+ pckt.port16[1] = 0;
+ }
+
+ if (vip_info->flags & F_HASH_NO_SRC_PORT)
+ pckt.port16[0] = 0;
+
+ if (!get_packet_dst(&dst, &pckt, vip_info, is_ipv6))
+ return TC_ACT_SHOT;
+
+ if (dst->flags & F_IPV6) {
+ cval = bpf_map_lookup_elem(&ctl_array, &v6_intf_pos);
+ if (!cval)
+ return TC_ACT_SHOT;
+ ifindex = cval->ifindex;
+ memcpy(tkey.remote_ipv6, dst->dstv6, 16);
+ tun_flag = BPF_F_TUNINFO_IPV6;
+ } else {
+ cval = bpf_map_lookup_elem(&ctl_array, &v4_intf_pos);
+ if (!cval)
+ return TC_ACT_SHOT;
+ ifindex = cval->ifindex;
+ tkey.remote_ipv4 = dst->dst;
+ }
+ vip_num = vip_info->vip_num;
+ data_stats = bpf_map_lookup_elem(&stats, &vip_num);
+ if (!data_stats)
+ return TC_ACT_SHOT;
+ data_stats->pkts++;
+ data_stats->bytes += pkt_bytes;
+ bpf_skb_set_tunnel_key(skb, &tkey, sizeof(tkey), tun_flag);
+ *(u32 *)eth->eth_dest = tkey.remote_ipv4;
+ return bpf_redirect(ifindex, 0);
+}
+
+SEC("l4lb-demo")
+int balancer_ingress(struct __sk_buff *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct eth_hdr *eth = data;
+ __u32 eth_proto;
+ __u32 nh_off;
+
+ nh_off = sizeof(struct eth_hdr);
+ if (data + nh_off > data_end)
+ return TC_ACT_SHOT;
+ eth_proto = eth->eth_proto;
+ if (eth_proto == bpf_htons(ETH_P_IP))
+ return process_packet(data, nh_off, data_end, false, ctx);
+ else if (eth_proto == bpf_htons(ETH_P_IPV6))
+ return process_packet(data, nh_off, data_end, true, ctx);
+ else
+ return TC_