summaryrefslogtreecommitdiffstats
path: root/samples
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-07-11 10:55:49 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2019-07-11 10:55:49 -0700
commit237f83dfbe668443b5e31c3c7576125871cca674 (patch)
tree11848a8d0aa414a1d3ce2024e181071b1d9dea08 /samples
parent8f6ccf6159aed1f04c6d179f61f6fb2691261e84 (diff)
parent1ff2f0fa450ea4e4f87793d9ed513098ec6e12be (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: "Some highlights from this development cycle: 1) Big refactoring of ipv6 route and neigh handling to support nexthop objects configurable as units from userspace. From David Ahern. 2) Convert explored_states in BPF verifier into a hash table, significantly decreased state held for programs with bpf2bpf calls, from Alexei Starovoitov. 3) Implement bpf_send_signal() helper, from Yonghong Song. 4) Various classifier enhancements to mvpp2 driver, from Maxime Chevallier. 5) Add aRFS support to hns3 driver, from Jian Shen. 6) Fix use after free in inet frags by allocating fqdirs dynamically and reworking how rhashtable dismantle occurs, from Eric Dumazet. 7) Add act_ctinfo packet classifier action, from Kevin Darbyshire-Bryant. 8) Add TFO key backup infrastructure, from Jason Baron. 9) Remove several old and unused ISDN drivers, from Arnd Bergmann. 10) Add devlink notifications for flash update status to mlxsw driver, from Jiri Pirko. 11) Lots of kTLS offload infrastructure fixes, from Jakub Kicinski. 12) Add support for mv88e6250 DSA chips, from Rasmus Villemoes. 13) Various enhancements to ipv6 flow label handling, from Eric Dumazet and Willem de Bruijn. 14) Support TLS offload in nfp driver, from Jakub Kicinski, Dirk van der Merwe, and others. 15) Various improvements to axienet driver including converting it to phylink, from Robert Hancock. 16) Add PTP support to sja1105 DSA driver, from Vladimir Oltean. 17) Add mqprio qdisc offload support to dpaa2-eth, from Ioana Radulescu. 18) Add devlink health reporting to mlx5, from Moshe Shemesh. 19) Convert stmmac over to phylink, from Jose Abreu. 20) Add PTP PHC (Physical Hardware Clock) support to mlxsw, from Shalom Toledo. 21) Add nftables SYNPROXY support, from Fernando Fernandez Mancera. 22) Convert tcp_fastopen over to use SipHash, from Ard Biesheuvel. 23) Track spill/fill of constants in BPF verifier, from Alexei Starovoitov. 24) Support bounded loops in BPF, from Alexei Starovoitov. 25) Various page_pool API fixes and improvements, from Jesper Dangaard Brouer. 26) Just like ipv4, support ref-countless ipv6 route handling. From Wei Wang. 27) Support VLAN offloading in aquantia driver, from Igor Russkikh. 28) Add AF_XDP zero-copy support to mlx5, from Maxim Mikityanskiy. 29) Add flower GRE encap/decap support to nfp driver, from Pieter Jansen van Vuuren. 30) Protect against stack overflow when using act_mirred, from John Hurley. 31) Allow devmap map lookups from eBPF, from Toke Høiland-Jørgensen. 32) Use page_pool API in netsec driver, Ilias Apalodimas. 33) Add Google gve network driver, from Catherine Sullivan. 34) More indirect call avoidance, from Paolo Abeni. 35) Add kTLS TX HW offload support to mlx5, from Tariq Toukan. 36) Add XDP_REDIRECT support to bnxt_en, from Andy Gospodarek. 37) Add MPLS manipulation actions to TC, from John Hurley. 38) Add sending a packet to connection tracking from TC actions, and then allow flower classifier matching on conntrack state. From Paul Blakey. 39) Netfilter hw offload support, from Pablo Neira Ayuso" * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (2080 commits) net/mlx5e: Return in default case statement in tx_post_resync_params mlx5: Return -EINVAL when WARN_ON_ONCE triggers in mlx5e_tls_resync(). net: dsa: add support for BRIDGE_MROUTER attribute pkt_sched: Include const.h net: netsec: remove static declaration for netsec_set_tx_de() net: netsec: remove superfluous if statement netfilter: nf_tables: add hardware offload support net: flow_offload: rename tc_cls_flower_offload to flow_cls_offload net: flow_offload: add flow_block_cb_is_busy() and use it net: sched: remove tcf block API drivers: net: use flow block API net: sched: use flow block API net: flow_offload: add flow_block_cb_{priv, incref, decref}() net: flow_offload: add list handling functions net: flow_offload: add flow_block_cb_alloc() and flow_block_cb_free() net: flow_offload: rename TCF_BLOCK_BINDER_TYPE_* to FLOW_BLOCK_BINDER_TYPE_* net: flow_offload: rename TC_BLOCK_{UN}BIND to FLOW_BLOCK_{UN}BIND net: flow_offload: add flow_block_cb_setup_simple() net: hisilicon: Add an tx_desc to adapt HI13X1_GMAC net: hisilicon: Add an rx_desc to adapt HI13X1_GMAC ...
Diffstat (limited to 'samples')
-rw-r--r--samples/bpf/.gitignore1
-rw-r--r--samples/bpf/Makefile28
-rw-r--r--samples/bpf/bpf_load.c8
-rwxr-xr-xsamples/bpf/do_hbm_test.sh30
-rw-r--r--samples/bpf/fds_example.c2
-rw-r--r--samples/bpf/hbm.c67
-rw-r--r--samples/bpf/hbm.h9
-rw-r--r--samples/bpf/hbm_edt_kern.c168
-rw-r--r--samples/bpf/hbm_kern.h117
-rw-r--r--samples/bpf/hbm_out_kern.c48
-rw-r--r--samples/bpf/ibumad_kern.c18
-rw-r--r--samples/bpf/ibumad_user.c2
-rw-r--r--samples/bpf/sockex1_user.c2
-rw-r--r--samples/bpf/sockex2_user.c2
-rw-r--r--samples/bpf/tcp_basertt_kern.c7
-rw-r--r--samples/bpf/tcp_bpf.readme2
-rw-r--r--samples/bpf/tcp_bufs_kern.c7
-rw-r--r--samples/bpf/tcp_clamp_kern.c7
-rw-r--r--samples/bpf/tcp_cong_kern.c7
-rw-r--r--samples/bpf/tcp_dumpstats_kern.c68
-rw-r--r--samples/bpf/tcp_iw_kern.c7
-rw-r--r--samples/bpf/tcp_rwnd_kern.c7
-rw-r--r--samples/bpf/tcp_synrto_kern.c7
-rw-r--r--samples/bpf/tcp_tos_reflect_kern.c7
-rw-r--r--samples/bpf/test_cgrp2_attach2.c459
-rw-r--r--samples/bpf/xdp1_user.c4
-rw-r--r--samples/bpf/xdp_adjust_tail_user.c16
-rw-r--r--samples/bpf/xdp_fwd_user.c2
-rw-r--r--samples/bpf/xdp_redirect_cpu_user.c2
-rw-r--r--samples/bpf/xdp_redirect_map_user.c17
-rw-r--r--samples/bpf/xdp_redirect_user.c19
-rw-r--r--samples/bpf/xdp_router_ipv4_user.c2
-rw-r--r--samples/bpf/xdp_rxq_info_user.c4
-rw-r--r--samples/bpf/xdp_sample_pkts_kern.c7
-rw-r--r--samples/bpf/xdp_tx_iptunnel_user.c14
-rw-r--r--samples/bpf/xdpsock_user.c48
-rw-r--r--samples/pktgen/README.rst1
-rw-r--r--samples/pktgen/functions.sh34
-rw-r--r--samples/pktgen/parameters.sh7
-rwxr-xr-xsamples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh11
-rwxr-xr-xsamples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh11
-rwxr-xr-xsamples/pktgen/pktgen_sample01_simple.sh11
-rwxr-xr-xsamples/pktgen/pktgen_sample02_multiqueue.sh11
-rwxr-xr-xsamples/pktgen/pktgen_sample03_burst_single_flow.sh11
-rwxr-xr-xsamples/pktgen/pktgen_sample04_many_flows.sh11
-rwxr-xr-xsamples/pktgen/pktgen_sample05_flow_per_thread.sh12
-rwxr-xr-xsamples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh11
47 files changed, 705 insertions, 648 deletions
diff --git a/samples/bpf/.gitignore b/samples/bpf/.gitignore
index c7498457595a..74d31fd3c99c 100644
--- a/samples/bpf/.gitignore
+++ b/samples/bpf/.gitignore
@@ -1,6 +1,7 @@
cpustat
fds_example
hbm
+ibumad
lathist
lwt_len_hist
map_perf_test
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 4f0a1cdbfe7c..f90daadfbc89 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -26,7 +26,6 @@ hostprogs-y += map_perf_test
hostprogs-y += test_overhead
hostprogs-y += test_cgrp2_array_pin
hostprogs-y += test_cgrp2_attach
-hostprogs-y += test_cgrp2_attach2
hostprogs-y += test_cgrp2_sock
hostprogs-y += test_cgrp2_sock2
hostprogs-y += xdp1
@@ -81,7 +80,6 @@ map_perf_test-objs := bpf_load.o map_perf_test_user.o
test_overhead-objs := bpf_load.o test_overhead_user.o
test_cgrp2_array_pin-objs := test_cgrp2_array_pin.o
test_cgrp2_attach-objs := test_cgrp2_attach.o
-test_cgrp2_attach2-objs := test_cgrp2_attach2.o $(CGROUP_HELPERS)
test_cgrp2_sock-objs := test_cgrp2_sock.o
test_cgrp2_sock2-objs := bpf_load.o test_cgrp2_sock2.o
xdp1-objs := xdp1_user.o
@@ -156,6 +154,7 @@ always += tcp_iw_kern.o
always += tcp_clamp_kern.o
always += tcp_basertt_kern.o
always += tcp_tos_reflect_kern.o
+always += tcp_dumpstats_kern.o
always += xdp_redirect_kern.o
always += xdp_redirect_map_kern.o
always += xdp_redirect_cpu_kern.o
@@ -170,23 +169,15 @@ always += task_fd_query_kern.o
always += xdp_sample_pkts_kern.o
always += ibumad_kern.o
always += hbm_out_kern.o
+always += hbm_edt_kern.o
KBUILD_HOSTCFLAGS += -I$(objtree)/usr/include
-KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/
+KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/bpf/
KBUILD_HOSTCFLAGS += -I$(srctree)/tools/testing/selftests/bpf/
KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/ -I$(srctree)/tools/include
KBUILD_HOSTCFLAGS += -I$(srctree)/tools/perf
HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable
-HOSTCFLAGS_trace_helpers.o += -I$(srctree)/tools/lib/bpf/
-
-HOSTCFLAGS_trace_output_user.o += -I$(srctree)/tools/lib/bpf/
-HOSTCFLAGS_offwaketime_user.o += -I$(srctree)/tools/lib/bpf/
-HOSTCFLAGS_spintest_user.o += -I$(srctree)/tools/lib/bpf/
-HOSTCFLAGS_trace_event_user.o += -I$(srctree)/tools/lib/bpf/
-HOSTCFLAGS_sampleip_user.o += -I$(srctree)/tools/lib/bpf/
-HOSTCFLAGS_task_fd_query_user.o += -I$(srctree)/tools/lib/bpf/
-HOSTCFLAGS_xdp_sample_pkts_user.o += -I$(srctree)/tools/lib/bpf/
KBUILD_HOSTLDLIBS += $(LIBBPF) -lelf
HOSTLDLIBS_tracex4 += -lrt
@@ -208,6 +199,17 @@ HOSTCC = $(CROSS_COMPILE)gcc
CLANG_ARCH_ARGS = -target $(ARCH)
endif
+# Don't evaluate probes and warnings if we need to run make recursively
+ifneq ($(src),)
+HDR_PROBE := $(shell echo "\#include <linux/types.h>\n struct list_head { int a; }; int main() { return 0; }" | \
+ $(HOSTCC) $(KBUILD_HOSTCFLAGS) -x c - -o /dev/null 2>/dev/null && \
+ echo okay)
+
+ifeq ($(HDR_PROBE),)
+$(warning WARNING: Detected possible issues with include path.)
+$(warning WARNING: Please install kernel headers locally (make headers_install).)
+endif
+
BTF_LLC_PROBE := $(shell $(LLC) -march=bpf -mattr=help 2>&1 | grep dwarfris)
BTF_PAHOLE_PROBE := $(shell $(BTF_PAHOLE) --help 2>&1 | grep BTF)
BTF_OBJCOPY_PROBE := $(shell $(LLVM_OBJCOPY) --help 2>&1 | grep -i 'usage.*llvm')
@@ -225,6 +227,7 @@ ifneq ($(and $(BTF_LLC_PROBE),$(BTF_PAHOLE_PROBE),$(BTF_OBJCOPY_PROBE)),)
DWARF2BTF = y
endif
endif
+endif
# Trick to allow make to be run from this directory
all:
@@ -271,6 +274,7 @@ $(src)/*.c: verify_target_bpf $(LIBBPF)
$(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h
$(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h
$(obj)/hbm.o: $(src)/hbm.h
+$(obj)/hbm_edt_kern.o: $(src)/hbm.h $(src)/hbm_kern.h
# asm/sysreg.h - inline assembly used by it is incompatible with llvm.
# But, there is no easy way to fix it, so just exclude it since it is
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 6e87cc831e84..4574b1939e49 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -40,7 +40,7 @@ int prog_cnt;
int prog_array_fd = -1;
struct bpf_map_data map_data[MAX_MAPS];
-int map_data_count = 0;
+int map_data_count;
static int populate_prog_array(const char *event, int prog_fd)
{
@@ -65,7 +65,7 @@ static int write_kprobe_events(const char *val)
else
flags = O_WRONLY | O_APPEND;
- fd = open("/sys/kernel/debug/tracing/kprobe_events", flags);
+ fd = open(DEBUGFS "kprobe_events", flags);
ret = write(fd, val, strlen(val));
close(fd);
@@ -490,8 +490,8 @@ static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx,
/* Verify no newer features were requested */
if (validate_zero) {
- addr = (unsigned char*) def + map_sz_copy;
- end = (unsigned char*) def + map_sz_elf;
+ addr = (unsigned char *) def + map_sz_copy;
+ end = (unsigned char *) def + map_sz_elf;
for (; addr < end; addr++) {
if (*addr != 0) {
free(sym);
diff --git a/samples/bpf/do_hbm_test.sh b/samples/bpf/do_hbm_test.sh
index 56c8b4115c95..ffe4c0607341 100755
--- a/samples/bpf/do_hbm_test.sh
+++ b/samples/bpf/do_hbm_test.sh
@@ -13,10 +13,10 @@ Usage() {
echo "egress or ingress bandwidht. It then uses iperf3 or netperf to create"
echo "loads. The output is the goodput in Mbps (unless -D was used)."
echo ""
- echo "USAGE: $name [out] [-b=<prog>|--bpf=<prog>] [-c=<cc>|--cc=<cc>] [-D]"
- echo " [-d=<delay>|--delay=<delay>] [--debug] [-E]"
+ echo "USAGE: $name [out] [-b=<prog>|--bpf=<prog>] [-c=<cc>|--cc=<cc>]"
+ echo " [-D] [-d=<delay>|--delay=<delay>] [--debug] [-E] [--edt]"
echo " [-f=<#flows>|--flows=<#flows>] [-h] [-i=<id>|--id=<id >]"
- echo " [-l] [-N] [-p=<port>|--port=<port>] [-P]"
+ echo " [-l] [-N] [--no_cn] [-p=<port>|--port=<port>] [-P]"
echo " [-q=<qdisc>] [-R] [-s=<server>|--server=<server]"
echo " [-S|--stats] -t=<time>|--time=<time>] [-w] [cubic|dctcp]"
echo " Where:"
@@ -30,9 +30,11 @@ Usage() {
echo " other detailed information. This information is"
echo " test dependent (i.e. iperf3 or netperf)."
echo " -E enable ECN (not required for dctcp)"
+ echo " --edt use fq's Earliest Departure Time (requires fq)"
echo " -f or --flows number of concurrent flows (default=1)"
echo " -i or --id cgroup id (an integer, default is 1)"
echo " -N use netperf instead of iperf3"
+ echo " --no_cn Do not return CN notifications"
echo " -l do not limit flows using loopback"
echo " -h Help"
echo " -p or --port iperf3 port (default is 5201)"
@@ -115,6 +117,9 @@ processArgs () {
-c=*|--cc=*)
cc="${i#*=}"
;;
+ --no_cn)
+ flags="$flags --no_cn"
+ ;;
--debug)
flags="$flags -d"
debug_flag=1
@@ -126,13 +131,12 @@ processArgs () {
details=1
;;
-E)
- ecn=1
+ ecn=1
+ ;;
+ --edt)
+ flags="$flags --edt"
+ qdisc="fq"
;;
- # Support for upcomming fq Early Departure Time egress rate limiting
- #--edt)
- # prog="hbm_out_edt_kern.o"
- # qdisc="fq"
- # ;;
-f=*|--flows=*)
flows="${i#*=}"
;;
@@ -224,8 +228,8 @@ if [ "$netem" -ne "0" ] ; then
tc qdisc del dev lo root > /dev/null 2>&1
tc qdisc add dev lo root netem delay $netem\ms > /dev/null 2>&1
elif [ "$qdisc" != "" ] ; then
- tc qdisc del dev lo root > /dev/null 2>&1
- tc qdisc add dev lo root $qdisc > /dev/null 2>&1
+ tc qdisc del dev eth0 root > /dev/null 2>&1
+ tc qdisc add dev eth0 root $qdisc > /dev/null 2>&1
fi
n=0
@@ -395,7 +399,9 @@ fi
if [ "$netem" -ne "0" ] ; then
tc qdisc del dev lo root > /dev/null 2>&1
fi
-
+if [ "$qdisc" != "" ] ; then
+ tc qdisc del dev eth0 root > /dev/null 2>&1
+fi
sleep 2
hbmPid=`ps ax | grep "hbm " | grep --invert-match "grep" | awk '{ print $1 }'`
diff --git a/samples/bpf/fds_example.c b/samples/bpf/fds_example.c
index e51eb060244e..2d4b717726b6 100644
--- a/samples/bpf/fds_example.c
+++ b/samples/bpf/fds_example.c
@@ -14,7 +14,7 @@
#include <bpf/bpf.h>
-#include "bpf/libbpf.h"
+#include "libbpf.h"
#include "bpf_insn.h"
#include "sock_example.h"
diff --git a/samples/bpf/hbm.c b/samples/bpf/hbm.c
index a79828ab273f..e0fbab9bec83 100644
--- a/samples/bpf/hbm.c
+++ b/samples/bpf/hbm.c
@@ -16,6 +16,7 @@
* -l Also limit flows doing loopback
* -n <#> To create cgroup \"/hbm#\" and attach prog
* Default is /hbm1
+ * --no_cn Do not return cn notifications
* -r <rate> Rate limit in Mbps
* -s Get HBM stats (marked, dropped, etc.)
* -t <time> Exit after specified seconds (default is 0)
@@ -42,14 +43,15 @@
#include <linux/bpf.h>
#include <bpf/bpf.h>
+#include <getopt.h>
#include "bpf_load.h"
#include "bpf_rlimit.h"
#include "cgroup_helpers.h"
#include "hbm.h"
#include "bpf_util.h"
-#include "bpf/bpf.h"
-#include "bpf/libbpf.h"
+#include "bpf.h"
+#include "libbpf.h"
bool outFlag = true;
int minRate = 1000; /* cgroup rate limit in Mbps */
@@ -59,6 +61,8 @@ bool stats_flag;
bool loopback_flag;
bool debugFlag;
bool work_conserving_flag;
+bool no_cn_flag;
+bool edt_flag;
static void Usage(void);
static void read_trace_pipe2(void);
@@ -185,6 +189,7 @@ static int run_bpf_prog(char *prog, int cg_id)
qstats.rate = rate;
qstats.stats = stats_flag ? 1 : 0;
qstats.loopback = loopback_flag ? 1 : 0;
+ qstats.no_cn = no_cn_flag ? 1 : 0;
if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) {
printf("ERROR: Could not update map element\n");
goto err;
@@ -312,6 +317,14 @@ static int run_bpf_prog(char *prog, int cg_id)
double percent_pkts, percent_bytes;
char fname[100];
FILE *fout;
+ int k;
+ static const char *returnValNames[] = {
+ "DROP_PKT",
+ "ALLOW_PKT",
+ "DROP_PKT_CWR",
+ "ALLOW_PKT_CWR"
+ };
+#define RET_VAL_COUNT 4
// Future support of ingress
// if (!outFlag)
@@ -346,6 +359,36 @@ static int run_bpf_prog(char *prog, int cg_id)
(qstats.bytes_total + 1);
fprintf(fout, "pkts_dropped_percent:%6.2f\n", percent_pkts);
fprintf(fout, "bytes_dropped_percent:%6.2f\n", percent_bytes);
+
+ // ECN CE markings
+ percent_pkts = (qstats.pkts_ecn_ce * 100.0) /
+ (qstats.pkts_total + 1);
+ fprintf(fout, "pkts_ecn_ce:%6.2f (%d)\n", percent_pkts,
+ (int)qstats.pkts_ecn_ce);
+
+ // Average cwnd
+ fprintf(fout, "avg cwnd:%d\n",
+ (int)(qstats.sum_cwnd / (qstats.sum_cwnd_cnt + 1)));
+ // Average rtt
+ fprintf(fout, "avg rtt:%d\n",
+ (int)(qstats.sum_rtt / (qstats.pkts_total + 1)));
+ // Average credit
+ if (edt_flag)
+ fprintf(fout, "avg credit_ms:%.03f\n",
+ (qstats.sum_credit /
+ (qstats.pkts_total + 1.0)) / 1000000.0);
+ else
+ fprintf(fout, "avg credit:%d\n",
+ (int)(qstats.sum_credit /
+ (1500 * ((int)qstats.pkts_total ) + 1)));
+
+ // Return values stats
+ for (k = 0; k < RET_VAL_COUNT; k++) {
+ percent_pkts = (qstats.returnValCount[k] * 100.0) /
+ (qstats.pkts_total + 1);
+ fprintf(fout, "%s:%6.2f (%d)\n", returnValNames[k],
+ percent_pkts, (int)qstats.returnValCount[k]);
+ }
fclose(fout);
}
@@ -366,14 +409,16 @@ static void Usage(void)
{
printf("This program loads a cgroup skb BPF program to enforce\n"
"cgroup output (egress) bandwidth limits.\n\n"
- "USAGE: hbm [-o] [-d] [-l] [-n <id>] [-r <rate>] [-s]\n"
- " [-t <secs>] [-w] [-h] [prog]\n"
+ "USAGE: hbm [-o] [-d] [-l] [-n <id>] [--no_cn] [-r <rate>]\n"
+ " [-s] [-t <secs>] [-w] [-h] [prog]\n"
" Where:\n"
" -o indicates egress direction (default)\n"
" -d print BPF trace debug buffer\n"
+ " --edt use fq's Earliest Departure Time\n"
" -l also limit flows using loopback\n"
" -n <#> to create cgroup \"/hbm#\" and attach prog\n"
" Default is /hbm1\n"
+ " --no_cn disable CN notifications\n"
" -r <rate> Rate in Mbps\n"
" -s Update HBM stats\n"
" -t <time> Exit after specified seconds (default is 0)\n"
@@ -393,9 +438,21 @@ int main(int argc, char **argv)
int k;
int cg_id = 1;
char *optstring = "iodln:r:st:wh";
+ struct option loptions[] = {
+ {"no_cn", 0, NULL, 1},
+ {"edt", 0, NULL, 2},
+ {NULL, 0, NULL, 0}
+ };
- while ((k = getopt(argc, argv, optstring)) != -1) {
+ while ((k = getopt_long(argc, argv, optstring, loptions, NULL)) != -1) {
switch (k) {
+ case 1:
+ no_cn_flag = true;
+ break;
+ case 2:
+ prog = "hbm_edt_kern.o";
+ edt_flag = true;
+ break;
case'o':
break;
case 'd':
diff --git a/samples/bpf/hbm.h b/samples/bpf/hbm.h
index 518e8147d084..f0963ed6a562 100644
--- a/samples/bpf/hbm.h
+++ b/samples/bpf/hbm.h
@@ -19,7 +19,8 @@ struct hbm_vqueue {
struct hbm_queue_stats {
unsigned long rate; /* in Mbps*/
unsigned long stats:1, /* get HBM stats (marked, dropped,..) */
- loopback:1; /* also limit flows using loopback */
+ loopback:1, /* also limit flows using loopback */
+ no_cn:1; /* do not use cn flags */
unsigned long long pkts_marked;
unsigned long long bytes_marked;
unsigned long long pkts_dropped;
@@ -28,4 +29,10 @@ struct hbm_queue_stats {
unsigned long long bytes_total;
unsigned long long firstPacketTime;
unsigned long long lastPacketTime;
+ unsigned long long pkts_ecn_ce;
+ unsigned long long returnValCount[4];
+ unsigned long long sum_cwnd;
+ unsigned long long sum_rtt;
+ unsigned long long sum_cwnd_cnt;
+ long long sum_credit;
};
diff --git a/samples/bpf/hbm_edt_kern.c b/samples/bpf/hbm_edt_kern.c
new file mode 100644
index 000000000000..a65b677acdb0
--- /dev/null
+++ b/samples/bpf/hbm_edt_kern.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Sample Host Bandwidth Manager (HBM) BPF program.
+ *
+ * A cgroup skb BPF egress program to limit cgroup output bandwidth.
+ * It uses a modified virtual token bucket queue to limit average
+ * egress bandwidth. The implementation uses credits instead of tokens.
+ * Negative credits imply that queueing would have happened (this is
+ * a virtual queue, so no queueing is done by it. However, queueing may
+ * occur at the actual qdisc (which is not used for rate limiting).
+ *
+ * This implementation uses 3 thresholds, one to start marking packets and
+ * the other two to drop packets:
+ * CREDIT
+ * - <--------------------------|------------------------> +
+ * | | | 0
+ * | Large pkt |
+ * | drop thresh |
+ * Small pkt drop Mark threshold
+ * thresh
+ *
+ * The effect of marking depends on the type of packet:
+ * a) If the packet is ECN enabled and it is a TCP packet, then the packet
+ * is ECN marked.
+ * b) If the packet is a TCP packet, then we probabilistically call tcp_cwr
+ * to reduce the congestion window. The current implementation uses a linear
+ * distribution (0% probability at marking threshold, 100% probability
+ * at drop threshold).
+ * c) If the packet is not a TCP packet, then it is dropped.
+ *
+ * If the credit is below the drop threshold, the packet is dropped. If it
+ * is a TCP packet, then it also calls tcp_cwr since packets dropped by
+ * by a cgroup skb BPF program do not automatically trigger a call to
+ * tcp_cwr in the current kernel code.
+ *
+ * This BPF program actually uses 2 drop thresholds, one threshold
+ * for larger packets (>= 120 bytes) and another for smaller packets. This
+ * protects smaller packets such as SYNs, ACKs, etc.
+ *
+ * The default bandwidth limit is set at 1Gbps but this can be changed by
+ * a user program through a shared BPF map. In addition, by default this BPF
+ * program does not limit connections using loopback. This behavior can be
+ * overwritten by the user program. There is also an option to calculate
+ * some statistics, such as percent of packets marked or dropped, which
+ * a user program, such as hbm, can access.
+ */
+
+#include "hbm_kern.h"
+
+SEC("cgroup_skb/egress")
+int _hbm_out_cg(struct __sk_buff *skb)
+{
+ long long delta = 0, delta_send;
+ unsigned long long curtime, sendtime;
+ struct hbm_queue_stats *qsp = NULL;
+ unsigned int queue_index = 0;
+ bool congestion_flag = false;
+ bool ecn_ce_flag = false;
+ struct hbm_pkt_info pkti = {};
+ struct hbm_vqueue *qdp;
+ bool drop_flag = false;
+ bool cwr_flag = false;
+ int len = skb->len;
+ int rv = ALLOW_PKT;
+
+ qsp = bpf_map_lookup_elem(&queue_stats, &queue_index);
+
+ // Check if we should ignore loopback traffic
+ if (qsp != NULL && !qsp->loopback && (skb->ifindex == 1))
+ return ALLOW_PKT;
+
+ hbm_get_pkt_info(skb, &pkti);
+
+ // We may want to account for the length of headers in len
+ // calculation, like ETH header + overhead, specially if it
+ // is a gso packet. But I am not doing it right now.
+
+ qdp = bpf_get_local_storage(&queue_state, 0);
+ if (!qdp)
+ return ALLOW_PKT;
+ if (qdp->lasttime == 0)
+ hbm_init_edt_vqueue(qdp, 1024);
+
+ curtime = bpf_ktime_get_ns();
+
+ // Begin critical section
+ bpf_spin_lock(&qdp->lock);
+ delta = qdp->lasttime - curtime;
+ // bound bursts to 100us
+ if (delta < -BURST_SIZE_NS) {
+ // negative delta is a credit that allows bursts
+ qdp->lasttime = curtime - BURST_SIZE_NS;
+ delta = -BURST_SIZE_NS;
+ }
+ sendtime = qdp->lasttime;
+ delta_send = BYTES_TO_NS(len, qdp->rate);
+ __sync_add_and_fetch(&(qdp->lasttime), delta_send);
+ bpf_spin_unlock(&qdp->lock);
+ // End critical section
+
+ // Set EDT of packet
+ skb->tstamp = sendtime;
+
+ // Check if we should update rate
+ if (qsp != NULL && (qsp->rate * 128) != qdp->rate)
+ qdp->rate = qsp->rate * 128;
+
+ // Set flags (drop, congestion, cwr)
+ // last packet will be sent in the future, bound latency
+ if (delta > DROP_THRESH_NS || (delta > LARGE_PKT_DROP_THRESH_NS &&
+ len > LARGE_PKT_THRESH)) {
+ drop_flag = true;
+ if (pkti.is_tcp && pkti.ecn == 0)
+ cwr_flag = true;
+ } else if (delta > MARK_THRESH_NS) {
+ if (pkti.is_tcp)
+ congestion_flag = true;
+ else
+ drop_flag = true;
+ }
+
+ if (congestion_flag) {
+ if (bpf_skb_ecn_set_ce(skb)) {
+ ecn_ce_flag = true;
+ } else {
+ if (pkti.is_tcp) {
+ unsigned int rand = bpf_get_prandom_u32();
+
+ if (delta >= MARK_THRESH_NS +
+ (rand % MARK_REGION_SIZE_NS)) {
+ // Do congestion control
+ cwr_flag = true;
+ }
+ } else if (len > LARGE_PKT_THRESH) {
+ // Problem if too many small packets?
+ drop_flag = true;
+ congestion_flag = false;
+ }
+ }
+ }
+
+ if (pkti.is_tcp && drop_flag && pkti.packets_out <= 1) {
+ drop_flag = false;
+ cwr_flag = true;
+ congestion_flag = false;
+ }
+
+ if (qsp != NULL && qsp->no_cn)
+ cwr_flag = false;
+
+ hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag,
+ cwr_flag, ecn_ce_flag, &pkti, (int) delta);
+
+ if (drop_flag) {
+ __sync_add_and_fetch(&(qdp->lasttime), -delta_send);
+ rv = DROP_PKT;
+ }
+
+ if (cwr_flag)
+ rv |= CWR;