diff options
Diffstat (limited to 'net/core/filter.c')
-rw-r--r-- | net/core/filter.c | 1423 |
1 files changed, 1301 insertions, 122 deletions
diff --git a/net/core/filter.c b/net/core/filter.c index 201ff36b17a8..3d9ba7e5965a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -57,7 +57,17 @@ #include <net/sock_reuseport.h> #include <net/busy_poll.h> #include <net/tcp.h> +#include <net/xfrm.h> #include <linux/bpf_trace.h> +#include <net/xdp_sock.h> +#include <linux/inetdevice.h> +#include <net/ip_fib.h> +#include <net/flow.h> +#include <net/arp.h> +#include <net/ipv6.h> +#include <linux/seg6_local.h> +#include <net/seg6.h> +#include <net/seg6_local.h> /** * sk_filter_trim_cap - run a packet through a socket filter @@ -111,12 +121,12 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) } EXPORT_SYMBOL(sk_filter_trim_cap); -BPF_CALL_1(__skb_get_pay_offset, struct sk_buff *, skb) +BPF_CALL_1(bpf_skb_get_pay_offset, struct sk_buff *, skb) { return skb_get_poff(skb); } -BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) +BPF_CALL_3(bpf_skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) { struct nlattr *nla; @@ -136,7 +146,7 @@ BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) return 0; } -BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) +BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) { struct nlattr *nla; @@ -160,13 +170,94 @@ BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) return 0; } -BPF_CALL_0(__get_raw_cpu_id) +BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *, + data, int, headlen, int, offset) +{ + u8 tmp, *ptr; + const int len = sizeof(tmp); + + if (offset >= 0) { + if (headlen - offset >= len) + return *(u8 *)(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return tmp; + } else { + ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); + if (likely(ptr)) + return *(u8 *)ptr; + } + + return -EFAULT; +} + +BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb, + int, offset) +{ + return ____bpf_skb_load_helper_8(skb, skb->data, skb->len - skb->data_len, + offset); +} + +BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *, + data, int, headlen, int, offset) +{ + u16 tmp, *ptr; + const int len = sizeof(tmp); + + if (offset >= 0) { + if (headlen - offset >= len) + return get_unaligned_be16(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return be16_to_cpu(tmp); + } else { + ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); + if (likely(ptr)) + return get_unaligned_be16(ptr); + } + + return -EFAULT; +} + +BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb, + int, offset) +{ + return ____bpf_skb_load_helper_16(skb, skb->data, skb->len - skb->data_len, + offset); +} + +BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *, + data, int, headlen, int, offset) +{ + u32 tmp, *ptr; + const int len = sizeof(tmp); + + if (likely(offset >= 0)) { + if (headlen - offset >= len) + return get_unaligned_be32(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return be32_to_cpu(tmp); + } else { + ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); + if (likely(ptr)) + return get_unaligned_be32(ptr); + } + + return -EFAULT; +} + +BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb, + int, offset) +{ + return ____bpf_skb_load_helper_32(skb, skb->data, skb->len - skb->data_len, + offset); +} + +BPF_CALL_0(bpf_get_raw_cpu_id) { return raw_smp_processor_id(); } static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { - .func = __get_raw_cpu_id, + .func = bpf_get_raw_cpu_id, .gpl_only = false, .ret_type = RET_INTEGER, }; @@ -316,16 +407,16 @@ static bool convert_bpf_extensions(struct sock_filter *fp, /* Emit call(arg1=CTX, arg2=A, arg3=X) */ switch (fp->k) { case SKF_AD_OFF + SKF_AD_PAY_OFFSET: - *insn = BPF_EMIT_CALL(__skb_get_pay_offset); + *insn = BPF_EMIT_CALL(bpf_skb_get_pay_offset); break; case SKF_AD_OFF + SKF_AD_NLATTR: - *insn = BPF_EMIT_CALL(__skb_get_nlattr); + *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr); break; case SKF_AD_OFF + SKF_AD_NLATTR_NEST: - *insn = BPF_EMIT_CALL(__skb_get_nlattr_nest); + *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr_nest); break; case SKF_AD_OFF + SKF_AD_CPU: - *insn = BPF_EMIT_CALL(__get_raw_cpu_id); + *insn = BPF_EMIT_CALL(bpf_get_raw_cpu_id); break; case SKF_AD_OFF + SKF_AD_RANDOM: *insn = BPF_EMIT_CALL(bpf_user_rnd_u32); @@ -352,26 +443,87 @@ static bool convert_bpf_extensions(struct sock_filter *fp, return true; } +static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp) +{ + const bool unaligned_ok = IS_BUILTIN(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS); + int size = bpf_size_to_bytes(BPF_SIZE(fp->code)); + bool endian = BPF_SIZE(fp->code) == BPF_H || + BPF_SIZE(fp->code) == BPF_W; + bool indirect = BPF_MODE(fp->code) == BPF_IND; + const int ip_align = NET_IP_ALIGN; + struct bpf_insn *insn = *insnp; + int offset = fp->k; + + if (!indirect && + ((unaligned_ok && offset >= 0) || + (!unaligned_ok && offset >= 0 && + offset + ip_align >= 0 && + offset + ip_align % size == 0))) { + *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H); + *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); + *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, size, 2 + endian); + *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, BPF_REG_D, + offset); + if (endian) + *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8); + *insn++ = BPF_JMP_A(8); + } + + *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); + *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_D); + *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_H); + if (!indirect) { + *insn++ = BPF_MOV64_IMM(BPF_REG_ARG4, offset); + } else { + *insn++ = BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_X); + if (fp->k) + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG4, offset); + } + + switch (BPF_SIZE(fp->code)) { + case BPF_B: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8); + break; + case BPF_H: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16); + break; + case BPF_W: + *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32); + break; + default: + return false; + } + + *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_A, 0, 2); + *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); + *insn = BPF_EXIT_INSN(); + + *insnp = insn; + return true; +} + /** * bpf_convert_filter - convert filter program * @prog: the user passed filter program * @len: the length of the user passed filter program * @new_prog: allocated 'struct bpf_prog' or NULL * @new_len: pointer to store length of converted program + * @seen_ld_abs: bool whether we've seen ld_abs/ind * * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn' * style extended BPF (eBPF). * Conversion workflow: * * 1) First pass for calculating the new program length: - * bpf_convert_filter(old_prog, old_len, NULL, &new_len) + * bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs) * * 2) 2nd pass to remap in two passes: 1st pass finds new * jump offsets, 2nd pass remapping: - * bpf_convert_filter(old_prog, old_len, new_prog, &new_len); + * bpf_convert_filter(old_prog, old_len, new_prog, &new_len, &seen_ld_abs) */ static int bpf_convert_filter(struct sock_filter *prog, int len, - struct bpf_prog *new_prog, int *new_len) + struct bpf_prog *new_prog, int *new_len, + bool *seen_ld_abs) { int new_flen = 0, pass = 0, target, i, stack_off; struct bpf_insn *new_insn, *first_insn = NULL; @@ -410,12 +562,27 @@ do_pass: * do this ourself. Initial CTX is present in BPF_REG_ARG1. */ *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); + if (*seen_ld_abs) { + /* For packet access in classic BPF, cache skb->data + * in callee-saved BPF R8 and skb->len - skb->data_len + * (headlen) in BPF R9. Since classic BPF is read-only + * on CTX, we only need to cache it once. + */ + *new_insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), + BPF_REG_D, BPF_REG_CTX, + offsetof(struct sk_buff, data)); + *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_H, BPF_REG_CTX, + offsetof(struct sk_buff, len)); + *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_CTX, + offsetof(struct sk_buff, data_len)); + *new_insn++ = BPF_ALU32_REG(BPF_SUB, BPF_REG_H, BPF_REG_TMP); + } } else { new_insn += 3; } for (i = 0; i < len; fp++, i++) { - struct bpf_insn tmp_insns[6] = { }; + struct bpf_insn tmp_insns[32] = { }; struct bpf_insn *insn = tmp_insns; if (addrs) @@ -458,6 +625,11 @@ do_pass: BPF_MODE(fp->code) == BPF_ABS && convert_bpf_extensions(fp, &insn)) break; + if (BPF_CLASS(fp->code) == BPF_LD && + convert_bpf_ld_abs(fp, &insn)) { + *seen_ld_abs = true; + break; + } if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || fp->code == (BPF_ALU | BPF_MOD | BPF_X)) { @@ -567,21 +739,31 @@ jmp_rest: break; /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */ - case BPF_LDX | BPF_MSH | BPF_B: - /* tmp = A */ - *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A); + case BPF_LDX | BPF_MSH | BPF_B: { + struct sock_filter tmp = { + .code = BPF_LD | BPF_ABS | BPF_B, + .k = fp->k, + }; + + *seen_ld_abs = true; + + /* X = A */ + *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); /* A = BPF_R0 = *(u8 *) (skb->data + K) */ - *insn++ = BPF_LD_ABS(BPF_B, fp->k); + convert_bpf_ld_abs(&tmp, &insn); + insn++; /* A &= 0xf */ *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf); /* A <<= 2 */ *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2); + /* tmp = X */ + *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_X); /* X = A */ *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); /* A = tmp */ *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); break; - + } /* RET_K is remaped into 2 insns. RET_A case doesn't need an * extra mov as BPF_REG_0 is already mapped into BPF_REG_A. */ @@ -663,6 +845,8 @@ jmp_rest: if (!new_prog) { /* Only calculating new length. */ *new_len = new_insn - first_insn; + if (*seen_ld_abs) + *new_len += 4; /* Prologue bits. */ return 0; } @@ -1024,6 +1208,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) struct sock_filter *old_prog; struct bpf_prog *old_fp; int err, new_len, old_len = fp->len; + bool seen_ld_abs = false; /* We are free to overwrite insns et al right here as it * won't be used at this point in time anymore internally @@ -1045,7 +1230,8 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) } /* 1st pass: calculate the new program length. */ - err = bpf_convert_filter(old_prog, old_len, NULL, &new_len); + err = bpf_convert_filter(old_prog, old_len, NULL, &new_len, + &seen_ld_abs); if (err) goto out_err_free; @@ -1064,7 +1250,8 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) fp->len = new_len; /* 2nd pass: remap sock_filter insns into bpf_insn insns. */ - err = bpf_convert_filter(old_prog, old_len, fp, &new_len); + err = bpf_convert_filter(old_prog, old_len, fp, &new_len, + &seen_ld_abs); if (err) /* 2nd bpf_convert_filter() can fail only if it fails * to allocate memory, remapping must succeed. Note, @@ -1512,6 +1699,47 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = { .arg4_type = ARG_CONST_SIZE, }; +BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb, + u32, offset, void *, to, u32, len, u32, start_header) +{ + u8 *ptr; + + if (unlikely(offset > 0xffff || len > skb_headlen(skb))) + goto err_clear; + + switch (start_header) { + case BPF_HDR_START_MAC: + ptr = skb_mac_header(skb) + offset; + break; + case BPF_HDR_START_NET: + ptr = skb_network_header(skb) + offset; + break; + default: + goto err_clear; + } + + if (likely(ptr >= skb_mac_header(skb) && + ptr + len <= skb_tail_pointer(skb))) { + memcpy(to, ptr, len); + return 0; + } + +err_clear: + memset(to, 0, len); + return -EFAULT; +} + +static const struct bpf_func_proto bpf_skb_load_bytes_relative_proto = { + .func = bpf_skb_load_bytes_relative, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, + .arg5_type = ARG_ANYTHING, +}; + BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len) { /* Idea is the following: should the needed direct read/write @@ -1857,6 +2085,33 @@ static const struct bpf_func_proto bpf_redirect_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, + struct bpf_map *, map, void *, key, u64, flags) +{ + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + + /* If user passes invalid input drop the packet. */ + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; + + tcb->bpf.flags = flags; + tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key); + if (!tcb->bpf.sk_redir) + return SK_DROP; + + return SK_PASS; +} + +static const struct bpf_func_proto bpf_sk_redirect_hash_proto = { + .func = bpf_sk_redirect_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct bpf_map *, map, u32, key, u64, flags) { @@ -1866,9 +2121,10 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; - tcb->bpf.key = key; tcb->bpf.flags = flags; - tcb->bpf.map = map; + tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key); + if (!tcb->bpf.sk_redir) + return SK_DROP; return SK_PASS; } @@ -1876,16 +2132,8 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct sock *do_sk_redirect_map(struct sk_buff *skb) { struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); - struct sock *sk = NULL; - - if (tcb->bpf.map) { - sk = __sock_map_lookup_elem(tcb->bpf.map, tcb->bpf.key); - tcb->bpf.key = 0; - tcb->bpf.map = NULL; - } - - return sk; + return tcb->bpf.sk_redir; } static const struct bpf_func_proto bpf_sk_redirect_map_proto = { @@ -1898,32 +2146,49 @@ static const struct bpf_func_proto bpf_sk_redirect_map_proto = { .arg4_type = ARG_ANYTHING, }; -BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg, - struct bpf_map *, map, u32, key, u64, flags) +BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg_buff *, msg, + struct bpf_map *, map, void *, key, u64, flags) { /* If user passes invalid input drop the packet. */ if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; - msg->key = key; msg->flags = flags; - msg->map = map; + msg->sk_redir = __sock_hash_lookup_elem(map, key); + if (!msg->sk_redir) + return SK_DROP; return SK_PASS; } -struct sock *do_msg_redirect_map(struct sk_msg_buff *msg) +static const struct bpf_func_proto bpf_msg_redirect_hash_proto = { + .func = bpf_msg_redirect_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg, + struct bpf_map *, map, u32, key, u64, flags) { - struct sock *sk = NULL; + /* If user passes invalid input drop the packet. */ + if (unlikely(flags & ~(BPF_F_INGRESS))) + return SK_DROP; - if (msg->map) { - sk = __sock_map_lookup_elem(msg->map, msg->key); + msg->flags = flags; + msg->sk_redir = __sock_map_lookup_elem(map, key); + if (!msg->sk_redir) + return SK_DROP; - msg->key = 0; - msg->map = NULL; - } + return SK_PASS; +} - return sk; +struct sock *do_msg_redirect_map(struct sk_msg_buff *msg) +{ + return msg->sk_redir; } static const struct bpf_func_proto bpf_msg_redirect_map_proto = { @@ -2186,7 +2451,7 @@ BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, return ret; } -const struct bpf_func_proto bpf_skb_vlan_push_proto = { +static const struct bpf_func_proto bpf_skb_vlan_push_proto = { .func = bpf_skb_vlan_push, .gpl_only = false, .ret_type = RET_INTEGER, @@ -2194,7 +2459,6 @@ const struct bpf_func_proto bpf_skb_vlan_push_proto = { .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; -EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto); BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) { @@ -2208,13 +2472,12 @@ BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) return ret; } -const struct bpf_func_proto bpf_skb_vlan_pop_proto = { +static const struct bpf_func_proto bpf_skb_vlan_pop_proto = { .func = bpf_skb_vlan_pop, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; -EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto); static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len) { @@ -2699,8 +2962,9 @@ static unsigned long xdp_get_metalen(const struct xdp_buff *xdp) BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) { + void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame); unsigned long metalen = xdp_get_metalen(xdp); - void *data_start = xdp->data_hard_start + metalen; + void *data_start = xdp_frame_end + metalen; void *data = xdp->data + offset; if (unlikely(data < data_start || @@ -2724,14 +2988,39 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset) +{ + void *data_end = xdp->data_end + offset; + + /* only shrinking is allowed for now. */ + if (unlikely(offset >= 0)) + return -EINVAL; + + if (unlikely(data_end < xdp->data + ETH_HLEN)) + return -EINVAL; + + xdp->data_end = data_end; + + return 0; +} + +static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = { + .func = bpf_xdp_adjust_tail, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset) { + void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame); void *meta = xdp->data_meta + offset; unsigned long metalen = xdp->data - meta; if (xdp_data_meta_unsupported(xdp)) return -ENOTSUPP; - if (unlikely(meta < xdp->data_hard_start || + if (unlikely(meta < xdp_frame_end || meta > xdp->data)) return -EINVAL; if (unlikely((metalen & (sizeof(__u32) - 1)) || @@ -2756,16 +3045,20 @@ static int __bpf_tx_xdp(struct net_device *dev, struct xdp_buff *xdp, u32 index) { - int err; + struct xdp_frame *xdpf; + int sent; if (!dev->netdev_ops->ndo_xdp_xmit) { return -EOPNOTSUPP; } - err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp); - if (err) - return err; - dev->netdev_ops->ndo_xdp_flush(dev); + xdpf = convert_to_xdp_frame(xdp); + if (unlikely(!xdpf)) + return -EOVERFLOW; + + sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf, XDP_XMIT_FLUSH); + if (sent <= 0) + return sent; return 0; } @@ -2776,24 +3069,33 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, { int err; - if (map->map_type == BPF_MAP_TYPE_DEVMAP) { - struct net_device *dev = fwd; - - if (!dev->netdev_ops->ndo_xdp_xmit) - return -EOPNOTSUPP; + switch (map->map_type) { + case BPF_MAP_TYPE_DEVMAP: { + struct bpf_dtab_netdev *dst = fwd; - err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp); + err = dev_map_enqueue(dst, xdp, dev_rx); if (err) return err; __dev_map_insert_ctx(map, index); - - } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) { + break; + } + case BPF_MAP_TYPE_CPUMAP: { struct bpf_cpu_map_entry *rcpu = fwd; err = cpu_map_enqueue(rcpu, xdp, dev_rx); if (err) return err; __cpu_map_insert_ctx(map, index); + break; + } + case BPF_MAP_TYPE_XSKMAP: { + struct xdp_sock *xs = fwd; + + err = __xsk_map_redirect(map, xdp, xs); + return err; + } + default: + break; } return 0; } @@ -2812,6 +3114,9 @@ void xdp_do_flush_map(void) case BPF_MAP_TYPE_CPUMAP: __cpu_map_flush(map); break; + case BPF_MAP_TYPE_XSKMAP: + __xsk_map_flush(map); + break; default: break; } @@ -2826,6 +3131,8 @@ static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) return __dev_map_lookup_elem(map, index); case BPF_MAP_TYPE_CPUMAP: return __cpu_map_lookup_elem(map, index); + case BPF_MAP_TYPE_XSKMAP: + return __xsk_map_lookup_elem(map, index); default: return NULL; } @@ -2923,13 +3230,14 @@ static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd) static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, + struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); unsigned long map_owner = ri->map_owner; struct bpf_map *map = ri->map; - struct net_device *fwd = NULL; u32 index = ri->ifindex; + void *fwd = NULL; int err = 0; ri->ifindex = 0; @@ -2951,6 +3259,14 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd)))) goto err; skb->dev = fwd; + generic_xdp_tx(skb, xdp_prog); + } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { + struct xdp_sock *xs = fwd; + + err = xsk_generic_rcv(xs, xdp); + if (err) + goto err; + consume_skb(skb); } else { /* TODO: Handle BPF_MAP_TYPE_CPUMAP */ err = -EBADRQC; @@ -2965,7 +3281,7 @@ err: } int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, - struct bpf_prog *xdp_prog) + struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); u32 index = ri->ifindex; @@ -2973,7 +3289,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, int err = 0; if (ri->map) - return xdp_do_generic_redirect_map(dev, skb, xdp_prog); + return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog); ri->ifindex = 0; fwd = dev_get_by_index_rcu(dev_net(dev), index); @@ -2987,6 +3303,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, skb->dev = fwd; _trace_xdp_redirect(dev, xdp_prog, index); + generic_xdp_tx(skb, xdp_prog); return 0; err: _trace_xdp_redirect_err(dev, xdp_prog, index, err); @@ -3045,27 +3362,6 @@ static const struct bpf_func_proto bpf_xdp_redirect_map_proto = { .arg3_type = ARG_ANYTHING, }; -bool bpf_helper_changes_pkt_data(void *func) -{ - if (func == bpf_skb_vlan_push || - func == bpf_skb_vlan_pop || - func == bpf_skb_store_bytes || - func == bpf_skb_change_proto || - func == bpf_skb_change_head || - func == bpf_skb_change_tail || - func == bpf_skb_adjust_room || - func == bpf_skb_pull_data || - func == bpf_clone_redirect || - func == bpf_l3_csum_replace || - func == bpf_l4_csum_replace || - func == bpf_xdp_adjust_head || - func == bpf_xdp_adjust_meta || - func == bpf_msg_pull_data) - return true; - - return false; -} - static unsigned long bpf_skb_copy(void *dst_buff, const void *skb, unsigned long off, unsigned long len) { @@ -3148,6 +3444,7 @@ set_compat: to->tunnel_id = be64_to_cpu(info->key.tun_id); to->tunnel_tos = info->key.tos; to->tunnel_ttl = info->key.ttl; + to->tunnel_ext = 0; if (flags & BPF_F_TUNINFO_IPV6) { memcpy(to->remote_ipv6, &info->key.u.ipv6.src, @@ -3155,6 +3452,8 @@ set_compat: to->tunnel_label = be32_to_cpu(info->key.label); } else { to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); + memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3); + to->tunnel_label = 0; } if (unlikely(size != sizeof(struct bpf_tunnel_key))) @@ -3364,6 +3663,27 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { .arg3_type = ARG_ANYTHING, }; +#ifdef CONFIG_SOCK_CGROUP_DATA +BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb) +{ + struct sock *sk = skb_to_full_sk(skb); + struct cgroup *cgrp; + + if (!sk || !sk_fullsock(sk)) + return 0; + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + return cgrp->kn->id.id; +} + +static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { + .func = bpf_skb_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; +#endif + static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, unsigned long off, unsigned long len) { @@ -3711,6 +4031,603 @@ static const struct bpf_func_proto bpf_bind_proto = { .arg3_type = ARG_CONST_SIZE, }; +#ifdef CONFIG_XFRM +BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index, + struct bpf_xfrm_state *, to, u32, size, u64, flags) +{ + const struct sec_path *sp = skb_sec_path(skb); + const struct xfrm_state *x; + + if (!sp || unlikely(index >= sp->len || flags)) + goto err_clear; + + x = sp->xvec[index]; + + if (unlikely(size != sizeof(struct bpf_xfrm_state))) + goto err_clear; + + to->reqid = x->props.reqid; + to->spi = x->id.spi; + to->family = x->props.family; + to->ext = 0; + + if (to->family == AF_INET6) { + memcpy(to->remote_ipv6, x->props.saddr.a6, + sizeof(to->remote_ipv6)); + } else { + to->remote_ipv4 = x->props.saddr.a4; + memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3); + } + + return 0; +err_clear: + memset(to, 0, size); + return -EINVAL; +} + +static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = { + .func = bpf_skb_get_xfrm_state, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, + .arg5_type = ARG_ANYTHING, +}; +#endif + +#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6) +static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, + const struct neighbour *neigh, + const struct net_device *dev) +{ + memcpy(params->dmac, neigh->ha, ETH_ALEN); + memcpy(params->smac, dev->dev_addr, ETH_ALEN); + params->h_vlan_TCI = 0; + params->h_vlan_proto = 0; + + return dev->ifindex; +} +#endif + +#if IS_ENABLED(CONFIG_INET) +static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, + u32 flags, bool check_mtu) +{ + struct in_device *in_dev; + struct neighbour *neigh; + struct net_device *dev; + struct fib_result res; + struct fib_nh *nh; + struct flowi4 fl4; + int err; + u32 mtu; + + dev = dev_get_by_index_rcu(net, params->ifindex); + if (unlikely(!dev)) + return -ENODEV; + + /* verify forwarding is enabled on this interface */ + in_dev = __in_dev_get_rcu(dev); + if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev))) + return 0; + + if (flags & BPF_FIB_LOOKUP_OUTPUT) { + fl4.flowi4_iif = 1; + fl4.flowi4_oif = params->ifindex; + } else { + fl4.flowi4_iif = params->ifindex; + fl4.flowi4_oif = 0; + } + fl4.flowi4_tos = params->tos & IPTOS_RT_MASK; + fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + fl4.flowi4_flags = 0; + + fl4.flowi4_proto = params->l4_protocol; + fl4.daddr = params->ipv4_dst; + fl4.saddr = params->ipv4_src; + fl4.fl4_sport = params->sport; + fl4.fl4_dport = params->dport; + + if (flags & BPF_FIB_LOOKUP_DIRECT) { + u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; + struct fib_table *tb; + + tb = fib_get_table(net, tbid); + if (unlikely(!tb)) + return 0; + + err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); + } else { + fl4.flowi4_mark = 0; + fl4.flowi4_secid = 0; + fl4.flowi4_tun_key.tun_id = 0; + fl4.flowi4_uid = sock_net_uid(net, NULL); + + err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF); + } + + if (err || res.type != RTN_UNICAST) + return 0; + + if (res.fi->fib_nhs > 1) + fib_select_path(net, &res, &fl4, NULL); + + if (check_mtu) { + mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst); + if (params->tot_len > mtu) + return 0; + } + + nh = &res.fi->fib_nh[res.nh_sel]; + + /* do not handle lwt encaps right now */ + if (nh->nh_lwtstate) + return 0; + + dev = nh->nh_dev; + if (unlikely(!dev)) + return 0; + + if (nh->nh_gw) + params->ipv4_dst = nh->nh_gw; + + params->rt_metric = res.fi->fib_priority; + + /* xdp and cls_bpf programs are run in RCU-bh so + * rcu_read_lock_bh is not needed here + */ + neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst); + if (neigh) + return bpf_fib_set_fwd_params(params, neigh, dev); + + return 0; +} +#endif + +#if IS_ENABLED(CONFIG_IPV6) +static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, + u32 flags, bool check_mtu) +{ + struct in6_addr *src = (struct in6_addr *) params->ipv6_src; + struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst; + struct neighbour *neigh; + struct net_device *dev; + struct inet6_dev *idev; + struct fib6_info *f6i; + struct flowi6 fl6; + int strict = 0; + int oif; + u32 mtu; + + /* link local addresses are never forwarded */ + if (rt6_need_strict(dst) || rt6_need_strict(src)) + return 0; + + dev = dev_get_by_index_rcu(net, params->ifindex); + if (unlikely(!dev)) + return -ENODEV; + + idev = __in6_dev_get_safely(dev); + if (unlikely(!idev || !net->ipv6.devconf_all->forwarding)) + return 0; + + if (flags & BPF_FIB_LOOKUP_OUTPUT) { + fl6.flowi6_iif = 1; + oif = fl6.flowi6_oif = params->ifindex; + } else { + oif = fl6.flowi6_iif = params->ifindex; + fl6.flowi6_oif = 0; + strict = RT6_LOOKUP_F_HAS_SADDR; + } + fl6.flowlabel = params->flowinfo; + fl6.flowi6_scope = 0; + fl6.flowi6_flags = 0; + fl6.mp_hash = 0; + + fl6.flowi6_proto = params->l4_protocol; + fl6.daddr = *dst; + fl6.saddr = *src; + fl6.fl6_sport = params->sport; + fl6.fl6_dport = params->dport; + + if (flags & BPF_FIB_LOOKUP_DIRECT) { + u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; + struct fib6_table *tb; + + tb = ipv6_stub->fib6_get_table(net, tbid); + if (unlikely(!tb)) + return 0; + + f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict); + } else { + fl6.flowi6_mark = 0; + fl6.flowi6_secid = 0; + fl6.flowi6_tun_key.tun_id = 0; + fl6.flowi6_uid = sock_net_uid(net, NULL); + + f6i = ipv6_stub->fib6_lookup(net, oif, &fl6, strict); + } + + if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry)) + return 0; + + if (unlikely(f6i->fib6_flags & RTF_REJECT || + f6i->fib6_type != RTN_UNICAST)) + return 0; + + if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0) + f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6, + fl6.flowi6_oif, NULL, + strict); + + if (check_mtu) { + mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src); + if (params->tot_len > mtu) + return 0; + } + + if (f6i->fib6_nh.nh_lwtstate) + return 0; + + if (f6i->fib6_flags & RTF_GATEWAY) + *dst = f6i->fib6_nh.nh_gw; + + dev = f6i->fib6_nh.nh_dev; + params->rt_metric = f6i->fib6_metric; + + /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is + * not needed here. Can not use __ipv6_neigh_lookup_noref here + * because we need to get nd_tbl via the stub + */ + neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128, + ndisc_hashfn, dst, dev); + if (neigh) + return bpf_fib_set_fwd_params(params, neigh, dev); + + return 0; +} +#endif + +BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx, + struct bpf_fib_lookup *, params, int, plen, u32, flags) +{ + if (plen < sizeof(*params)) + return -EINVAL; + + if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT)) + return -EINVAL; + + switch (params->family) { +#if IS_ENABLED(CONFIG_INET) + case AF_INET: + return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params, + flags, true); +#endif +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params, + flags, true); +#endif + } + return -EAFNOSUPPORT; +} + +static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = { + .func = bpf_xdp_fib_lookup, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + +BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, + struct bpf_fib_lookup *, params, int, plen, u32, flags) +{ + struct net *net = dev_net(skb->dev); + int index = -EAFNOSUPPORT; + + if (plen < sizeof(*params)) + return -EINVAL; + + if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT)) + return -EINVAL; + + switch (params->family) { +#if IS_ENABLED(CONFIG_INET) + case AF_INET: + index = bpf_ipv4_fib_lookup(net, params, flags, false); + break; +#endif +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + index = bpf_ipv6_fib_lookup(net, params, flags, false); + break; +#endif + } + + if (index > 0) { + struct net_device *dev; + + dev = dev_get_by_index_rcu(net, index); + if (!is_skb_forwardable(dev, skb)) + index = 0; + } + + return index; +} + +static const struct bp |