summaryrefslogtreecommitdiffstats
path: root/kernel/bpf
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/bpf')
-rw-r--r--kernel/bpf/Makefile1
-rw-r--r--kernel/bpf/arraymap.c18
-rw-r--r--kernel/bpf/btf.c12
-rw-r--r--kernel/bpf/cgroup.c448
-rw-r--r--kernel/bpf/core.c60
-rw-r--r--kernel/bpf/cpumap.c117
-rw-r--r--kernel/bpf/devmap.c124
-rw-r--r--kernel/bpf/hashtab.c14
-rw-r--r--kernel/bpf/local_storage.c13
-rw-r--r--kernel/bpf/lpm_trie.c8
-rw-r--r--kernel/bpf/queue_stack_maps.c13
-rw-r--r--kernel/bpf/reuseport_array.c17
-rw-r--r--kernel/bpf/stackmap.c28
-rw-r--r--kernel/bpf/syscall.c122
-rw-r--r--kernel/bpf/verifier.c1283
-rw-r--r--kernel/bpf/xskmap.c22
16 files changed, 1887 insertions, 413 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 4c2fa3ac56f6..29d781061cd5 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,5 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
obj-y := core.o
+CFLAGS_core.o += $(call cc-disable-warning, override-init)
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 262a321f58a6..1c65ce0098a9 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -75,6 +75,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
u32 elem_size, index_mask, max_entries;
bool unpriv = !capable(CAP_SYS_ADMIN);
u64 cost, array_size, mask64;
+ struct bpf_map_memory mem;
struct bpf_array *array;
elem_size = round_up(attr->value_size, 8);
@@ -108,32 +109,29 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
/* make sure there is no u32 overflow later in round_up() */
cost = array_size;
- if (cost >= U32_MAX - PAGE_SIZE)
- return ERR_PTR(-ENOMEM);
- if (percpu) {
+ if (percpu)
cost += (u64)attr->max_entries * elem_size * num_possible_cpus();
- if (cost >= U32_MAX - PAGE_SIZE)
- return ERR_PTR(-ENOMEM);
- }
- cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
- ret = bpf_map_precharge_memlock(cost);
+ ret = bpf_map_charge_init(&mem, cost);
if (ret < 0)
return ERR_PTR(ret);
/* allocate all map elements and zero-initialize them */
array = bpf_map_area_alloc(array_size, numa_node);
- if (!array)
+ if (!array) {
+ bpf_map_charge_finish(&mem);
return ERR_PTR(-ENOMEM);
+ }
array->index_mask = index_mask;
array->map.unpriv_array = unpriv;
/* copy mandatory map attributes */
bpf_map_init_from_attr(&array->map, attr);
- array->map.pages = cost;
+ bpf_map_charge_move(&array->map.memory, &mem);
array->elem_size = elem_size;
if (percpu && bpf_array_alloc_percpu(array)) {
+ bpf_map_charge_finish(&array->map.memory);
bpf_map_area_free(array);
return ERR_PTR(-ENOMEM);
}
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index cad09858a5f2..546ebee39e2a 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1928,8 +1928,8 @@ static int btf_array_resolve(struct btf_verifier_env *env,
/* Check array->index_type */
index_type_id = array->index_type;
index_type = btf_type_by_id(btf, index_type_id);
- if (btf_type_is_resolve_source_only(index_type) ||
- btf_type_nosize_or_null(index_type)) {
+ if (btf_type_nosize_or_null(index_type) ||
+ btf_type_is_resolve_source_only(index_type)) {
btf_verifier_log_type(env, v->t, "Invalid index");
return -EINVAL;
}
@@ -1948,8 +1948,8 @@ static int btf_array_resolve(struct btf_verifier_env *env,
/* Check array->type */
elem_type_id = array->type;
elem_type = btf_type_by_id(btf, elem_type_id);
- if (btf_type_is_resolve_source_only(elem_type) ||
- btf_type_nosize_or_null(elem_type)) {
+ if (btf_type_nosize_or_null(elem_type) ||
+ btf_type_is_resolve_source_only(elem_type)) {
btf_verifier_log_type(env, v->t,
"Invalid elem");
return -EINVAL;
@@ -2170,8 +2170,8 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
const struct btf_type *member_type = btf_type_by_id(env->btf,
member_type_id);
- if (btf_type_is_resolve_source_only(member_type) ||
- btf_type_nosize_or_null(member_type)) {
+ if (btf_type_nosize_or_null(member_type) ||
+ btf_type_is_resolve_source_only(member_type)) {
btf_verifier_log_member(env, v->t, member,
"Invalid member");
return -EINVAL;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 92a7d0cf8d13..0a00eaca6fae 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -15,19 +15,34 @@
#include <linux/bpf.h>
#include <linux/bpf-cgroup.h>
#include <net/sock.h>
+#include <net/bpf_sk_storage.h>
+
+#include "../cgroup/cgroup-internal.h"
DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
EXPORT_SYMBOL(cgroup_bpf_enabled_key);
+void cgroup_bpf_offline(struct cgroup *cgrp)
+{
+ cgroup_get(cgrp);
+ percpu_ref_kill(&cgrp->bpf.refcnt);
+}
+
/**
- * cgroup_bpf_put() - put references of all bpf programs
- * @cgrp: the cgroup to modify
+ * cgroup_bpf_release() - put references of all bpf programs and
+ * release all cgroup bpf data
+ * @work: work structure embedded into the cgroup to modify
*/
-void cgroup_bpf_put(struct cgroup *cgrp)
+static void cgroup_bpf_release(struct work_struct *work)
{
+ struct cgroup *cgrp = container_of(work, struct cgroup,
+ bpf.release_work);
enum bpf_cgroup_storage_type stype;
+ struct bpf_prog_array *old_array;
unsigned int type;
+ mutex_lock(&cgroup_mutex);
+
for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
struct list_head *progs = &cgrp->bpf.progs[type];
struct bpf_prog_list *pl, *tmp;
@@ -42,8 +57,29 @@ void cgroup_bpf_put(struct cgroup *cgrp)
kfree(pl);
static_branch_dec(&cgroup_bpf_enabled_key);
}
- bpf_prog_array_free(cgrp->bpf.effective[type]);
+ old_array = rcu_dereference_protected(
+ cgrp->bpf.effective[type],
+ lockdep_is_held(&cgroup_mutex));
+ bpf_prog_array_free(old_array);
}
+
+ mutex_unlock(&cgroup_mutex);
+
+ percpu_ref_exit(&cgrp->bpf.refcnt);
+ cgroup_put(cgrp);
+}
+
+/**
+ * cgroup_bpf_release_fn() - callback used to schedule releasing
+ * of bpf cgroup data
+ * @ref: percpu ref counter structure
+ */
+static void cgroup_bpf_release_fn(struct percpu_ref *ref)
+{
+ struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
+
+ INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
+ queue_work(system_wq, &cgrp->bpf.release_work);
}
/* count number of elements in the list.
@@ -98,7 +134,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp,
*/
static int compute_effective_progs(struct cgroup *cgrp,
enum bpf_attach_type type,
- struct bpf_prog_array __rcu **array)
+ struct bpf_prog_array **array)
{
enum bpf_cgroup_storage_type stype;
struct bpf_prog_array *progs;
@@ -136,17 +172,16 @@ static int compute_effective_progs(struct cgroup *cgrp,
}
} while ((p = cgroup_parent(p)));
- rcu_assign_pointer(*array, progs);
+ *array = progs;
return 0;
}
static void activate_effective_progs(struct cgroup *cgrp,
enum bpf_attach_type type,
- struct bpf_prog_array __rcu *array)
+ struct bpf_prog_array *old_array)
{
- struct bpf_prog_array __rcu *old_array;
-
- old_array = xchg(&cgrp->bpf.effective[type], array);
+ rcu_swap_protected(cgrp->bpf.effective[type], old_array,
+ lockdep_is_held(&cgroup_mutex));
/* free prog array after grace period, since __cgroup_bpf_run_*()
* might be still walking the array
*/
@@ -163,8 +198,13 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
* that array below is variable length
*/
#define NR ARRAY_SIZE(cgrp->bpf.effective)
- struct bpf_prog_array __rcu *arrays[NR] = {};
- int i;
+ struct bpf_prog_array *arrays[NR] = {};
+ int ret, i;
+
+ ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0,
+ GFP_KERNEL);
+ if (ret)
+ return ret;
for (i = 0; i < NR; i++)
INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
@@ -180,6 +220,9 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
cleanup:
for (i = 0; i < NR; i++)
bpf_prog_array_free(arrays[i]);
+
+ percpu_ref_exit(&cgrp->bpf.refcnt);
+
return -ENOMEM;
}
@@ -193,6 +236,9 @@ static int update_effective_progs(struct cgroup *cgrp,
css_for_each_descendant_pre(css, &cgrp->self) {
struct cgroup *desc = container_of(css, struct cgroup, self);
+ if (percpu_ref_is_zero(&desc->bpf.refcnt))
+ continue;
+
err = compute_effective_progs(desc, type, &desc->bpf.inactive);
if (err)
goto cleanup;
@@ -202,6 +248,14 @@ static int update_effective_progs(struct cgroup *cgrp,
css_for_each_descendant_pre(css, &cgrp->self) {
struct cgroup *desc = container_of(css, struct cgroup, self);
+ if (percpu_ref_is_zero(&desc->bpf.refcnt)) {
+ if (unlikely(desc->bpf.inactive)) {
+ bpf_prog_array_free(desc->bpf.inactive);
+ desc->bpf.inactive = NULL;
+ }
+ continue;
+ }
+
activate_effective_progs(desc, type, desc->bpf.inactive);
desc->bpf.inactive = NULL;
}
@@ -441,10 +495,14 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
enum bpf_attach_type type = attr->query.attach_type;
struct list_head *progs = &cgrp->bpf.progs[type];
u32 flags = cgrp->bpf.flags[type];
+ struct bpf_prog_array *effective;
int cnt, ret = 0, i;
+ effective = rcu_dereference_protected(cgrp->bpf.effective[type],
+ lockdep_is_held(&cgroup_mutex));
+
if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
- cnt = bpf_prog_array_length(cgrp->bpf.effective[type]);
+ cnt = bpf_prog_array_length(effective);
else
cnt = prog_list_length(progs);
@@ -461,8 +519,7 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
}
if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
- return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type],
- prog_ids, cnt);
+ return bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
} else {
struct bpf_prog_list *pl;
u32 id;
@@ -545,8 +602,16 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
* The program type passed in via @type must be suitable for network
* filtering. No further check is performed to assert that.
*
- * This function will return %-EPERM if any if an attached program was found
- * and if it returned != 1 during execution. In all other cases, 0 is returned.
+ * For egress packets, this function can return:
+ * NET_XMIT_SUCCESS (0) - continue with packet output
+ * NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr
+ * NET_XMIT_CN (2) - continue with packet output and notify TCP
+ * to call cwr
+ * -EPERM - drop packet
+ *
+ * For ingress packets, this function will return -EPERM if any
+ * attached program was found and if it returned != 1 during execution.
+ * Otherwise 0 is returned.
*/
int __cgroup_bpf_run_filter_skb(struct sock *sk,
struct sk_buff *skb,
@@ -572,12 +637,19 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
/* compute pointers for the bpf prog */
bpf_compute_and_save_data_end(skb, &saved_data_end);
- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
- __bpf_prog_run_save_cb);
+ if (type == BPF_CGROUP_INET_EGRESS) {
+ ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(
+ cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb);
+ } else {
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
+ __bpf_prog_run_save_cb);
+ ret = (ret == 1 ? 0 : -EPERM);
+ }
bpf_restore_data_end(skb, saved_data_end);
__skb_pull(skb, offset);
skb->sk = save_sk;
- return ret == 1 ? 0 : -EPERM;
+
+ return ret;
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
@@ -867,6 +939,190 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl);
+#ifdef CONFIG_NET
+static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
+ enum bpf_attach_type attach_type)
+{
+ struct bpf_prog_array *prog_array;
+ bool empty;
+
+ rcu_read_lock();
+ prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]);
+ empty = bpf_prog_array_is_empty(prog_array);
+ rcu_read_unlock();
+
+ return empty;
+}
+
+static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
+{
+ if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0)
+ return -EINVAL;
+
+ ctx->optval = kzalloc(max_optlen, GFP_USER);
+ if (!ctx->optval)
+ return -ENOMEM;
+
+ ctx->optval_end = ctx->optval + max_optlen;
+ ctx->optlen = max_optlen;
+
+ return 0;
+}
+
+static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
+{
+ kfree(ctx->optval);
+}
+
+int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
+ int *optname, char __user *optval,
+ int *optlen, char **kernel_optval)
+{
+ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ struct bpf_sockopt_kern ctx = {
+ .sk = sk,
+ .level = *level,
+ .optname = *optname,
+ };
+ int ret;
+
+ /* Opportunistic check to see whether we have any BPF program
+ * attached to the hook so we don't waste time allocating
+ * memory and locking the socket.
+ */
+ if (!cgroup_bpf_enabled ||
+ __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT))
+ return 0;
+
+ ret = sockopt_alloc_buf(&ctx, *optlen);
+ if (ret)
+ return ret;
+
+ if (copy_from_user(ctx.optval, optval, *optlen) != 0) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ lock_sock(sk);
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT],
+ &ctx, BPF_PROG_RUN);
+ release_sock(sk);
+
+ if (!ret) {
+ ret = -EPERM;
+ goto out;
+ }
+
+ if (ctx.optlen == -1) {
+ /* optlen set to -1, bypass kernel */
+ ret = 1;
+ } else if (ctx.optlen > *optlen || ctx.optlen < -1) {
+ /* optlen is out of bounds */
+ ret = -EFAULT;
+ } else {
+ /* optlen within bounds, run kernel handler */
+ ret = 0;
+
+ /* export any potential modifications */
+ *level = ctx.level;
+ *optname = ctx.optname;
+ *optlen = ctx.optlen;
+ *kernel_optval = ctx.optval;
+ }
+
+out:
+ if (ret)
+ sockopt_free_buf(&ctx);
+ return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt);
+
+int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
+ int optname, char __user *optval,
+ int __user *optlen, int max_optlen,
+ int retval)
+{
+ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ struct bpf_sockopt_kern ctx = {
+ .sk = sk,
+ .level = level,
+ .optname = optname,
+ .retval = retval,
+ };
+ int ret;
+
+ /* Opportunistic check to see whether we have any BPF program
+ * attached to the hook so we don't waste time allocating
+ * memory and locking the socket.
+ */
+ if (!cgroup_bpf_enabled ||
+ __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT))
+ return retval;
+
+ ret = sockopt_alloc_buf(&ctx, max_optlen);
+ if (ret)
+ return ret;
+
+ if (!retval) {
+ /* If kernel getsockopt finished successfully,
+ * copy whatever was returned to the user back
+ * into our temporary buffer. Set optlen to the
+ * one that kernel returned as well to let
+ * BPF programs inspect the value.
+ */
+
+ if (get_user(ctx.optlen, optlen)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (ctx.optlen > max_optlen)
+ ctx.optlen = max_optlen;
+
+ if (copy_from_user(ctx.optval, optval, ctx.optlen) != 0) {
+ ret = -EFAULT;
+ goto out;
+ }
+ }
+
+ lock_sock(sk);
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
+ &ctx, BPF_PROG_RUN);
+ release_sock(sk);
+
+ if (!ret) {
+ ret = -EPERM;
+ goto out;
+ }
+
+ if (ctx.optlen > max_optlen) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ /* BPF programs only allowed to set retval to 0, not some
+ * arbitrary value.
+ */
+ if (ctx.retval != 0 && ctx.retval != retval) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
+ put_user(ctx.optlen, optlen)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ ret = ctx.retval;
+
+out:
+ sockopt_free_buf(&ctx);
+ return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt);
+#endif
+
static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
size_t *lenp)
{
@@ -1127,3 +1383,155 @@ const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
const struct bpf_prog_ops cg_sysctl_prog_ops = {
};
+
+static const struct bpf_func_proto *
+cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+#ifdef CONFIG_NET
+ case BPF_FUNC_sk_storage_get:
+ return &bpf_sk_storage_get_proto;
+ case BPF_FUNC_sk_storage_delete:
+ return &bpf_sk_storage_delete_proto;
+#endif
+#ifdef CONFIG_INET
+ case BPF_FUNC_tcp_sock:
+ return &bpf_tcp_sock_proto;
+#endif
+ default:
+ return cgroup_base_func_proto(func_id, prog);
+ }
+}
+
+static bool cg_sockopt_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ const int size_default = sizeof(__u32);
+
+ if (off < 0 || off >= sizeof(struct bpf_sockopt))
+ return false;
+
+ if (off % size != 0)
+ return false;
+
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case offsetof(struct bpf_sockopt, retval):
+ if (size != size_default)
+ return false;
+ return prog->expected_attach_type ==
+ BPF_CGROUP_GETSOCKOPT;
+ case offsetof(struct bpf_sockopt, optname):
+ /* fallthrough */
+ case offsetof(struct bpf_sockopt, level):
+ if (size != size_default)
+ return false;
+ return prog->expected_attach_type ==
+ BPF_CGROUP_SETSOCKOPT;
+ case offsetof(struct bpf_sockopt, optlen):
+ return size == size_default;
+ default:
+ return false;
+ }
+ }
+
+ switch (off) {
+ case offsetof(struct bpf_sockopt, sk):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_SOCKET;
+ break;
+ case offsetof(struct bpf_sockopt, optval):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_PACKET;
+ break;
+ case offsetof(struct bpf_sockopt, optval_end):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_PACKET_END;
+ break;
+ case offsetof(struct bpf_sockopt, retval):
+ if (size != size_default)
+ return false;
+ return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
+ default:
+ if (size != size_default)
+ return false;
+ break;
+ }
+ return true;
+}
+
+#define CG_SOCKOPT_ACCESS_FIELD(T, F) \
+ T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \
+ si->dst_reg, si->src_reg, \
+ offsetof(struct bpf_sockopt_kern, F))
+
+static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog,
+ u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (si->off) {
+ case offsetof(struct bpf_sockopt, sk):
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk);
+ break;
+ case offsetof(struct bpf_sockopt, level):
+ if (type == BPF_WRITE)
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level);
+ else
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level);
+ break;
+ case offsetof(struct bpf_sockopt, optname):
+ if (type == BPF_WRITE)
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname);
+ else
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname);
+ break;
+ case offsetof(struct bpf_sockopt, optlen):
+ if (type == BPF_WRITE)
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen);
+ else
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen);
+ break;
+ case offsetof(struct bpf_sockopt, retval):
+ if (type == BPF_WRITE)
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval);
+ else
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval);
+ break;
+ case offsetof(struct bpf_sockopt, optval):
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval);
+ break;
+ case offsetof(struct bpf_sockopt, optval_end):
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end);
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf,
+ bool direct_write,
+ const struct bpf_prog *prog)
+{
+ /* Nothing to do for sockopt argument. The data is kzalloc'ated.
+ */
+ return 0;
+}
+
+const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
+ .get_func_proto = cg_sockopt_func_proto,
+ .is_valid_access = cg_sockopt_is_valid_access,
+ .convert_ctx_access = cg_sockopt_convert_ctx_access,
+ .gen_prologue = cg_sockopt_get_prologue,
+};
+
+const struct bpf_prog_ops cg_sockopt_prog_ops = {
+};
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 080e2bb644cc..16079550db6d 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1364,10 +1364,10 @@ select_insn:
insn++;
CONT;
ALU_ARSH_X:
- DST = (u64) (u32) ((*(s32 *) &DST) >> SRC);
+ DST = (u64) (u32) (((s32) DST) >> SRC);
CONT;
ALU_ARSH_K:
- DST = (u64) (u32) ((*(s32 *) &DST) >> IMM);
+ DST = (u64) (u32) (((s32) DST) >> IMM);
CONT;
ALU64_ARSH_X:
(*(s64 *) &DST) >>= SRC;
@@ -1791,38 +1791,42 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
return &empty_prog_array.hdr;
}
-void bpf_prog_array_free(struct bpf_prog_array __rcu *progs)
+void bpf_prog_array_free(struct bpf_prog_array *progs)
{
- if (!progs ||
- progs == (struct bpf_prog_array __rcu *)&empty_prog_array.hdr)
+ if (!progs || progs == &empty_prog_array.hdr)
return;
kfree_rcu(progs, rcu);
}
-int bpf_prog_array_length(struct bpf_prog_array __rcu *array)
+int bpf_prog_array_length(struct bpf_prog_array *array)
{
struct bpf_prog_array_item *item;
u32 cnt = 0;
- rcu_read_lock();
- item = rcu_dereference(array)->items;
- for (; item->prog; item++)
+ for (item = array->items; item->prog; item++)
if (item->prog != &dummy_bpf_prog.prog)
cnt++;
- rcu_read_unlock();
return cnt;
}
+bool bpf_prog_array_is_empty(struct bpf_prog_array *array)
+{
+ struct bpf_prog_array_item *item;
+
+ for (item = array->items; item->prog; item++)
+ if (item->prog != &dummy_bpf_prog.prog)
+ return false;
+ return true;
+}
-static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array,
+static bool bpf_prog_array_copy_core(struct bpf_prog_array *array,
u32 *prog_ids,
u32 request_cnt)
{
struct bpf_prog_array_item *item;
int i = 0;
- item = rcu_dereference_check(array, 1)->items;
- for (; item->prog; item++) {
+ for (item = array->items; item->prog; item++) {
if (item->prog == &dummy_bpf_prog.prog)
continue;
prog_ids[i] = item->prog->aux->id;
@@ -1835,7 +1839,7 @@ static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array,
return !!(item->prog);
}
-int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array,
+int bpf_prog_array_copy_to_user(struct bpf_prog_array *array,
__u32 __user *prog_ids, u32 cnt)
{
unsigned long err = 0;
@@ -1846,18 +1850,12 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array,
* cnt = bpf_prog_array_length();
* if (cnt > 0)
* bpf_prog_array_copy_to_user(..., cnt);
- * so below kcalloc doesn't need extra cnt > 0 check, but
- * bpf_prog_array_length() releases rcu lock and
- * prog array could have been swapped with empty or larger array,
- * so always copy 'cnt' prog_ids to the user.
- * In a rare race the user will see zero prog_ids
+ * so below kcalloc doesn't need extra cnt > 0 check.
*/
ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN);
if (!ids)
return -ENOMEM;
- rcu_read_lock();
nospc = bpf_prog_array_copy_core(array, ids, cnt);
- rcu_read_unlock();
err = copy_to_user(prog_ids, ids, cnt * sizeof(u32));
kfree(ids);
if (err)
@@ -1867,19 +1865,19 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array,
return 0;
}
-void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *array,
+void bpf_prog_array_delete_safe(struct bpf_prog_array *array,
struct bpf_prog *old_prog)
{
- struct bpf_prog_array_item *item = array->items;
+ struct bpf_prog_array_item *item;
- for (; item->prog; item++)
+ for (item = array->items; item->prog; item++)
if (item->prog == old_prog) {
WRITE_ONCE(item->prog, &dummy_bpf_prog.prog);
break;
}
}
-int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
+int bpf_prog_array_copy(struct bpf_prog_array *old_array,
struct bpf_prog *exclude_prog,
struct bpf_prog *include_prog,
struct bpf_prog_array **new_array)
@@ -1943,7 +1941,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
return 0;
}
-int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array,
+int bpf_prog_array_copy_info(struct bpf_prog_array *array,
u32 *prog_ids, u32 request_cnt,
u32 *prog_cnt)
{
@@ -2086,6 +2084,15 @@ bool __weak bpf_helper_changes_pkt_data(void *func)
return false;
}
+/* Return TRUE if the JIT backend wants verifier to enable sub-register usage
+ * analysis code and wants explicit zero extension inserted by verifier.
+ * Otherwise, return FALSE.
+ */
+bool __weak bpf_jit_needs_zext(void)
+{
+ return false;
+}
+
/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
* skb_copy_bits(), so provide a weak definition of it for NET-less config.
*/
@@ -2103,3 +2110,4 @@ EXPORT_SYMBOL(bpf_stats_enabled_key);
#include <linux/bpf_trace.h>
EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
+EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 8ebd0fa826f8..ef49e17ae47c 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -32,14 +32,19 @@
/* General idea: XDP packets getting XDP redirected to another CPU,
* will maximum be stored/queued for one driver ->poll() call. It is
- * guaranteed that setting flush bit and flush operation happen on
+ * guaranteed that queueing the frame and the flush operation happen on
* same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr()
* which queue in bpf_cpu_map_entry contains packets.
*/
#define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */
+struct bpf_cpu_map_entry;
+struct bpf_cpu_map;
+
struct xdp_bulk_queue {
void *q[CPU_MAP_BULK_SIZE];
+ struct list_head flush_node;
+ struct bpf_cpu_map_entry *obj;
unsigned int count;
};
@@ -52,6 +57,8 @@ struct bpf_cpu_map_entry {
/* XDP can run multiple RX-ring queues, need __percpu enqueue store */
struct xdp_bulk_queue __percpu *bulkq;
+ struct bpf_cpu_map *cmap;
+
/* Queue with potential multi-producers, and single-consumer kthread */
struct ptr_ring *queue;
struct task_struct *kthread;
@@ -65,23 +72,17 @@ struct bpf_cpu_map {
struct bpf_map map;
/* Below members specific for map type */
struct bpf_cpu_map_entry **cpu_map;
- unsigned long __percpu *flush_needed;
+ struct list_head __percpu *flush_list;
};
-static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
- struct xdp_bulk_queue *bq, bool in_napi_ctx);
-
-static u64 cpu_map_bitmap_size(const union bpf_attr *attr)
-{
- return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
-}
+static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx);
static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
{
struct bpf_cpu_map *cmap;
int err = -ENOMEM;
+ int ret, cpu;
u64 cost;
- int ret;
if (!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
@@ -105,23 +106,21 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
/* make sure page count doesn't overflow */
cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
- cost += cpu_map_bitmap_size(attr) * num_possible_cpus();
- if (cost >= U32_MAX - PAGE_SIZE)
- goto free_cmap;
- cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+ cost += sizeof(struct list_head) * num_possible_cpus();
/* Notice returns -EPERM on if map size is larger than memlock limit */
- ret = bpf_map_precharge_memlock(cmap->map.pages);
+ ret = bpf_map_charge_init(&cmap->map.memory, cost);
if (ret) {
err = ret;
goto free_cmap;
}
- /* A per cpu bitfield with a bit per possible CPU in map */
- cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr),
- __alignof__(unsigned long));
- if (!cmap->flush_needed)
- goto free_cmap;
+ cmap->flush_list = alloc_percpu(struct list_head);
+ if (!cmap->flush_list)
+ goto free_charge;
+
+ for_each_possible_cpu(cpu)
+ INIT_LIST_HEAD(per_cpu_ptr(cmap->flush_list, cpu));
/* Alloc array for possible remote "destination" CPUs */
cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
@@ -132,7 +131,9 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
return &cmap->map;
free_percpu:
- free_percpu(cmap->flush_needed);
+ free_percpu(cmap->flush_list);
+free_charge:
+ bpf_map_charge_finish(&cmap->map.memory);
free_cmap:
kfree(cmap);
return ERR_PTR(err);
@@ -209,6 +210,9 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
* - RX ring dev queue index (skb_record_rx_queue)
*/
+ /* Until page_pool get SKB return path, release DMA here */
+ xdp_release_frame(xdpf);
+
/* Allow SKB to reuse area used by xdp_frame */
xdp_scrub_frame(xdpf);
@@ -332,7 +336,8 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
{
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
struct bpf_cpu_map_entry *rcpu;
- int numa, err;
+ struct xdp_bulk_queue *bq;
+ int numa, err, i;
/* Have map->numa_node, but choose node of redirect target CPU */
numa = cpu_to_node(cpu);
@@ -347,6 +352,11 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
if (!rcpu->bulkq)
goto free_rcu;
+ for_each_possible_cpu(i) {
+ bq = per_cpu_ptr(rcpu->bulkq, i);
+ bq->obj = rcpu;
+ }
+
/* Alloc queue */
rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa);
if (!rcpu->queue)
@@ -403,7 +413,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu)
struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu);
/* No concurrent bq_enqueue can run at this point */
- bq_flush_to_queue(rcpu, bq, false);
+ bq_flush_to_queue(bq, false);
}
free_percpu(rcpu->bulkq);
/* Cannot kthread_stop() here, last put free rcpu resources */
@@ -486,6 +496,7 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id);
if (!rcpu)
return -ENOMEM;
+ rcpu->cmap = cmap;
}
rcu_read_lock();
__cpu_map_entry_replace(cmap, key_cpu, rcpu);
@@ -512,14 +523,14 @@ static void cpu_map_free(struct bpf_map *map)
synchronize_rcu();
/* To ensure all pending flush operations have completed wait for flush
- * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
- * Because the above synchronize_rcu() ensures the map is disconnected
- * from the program we can assume no new bits will be set.
+ * list be empty on _all_ cpus. Because the above synchronize_rcu()
+ * ensures the map is disconnected from the program we can assume no new
+ * items will be added to the list.
*/
for_each_online_cpu(cpu) {
- unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu);
+ struct list_head *flush_list = per_cpu_ptr(cmap->flush_list, cpu);
- while (!bitmap_empty(bitmap, cmap->map.max_entries))
+ while (!list_empty(flush_list))
cond_resched();
}
@@ -536,7 +547,7 @@ static void cpu_map_free(struct bpf_map *map)
/* bq flush and cleanup happens after RCU graze-period */
__cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
}
- free_percpu(cmap->flush_needed);
+ free_percpu(cmap->flush_list);
bpf_map_area_free(cmap->cpu_map);
kfr