summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2019-11-24 22:05:47 -0800
committerDavid S. Miller <davem@davemloft.net>2019-11-24 22:05:47 -0800
commit4eb47198e955fc01d14113f525b62a4bc95554c7 (patch)
tree1a3ed7a7ad916ca878d130491241123f76e50ac9
parent5f04ed74a8a3bcb2122738b7eda857187e85aa34 (diff)
parentb553a6ec570044fc1ae300c6fb24f9ce204c5894 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Alexei Starovoitov says: ==================== pull-request: bpf-next 2019-11-24 The following pull-request contains BPF updates for your *net-next* tree. We've added 27 non-merge commits during the last 4 day(s) which contain a total of 50 files changed, 2031 insertions(+), 548 deletions(-). The main changes are: 1) Optimize bpf_tail_call() from retpoline-ed indirect jump to direct jump, from Daniel. 2) Support global variables in libbpf, from Andrii. 3) Cleanup selftests with BPF_TRACE_x() macro, from Martin. 4) Fix devmap hash, from Toke. 5) Fix register bounds after 32-bit conditional jumps, from Yonghong. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--arch/x86/net/bpf_jit_comp.c229
-rw-r--r--include/linux/bpf.h60
-rw-r--r--include/linux/bpf_verifier.h3
-rw-r--r--include/linux/filter.h15
-rw-r--r--kernel/bpf/arraymap.c205
-rw-r--r--kernel/bpf/core.c73
-rw-r--r--kernel/bpf/devmap.c74
-rw-r--r--kernel/bpf/map_in_map.c5
-rw-r--r--kernel/bpf/syscall.c56
-rw-r--r--kernel/bpf/trampoline.c8
-rw-r--r--kernel/bpf/verifier.c139
-rw-r--r--net/xdp/xsk.c8
-rw-r--r--tools/bpf/Makefile6
-rw-r--r--tools/bpf/bpftool/btf.c6
-rw-r--r--tools/lib/bpf/bpf_helpers.h13
-rw-r--r--tools/lib/bpf/libbpf.c294
-rw-r--r--tools/testing/selftests/bpf/Makefile2
-rw-r--r--tools/testing/selftests/bpf/bpf_trace_helpers.h58
-rw-r--r--tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c4
-rw-r--r--tools/testing/selftests/bpf/prog_tests/core_reloc.c16
-rw-r--r--tools/testing/selftests/bpf/prog_tests/tailcalls.c487
-rw-r--r--tools/testing/selftests/bpf/prog_tests/test_overhead.c142
-rw-r--r--tools/testing/selftests/bpf/progs/fentry_test.c72
-rw-r--r--tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c27
-rw-r--r--tools/testing/selftests/bpf/progs/fexit_test.c83
-rw-r--r--tools/testing/selftests/bpf/progs/kfree_skb.c43
-rw-r--r--tools/testing/selftests/bpf/progs/tailcall1.c48
-rw-r--r--tools/testing/selftests/bpf/progs/tailcall2.c59
-rw-r--r--tools/testing/selftests/bpf/progs/tailcall3.c31
-rw-r--r--tools/testing/selftests/bpf/progs/tailcall4.c33
-rw-r--r--tools/testing/selftests/bpf/progs/tailcall5.c40
-rw-r--r--tools/testing/selftests/bpf/progs/test_core_reloc_arrays.c4
-rw-r--r--tools/testing/selftests/bpf/progs/test_core_reloc_bitfields_direct.c4
-rw-r--r--tools/testing/selftests/bpf/progs/test_core_reloc_bitfields_probed.c4
-rw-r--r--tools/testing/selftests/bpf/progs/test_core_reloc_existence.c4
-rw-r--r--tools/testing/selftests/bpf/progs/test_core_reloc_flavors.c4
-rw-r--r--tools/testing/selftests/bpf/progs/test_core_reloc_ints.c4
-rw-r--r--tools/testing/selftests/bpf/progs/test_core_reloc_kernel.c8
-rw-r--r--tools/testing/selftests/bpf/progs/test_core_reloc_misc.c4
-rw-r--r--tools/testing/selftests/bpf/progs/test_core_reloc_mods.c4
-rw-r--r--tools/testing/selftests/bpf/progs/test_core_reloc_nesting.c4
-rw-r--r--tools/testing/selftests/bpf/progs/test_core_reloc_primitives.c4
-rw-r--r--tools/testing/selftests/bpf/progs/test_core_reloc_ptr_as_arr.c4
-rw-r--r--tools/testing/selftests/bpf/progs/test_core_reloc_size.c4
-rw-r--r--tools/testing/selftests/bpf/progs/test_overhead.c39
-rwxr-xr-xtools/testing/selftests/bpf/test_bpftool_build.sh30
-rw-r--r--tools/testing/selftests/bpf/test_progs.c18
-rw-r--r--tools/testing/selftests/bpf/test_progs.h10
-rw-r--r--tools/testing/selftests/bpf/test_stub.c4
-rw-r--r--tools/testing/selftests/bpf/verifier/jmp32.c83
50 files changed, 2031 insertions, 548 deletions
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 2e586f579945..b8be18427277 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -203,8 +203,9 @@ struct jit_context {
/* Maximum number of bytes emitted while JITing one eBPF insn */
#define BPF_MAX_INSN_SIZE 128
#define BPF_INSN_SAFETY 64
-/* number of bytes emit_call() needs to generate call instruction */
-#define X86_CALL_SIZE 5
+
+/* Number of bytes emit_patch() needs to generate instructions */
+#define X86_PATCH_SIZE 5
#define PROLOGUE_SIZE 25
@@ -215,7 +216,7 @@ struct jit_context {
static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
{
u8 *prog = *pprog;
- int cnt = X86_CALL_SIZE;
+ int cnt = X86_PATCH_SIZE;
/* BPF trampoline can be made to work without these nops,
* but let's waste 5 bytes for now and optimize later
@@ -238,6 +239,89 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
*pprog = prog;
}
+static int emit_patch(u8 **pprog, void *func, void *ip, u8 opcode)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ s64 offset;
+
+ offset = func - (ip + X86_PATCH_SIZE);
+ if (!is_simm32(offset)) {
+ pr_err("Target call %p is out of range\n", func);
+ return -ERANGE;
+ }
+ EMIT1_off32(opcode, offset);
+ *pprog = prog;
+ return 0;
+}
+
+static int emit_call(u8 **pprog, void *func, void *ip)
+{
+ return emit_patch(pprog, func, ip, 0xE8);
+}
+
+static int emit_jump(u8 **pprog, void *func, void *ip)
+{
+ return emit_patch(pprog, func, ip, 0xE9);
+}
+
+static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
+ void *old_addr, void *new_addr,
+ const bool text_live)
+{
+ const u8 *nop_insn = ideal_nops[NOP_ATOMIC5];
+ u8 old_insn[X86_PATCH_SIZE];
+ u8 new_insn[X86_PATCH_SIZE];
+ u8 *prog;
+ int ret;
+
+ memcpy(old_insn, nop_insn, X86_PATCH_SIZE);
+ if (old_addr) {
+ prog = old_insn;
+ ret = t == BPF_MOD_CALL ?
+ emit_call(&prog, old_addr, ip) :
+ emit_jump(&prog, old_addr, ip);
+ if (ret)
+ return ret;
+ }
+
+ memcpy(new_insn, nop_insn, X86_PATCH_SIZE);
+ if (new_addr) {
+ prog = new_insn;
+ ret = t == BPF_MOD_CALL ?
+ emit_call(&prog, new_addr, ip) :
+ emit_jump(&prog, new_addr, ip);
+ if (ret)
+ return ret;
+ }
+
+ ret = -EBUSY;
+ mutex_lock(&text_mutex);
+ if (memcmp(ip, old_insn, X86_PATCH_SIZE))
+ goto out;
+ if (memcmp(ip, new_insn, X86_PATCH_SIZE)) {
+ if (text_live)
+ text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL);
+ else
+ memcpy(ip, new_insn, X86_PATCH_SIZE);
+ }
+ ret = 0;
+out:
+ mutex_unlock(&text_mutex);
+ return ret;
+}
+
+int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
+ void *old_addr, void *new_addr)
+{
+ if (!is_kernel_text((long)ip) &&
+ !is_bpf_text_address((long)ip))
+ /* BPF poking in modules is not supported */
+ return -EINVAL;
+
+ return __bpf_arch_text_poke(ip, t, old_addr, new_addr, true);
+}
+
/*
* Generate the following code:
*
@@ -252,7 +336,7 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
* goto *(prog->bpf_func + prologue_size);
* out:
*/
-static void emit_bpf_tail_call(u8 **pprog)
+static void emit_bpf_tail_call_indirect(u8 **pprog)
{
u8 *prog = *pprog;
int label1, label2, label3;
@@ -319,6 +403,68 @@ static void emit_bpf_tail_call(u8 **pprog)
*pprog = prog;
}
+static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke,
+ u8 **pprog, int addr, u8 *image)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+ /*
+ * if (tail_call_cnt > MAX_TAIL_CALL_CNT)
+ * goto out;
+ */
+ EMIT2_off32(0x8B, 0x85, -36 - MAX_BPF_STACK); /* mov eax, dword ptr [rbp - 548] */
+ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */
+ EMIT2(X86_JA, 14); /* ja out */
+ EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */
+ EMIT2_off32(0x89, 0x85, -36 - MAX_BPF_STACK); /* mov dword ptr [rbp -548], eax */
+
+ poke->ip = image + (addr - X86_PATCH_SIZE);
+ poke->adj_off = PROLOGUE_SIZE;
+
+ memcpy(prog, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE);
+ prog += X86_PATCH_SIZE;
+ /* out: */
+
+ *pprog = prog;
+}
+
+static void bpf_tail_call_direct_fixup(struct bpf_prog *prog)
+{
+ struct bpf_jit_poke_descriptor *poke;
+ struct bpf_array *array;
+ struct bpf_prog *target;
+ int i, ret;
+
+ for (i = 0; i < prog->aux->size_poke_tab; i++) {
+ poke = &prog->aux->poke_tab[i];
+ WARN_ON_ONCE(READ_ONCE(poke->ip_stable));
+
+ if (poke->reason != BPF_POKE_REASON_TAIL_CALL)
+ continue;
+
+ array = container_of(poke->tail_call.map, struct bpf_array, map);
+ mutex_lock(&array->aux->poke_mutex);
+ target = array->ptrs[poke->tail_call.key];
+ if (target) {
+ /* Plain memcpy is used when image is not live yet
+ * and still not locked as read-only. Once poke
+ * location is active (poke->ip_stable), any parallel
+ * bpf_arch_text_poke() might occur still on the
+ * read-write image until we finally locked it as
+ * read-only. Both modifications on the given image
+ * are under text_mutex to avoid interference.
+ */
+ ret = __bpf_arch_text_poke(poke->ip, BPF_MOD_JUMP, NULL,
+ (u8 *)target->bpf_func +
+ poke->adj_off, false);
+ BUG_ON(ret < 0);
+ }
+ WRITE_ONCE(poke->ip_stable, true);
+ mutex_unlock(&array->aux->poke_mutex);
+ }
+}
+
static void emit_mov_imm32(u8 **pprog, bool sign_propagate,
u32 dst_reg, const u32 imm32)
{
@@ -480,72 +626,6 @@ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
*pprog = prog;
}
-static int emit_call(u8 **pprog, void *func, void *ip)
-{
- u8 *prog = *pprog;
- int cnt = 0;
- s64 offset;
-
- offset = func - (ip + X86_CALL_SIZE);
- if (!is_simm32(offset)) {
- pr_err("Target call %p is out of range\n", func);
- return -EINVAL;
- }
- EMIT1_off32(0xE8, offset);
- *pprog = prog;
- return 0;
-}
-
-int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
- void *old_addr, void *new_addr)
-{
- u8 old_insn[X86_CALL_SIZE] = {};
- u8 new_insn[X86_CALL_SIZE] = {};
- u8 *prog;
- int ret;
-
- if (!is_kernel_text((long)ip) &&
- !is_bpf_text_address((long)ip))
- /* BPF trampoline in modules is not supported */
- return -EINVAL;
-
- if (old_addr) {
- prog = old_insn;
- ret = emit_call(&prog, old_addr, (void *)ip);
- if (ret)
- return ret;
- }
- if (new_addr) {
- prog = new_insn;
- ret = emit_call(&prog, new_addr, (void *)ip);
- if (ret)
- return ret;
- }
- ret = -EBUSY;
- mutex_lock(&text_mutex);
- switch (t) {
- case BPF_MOD_NOP_TO_CALL:
- if (memcmp(ip, ideal_nops[NOP_ATOMIC5], X86_CALL_SIZE))
- goto out;
- text_poke_bp(ip, new_insn, X86_CALL_SIZE, NULL);
- break;
- case BPF_MOD_CALL_TO_CALL:
- if (memcmp(ip, old_insn, X86_CALL_SIZE))
- goto out;
- text_poke_bp(ip, new_insn, X86_CALL_SIZE, NULL);
- break;
- case BPF_MOD_CALL_TO_NOP:
- if (memcmp(ip, old_insn, X86_CALL_SIZE))
- goto out;
- text_poke_bp(ip, ideal_nops[NOP_ATOMIC5], X86_CALL_SIZE, NULL);
- break;
- }
- ret = 0;
-out:
- mutex_unlock(&text_mutex);
- return ret;
-}
-
static bool ex_handler_bpf(const struct exception_table_entry *x,
struct pt_regs *regs, int trapnr,
unsigned long error_code, unsigned long fault_addr)
@@ -1013,7 +1093,11 @@ xadd: if (is_imm8(insn->off))
break;
case BPF_JMP | BPF_TAIL_CALL:
- emit_bpf_tail_call(&prog);
+ if (imm32)
+ emit_bpf_tail_call_direct(&bpf_prog->aux->poke_tab[imm32 - 1],
+ &prog, addrs[i], image);
+ else
+ emit_bpf_tail_call_indirect(&prog);
break;
/* cond jump */
@@ -1394,7 +1478,7 @@ int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags
/* skip patched call instruction and point orig_call to actual
* body of the kernel function.
*/
- orig_call += X86_CALL_SIZE;
+ orig_call += X86_PATCH_SIZE;
prog = image;
@@ -1571,6 +1655,7 @@ out_image:
if (image) {
if (!prog->is_func || extra_pass) {
+ bpf_tail_call_direct_fixup(prog);
bpf_jit_binary_lock_ro(header);
} else {
jit_data->addrs = addrs;
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e89e86122233..35903f148be5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -22,6 +22,7 @@ struct bpf_verifier_env;
struct bpf_verifier_log;
struct perf_event;
struct bpf_prog;
+struct bpf_prog_aux;
struct bpf_map;
struct sock;
struct seq_file;
@@ -64,6 +65,12 @@ struct bpf_map_ops {
const struct btf_type *key_type,
const struct btf_type *value_type);
+ /* Prog poke tracking helpers. */
+ int (*map_poke_track)(struct bpf_map *map, struct bpf_prog_aux *aux);
+ void (*map_poke_untrack)(struct bpf_map *map, struct bpf_prog_aux *aux);
+ void (*map_poke_run)(struct bpf_map *map, u32 key, struct bpf_prog *old,
+ struct bpf_prog *new);
+
/* Direct value access helpers. */
int (*map_direct_value_addr)(const struct bpf_map *map,
u64 *imm, u32 off);
@@ -488,6 +495,24 @@ struct bpf_func_info_aux {
bool unreliable;
};
+enum bpf_jit_poke_reason {
+ BPF_POKE_REASON_TAIL_CALL,
+};
+
+/* Descriptor of pokes pointing /into/ the JITed image. */
+struct bpf_jit_poke_descriptor {
+ void *ip;
+ union {
+ struct {
+ struct bpf_map *map;
+ u32 key;
+ } tail_call;
+ };
+ bool ip_stable;
+ u8 adj_off;
+ u16 reason;
+};
+
struct bpf_prog_aux {
atomic64_t refcnt;
u32 used_map_cnt;
@@ -513,6 +538,8 @@ struct bpf_prog_aux {
const char *attach_func_name;
struct bpf_prog **func;
void *jit_data; /* JIT specific data. arch dependent */
+ struct bpf_jit_poke_descriptor *poke_tab;
+ u32 size_poke_tab;
struct latch_tree_node ksym_tnode;
struct list_head ksym_lnode;
const struct bpf_prog_ops *ops;
@@ -560,17 +587,26 @@ struct bpf_prog_aux {
};
};
+struct bpf_array_aux {
+ /* 'Ownership' of prog array is claimed by the first program that
+ * is going to use this map or by the first program which FD is
+ * stored in the map to make sure that all callers and callees have
+ * the same prog type and JITed flag.
+ */
+ enum bpf_prog_type type;
+ bool jited;
+ /* Programs with direct jumps into programs part of this array. */
+ struct list_head poke_progs;
+ struct bpf_map *map;
+ struct mutex poke_mutex;
+ struct work_struct work;
+};
+
struct bpf_array {
struct bpf_map map;
u32 elem_size;
u32 index_mask;
- /* 'ownership' of prog_array is claimed by the first program that
- * is going to use this map or by the first program which FD is stored
- * in the map to make sure that all callers and callees have the same
- * prog_type and JITed flag
- */
- enum bpf_prog_type owner_prog_type;
- bool owner_jited;
+ struct bpf_array_aux *aux;
union {
char value[0] __aligned(8);
void *ptrs[0] __aligned(8);
@@ -1031,6 +1067,10 @@ static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
{
return -ENOTSUPP;
}
+
+static inline void bpf_map_put(struct bpf_map *map)
+{
+}
#endif /* CONFIG_BPF_SYSCALL */
static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
@@ -1284,10 +1324,10 @@ static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
#endif /* CONFIG_INET */
enum bpf_text_poke_type {
- BPF_MOD_NOP_TO_CALL,
- BPF_MOD_CALL_TO_CALL,
- BPF_MOD_CALL_TO_NOP,
+ BPF_MOD_CALL,
+ BPF_MOD_JUMP,
};
+
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
void *addr1, void *addr2);
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index cdd08bf0ec06..26e40de9ef55 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -293,7 +293,7 @@ struct bpf_verifier_state_list {
struct bpf_insn_aux_data {
union {
enum bpf_reg_type ptr_type; /* pointer type for load/store insns */
- unsigned long map_state; /* pointer/poison value for maps */
+ unsigned long map_ptr_state; /* pointer/poison value for maps */
s32 call_imm; /* saved imm field of call insn */
u32 alu_limit; /* limit for add/sub register with pointer */
struct {
@@ -301,6 +301,7 @@ struct bpf_insn_aux_data {
u32 map_off; /* offset from value base address */
};
};
+ u64 map_key_state; /* constant (32 bit) key tracking for maps */
int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
int sanitize_stack_off; /* stack slot to be cleared */
bool seen; /* this insn was processed by the verifier */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index ad80e9c6111c..1b1e8b8f88da 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -952,6 +952,9 @@ void *bpf_jit_alloc_exec(unsigned long size);
void bpf_jit_free_exec(void *addr);
void bpf_jit_free(struct bpf_prog *fp);
+int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
+ struct bpf_jit_poke_descriptor *poke);
+
int bpf_jit_get_func_addr(const struct bpf_prog *prog,
const struct bpf_insn *insn, bool extra_pass,
u64 *func_addr, bool *func_addr_fixed);
@@ -1050,11 +1053,23 @@ static inline bool ebpf_jit_enabled(void)
return false;
}
+static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog)
+{
+ return false;
+}
+
static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp)
{
return false;
}
+static inline int
+bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
+ struct bpf_jit_poke_descriptor *poke)
+{
+ return -ENOTSUPP;
+}
+
static inline void bpf_jit_free(struct bpf_prog *fp)
{
bpf_prog_unlock_free(fp);
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 633c8c701ff6..f0d19bbb9211 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -586,10 +586,17 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
if (IS_ERR(new_ptr))
return PTR_ERR(new_ptr);
- old_ptr = xchg(array->ptrs + index, new_ptr);
+ if (map->ops->map_poke_run) {
+ mutex_lock(&array->aux->poke_mutex);
+ old_ptr = xchg(array->ptrs + index, new_ptr);
+ map->ops->map_poke_run(map, index, old_ptr, new_ptr);
+ mutex_unlock(&array->aux->poke_mutex);
+ } else {
+ old_ptr = xchg(array->ptrs + index, new_ptr);
+ }
+
if (old_ptr)
map->ops->map_fd_put_ptr(old_ptr);
-
return 0;
}
@@ -602,7 +609,15 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
if (index >= array->map.max_entries)
return -E2BIG;
- old_ptr = xchg(array->ptrs + index, NULL);
+ if (map->ops->map_poke_run) {
+ mutex_lock(&array->aux->poke_mutex);
+ old_ptr = xchg(array->ptrs + index, NULL);
+ map->ops->map_poke_run(map, index, old_ptr, NULL);
+ mutex_unlock(&array->aux->poke_mutex);
+ } else {
+ old_ptr = xchg(array->ptrs + index, NULL);
+ }
+
if (old_ptr) {
map->ops->map_fd_put_ptr(old_ptr);
return 0;
@@ -671,17 +686,195 @@ static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key,
rcu_read_unlock();
}
+struct prog_poke_elem {
+ struct list_head list;
+ struct bpf_prog_aux *aux;
+};
+
+static int prog_array_map_poke_track(struct bpf_map *map,
+ struct bpf_prog_aux *prog_aux)
+{
+ struct prog_poke_elem *elem;
+ struct bpf_array_aux *aux;
+ int ret = 0;
+
+ aux = container_of(map, struct bpf_array, map)->aux;
+ mutex_lock(&aux->poke_mutex);
+ list_for_each_entry(elem, &aux->poke_progs, list) {
+ if (elem->aux == prog_aux)
+ goto out;
+ }
+
+ elem = kmalloc(sizeof(*elem), GFP_KERNEL);
+ if (!elem) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ INIT_LIST_HEAD(&elem->list);
+ /* We must track the program's aux info at this point in time
+ * since the program pointer itself may not be stable yet, see
+ * also comment in prog_array_map_poke_run().
+ */
+ elem->aux = prog_aux;
+
+ list_add_tail(&elem->list, &aux->poke_progs);
+out:
+ mutex_unlock(&aux->poke_mutex);
+ return ret;
+}
+
+static void prog_array_map_poke_untrack(struct bpf_map *map,
+ struct bpf_prog_aux *prog_aux)
+{
+ struct prog_poke_elem *elem, *tmp;
+ struct bpf_array_aux *aux;
+
+ aux = container_of(map, struct bpf_array, map)->aux;
+ mutex_lock(&aux->poke_mutex);
+ list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) {
+ if (elem->aux == prog_aux) {
+ list_del_init(&elem->list);
+ kfree(elem);
+ break;
+ }
+ }
+ mutex_unlock(&aux->poke_mutex);
+}
+
+static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
+ struct bpf_prog *old,
+ struct bpf_prog *new)
+{
+ struct prog_poke_elem *elem;
+ struct bpf_array_aux *aux;
+
+ aux = container_of(map, struct bpf_array, map)->aux;
+ WARN_ON_ONCE(!mutex_is_locked(&aux->poke_mutex));
+
+ list_for_each_entry(elem, &aux->poke_progs, list) {
+ struct bpf_jit_poke_descriptor *poke;
+ int i, ret;
+
+ for (i = 0; i < elem->aux->size_poke_tab; i++) {
+ poke = &elem->aux->poke_tab[i];
+
+ /* Few things to be aware of:
+ *
+ * 1) We can only ever access aux in this context, but
+ * not aux->prog since it might not be stable yet and
+ * there could be danger of use after free otherwise.
+ * 2) Initially when we start tracking aux, the program
+ * is not JITed yet and also does not have a kallsyms
+ * entry. We skip these as poke->ip_stable is not
+ * active yet. The JIT will do the final fixup before
+ * setting it stable. The various poke->ip_stable are
+ * successively activated, so tail call updates can
+ * arrive from here while JIT is still finishing its
+ * final fixup for non-activated poke entries.
+ * 3) On program teardown, the program's kallsym entry gets
+ * removed out of RCU callback, but we can only untrack
+ * from sleepable context, therefore bpf_arch_text_poke()
+ * might not see that this is in BPF text section and
+ * bails out with -EINVAL. As these are unreachable since
+ * RCU grace period already passed, we simply skip them.
+ * 4) Also programs reaching refcount of zero while patching
+ * is in progress is okay since we're protected under
+ * poke_mutex and untrack the programs before the JIT
+ * buffer is freed. When we're still in the middle of
+ * patching and suddenly kallsyms entry of the program
+ * gets evicted, we just skip the rest which is fine due
+ * to point 3).
+ * 5) Any other error happening below from bpf_arch_text_poke()
+ * is a unexpected bug.
+ */
+ if (!READ_ONCE(poke->ip_stable))
+ continue;
+ if (poke->reason != BPF_POKE_REASON_TAIL_CALL)
+ continue;
+ if (poke->tail_call.map != map ||
+ poke->tail_call.key != key)
+ continue;
+
+ ret = bpf_arch_text_poke(poke->ip, BPF_MOD_JUMP,
+ old ? (u8 *)old->bpf_func +
+ poke->adj_off : NULL,
+ new ? (u8 *)new->bpf_func +
+ poke->adj_off : NULL);
+ BUG_ON(ret < 0 && ret != -EINVAL);
+ }
+ }
+}
+
+static void prog_array_map_clear_deferred(struct work_struct *work)
+{
+ struct bpf_map *map = container_of(work, struct bpf_array_aux,
+ work)->map;
+ bpf_fd_array_map_clear(map);
+ bpf_map_put(map);
+}
+
+static void prog_array_map_clear(struct bpf_map *map)
+{
+ struct bpf_array_aux *aux = container_of(map, struct bpf_array,
+ map)->aux;
+ bpf_map_inc(map);
+ schedule_work(&aux->work);
+}
+
+static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_array_aux *aux;
+ struct bpf_map *map;
+
+ aux = kzalloc(sizeof(*aux), GFP_KERNEL);
+ if (!aux)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_WORK(&aux->work, prog_array_map_clear_deferred);
+ INIT_LIST_HEAD(&aux->poke_progs);
+ mutex_init(&aux->poke_mutex);
+
+ map = array_map_alloc(attr);
+ if (IS_ERR(map)) {
+ kfree(aux);
+ return map;
+ }
+
+ container_of(map, struct bpf_array, map)->aux = aux;
+ aux->map = map;
+
+ return map;
+}
+
+static void prog_array_map_free(struct bpf_map *map)
+{
+ struct prog_poke_elem *elem, *tmp;
+ struct bpf_array_aux *aux;
+
+ aux = container_of(map, struct bpf_array, map)->aux;
+ list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) {
+ list_del_init(&elem->list);
+ kfree(elem);
+ }
+ kfree(aux);
+ fd_array_map_free(map);
+}
+
const struct bpf_map_ops prog_array_map_ops = {
.map_alloc_check = fd_array_map_alloc_check,
- .map_alloc = array_map_alloc,
- .map_free = fd_array_map_free,
+ .map_alloc = prog_array_map_alloc,
+ .map_free = prog_array_map_free,
+ .map_poke_track = prog_array_map_poke_track,
+ .map_poke_untrack = prog_array_map_poke_untrack,
+ .map_poke_run = prog_array_map_poke_run,
.map_get_next_key = array_map_get_next