From d8ef4b38cb69d907f9b0e889c44d05fc0f890977 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 9 Apr 2020 14:55:35 -0400 Subject: Revert "cgroup: Add memory barriers to plug cgroup_rstat_updated() race window" This reverts commit 9a9e97b2f1f2 ("cgroup: Add memory barriers to plug cgroup_rstat_updated() race window"). The commit was added in anticipation of memcg rstat conversion which needed synchronous accounting for the event counters (e.g. oom kill count). However, the conversion didn't get merged due to percpu memory overhead concern which couldn't be addressed at the time. Unfortunately, the patch's addition of smp_mb() to cgroup_rstat_updated() meant that every scheduling event now had to go through an additional full barrier and Mel Gorman noticed it as 1% regression in netperf UDP_STREAM test. There's no need to have this barrier in tree now and even if we need synchronous accounting in the future, the right thing to do is separating that out to a separate function so that hot paths which don't care about synchronous behavior don't have to pay the overhead of the full barrier. Let's revert. Signed-off-by: Tejun Heo Reported-by: Mel Gorman Link: http://lkml.kernel.org/r/20200409154413.GK3818@techsingularity.net Cc: v4.18+ --- kernel/cgroup/rstat.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 6f87352f8219..41ca996568df 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -33,12 +33,9 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) return; /* - * Paired with the one in cgroup_rstat_cpu_pop_updated(). Either we - * see NULL updated_next or they see our updated stat. - */ - smp_mb(); - - /* + * Speculative already-on-list test. This may race leading to + * temporary inaccuracies, which is fine. + * * Because @parent's updated_children is terminated with @parent * instead of NULL, we can tell whether @cgrp is on the list by * testing the next pointer for NULL. @@ -134,13 +131,6 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, *nextp = rstatc->updated_next; rstatc->updated_next = NULL; - /* - * Paired with the one in cgroup_rstat_cpu_updated(). - * Either they see NULL updated_next or we see their - * updated stat. - */ - smp_mb(); - return pos; } -- cgit v1.2.3 From 18f855e574d9799a0e7489f8ae6fd8447d0dd74a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 26 May 2020 09:38:31 -0600 Subject: sched/fair: Don't NUMA balance for kthreads Stefano reported a crash with using SQPOLL with io_uring: BUG: kernel NULL pointer dereference, address: 00000000000003b0 CPU: 2 PID: 1307 Comm: io_uring-sq Not tainted 5.7.0-rc7 #11 RIP: 0010:task_numa_work+0x4f/0x2c0 Call Trace: task_work_run+0x68/0xa0 io_sq_thread+0x252/0x3d0 kthread+0xf9/0x130 ret_from_fork+0x35/0x40 which is task_numa_work() oopsing on current->mm being NULL. The task work is queued by task_tick_numa(), which checks if current->mm is NULL at the time of the call. But this state isn't necessarily persistent, if the kthread is using use_mm() to temporarily adopt the mm of a task. Change the task_tick_numa() check to exclude kernel threads in general, as it doesn't make sense to attempt ot balance for kthreads anyway. Reported-by: Stefano Garzarella Signed-off-by: Jens Axboe Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/865de121-8190-5d30-ece5-3b097dc74431@kernel.dk --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 538ba5d94e99..da3e5b54715b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2908,7 +2908,7 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr) /* * We don't care about NUMA placement if we don't have memory. */ - if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work) + if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work) return; /* -- cgit v1.2.3 From 18644cec714aabbab173729192c7594ee57a406b Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 28 May 2020 21:38:36 -0700 Subject: bpf: Fix use-after-free in fmod_ret check Fix the following issue: [ 436.749342] BUG: KASAN: use-after-free in bpf_trampoline_put+0x39/0x2a0 [ 436.749995] Write of size 4 at addr ffff8881ef38b8a0 by task kworker/3:5/2243 [ 436.750712] [ 436.752677] Workqueue: events bpf_prog_free_deferred [ 436.753183] Call Trace: [ 436.756483] bpf_trampoline_put+0x39/0x2a0 [ 436.756904] bpf_prog_free_deferred+0x16d/0x3d0 [ 436.757377] process_one_work+0x94a/0x15b0 [ 436.761969] [ 436.762130] Allocated by task 2529: [ 436.763323] bpf_trampoline_lookup+0x136/0x540 [ 436.763776] bpf_check+0x2872/0xa0a8 [ 436.764144] bpf_prog_load+0xb6f/0x1350 [ 436.764539] __do_sys_bpf+0x16d7/0x3720 [ 436.765825] [ 436.765988] Freed by task 2529: [ 436.767084] kfree+0xc6/0x280 [ 436.767397] bpf_trampoline_put+0x1fd/0x2a0 [ 436.767826] bpf_check+0x6832/0xa0a8 [ 436.768197] bpf_prog_load+0xb6f/0x1350 [ 436.768594] __do_sys_bpf+0x16d7/0x3720 prog->aux->trampoline = tr should be set only when prog is valid. Otherwise prog freeing will try to put trampoline via prog->aux->trampoline, but it may not point to a valid trampoline. Fixes: 6ba43b761c41 ("bpf: Attachment verification for BPF_MODIFY_RETURN") Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20200529043839.15824-2-alexei.starovoitov@gmail.com --- kernel/bpf/verifier.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8d7ee40e2748..bfea3f9a972f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10428,22 +10428,13 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env) } #define SECURITY_PREFIX "security_" -static int check_attach_modify_return(struct bpf_verifier_env *env) +static int check_attach_modify_return(struct bpf_prog *prog, unsigned long addr) { - struct bpf_prog *prog = env->prog; - unsigned long addr = (unsigned long) prog->aux->trampoline->func.addr; - - /* This is expected to be cleaned up in the future with the KRSI effort - * introducing the LSM_HOOK macro for cleaning up lsm_hooks.h. - */ if (within_error_injection_list(addr) || !strncmp(SECURITY_PREFIX, prog->aux->attach_func_name, sizeof(SECURITY_PREFIX) - 1)) return 0; - verbose(env, "fmod_ret attach_btf_id %u (%s) is not modifiable\n", - prog->aux->attach_btf_id, prog->aux->attach_func_name); - return -EINVAL; } @@ -10654,11 +10645,18 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) goto out; } } + + if (prog->expected_attach_type == BPF_MODIFY_RETURN) { + ret = check_attach_modify_return(prog, addr); + if (ret) + verbose(env, "%s() is not modifiable\n", + prog->aux->attach_func_name); + } + + if (ret) + goto out; tr->func.addr = (void *)addr; prog->aux->trampoline = tr; - - if (prog->expected_attach_type == BPF_MODIFY_RETURN) - ret = check_attach_modify_return(env); out: mutex_unlock(&tr->mutex); if (ret) -- cgit v1.2.3 From 3a71dc366d4aa51a22f385445f2862231d3fda3b Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 29 May 2020 10:28:40 -0700 Subject: bpf: Fix a verifier issue when assigning 32bit reg states to 64bit ones With the latest trunk llvm (llvm 11), I hit a verifier issue for test_prog subtest test_verif_scale1. The following simplified example illustrate the issue: w9 = 0 /* R9_w=inv0 */ r8 = *(u32 *)(r1 + 80) /* __sk_buff->data_end */ r7 = *(u32 *)(r1 + 76) /* __sk_buff->data */ ...... w2 = w9 /* R2_w=inv0 */ r6 = r7 /* R6_w=pkt(id=0,off=0,r=0,imm=0) */ r6 += r2 /* R6_w=inv(id=0) */ r3 = r6 /* R3_w=inv(id=0) */ r3 += 14 /* R3_w=inv(id=0) */ if r3 > r8 goto end r5 = *(u32 *)(r6 + 0) /* R6_w=inv(id=0) */ <== error here: R6 invalid mem access 'inv' ... end: In real test_verif_scale1 code, "w9 = 0" and "w2 = w9" are in different basic blocks. In the above, after "r6 += r2", r6 becomes a scalar, which eventually caused the memory access error. The correct register state should be a pkt pointer. The inprecise register state starts at "w2 = w9". The 32bit register w9 is 0, in __reg_assign_32_into_64(), the 64bit reg->smax_value is assigned to be U32_MAX. The 64bit reg->smin_value is 0 and the 64bit register itself remains constant based on reg->var_off. In adjust_ptr_min_max_vals(), the verifier checks for a known constant, smin_val must be equal to smax_val. Since they are not equal, the verifier decides r6 is a unknown scalar, which caused later failure. The llvm10 does not have this issue as it generates different code: w9 = 0 /* R9_w=inv0 */ r8 = *(u32 *)(r1 + 80) /* __sk_buff->data_end */ r7 = *(u32 *)(r1 + 76) /* __sk_buff->data */ ...... r6 = r7 /* R6_w=pkt(id=0,off=0,r=0,imm=0) */ r6 += r9 /* R6_w=pkt(id=0,off=0,r=0,imm=0) */ r3 = r6 /* R3_w=pkt(id=0,off=0,r=0,imm=0) */ r3 += 14 /* R3_w=pkt(id=0,off=14,r=0,imm=0) */ if r3 > r8 goto end ... To fix the above issue, we can include zero in the test condition for assigning the s32_max_value and s32_min_value to their 64-bit equivalents smax_value and smin_value. Further, fix the condition to avoid doing zero extension bounds checks when s32_min_value <= 0. This could allow for the case where bounds 32-bit bounds (-1,1) get incorrectly translated to (0,1) 64-bit bounds. When in-fact the -1 min value needs to force U32_MAX bound. Fixes: 3f50f132d840 ("bpf: Verifier, do explicit ALU32 bounds tracking") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/159077331983.6014.5758956193749002737.stgit@john-Precision-5820-Tower --- kernel/bpf/verifier.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bfea3f9a972f..efe14cf24bc6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1168,14 +1168,14 @@ static void __reg_assign_32_into_64(struct bpf_reg_state *reg) * but must be positive otherwise set to worse case bounds * and refine later from tnum. */ - if (reg->s32_min_value > 0) - reg->smin_value = reg->s32_min_value; - else - reg->smin_value = 0; - if (reg->s32_max_value > 0) + if (reg->s32_min_value >= 0 && reg->s32_max_value >= 0) reg->smax_value = reg->s32_max_value; else reg->smax_value = U32_MAX; + if (reg->s32_min_value >= 0) + reg->smin_value = reg->s32_min_value; + else + reg->smin_value = 0; } static void __reg_combine_32_into_64(struct bpf_reg_state *reg) -- cgit v1.2.3