diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-11-03 18:03:50 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-11-03 18:03:50 -0800 |
commit | 53528695ff6d8b77011bc818407c13e30914a946 (patch) | |
tree | 04acd099c5759bf6f1d728c5415f574d572c6872 | |
parent | b831ef2cad979912850e34f82415c0c5d59de8cb (diff) | |
parent | e73e85f0593832aa583b252f9a16cf90ed6d30fa (diff) |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler changes from Ingo Molnar:
"The main changes in this cycle were:
- sched/fair load tracking fixes and cleanups (Byungchul Park)
- Make load tracking frequency scale invariant (Dietmar Eggemann)
- sched/deadline updates (Juri Lelli)
- stop machine fixes, cleanups and enhancements for bugs triggered by
CPU hotplug stress testing (Oleg Nesterov)
- scheduler preemption code rework: remove PREEMPT_ACTIVE and related
cleanups (Peter Zijlstra)
- Rework the sched_info::run_delay code to fix races (Peter Zijlstra)
- Optimize per entity utilization tracking (Peter Zijlstra)
- ... misc other fixes, cleanups and smaller updates"
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (57 commits)
sched: Don't scan all-offline ->cpus_allowed twice if !CONFIG_CPUSETS
sched: Move cpu_active() tests from stop_two_cpus() into migrate_swap_stop()
sched: Start stopper early
stop_machine: Kill cpu_stop_threads->setup() and cpu_stop_unpark()
stop_machine: Kill smp_hotplug_thread->pre_unpark, introduce stop_machine_unpark()
stop_machine: Change cpu_stop_queue_two_works() to rely on stopper->enabled
stop_machine: Introduce __cpu_stop_queue_work() and cpu_stop_queue_two_works()
stop_machine: Ensure that a queued callback will be called before cpu_stop_park()
sched/x86: Fix typo in __switch_to() comments
sched/core: Remove a parameter in the migrate_task_rq() function
sched/core: Drop unlikely behind BUG_ON()
sched/core: Fix task and run queue sched_info::run_delay inconsistencies
sched/numa: Fix task_tick_fair() from disabling numa_balancing
sched/core: Add preempt_count invariant check
sched/core: More notrace annotations
sched/core: Kill PREEMPT_ACTIVE
sched/core, sched/x86: Kill thread_info::saved_preempt_count
sched/core: Simplify preempt_count tests
sched/core: Robustify preemption leak checks
sched/core: Stop setting PREEMPT_ACTIVE
...
-rw-r--r-- | arch/x86/include/asm/preempt.h | 5 | ||||
-rw-r--r-- | arch/x86/include/asm/thread_info.h | 2 | ||||
-rw-r--r-- | arch/x86/kernel/process_32.c | 8 | ||||
-rw-r--r-- | arch/x86/kernel/process_64.c | 10 | ||||
-rw-r--r-- | include/asm-generic/preempt.h | 2 | ||||
-rw-r--r-- | include/linux/preempt.h | 20 | ||||
-rw-r--r-- | include/linux/sched.h | 36 | ||||
-rw-r--r-- | include/linux/sched/deadline.h | 5 | ||||
-rw-r--r-- | include/linux/smpboot.h | 4 | ||||
-rw-r--r-- | include/linux/stop_machine.h | 2 | ||||
-rw-r--r-- | include/trace/events/sched.h | 22 | ||||
-rw-r--r-- | kernel/cpu.c | 10 | ||||
-rw-r--r-- | kernel/exit.c | 4 | ||||
-rw-r--r-- | kernel/locking/rtmutex.c | 3 | ||||
-rw-r--r-- | kernel/sched/core.c | 203 | ||||
-rw-r--r-- | kernel/sched/cpudeadline.c | 5 | ||||
-rw-r--r-- | kernel/sched/cpudeadline.h | 1 | ||||
-rw-r--r-- | kernel/sched/fair.c | 419 | ||||
-rw-r--r-- | kernel/sched/features.h | 21 | ||||
-rw-r--r-- | kernel/sched/rt.c | 22 | ||||
-rw-r--r-- | kernel/sched/sched.h | 55 | ||||
-rw-r--r-- | kernel/smpboot.c | 5 | ||||
-rw-r--r-- | kernel/stop_machine.c | 90 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 2 | ||||
-rw-r--r-- | kernel/trace/trace_sched_switch.c | 3 | ||||
-rw-r--r-- | kernel/trace/trace_sched_wakeup.c | 2 |
26 files changed, 492 insertions, 469 deletions
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index b12f81022a6b..01bcde84d3e4 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -30,12 +30,9 @@ static __always_inline void preempt_count_set(int pc) /* * must be macros to avoid header recursion hell */ -#define init_task_preempt_count(p) do { \ - task_thread_info(p)->saved_preempt_count = PREEMPT_DISABLED; \ -} while (0) +#define init_task_preempt_count(p) do { } while (0) #define init_idle_preempt_count(p, cpu) do { \ - task_thread_info(p)->saved_preempt_count = PREEMPT_ENABLED; \ per_cpu(__preempt_count, (cpu)) = PREEMPT_ENABLED; \ } while (0) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 8afdc3e44247..809877e9030b 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -57,7 +57,6 @@ struct thread_info { __u32 flags; /* low level flags */ __u32 status; /* thread synchronous flags */ __u32 cpu; /* current CPU */ - int saved_preempt_count; mm_segment_t addr_limit; void __user *sysenter_return; unsigned int sig_on_uaccess_error:1; @@ -69,7 +68,6 @@ struct thread_info { .task = &tsk, \ .flags = 0, \ .cpu = 0, \ - .saved_preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 737527b40e5b..9f950917528b 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -280,14 +280,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) set_iopl_mask(next->iopl); /* - * If it were not for PREEMPT_ACTIVE we could guarantee that the - * preempt_count of all tasks was equal here and this would not be - * needed. - */ - task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); - this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); - - /* * Now maybe handle debug registers and/or IO bitmaps */ if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV || diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b35921a670b2..e835d263a33b 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -332,7 +332,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Switch FS and GS. * - * These are even more complicated than FS and GS: they have + * These are even more complicated than DS and ES: they have * 64-bit bases are that controlled by arch_prctl. Those bases * only differ from the values in the GDT or LDT if the selector * is 0. @@ -401,14 +401,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ this_cpu_write(current_task, next_p); - /* - * If it were not for PREEMPT_ACTIVE we could guarantee that the - * preempt_count of all tasks was equal here and this would not be - * needed. - */ - task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); - this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); - /* Reload esp0 and ss1. This changes current_thread_info(). */ load_sp0(tss, next); diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h index 0bec580a4885..5d8ffa3e6f8c 100644 --- a/include/asm-generic/preempt.h +++ b/include/asm-generic/preempt.h @@ -24,7 +24,7 @@ static __always_inline void preempt_count_set(int pc) * must be macros to avoid header recursion hell */ #define init_task_preempt_count(p) do { \ - task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \ + task_thread_info(p)->preempt_count = FORK_PREEMPT_COUNT; \ } while (0) #define init_idle_preempt_count(p, cpu) do { \ diff --git a/include/linux/preempt.h b/include/linux/preempt.h index bea8dd8ff5e0..75e4e30677f1 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -26,7 +26,6 @@ * SOFTIRQ_MASK: 0x0000ff00 * HARDIRQ_MASK: 0x000f0000 * NMI_MASK: 0x00100000 - * PREEMPT_ACTIVE: 0x00200000 * PREEMPT_NEED_RESCHED: 0x80000000 */ #define PREEMPT_BITS 8 @@ -53,10 +52,6 @@ #define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET) -#define PREEMPT_ACTIVE_BITS 1 -#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS) -#define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT) - /* We use the MSB mostly because its available */ #define PREEMPT_NEED_RESCHED 0x80000000 @@ -126,8 +121,7 @@ * Check whether we were atomic before we did preempt_disable(): * (used by the scheduler) */ -#define in_atomic_preempt_off() \ - ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_DISABLE_OFFSET) +#define in_atomic_preempt_off() (preempt_count() != PREEMPT_DISABLE_OFFSET) #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) extern void preempt_count_add(int val); @@ -146,18 +140,6 @@ extern void preempt_count_sub(int val); #define preempt_count_inc() preempt_count_add(1) #define preempt_count_dec() preempt_count_sub(1) -#define preempt_active_enter() \ -do { \ - preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \ - barrier(); \ -} while (0) - -#define preempt_active_exit() \ -do { \ - barrier(); \ - preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \ -} while (0) - #ifdef CONFIG_PREEMPT_COUNT #define preempt_disable() \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 56667292d1e4..9e1e06c3ce05 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -599,20 +599,26 @@ struct task_cputime_atomic { .sum_exec_runtime = ATOMIC64_INIT(0), \ } -#ifdef CONFIG_PREEMPT_COUNT -#define PREEMPT_DISABLED (1 + PREEMPT_ENABLED) -#else -#define PREEMPT_DISABLED PREEMPT_ENABLED -#endif +#define PREEMPT_DISABLED (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED) + +/* + * Disable preemption until the scheduler is running -- use an unconditional + * value so that it also works on !PREEMPT_COUNT kernels. + * + * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count(). + */ +#define INIT_PREEMPT_COUNT PREEMPT_OFFSET /* - * Disable preemption until the scheduler is running. - * Reset by start_kernel()->sched_init()->init_idle(). + * Initial preempt_count value; reflects the preempt_count schedule invariant + * which states that during context switches: * - * We include PREEMPT_ACTIVE to avoid cond_resched() from working - * before the scheduler is active -- see should_resched(). + * preempt_count() == 2*PREEMPT_DISABLE_OFFSET + * + * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels. + * Note: See finish_task_switch(). */ -#define INIT_PREEMPT_COUNT (PREEMPT_DISABLED + PREEMPT_ACTIVE) +#define FORK_PREEMPT_COUNT (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED) /** * struct thread_group_cputimer - thread group interval timer counts @@ -1142,8 +1148,6 @@ struct sched_domain_topology_level { #endif }; -extern struct sched_domain_topology_level *sched_domain_topology; - extern void set_sched_topology(struct sched_domain_topology_level *tl); extern void wake_up_if_idle(int cpu); @@ -1192,10 +1196,10 @@ struct load_weight { /* * The load_avg/util_avg accumulates an infinite geometric series. - * 1) load_avg factors the amount of time that a sched_entity is - * runnable on a rq into its weight. For cfs_rq, it is the aggregated - * such weights of all runnable and blocked sched_entities. - * 2) util_avg factors frequency scaling into the amount of time + * 1) load_avg factors frequency scaling into the amount of time that a + * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the + * aggregated such weights of all runnable and blocked sched_entities. + * 2) util_avg factors frequency and cpu scaling into the amount of time * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE]. * For cfs_rq, it is the aggregated such times of all runnable and * blocked sched_entities. diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index 9d303b8847df..9089a2ae913d 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h @@ -21,4 +21,9 @@ static inline int dl_task(struct task_struct *p) return dl_prio(p->prio); } +static inline bool dl_time_before(u64 a, u64 b) +{ + return (s64)(a - b) < 0; +} + #endif /* _SCHED_DEADLINE_H */ diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h index e6109a6cd8f6..12910cf19869 100644 --- a/include/linux/smpboot.h +++ b/include/linux/smpboot.h @@ -24,9 +24,6 @@ struct smpboot_thread_data; * parked (cpu offline) * @unpark: Optional unpark function, called when the thread is * unparked (cpu online) - * @pre_unpark: Optional unpark function, called before the thread is - * unparked (cpu online). This is not guaranteed to be - * called on the target cpu of the thread. Careful! * @cpumask: Internal state. To update which threads are unparked, * call smpboot_update_cpumask_percpu_thread(). * @selfparking: Thread is not parked by the park function. @@ -42,7 +39,6 @@ struct smp_hotplug_thread { void (*cleanup)(unsigned int cpu, bool online); void (*park)(unsigned int cpu); void (*unpark)(unsigned int cpu); - void (*pre_unpark)(unsigned int cpu); cpumask_var_t cpumask; bool selfparking; const char *thread_comm; diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 414d924318ce..0adedca24c5b 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -33,6 +33,8 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, struct cpu_stop_work *work_buf); int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); +void stop_machine_park(int cpu); +void stop_machine_unpark(int cpu); #else /* CONFIG_SMP */ diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 539d6bc3216a..9b90c57517a9 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -104,22 +104,17 @@ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new, TP_ARGS(p)); #ifdef CREATE_TRACE_POINTS -static inline long __trace_sched_switch_state(struct task_struct *p) +static inline long __trace_sched_switch_state(bool preempt, struct task_struct *p) { - long state = p->state; - -#ifdef CONFIG_PREEMPT #ifdef CONFIG_SCHED_DEBUG BUG_ON(p != current); #endif /* CONFIG_SCHED_DEBUG */ + /* - * For all intents and purposes a preempted task is a running task. + * Preemption ignores task state, therefore preempted tasks are always + * RUNNING (we will not have dequeued if state != RUNNING). */ - if (preempt_count() & PREEMPT_ACTIVE) - state = TASK_RUNNING | TASK_STATE_MAX; -#endif /* CONFIG_PREEMPT */ - - return state; + return preempt ? TASK_RUNNING | TASK_STATE_MAX : p->state; } #endif /* CREATE_TRACE_POINTS */ @@ -128,10 +123,11 @@ static inline long __trace_sched_switch_state(struct task_struct *p) */ TRACE_EVENT(sched_switch, - TP_PROTO(struct task_struct *prev, + TP_PROTO(bool preempt, + struct task_struct *prev, struct task_struct *next), - TP_ARGS(prev, next), + TP_ARGS(preempt, prev, next), TP_STRUCT__entry( __array( char, prev_comm, TASK_COMM_LEN ) @@ -147,7 +143,7 @@ TRACE_EVENT(sched_switch, memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); __entry->prev_pid = prev->pid; __entry->prev_prio = prev->prio; - __entry->prev_state = __trace_sched_switch_state(prev); + __entry->prev_state = __trace_sched_switch_state(preempt, prev); memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); __entry->next_pid = next->pid; __entry->next_prio = next->prio; diff --git a/kernel/cpu.c b/kernel/cpu.c index 14a9cdf8abe9..85ff5e26e23b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -291,8 +291,8 @@ static inline void check_for_tasks(int dead_cpu) { struct task_struct *g, *p; - read_lock_irq(&tasklist_lock); - do_each_thread(g, p) { + read_lock(&tasklist_lock); + for_each_process_thread(g, p) { if (!p->on_rq) continue; /* @@ -307,8 +307,8 @@ static inline void check_for_tasks(int dead_cpu) pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n", p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags); - } while_each_thread(g, p); - read_unlock_irq(&tasklist_lock); + } + read_unlock(&tasklist_lock); } struct take_cpu_down_param { @@ -331,7 +331,7 @@ static int take_cpu_down(void *_param) /* Give up timekeeping duties */ tick_handover_do_timer(); /* Park the stopper thread */ - kthread_park(current); + stop_machine_park((long)param->hcpu); return 0; } diff --git a/kernel/exit.c b/kernel/exit.c index 0e93b63bbc59..07110c6020a0 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -706,10 +706,12 @@ void do_exit(long code) smp_mb(); raw_spin_unlock_wait(&tsk->pi_lock); - if (unlikely(in_atomic())) + if (unlikely(in_atomic())) { pr_info("note: %s[%d] exited with preempt_count %d\n", current->comm, task_pid_nr(current), preempt_count()); + preempt_count_set(PREEMPT_ENABLED); + } /* sync mm's RSS info before statistics gathering */ if (tsk->mm) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index bbb72b4f64a1..8251e75dd9c0 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -170,7 +170,8 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left, * then right waiter has a dl_prio() too. */ if (dl_prio(left->prio)) - return (left->task->dl.deadline < right->task->dl.deadline); + return dl_time_before(left->task->dl.deadline, + right->task->dl.deadline); return 0; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f7402f7eb448..aa5973220ad2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -817,7 +817,7 @@ static void set_load_weight(struct task_struct *p) /* * SCHED_IDLE tasks get minimal weight: */ - if (p->policy == SCHED_IDLE) { + if (idle_policy(p->policy)) { load->weight = scale_load(WEIGHT_IDLEPRIO); load->inv_weight = WMULT_IDLEPRIO; return; @@ -827,17 +827,19 @@ static void set_load_weight(struct task_struct *p) load->inv_weight = prio_to_wmult[prio]; } -static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) +static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { update_rq_clock(rq); - sched_info_queued(rq, p); + if (!(flags & ENQUEUE_RESTORE)) + sched_info_queued(rq, p); p->sched_class->enqueue_task(rq, p, flags); } -static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) +static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { update_rq_clock(rq); - sched_info_dequeued(rq, p); + if (!(flags & DEQUEUE_SAVE)) + sched_info_dequeued(rq, p); p->sched_class->dequeue_task(rq, p, flags); } @@ -1178,7 +1180,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) * holding rq->lock. */ lockdep_assert_held(&rq->lock); - dequeue_task(rq, p, 0); + dequeue_task(rq, p, DEQUEUE_SAVE); } if (running) put_prev_task(rq, p); @@ -1188,7 +1190,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) if (running) p->sched_class->set_curr_task(rq); if (queued) - enqueue_task(rq, p, 0); + enqueue_task(rq, p, ENQUEUE_RESTORE); } /* @@ -1292,7 +1294,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (task_cpu(p) != new_cpu) { if (p->sched_class->migrate_task_rq) - p->sched_class->migrate_task_rq(p, new_cpu); + p->sched_class->migrate_task_rq(p); p->se.nr_migrations++; perf_event_task_migrate(p); } @@ -1333,12 +1335,16 @@ static int migrate_swap_stop(void *data) struct rq *src_rq, *dst_rq; int ret = -EAGAIN; + if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu)) + return -EAGAIN; + src_rq = cpu_rq(arg->src_cpu); dst_rq = cpu_rq(arg->dst_cpu); double_raw_lock(&arg->src_task->pi_lock, &arg->dst_task->pi_lock); double_rq_lock(src_rq, dst_rq); + if (task_cpu(arg->dst_task) != arg->dst_cpu) goto unlock; @@ -1574,13 +1580,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p) goto out; } + /* No more Mr. Nice Guy. */ switch (state) { case cpuset: - /* No more Mr. Nice Guy. */ - cpuset_cpus_allowed_fallback(p); - state = possible; - break; - + if (IS_ENABLED(CONFIG_CPUSETS)) { + cpuset_cpus_allowed_fallback(p); + state = possible; + break; + } + /* fall-through */ case possible: do_set_cpus_allowed(p, cpu_possible_mask); state = fail; @@ -1692,7 +1700,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) #endif /* CONFIG_SCHEDSTATS */ } -static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) +static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) { activate_task(rq, p, en_flags); p->on_rq = TASK_ON_RQ_QUEUED; @@ -2114,23 +2122,17 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #endif /* CONFIG_NUMA_BALANCING */ } +DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); + #ifdef CONFIG_NUMA_BALANCING -#ifdef CONFIG_SCHED_DEBUG + void set_numabalancing_state(bool enabled) { if (enabled) - sched_feat_set("NUMA"); + static_branch_enable(&sched_numa_balancing); else - sched_feat_set("NO_NUMA"); + static_branch_disable(&sched_numa_balancing); } -#else -__read_mostly bool numabalancing_enabled; - -void set_numabalancing_state(bool enabled) -{ - numabalancing_enabled = enabled; -} -#endif /* CONFIG_SCHED_DEBUG */ #ifdef CONFIG_PROC_SYSCTL int sysctl_numa_balancing(struct ctl_table *table, int write, @@ -2138,7 +2140,7 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, { struct ctl_table t; int err; - int state = numabalancing_enabled; + int state = static_branch_likely(&sched_numa_balancing); if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; @@ -2349,6 +2351,8 @@ void wake_up_new_task(struct task_struct *p) struct rq *rq; raw_spin_lock_irqsave(&p->pi_lock, flags); + /* Initialize new task's runnable average */ + init_entity_runnable_average(&p->se); #ifdef CONFIG_SMP /* * Fork balancing, do it here and not earlier because: @@ -2358,8 +2362,6 @@ void wake_up_new_task(struct task_struct *p) set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif - /* Initialize new task's runnable average */ - init_entity_runnable_average(&p->se); rq = __task_rq_lock(p); activate_task(rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; @@ -2483,7 +2485,6 @@ static inline void prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { - trace_sched_switch(prev, next); sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); fire_sched_out_preempt_notifiers(prev, next); @@ -2517,6 +2518,22 @@ static struct rq *finish_task_switch(struct task_struct *prev) struct mm_struct *mm = rq->prev_mm; long prev_state; + /* + * The previous task will have left us with a preempt_count of 2 + * because it left us after: + * + * schedule() + * preempt_disable(); // 1 + * __schedule() + * raw_spin_lock_irq(&rq->lock) // 2 + * + * Also, see FORK_PREEMPT_COUNT. + */ + if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, + "corrupted preempt_count: %s/%d/0x%x\n", + current->comm, current->pid, preempt_count())) + preempt_count_set(FORK_PREEMPT_COUNT); + rq->prev_mm = NULL; /* @@ -2601,8 +2618,15 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) { struct rq *rq; - /* finish_task_switch() drops rq->lock and enables preemtion */ - preempt_disable(); + /* + * New tasks start with FORK_PREEMPT_COUNT, see there and + * finish_task_switch() for details. + * + * finish_task_switch() will drop rq->lock() and lower preempt_count + * and the preempt_enable() will end up enabling preemption (on + * PREEMPT_COUNT kernels). + */ + rq = finish_task_switch(prev); balance_callback(rq); preempt_enable(); @@ -2960,15 +2984,13 @@ static noinline void __schedule_bug(struct task_struct *prev) static inline void schedule_debug(struct task_struct *prev) { #ifdef CONFIG_SCHED_STACK_END_CHECK - BUG_ON(unlikely(task_stack_end_corrupted(prev))); + BUG_ON(task_stack_end_corrupted(prev)); #endif - /* - * Test if we are atomic. Since do_exit() needs to call into - * schedule() atomically, we ignore that path. Otherwise whine - * if we are scheduling when we should not. - */ - if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD)) + + if (unlikely(in_atomic_preempt_off())) { __schedule_bug(prev); + preempt_count_set(PREEMPT_DISABLED); + } rcu_sleep_check(); profile_hit(SCHED_PROFILING, __builtin_return_address(0)); @@ -3054,7 +3076,7 @@ again: * * WARNING: must be called with preemption disabled! */ -static void __sched __schedule(void) +static void __sched notrace __schedule(bool preempt) { struct task_struct *prev, *next; unsigned long *switch_count; @@ -3066,6 +3088,17 @@ static void __sched __schedule(void) rcu_note_context_switch(); prev = rq->curr; + /* + * do_exit() calls schedule() with preemption disabled as an exception; + * however we must fix that up, otherwise the next task will see an + * inconsistent (higher) preempt count. + * + * It also avoids the below schedule_debug() test from complaining + * about this. + */ + if (unlikely(prev->state == TASK_DEAD)) + preempt_enable_no_resched_notrace(); + schedule_debug(prev); if (sched_feat(HRTICK)) @@ -3083,7 +3116,7 @@ static void __sched __schedule(void) rq->clock_skip_update <<= 1; /* promote REQ to ACT */ switch_count = &prev->nivcsw; - if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + if (!preempt && prev->state) { if (unlikely(signal_pending_state(prev->state, prev))) { prev->state = TASK_RUNNING; } else { @@ -3119,6 +3152,7 @@ static void __sched __schedule(void) rq->curr = next; ++*switch_count; + trace_sched_switch(preempt, prev, next); rq = context_switch(rq, prev, next); /* unlocks the rq */ cpu = cpu_of(rq); } else { @@ -3148,7 +3182,7 @@ asmlinkage __visible void __sched schedule(void) sched_submit_work(tsk); do { preempt_disable(); - __schedule(); + __schedule(false); sched_preempt_enable_no_resched(); } while (need_resched()); } @@ -3188,9 +3222,9 @@ void __sched schedule_preempt_disabled(void) static void __sched notrace preempt_schedule_common(void) { do { - preempt_active_enter(); - __schedule(); - preempt_active_exit(); + preempt_disable_notrace(); + __schedule(true); + preempt_enable_no_resched_notrace(); /* * Check again in case we missed a preemption opportunity @@ -3241,24 +3275,17 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) return; do { - /* - * Use raw __prempt_count() ops that don't call function. - * We can't call functions before disabling preemption which - * disarm preemption tracing recursions. - */ - __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); - barrier(); + preempt_disable_notrace(); /* * Needs preempt disabled in case user_exit() is traced * and the tracer calls preempt_enable_notrace() causing * an infinite recursion. */ prev_ctx = exception_enter(); - __schedule(); + __schedule(true); exception_exit(prev_ctx); - barrier(); - __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); + preempt_enable_no_resched_notrace(); } while (need_resched()); } EXPORT_SYMBOL_GPL(preempt_schedule_notrace); @@ -3281,11 +3308,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) prev_state = exception_enter(); do { - preempt_active_enter(); + preempt_disable(); local_irq_enable(); - __schedule(); + __schedule(true); local_irq_disable(); - preempt_active_exit(); + sched_preempt_enable_no_resched(); } while (need_resched()); exception_exit(prev_state); @@ -3313,7 +3340,7 @@ EXPORT_SYMBOL(default_wake_function); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - int oldprio, queued, running, enqueue_flag = 0; + int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE; struct rq *rq; const struct sched_class *prev_class; @@ -3345,7 +3372,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) - dequeue_task(rq, p, 0); + dequeue_task(rq, p, DEQUEUE_SAVE); if (running) put_prev_task(rq, p); @@ -3363,7 +3390,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (!dl_prio(p->normal_prio) || (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { p->dl.dl_boosted = 1; - enqueue_flag = ENQUEUE_REPLENISH; + enqueue_flag |= ENQUEUE_REPLENISH; } else p->dl.dl_boosted = 0; p->sched_class = &dl_sched_class; @@ -3371,7 +3398,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (dl_prio(oldprio)) p->dl.dl_boosted = 0; if (oldprio < prio) - enqueue_flag = ENQUEUE_HEAD; + enqueue_flag |= ENQUEUE_HEAD; p->sched_class = &rt_sched_class; } else { if (dl_prio(oldprio)) @@ -3423,7 +3450,7 @@ void set_user_nice(struct task_struct *p, long nice) } queued = task_on_rq_queued(p); if (queued) - dequeue_task(rq, p, 0); + dequeue_task(rq, p, DEQUEUE_SAVE); p->static_prio = NICE_TO_PRIO(nice); set_load_weight(p); @@ -3432,7 +3459,7 @@ void set_user_nice(struct task_struct *p, long nice) delta = p->prio - old_prio; if (queued) { - enqueue_task(rq, p, 0); + enqueue_task(rq, p, ENQUEUE_RESTORE); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -3753,10 +3780,7 @@ recheck: } else { reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); - if (policy != SCHED_DEADLINE && - policy != SCHED_FIFO && policy != SCHED_RR && - policy != SCHED_NORMAL && policy != SCHED_BATCH && - policy != SCHED_IDLE) + if (!valid_policy(policy)) return -EINVAL; } @@ -3812,7 +3836,7 @@ recheck: * Treat SCHED_IDLE as nice 20. Only allow a switch to * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. */ - if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { + if (idle_policy(p->policy) && !idle_policy(policy)) { if (!can_nice(p, task_nice(p))) return -EPERM; } @@ -3937,7 +3961,7 @@ change: queued = task_on_rq_queued(p); running = task_ |