summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-11-03 18:03:50 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2015-11-03 18:03:50 -0800
commit53528695ff6d8b77011bc818407c13e30914a946 (patch)
tree04acd099c5759bf6f1d728c5415f574d572c6872
parentb831ef2cad979912850e34f82415c0c5d59de8cb (diff)
parente73e85f0593832aa583b252f9a16cf90ed6d30fa (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler changes from Ingo Molnar: "The main changes in this cycle were: - sched/fair load tracking fixes and cleanups (Byungchul Park) - Make load tracking frequency scale invariant (Dietmar Eggemann) - sched/deadline updates (Juri Lelli) - stop machine fixes, cleanups and enhancements for bugs triggered by CPU hotplug stress testing (Oleg Nesterov) - scheduler preemption code rework: remove PREEMPT_ACTIVE and related cleanups (Peter Zijlstra) - Rework the sched_info::run_delay code to fix races (Peter Zijlstra) - Optimize per entity utilization tracking (Peter Zijlstra) - ... misc other fixes, cleanups and smaller updates" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (57 commits) sched: Don't scan all-offline ->cpus_allowed twice if !CONFIG_CPUSETS sched: Move cpu_active() tests from stop_two_cpus() into migrate_swap_stop() sched: Start stopper early stop_machine: Kill cpu_stop_threads->setup() and cpu_stop_unpark() stop_machine: Kill smp_hotplug_thread->pre_unpark, introduce stop_machine_unpark() stop_machine: Change cpu_stop_queue_two_works() to rely on stopper->enabled stop_machine: Introduce __cpu_stop_queue_work() and cpu_stop_queue_two_works() stop_machine: Ensure that a queued callback will be called before cpu_stop_park() sched/x86: Fix typo in __switch_to() comments sched/core: Remove a parameter in the migrate_task_rq() function sched/core: Drop unlikely behind BUG_ON() sched/core: Fix task and run queue sched_info::run_delay inconsistencies sched/numa: Fix task_tick_fair() from disabling numa_balancing sched/core: Add preempt_count invariant check sched/core: More notrace annotations sched/core: Kill PREEMPT_ACTIVE sched/core, sched/x86: Kill thread_info::saved_preempt_count sched/core: Simplify preempt_count tests sched/core: Robustify preemption leak checks sched/core: Stop setting PREEMPT_ACTIVE ...
-rw-r--r--arch/x86/include/asm/preempt.h5
-rw-r--r--arch/x86/include/asm/thread_info.h2
-rw-r--r--arch/x86/kernel/process_32.c8
-rw-r--r--arch/x86/kernel/process_64.c10
-rw-r--r--include/asm-generic/preempt.h2
-rw-r--r--include/linux/preempt.h20
-rw-r--r--include/linux/sched.h36
-rw-r--r--include/linux/sched/deadline.h5
-rw-r--r--include/linux/smpboot.h4
-rw-r--r--include/linux/stop_machine.h2
-rw-r--r--include/trace/events/sched.h22
-rw-r--r--kernel/cpu.c10
-rw-r--r--kernel/exit.c4
-rw-r--r--kernel/locking/rtmutex.c3
-rw-r--r--kernel/sched/core.c203
-rw-r--r--kernel/sched/cpudeadline.c5
-rw-r--r--kernel/sched/cpudeadline.h1
-rw-r--r--kernel/sched/fair.c419
-rw-r--r--kernel/sched/features.h21
-rw-r--r--kernel/sched/rt.c22
-rw-r--r--kernel/sched/sched.h55
-rw-r--r--kernel/smpboot.c5
-rw-r--r--kernel/stop_machine.c90
-rw-r--r--kernel/trace/ftrace.c2
-rw-r--r--kernel/trace/trace_sched_switch.c3
-rw-r--r--kernel/trace/trace_sched_wakeup.c2
26 files changed, 492 insertions, 469 deletions
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index b12f81022a6b..01bcde84d3e4 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -30,12 +30,9 @@ static __always_inline void preempt_count_set(int pc)
/*
* must be macros to avoid header recursion hell
*/
-#define init_task_preempt_count(p) do { \
- task_thread_info(p)->saved_preempt_count = PREEMPT_DISABLED; \
-} while (0)
+#define init_task_preempt_count(p) do { } while (0)
#define init_idle_preempt_count(p, cpu) do { \
- task_thread_info(p)->saved_preempt_count = PREEMPT_ENABLED; \
per_cpu(__preempt_count, (cpu)) = PREEMPT_ENABLED; \
} while (0)
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 8afdc3e44247..809877e9030b 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -57,7 +57,6 @@ struct thread_info {
__u32 flags; /* low level flags */
__u32 status; /* thread synchronous flags */
__u32 cpu; /* current CPU */
- int saved_preempt_count;
mm_segment_t addr_limit;
void __user *sysenter_return;
unsigned int sig_on_uaccess_error:1;
@@ -69,7 +68,6 @@ struct thread_info {
.task = &tsk, \
.flags = 0, \
.cpu = 0, \
- .saved_preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 737527b40e5b..9f950917528b 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -280,14 +280,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
set_iopl_mask(next->iopl);
/*
- * If it were not for PREEMPT_ACTIVE we could guarantee that the
- * preempt_count of all tasks was equal here and this would not be
- * needed.
- */
- task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
- this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
-
- /*
* Now maybe handle debug registers and/or IO bitmaps
*/
if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b35921a670b2..e835d263a33b 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -332,7 +332,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/*
* Switch FS and GS.
*
- * These are even more complicated than FS and GS: they have
+ * These are even more complicated than DS and ES: they have
* 64-bit bases are that controlled by arch_prctl. Those bases
* only differ from the values in the GDT or LDT if the selector
* is 0.
@@ -401,14 +401,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*/
this_cpu_write(current_task, next_p);
- /*
- * If it were not for PREEMPT_ACTIVE we could guarantee that the
- * preempt_count of all tasks was equal here and this would not be
- * needed.
- */
- task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
- this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
-
/* Reload esp0 and ss1. This changes current_thread_info(). */
load_sp0(tss, next);
diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h
index 0bec580a4885..5d8ffa3e6f8c 100644
--- a/include/asm-generic/preempt.h
+++ b/include/asm-generic/preempt.h
@@ -24,7 +24,7 @@ static __always_inline void preempt_count_set(int pc)
* must be macros to avoid header recursion hell
*/
#define init_task_preempt_count(p) do { \
- task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \
+ task_thread_info(p)->preempt_count = FORK_PREEMPT_COUNT; \
} while (0)
#define init_idle_preempt_count(p, cpu) do { \
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index bea8dd8ff5e0..75e4e30677f1 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -26,7 +26,6 @@
* SOFTIRQ_MASK: 0x0000ff00
* HARDIRQ_MASK: 0x000f0000
* NMI_MASK: 0x00100000
- * PREEMPT_ACTIVE: 0x00200000
* PREEMPT_NEED_RESCHED: 0x80000000
*/
#define PREEMPT_BITS 8
@@ -53,10 +52,6 @@
#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
-#define PREEMPT_ACTIVE_BITS 1
-#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS)
-#define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
-
/* We use the MSB mostly because its available */
#define PREEMPT_NEED_RESCHED 0x80000000
@@ -126,8 +121,7 @@
* Check whether we were atomic before we did preempt_disable():
* (used by the scheduler)
*/
-#define in_atomic_preempt_off() \
- ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_DISABLE_OFFSET)
+#define in_atomic_preempt_off() (preempt_count() != PREEMPT_DISABLE_OFFSET)
#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
extern void preempt_count_add(int val);
@@ -146,18 +140,6 @@ extern void preempt_count_sub(int val);
#define preempt_count_inc() preempt_count_add(1)
#define preempt_count_dec() preempt_count_sub(1)
-#define preempt_active_enter() \
-do { \
- preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \
- barrier(); \
-} while (0)
-
-#define preempt_active_exit() \
-do { \
- barrier(); \
- preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \
-} while (0)
-
#ifdef CONFIG_PREEMPT_COUNT
#define preempt_disable() \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 56667292d1e4..9e1e06c3ce05 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -599,20 +599,26 @@ struct task_cputime_atomic {
.sum_exec_runtime = ATOMIC64_INIT(0), \
}
-#ifdef CONFIG_PREEMPT_COUNT
-#define PREEMPT_DISABLED (1 + PREEMPT_ENABLED)
-#else
-#define PREEMPT_DISABLED PREEMPT_ENABLED
-#endif
+#define PREEMPT_DISABLED (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
+
+/*
+ * Disable preemption until the scheduler is running -- use an unconditional
+ * value so that it also works on !PREEMPT_COUNT kernels.
+ *
+ * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count().
+ */
+#define INIT_PREEMPT_COUNT PREEMPT_OFFSET
/*
- * Disable preemption until the scheduler is running.
- * Reset by start_kernel()->sched_init()->init_idle().
+ * Initial preempt_count value; reflects the preempt_count schedule invariant
+ * which states that during context switches:
*
- * We include PREEMPT_ACTIVE to avoid cond_resched() from working
- * before the scheduler is active -- see should_resched().
+ * preempt_count() == 2*PREEMPT_DISABLE_OFFSET
+ *
+ * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels.
+ * Note: See finish_task_switch().
*/
-#define INIT_PREEMPT_COUNT (PREEMPT_DISABLED + PREEMPT_ACTIVE)
+#define FORK_PREEMPT_COUNT (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
/**
* struct thread_group_cputimer - thread group interval timer counts
@@ -1142,8 +1148,6 @@ struct sched_domain_topology_level {
#endif
};
-extern struct sched_domain_topology_level *sched_domain_topology;
-
extern void set_sched_topology(struct sched_domain_topology_level *tl);
extern void wake_up_if_idle(int cpu);
@@ -1192,10 +1196,10 @@ struct load_weight {
/*
* The load_avg/util_avg accumulates an infinite geometric series.
- * 1) load_avg factors the amount of time that a sched_entity is
- * runnable on a rq into its weight. For cfs_rq, it is the aggregated
- * such weights of all runnable and blocked sched_entities.
- * 2) util_avg factors frequency scaling into the amount of time
+ * 1) load_avg factors frequency scaling into the amount of time that a
+ * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the
+ * aggregated such weights of all runnable and blocked sched_entities.
+ * 2) util_avg factors frequency and cpu scaling into the amount of time
* that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE].
* For cfs_rq, it is the aggregated such times of all runnable and
* blocked sched_entities.
diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
index 9d303b8847df..9089a2ae913d 100644
--- a/include/linux/sched/deadline.h
+++ b/include/linux/sched/deadline.h
@@ -21,4 +21,9 @@ static inline int dl_task(struct task_struct *p)
return dl_prio(p->prio);
}
+static inline bool dl_time_before(u64 a, u64 b)
+{
+ return (s64)(a - b) < 0;
+}
+
#endif /* _SCHED_DEADLINE_H */
diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h
index e6109a6cd8f6..12910cf19869 100644
--- a/include/linux/smpboot.h
+++ b/include/linux/smpboot.h
@@ -24,9 +24,6 @@ struct smpboot_thread_data;
* parked (cpu offline)
* @unpark: Optional unpark function, called when the thread is
* unparked (cpu online)
- * @pre_unpark: Optional unpark function, called before the thread is
- * unparked (cpu online). This is not guaranteed to be
- * called on the target cpu of the thread. Careful!
* @cpumask: Internal state. To update which threads are unparked,
* call smpboot_update_cpumask_percpu_thread().
* @selfparking: Thread is not parked by the park function.
@@ -42,7 +39,6 @@ struct smp_hotplug_thread {
void (*cleanup)(unsigned int cpu, bool online);
void (*park)(unsigned int cpu);
void (*unpark)(unsigned int cpu);
- void (*pre_unpark)(unsigned int cpu);
cpumask_var_t cpumask;
bool selfparking;
const char *thread_comm;
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 414d924318ce..0adedca24c5b 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -33,6 +33,8 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
struct cpu_stop_work *work_buf);
int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
+void stop_machine_park(int cpu);
+void stop_machine_unpark(int cpu);
#else /* CONFIG_SMP */
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 539d6bc3216a..9b90c57517a9 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -104,22 +104,17 @@ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
TP_ARGS(p));
#ifdef CREATE_TRACE_POINTS
-static inline long __trace_sched_switch_state(struct task_struct *p)
+static inline long __trace_sched_switch_state(bool preempt, struct task_struct *p)
{
- long state = p->state;
-
-#ifdef CONFIG_PREEMPT
#ifdef CONFIG_SCHED_DEBUG
BUG_ON(p != current);
#endif /* CONFIG_SCHED_DEBUG */
+
/*
- * For all intents and purposes a preempted task is a running task.
+ * Preemption ignores task state, therefore preempted tasks are always
+ * RUNNING (we will not have dequeued if state != RUNNING).
*/
- if (preempt_count() & PREEMPT_ACTIVE)
- state = TASK_RUNNING | TASK_STATE_MAX;
-#endif /* CONFIG_PREEMPT */
-
- return state;
+ return preempt ? TASK_RUNNING | TASK_STATE_MAX : p->state;
}
#endif /* CREATE_TRACE_POINTS */
@@ -128,10 +123,11 @@ static inline long __trace_sched_switch_state(struct task_struct *p)
*/
TRACE_EVENT(sched_switch,
- TP_PROTO(struct task_struct *prev,
+ TP_PROTO(bool preempt,
+ struct task_struct *prev,
struct task_struct *next),
- TP_ARGS(prev, next),
+ TP_ARGS(preempt, prev, next),
TP_STRUCT__entry(
__array( char, prev_comm, TASK_COMM_LEN )
@@ -147,7 +143,7 @@ TRACE_EVENT(sched_switch,
memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
__entry->prev_pid = prev->pid;
__entry->prev_prio = prev->prio;
- __entry->prev_state = __trace_sched_switch_state(prev);
+ __entry->prev_state = __trace_sched_switch_state(preempt, prev);
memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
__entry->next_pid = next->pid;
__entry->next_prio = next->prio;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 14a9cdf8abe9..85ff5e26e23b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -291,8 +291,8 @@ static inline void check_for_tasks(int dead_cpu)
{
struct task_struct *g, *p;
- read_lock_irq(&tasklist_lock);
- do_each_thread(g, p) {
+ read_lock(&tasklist_lock);
+ for_each_process_thread(g, p) {
if (!p->on_rq)
continue;
/*
@@ -307,8 +307,8 @@ static inline void check_for_tasks(int dead_cpu)
pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n",
p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags);
- } while_each_thread(g, p);
- read_unlock_irq(&tasklist_lock);
+ }
+ read_unlock(&tasklist_lock);
}
struct take_cpu_down_param {
@@ -331,7 +331,7 @@ static int take_cpu_down(void *_param)
/* Give up timekeeping duties */
tick_handover_do_timer();
/* Park the stopper thread */
- kthread_park(current);
+ stop_machine_park((long)param->hcpu);
return 0;
}
diff --git a/kernel/exit.c b/kernel/exit.c
index 0e93b63bbc59..07110c6020a0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -706,10 +706,12 @@ void do_exit(long code)
smp_mb();
raw_spin_unlock_wait(&tsk->pi_lock);
- if (unlikely(in_atomic()))
+ if (unlikely(in_atomic())) {
pr_info("note: %s[%d] exited with preempt_count %d\n",
current->comm, task_pid_nr(current),
preempt_count());
+ preempt_count_set(PREEMPT_ENABLED);
+ }
/* sync mm's RSS info before statistics gathering */
if (tsk->mm)
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index bbb72b4f64a1..8251e75dd9c0 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -170,7 +170,8 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left,
* then right waiter has a dl_prio() too.
*/
if (dl_prio(left->prio))
- return (left->task->dl.deadline < right->task->dl.deadline);
+ return dl_time_before(left->task->dl.deadline,
+ right->task->dl.deadline);
return 0;
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f7402f7eb448..aa5973220ad2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -817,7 +817,7 @@ static void set_load_weight(struct task_struct *p)
/*
* SCHED_IDLE tasks get minimal weight:
*/
- if (p->policy == SCHED_IDLE) {
+ if (idle_policy(p->policy)) {
load->weight = scale_load(WEIGHT_IDLEPRIO);
load->inv_weight = WMULT_IDLEPRIO;
return;
@@ -827,17 +827,19 @@ static void set_load_weight(struct task_struct *p)
load->inv_weight = prio_to_wmult[prio];
}
-static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
+static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
update_rq_clock(rq);
- sched_info_queued(rq, p);
+ if (!(flags & ENQUEUE_RESTORE))
+ sched_info_queued(rq, p);
p->sched_class->enqueue_task(rq, p, flags);
}
-static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
update_rq_clock(rq);
- sched_info_dequeued(rq, p);
+ if (!(flags & DEQUEUE_SAVE))
+ sched_info_dequeued(rq, p);
p->sched_class->dequeue_task(rq, p, flags);
}
@@ -1178,7 +1180,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
* holding rq->lock.
*/
lockdep_assert_held(&rq->lock);
- dequeue_task(rq, p, 0);
+ dequeue_task(rq, p, DEQUEUE_SAVE);
}
if (running)
put_prev_task(rq, p);
@@ -1188,7 +1190,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
if (running)
p->sched_class->set_curr_task(rq);
if (queued)
- enqueue_task(rq, p, 0);
+ enqueue_task(rq, p, ENQUEUE_RESTORE);
}
/*
@@ -1292,7 +1294,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
if (task_cpu(p) != new_cpu) {
if (p->sched_class->migrate_task_rq)
- p->sched_class->migrate_task_rq(p, new_cpu);
+ p->sched_class->migrate_task_rq(p);
p->se.nr_migrations++;
perf_event_task_migrate(p);
}
@@ -1333,12 +1335,16 @@ static int migrate_swap_stop(void *data)
struct rq *src_rq, *dst_rq;
int ret = -EAGAIN;
+ if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
+ return -EAGAIN;
+
src_rq = cpu_rq(arg->src_cpu);
dst_rq = cpu_rq(arg->dst_cpu);
double_raw_lock(&arg->src_task->pi_lock,
&arg->dst_task->pi_lock);
double_rq_lock(src_rq, dst_rq);
+
if (task_cpu(arg->dst_task) != arg->dst_cpu)
goto unlock;
@@ -1574,13 +1580,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
goto out;
}
+ /* No more Mr. Nice Guy. */
switch (state) {
case cpuset:
- /* No more Mr. Nice Guy. */
- cpuset_cpus_allowed_fallback(p);
- state = possible;
- break;
-
+ if (IS_ENABLED(CONFIG_CPUSETS)) {
+ cpuset_cpus_allowed_fallback(p);
+ state = possible;
+ break;
+ }
+ /* fall-through */
case possible:
do_set_cpus_allowed(p, cpu_possible_mask);
state = fail;
@@ -1692,7 +1700,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
#endif /* CONFIG_SCHEDSTATS */
}
-static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
+static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
{
activate_task(rq, p, en_flags);
p->on_rq = TASK_ON_RQ_QUEUED;
@@ -2114,23 +2122,17 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
#endif /* CONFIG_NUMA_BALANCING */
}
+DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
+
#ifdef CONFIG_NUMA_BALANCING
-#ifdef CONFIG_SCHED_DEBUG
+
void set_numabalancing_state(bool enabled)
{
if (enabled)
- sched_feat_set("NUMA");
+ static_branch_enable(&sched_numa_balancing);
else
- sched_feat_set("NO_NUMA");
+ static_branch_disable(&sched_numa_balancing);
}
-#else
-__read_mostly bool numabalancing_enabled;
-
-void set_numabalancing_state(bool enabled)
-{
- numabalancing_enabled = enabled;
-}
-#endif /* CONFIG_SCHED_DEBUG */
#ifdef CONFIG_PROC_SYSCTL
int sysctl_numa_balancing(struct ctl_table *table, int write,
@@ -2138,7 +2140,7 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
{
struct ctl_table t;
int err;
- int state = numabalancing_enabled;
+ int state = static_branch_likely(&sched_numa_balancing);
if (write && !capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -2349,6 +2351,8 @@ void wake_up_new_task(struct task_struct *p)
struct rq *rq;
raw_spin_lock_irqsave(&p->pi_lock, flags);
+ /* Initialize new task's runnable average */
+ init_entity_runnable_average(&p->se);
#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
@@ -2358,8 +2362,6 @@ void wake_up_new_task(struct task_struct *p)
set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
- /* Initialize new task's runnable average */
- init_entity_runnable_average(&p->se);
rq = __task_rq_lock(p);
activate_task(rq, p, 0);
p->on_rq = TASK_ON_RQ_QUEUED;
@@ -2483,7 +2485,6 @@ static inline void
prepare_task_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
{
- trace_sched_switch(prev, next);
sched_info_switch(rq, prev, next);
perf_event_task_sched_out(prev, next);
fire_sched_out_preempt_notifiers(prev, next);
@@ -2517,6 +2518,22 @@ static struct rq *finish_task_switch(struct task_struct *prev)
struct mm_struct *mm = rq->prev_mm;
long prev_state;
+ /*
+ * The previous task will have left us with a preempt_count of 2
+ * because it left us after:
+ *
+ * schedule()
+ * preempt_disable(); // 1
+ * __schedule()
+ * raw_spin_lock_irq(&rq->lock) // 2
+ *
+ * Also, see FORK_PREEMPT_COUNT.
+ */
+ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
+ "corrupted preempt_count: %s/%d/0x%x\n",
+ current->comm, current->pid, preempt_count()))
+ preempt_count_set(FORK_PREEMPT_COUNT);
+
rq->prev_mm = NULL;
/*
@@ -2601,8 +2618,15 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
{
struct rq *rq;
- /* finish_task_switch() drops rq->lock and enables preemtion */
- preempt_disable();
+ /*
+ * New tasks start with FORK_PREEMPT_COUNT, see there and
+ * finish_task_switch() for details.
+ *
+ * finish_task_switch() will drop rq->lock() and lower preempt_count
+ * and the preempt_enable() will end up enabling preemption (on
+ * PREEMPT_COUNT kernels).
+ */
+
rq = finish_task_switch(prev);
balance_callback(rq);
preempt_enable();
@@ -2960,15 +2984,13 @@ static noinline void __schedule_bug(struct task_struct *prev)
static inline void schedule_debug(struct task_struct *prev)
{
#ifdef CONFIG_SCHED_STACK_END_CHECK
- BUG_ON(unlikely(task_stack_end_corrupted(prev)));
+ BUG_ON(task_stack_end_corrupted(prev));
#endif
- /*
- * Test if we are atomic. Since do_exit() needs to call into
- * schedule() atomically, we ignore that path. Otherwise whine
- * if we are scheduling when we should not.
- */
- if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
+
+ if (unlikely(in_atomic_preempt_off())) {
__schedule_bug(prev);
+ preempt_count_set(PREEMPT_DISABLED);
+ }
rcu_sleep_check();
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -3054,7 +3076,7 @@ again:
*
* WARNING: must be called with preemption disabled!
*/
-static void __sched __schedule(void)
+static void __sched notrace __schedule(bool preempt)
{
struct task_struct *prev, *next;
unsigned long *switch_count;
@@ -3066,6 +3088,17 @@ static void __sched __schedule(void)
rcu_note_context_switch();
prev = rq->curr;
+ /*
+ * do_exit() calls schedule() with preemption disabled as an exception;
+ * however we must fix that up, otherwise the next task will see an
+ * inconsistent (higher) preempt count.
+ *
+ * It also avoids the below schedule_debug() test from complaining
+ * about this.
+ */
+ if (unlikely(prev->state == TASK_DEAD))
+ preempt_enable_no_resched_notrace();
+
schedule_debug(prev);
if (sched_feat(HRTICK))
@@ -3083,7 +3116,7 @@ static void __sched __schedule(void)
rq->clock_skip_update <<= 1; /* promote REQ to ACT */
switch_count = &prev->nivcsw;
- if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
+ if (!preempt && prev->state) {
if (unlikely(signal_pending_state(prev->state, prev))) {
prev->state = TASK_RUNNING;
} else {
@@ -3119,6 +3152,7 @@ static void __sched __schedule(void)
rq->curr = next;
++*switch_count;
+ trace_sched_switch(preempt, prev, next);
rq = context_switch(rq, prev, next); /* unlocks the rq */
cpu = cpu_of(rq);
} else {
@@ -3148,7 +3182,7 @@ asmlinkage __visible void __sched schedule(void)
sched_submit_work(tsk);
do {
preempt_disable();
- __schedule();
+ __schedule(false);
sched_preempt_enable_no_resched();
} while (need_resched());
}
@@ -3188,9 +3222,9 @@ void __sched schedule_preempt_disabled(void)
static void __sched notrace preempt_schedule_common(void)
{
do {
- preempt_active_enter();
- __schedule();
- preempt_active_exit();
+ preempt_disable_notrace();
+ __schedule(true);
+ preempt_enable_no_resched_notrace();
/*
* Check again in case we missed a preemption opportunity
@@ -3241,24 +3275,17 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
return;
do {
- /*
- * Use raw __prempt_count() ops that don't call function.
- * We can't call functions before disabling preemption which
- * disarm preemption tracing recursions.
- */
- __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
- barrier();
+ preempt_disable_notrace();
/*
* Needs preempt disabled in case user_exit() is traced
* and the tracer calls preempt_enable_notrace() causing
* an infinite recursion.
*/
prev_ctx = exception_enter();
- __schedule();
+ __schedule(true);
exception_exit(prev_ctx);
- barrier();
- __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
+ preempt_enable_no_resched_notrace();
} while (need_resched());
}
EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
@@ -3281,11 +3308,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
prev_state = exception_enter();
do {
- preempt_active_enter();
+ preempt_disable();
local_irq_enable();
- __schedule();
+ __schedule(true);
local_irq_disable();
- preempt_active_exit();
+ sched_preempt_enable_no_resched();
} while (need_resched());
exception_exit(prev_state);
@@ -3313,7 +3340,7 @@ EXPORT_SYMBOL(default_wake_function);
*/
void rt_mutex_setprio(struct task_struct *p, int prio)
{
- int oldprio, queued, running, enqueue_flag = 0;
+ int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
struct rq *rq;
const struct sched_class *prev_class;
@@ -3345,7 +3372,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
- dequeue_task(rq, p, 0);
+ dequeue_task(rq, p, DEQUEUE_SAVE);
if (running)
put_prev_task(rq, p);
@@ -3363,7 +3390,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (!dl_prio(p->normal_prio) ||
(pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
p->dl.dl_boosted = 1;
- enqueue_flag = ENQUEUE_REPLENISH;
+ enqueue_flag |= ENQUEUE_REPLENISH;
} else
p->dl.dl_boosted = 0;
p->sched_class = &dl_sched_class;
@@ -3371,7 +3398,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (dl_prio(oldprio))
p->dl.dl_boosted = 0;
if (oldprio < prio)
- enqueue_flag = ENQUEUE_HEAD;
+ enqueue_flag |= ENQUEUE_HEAD;
p->sched_class = &rt_sched_class;
} else {
if (dl_prio(oldprio))
@@ -3423,7 +3450,7 @@ void set_user_nice(struct task_struct *p, long nice)
}
queued = task_on_rq_queued(p);
if (queued)
- dequeue_task(rq, p, 0);
+ dequeue_task(rq, p, DEQUEUE_SAVE);
p->static_prio = NICE_TO_PRIO(nice);
set_load_weight(p);
@@ -3432,7 +3459,7 @@ void set_user_nice(struct task_struct *p, long nice)
delta = p->prio - old_prio;
if (queued) {
- enqueue_task(rq, p, 0);
+ enqueue_task(rq, p, ENQUEUE_RESTORE);
/*
* If the task increased its priority or is running and
* lowered its priority, then reschedule its CPU:
@@ -3753,10 +3780,7 @@ recheck:
} else {
reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
- if (policy != SCHED_DEADLINE &&
- policy != SCHED_FIFO && policy != SCHED_RR &&
- policy != SCHED_NORMAL && policy != SCHED_BATCH &&
- policy != SCHED_IDLE)
+ if (!valid_policy(policy))
return -EINVAL;
}
@@ -3812,7 +3836,7 @@ recheck:
* Treat SCHED_IDLE as nice 20. Only allow a switch to
* SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
*/
- if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
+ if (idle_policy(p->policy) && !idle_policy(policy)) {
if (!can_nice(p, task_nice(p)))
return -EPERM;
}
@@ -3937,7 +3961,7 @@ change:
queued = task_on_rq_queued(p);
running = task_