summaryrefslogtreecommitdiffstats
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c2333
1 files changed, 377 insertions, 1956 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c56fb57f2991..34e2291a9a6c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1,88 +1,28 @@
/*
* kernel/sched/core.c
*
- * Kernel scheduler and related syscalls
+ * Core kernel scheduler code and related syscalls
*
* Copyright (C) 1991-2002 Linus Torvalds
- *
- * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
- * make semaphores SMP safe
- * 1998-11-19 Implemented schedule_timeout() and related stuff
- * by Andrea Arcangeli
- * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
- * hybrid priority-list and round-robin design with
- * an array-switch method of distributing timeslices
- * and per-CPU runqueues. Cleanups and useful suggestions
- * by Davide Libenzi, preemptible kernel bits by Robert Love.
- * 2003-09-03 Interactivity tuning by Con Kolivas.
- * 2004-04-02 Scheduler domains code by Nick Piggin
- * 2007-04-15 Work begun on replacing all interactivity tuning with a
- * fair scheduling design by Con Kolivas.
- * 2007-05-05 Load balancing (smp-nice) and other improvements
- * by Peter Williams
- * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
- * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
- * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
- * Thomas Gleixner, Mike Kravetz
*/
-
-#include <linux/kasan.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/nmi.h>
-#include <linux/init.h>
-#include <linux/uaccess.h>
-#include <linux/highmem.h>
-#include <linux/mmu_context.h>
-#include <linux/interrupt.h>
-#include <linux/capability.h>
-#include <linux/completion.h>
-#include <linux/kernel_stat.h>
-#include <linux/debug_locks.h>
-#include <linux/perf_event.h>
-#include <linux/security.h>
-#include <linux/notifier.h>
-#include <linux/profile.h>
-#include <linux/freezer.h>
-#include <linux/vmalloc.h>
-#include <linux/blkdev.h>
-#include <linux/delay.h>
-#include <linux/pid_namespace.h>
-#include <linux/smp.h>
-#include <linux/threads.h>
-#include <linux/timer.h>
-#include <linux/rcupdate.h>
-#include <linux/cpu.h>
+#include <linux/sched.h>
#include <linux/cpuset.h>
-#include <linux/percpu.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/sysctl.h>
-#include <linux/syscalls.h>
-#include <linux/times.h>
-#include <linux/tsacct_kern.h>
-#include <linux/kprobes.h>
#include <linux/delayacct.h>
-#include <linux/unistd.h>
-#include <linux/pagemap.h>
-#include <linux/hrtimer.h>
-#include <linux/tick.h>
-#include <linux/ctype.h>
-#include <linux/ftrace.h>
-#include <linux/slab.h>
#include <linux/init_task.h>
#include <linux/context_tracking.h>
-#include <linux/compiler.h>
-#include <linux/frame.h>
+
+#include <linux/blkdev.h>
+#include <linux/kprobes.h>
+#include <linux/mmu_context.h>
+#include <linux/module.h>
+#include <linux/nmi.h>
#include <linux/prefetch.h>
-#include <linux/mutex.h>
+#include <linux/profile.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
-#include <asm/irq_regs.h>
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#endif
#include "sched.h"
#include "../workqueue_internal.h"
@@ -91,27 +31,8 @@
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
-DEFINE_MUTEX(sched_domains_mutex);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-static void update_rq_clock_task(struct rq *rq, s64 delta);
-
-void update_rq_clock(struct rq *rq)
-{
- s64 delta;
-
- lockdep_assert_held(&rq->lock);
-
- if (rq->clock_skip_update & RQCF_ACT_SKIP)
- return;
-
- delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
- if (delta < 0)
- return;
- rq->clock += delta;
- update_rq_clock_task(rq, delta);
-}
-
/*
* Debugging: various feature bits
*/
@@ -140,7 +61,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
/*
- * period over which we measure -rt task cpu usage in us.
+ * period over which we measure -rt task CPU usage in us.
* default: 1s
*/
unsigned int sysctl_sched_rt_period = 1000000;
@@ -153,7 +74,7 @@ __read_mostly int scheduler_running;
*/
int sysctl_sched_rt_runtime = 950000;
-/* cpus with isolated domains */
+/* CPUs with isolated domains */
cpumask_var_t cpu_isolated_map;
/*
@@ -185,7 +106,7 @@ struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
rq = task_rq(p);
raw_spin_lock(&rq->lock);
if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
- rf->cookie = lockdep_pin_lock(&rq->lock);
+ rq_pin_lock(rq, rf);
return rq;
}
raw_spin_unlock(&rq->lock);
@@ -221,11 +142,11 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
* If we observe the old cpu in task_rq_lock, the acquire of
* the old rq->lock will fully serialize against the stores.
*
- * If we observe the new cpu in task_rq_lock, the acquire will
+ * If we observe the new CPU in task_rq_lock, the acquire will
* pair with the WMB to ensure we must then also see migrating.
*/
if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
- rf->cookie = lockdep_pin_lock(&rq->lock);
+ rq_pin_lock(rq, rf);
return rq;
}
raw_spin_unlock(&rq->lock);
@@ -236,6 +157,84 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
}
}
+/*
+ * RQ-clock updating methods:
+ */
+
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+/*
+ * In theory, the compile should just see 0 here, and optimize out the call
+ * to sched_rt_avg_update. But I don't trust it...
+ */
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+ s64 steal = 0, irq_delta = 0;
+#endif
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+
+ /*
+ * Since irq_time is only updated on {soft,}irq_exit, we might run into
+ * this case when a previous update_rq_clock() happened inside a
+ * {soft,}irq region.
+ *
+ * When this happens, we stop ->clock_task and only update the
+ * prev_irq_time stamp to account for the part that fit, so that a next
+ * update will consume the rest. This ensures ->clock_task is
+ * monotonic.
+ *
+ * It does however cause some slight miss-attribution of {soft,}irq
+ * time, a more accurate solution would be to update the irq_time using
+ * the current rq->clock timestamp, except that would require using
+ * atomic ops.
+ */
+ if (irq_delta > delta)
+ irq_delta = delta;
+
+ rq->prev_irq_time += irq_delta;
+ delta -= irq_delta;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+ if (static_key_false((&paravirt_steal_rq_enabled))) {
+ steal = paravirt_steal_clock(cpu_of(rq));
+ steal -= rq->prev_steal_time_rq;
+
+ if (unlikely(steal > delta))
+ steal = delta;
+
+ rq->prev_steal_time_rq += steal;
+ delta -= steal;
+ }
+#endif
+
+ rq->clock_task += delta;
+
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+ if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
+ sched_rt_avg_update(rq, irq_delta + steal);
+#endif
+}
+
+void update_rq_clock(struct rq *rq)
+{
+ s64 delta;
+
+ lockdep_assert_held(&rq->lock);
+
+ if (rq->clock_update_flags & RQCF_ACT_SKIP)
+ return;
+
+#ifdef CONFIG_SCHED_DEBUG
+ rq->clock_update_flags |= RQCF_UPDATED;
+#endif
+ delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+ if (delta < 0)
+ return;
+ rq->clock += delta;
+ update_rq_clock_task(rq, delta);
+}
+
+
#ifdef CONFIG_SCHED_HRTICK
/*
* Use HR-timers to deliver accurate preemption points.
@@ -458,7 +457,7 @@ void wake_up_q(struct wake_q_head *head)
task = container_of(node, struct task_struct, wake_q);
BUG_ON(!task);
- /* task can safely be re-inserted now */
+ /* Task can safely be re-inserted now: */
node = node->next;
task->wake_q.next = NULL;
@@ -516,12 +515,12 @@ void resched_cpu(int cpu)
#ifdef CONFIG_SMP
#ifdef CONFIG_NO_HZ_COMMON
/*
- * In the semi idle case, use the nearest busy cpu for migrating timers
- * from an idle cpu. This is good for power-savings.
+ * In the semi idle case, use the nearest busy CPU for migrating timers
+ * from an idle CPU. This is good for power-savings.
*
* We don't do similar optimization for completely idle system, as
- * selecting an idle cpu will add more delays to the timers than intended
- * (as that cpu's timer base may not be uptodate wrt jiffies etc).
+ * selecting an idle CPU will add more delays to the timers than intended
+ * (as that CPU's timer base may not be uptodate wrt jiffies etc).
*/
int get_nohz_timer_target(void)
{
@@ -550,6 +549,7 @@ unlock:
rcu_read_unlock();
return cpu;
}
+
/*
* When add_timer_on() enqueues a timer into the timer wheel of an
* idle CPU then this timer might expire before the next timer event
@@ -784,60 +784,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
dequeue_task(rq, p, flags);
}
-static void update_rq_clock_task(struct rq *rq, s64 delta)
-{
-/*
- * In theory, the compile should just see 0 here, and optimize out the call
- * to sched_rt_avg_update. But I don't trust it...
- */
-#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
- s64 steal = 0, irq_delta = 0;
-#endif
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
- irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
-
- /*
- * Since irq_time is only updated on {soft,}irq_exit, we might run into
- * this case when a previous update_rq_clock() happened inside a
- * {soft,}irq region.
- *
- * When this happens, we stop ->clock_task and only update the
- * prev_irq_time stamp to account for the part that fit, so that a next
- * update will consume the rest. This ensures ->clock_task is
- * monotonic.
- *
- * It does however cause some slight miss-attribution of {soft,}irq
- * time, a more accurate solution would be to update the irq_time using
- * the current rq->clock timestamp, except that would require using
- * atomic ops.
- */
- if (irq_delta > delta)
- irq_delta = delta;
-
- rq->prev_irq_time += irq_delta;
- delta -= irq_delta;
-#endif
-#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
- if (static_key_false((&paravirt_steal_rq_enabled))) {
- steal = paravirt_steal_clock(cpu_of(rq));
- steal -= rq->prev_steal_time_rq;
-
- if (unlikely(steal > delta))
- steal = delta;
-
- rq->prev_steal_time_rq += steal;
- delta -= steal;
- }
-#endif
-
- rq->clock_task += delta;
-
-#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
- if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
- sched_rt_avg_update(rq, irq_delta + steal);
-#endif
-}
-
void sched_set_stop_task(int cpu, struct task_struct *stop)
{
struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
@@ -1018,7 +964,7 @@ struct migration_arg {
};
/*
- * Move (not current) task off this cpu, onto dest cpu. We're doing
+ * Move (not current) task off this CPU, onto the destination CPU. We're doing
* this because either it can't run here any more (set_cpus_allowed()
* away from this CPU, or CPU going down), or because we're
* attempting to rebalance this task on exec (sched_exec).
@@ -1052,8 +998,8 @@ static int migration_cpu_stop(void *data)
struct rq *rq = this_rq();
/*
- * The original target cpu might have gone down and we might
- * be on another cpu but it doesn't matter.
+ * The original target CPU might have gone down and we might
+ * be on another CPU but it doesn't matter.
*/
local_irq_disable();
/*
@@ -1171,7 +1117,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
if (p->flags & PF_KTHREAD) {
/*
* For kernel threads that do indeed end up on online &&
- * !active we want to ensure they are strict per-cpu threads.
+ * !active we want to ensure they are strict per-CPU threads.
*/
WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
!cpumask_intersects(new_mask, cpu_active_mask) &&
@@ -1195,9 +1141,9 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
* OK, since we're going to drop the lock immediately
* afterwards anyway.
*/
- lockdep_unpin_lock(&rq->lock, rf.cookie);
+ rq_unpin_lock(rq, &rf);
rq = move_queued_task(rq, p, dest_cpu);
- lockdep_repin_lock(&rq->lock, rf.cookie);
+ rq_repin_lock(rq, &rf);
}
out:
task_rq_unlock(rq, p, &rf);
@@ -1276,7 +1222,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
/*
* Task isn't running anymore; make it appear like we migrated
* it before it went to sleep. This means on wakeup we make the
- * previous cpu our target instead of where it really is.
+ * previous CPU our target instead of where it really is.
*/
p->wake_cpu = cpu;
}
@@ -1508,12 +1454,12 @@ EXPORT_SYMBOL_GPL(kick_process);
*
* - on cpu-up we allow per-cpu kthreads on the online && !active cpu,
* see __set_cpus_allowed_ptr(). At this point the newly online
- * cpu isn't yet part of the sched domains, and balancing will not
+ * CPU isn't yet part of the sched domains, and balancing will not
* see it.
*
- * - on cpu-down we clear cpu_active() to mask the sched domains and
+ * - on CPU-down we clear cpu_active() to mask the sched domains and
* avoid the load balancer to place new tasks on the to be removed
- * cpu. Existing tasks will remain running there and will be taken
+ * CPU. Existing tasks will remain running there and will be taken
* off.
*
* This means that fallback selection must not select !active CPUs.
@@ -1529,9 +1475,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
int dest_cpu;
/*
- * If the node that the cpu is on has been offlined, cpu_to_node()
- * will return -1. There is no cpu on the node, and we should
- * select the cpu on the other node.
+ * If the node that the CPU is on has been offlined, cpu_to_node()
+ * will return -1. There is no CPU on the node, and we should
+ * select the CPU on the other node.
*/
if (nid != -1) {
nodemask = cpumask_of_node(nid);
@@ -1563,7 +1509,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
state = possible;
break;
}
- /* fall-through */
+ /* Fall-through */
case possible:
do_set_cpus_allowed(p, cpu_possible_mask);
state = fail;
@@ -1607,7 +1553,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
/*
* In order not to call set_task_cpu() on a blocking task we need
* to rely on ttwu() to place the task on a valid ->cpus_allowed
- * cpu.
+ * CPU.
*
* Since this is common to all placement strategies, this lives here.
*
@@ -1681,7 +1627,7 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
activate_task(rq, p, en_flags);
p->on_rq = TASK_ON_RQ_QUEUED;
- /* if a worker is waking up, notify workqueue */
+ /* If a worker is waking up, notify the workqueue: */
if (p->flags & PF_WQ_WORKER)
wq_worker_waking_up(p, cpu_of(rq));
}
@@ -1690,7 +1636,7 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
* Mark the task runnable and perform wakeup-preemption.
*/
static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
- struct pin_cookie cookie)
+ struct rq_flags *rf)
{
check_preempt_curr(rq, p, wake_flags);
p->state = TASK_RUNNING;
@@ -1702,9 +1648,9 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
* Our task @p is fully woken up and running; so its safe to
* drop the rq->lock, hereafter rq is only used for statistics.
*/
- lockdep_unpin_lock(&rq->lock, cookie);
+ rq_unpin_lock(rq, rf);
p->sched_class->task_woken(rq, p);
- lockdep_repin_lock(&rq->lock, cookie);
+ rq_repin_lock(rq, rf);
}
if (rq->idle_stamp) {
@@ -1723,7 +1669,7 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
static void
ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
- struct pin_cookie cookie)
+ struct rq_flags *rf)
{
int en_flags = ENQUEUE_WAKEUP;
@@ -1738,7 +1684,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
#endif
ttwu_activate(rq, p, en_flags);
- ttwu_do_wakeup(rq, p, wake_flags, cookie);
+ ttwu_do_wakeup(rq, p, wake_flags, rf);
}
/*
@@ -1757,7 +1703,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
if (task_on_rq_queued(p)) {
/* check_preempt_curr() may use rq clock */
update_rq_clock(rq);
- ttwu_do_wakeup(rq, p, wake_flags, rf.cookie);
+ ttwu_do_wakeup(rq, p, wake_flags, &rf);
ret = 1;
}
__task_rq_unlock(rq, &rf);
@@ -1770,15 +1716,15 @@ void sched_ttwu_pending(void)
{
struct rq *rq = this_rq();
struct llist_node *llist = llist_del_all(&rq->wake_list);
- struct pin_cookie cookie;
struct task_struct *p;
unsigned long flags;
+ struct rq_flags rf;
if (!llist)
return;
raw_spin_lock_irqsave(&rq->lock, flags);
- cookie = lockdep_pin_lock(&rq->lock);
+ rq_pin_lock(rq, &rf);
while (llist) {
int wake_flags = 0;
@@ -1789,10 +1735,10 @@ void sched_ttwu_pending(void)
if (p->sched_remote_wakeup)
wake_flags = WF_MIGRATED;
- ttwu_do_activate(rq, p, wake_flags, cookie);
+ ttwu_do_activate(rq, p, wake_flags, &rf);
}
- lockdep_unpin_lock(&rq->lock, cookie);
+ rq_unpin_lock(rq, &rf);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -1864,7 +1810,7 @@ void wake_up_if_idle(int cpu)
raw_spin_lock_irqsave(&rq->lock, flags);
if (is_idle_task(rq->curr))
smp_send_reschedule(cpu);
- /* Else cpu is not in idle, do nothing here */
+ /* Else CPU is not idle, do nothing here: */
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -1881,20 +1827,20 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
{
struct rq *rq = cpu_rq(cpu);
- struct pin_cookie cookie;
+ struct rq_flags rf;
#if defined(CONFIG_SMP)
if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
- sched_clock_cpu(cpu); /* sync clocks x-cpu */
+ sched_clock_cpu(cpu); /* Sync clocks across CPUs */
ttwu_queue_remote(p, cpu, wake_flags);
return;
}
#endif
raw_spin_lock(&rq->lock);
- cookie = lockdep_pin_lock(&rq->lock);
- ttwu_do_activate(rq, p, wake_flags, cookie);
- lockdep_unpin_lock(&rq->lock, cookie);
+ rq_pin_lock(rq, &rf);
+ ttwu_do_activate(rq, p, wake_flags, &rf);
+ rq_unpin_lock(rq, &rf);
raw_spin_unlock(&rq->lock);
}
@@ -1904,8 +1850,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* MIGRATION
*
* The basic program-order guarantee on SMP systems is that when a task [t]
- * migrates, all its activity on its old cpu [c0] happens-before any subsequent
- * execution on its new cpu [c1].
+ * migrates, all its activity on its old CPU [c0] happens-before any subsequent
+ * execution on its new CPU [c1].
*
* For migration (of runnable tasks) this is provided by the following means:
*
@@ -1916,7 +1862,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
*
* Transitivity guarantees that B happens after A and C after B.
* Note: we only require RCpc transitivity.
- * Note: the cpu doing B need not be c0 or c1
+ * Note: the CPU doing B need not be c0 or c1
*
* Example:
*
@@ -2024,7 +1970,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
trace_sched_waking(p);
- success = 1; /* we're going to change ->state */
+ /* We're going to change ->state: */
+ success = 1;
cpu = task_cpu(p);
/*
@@ -2073,7 +2020,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
smp_rmb();
/*
- * If the owning (remote) cpu is still in the middle of schedule() with
+ * If the owning (remote) CPU is still in the middle of schedule() with
* this task as prev, wait until its done referencing the task.
*
* Pairs with the smp_store_release() in finish_lock_switch().
@@ -2086,11 +2033,24 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
+ if (p->in_iowait) {
+ delayacct_blkio_end();
+ atomic_dec(&task_rq(p)->nr_iowait);
+ }
+
cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
if (task_cpu(p) != cpu) {
wake_flags |= WF_MIGRATED;
set_task_cpu(p, cpu);
}
+
+#else /* CONFIG_SMP */
+
+ if (p->in_iowait) {
+ delayacct_blkio_end();
+ atomic_dec(&task_rq(p)->nr_iowait);
+ }
+
#endif /* CONFIG_SMP */
ttwu_queue(p, cpu, wake_flags);
@@ -2111,7 +2071,7 @@ out:
* ensure that this_rq() is locked, @p is bound to this_rq() and not
* the current task.
*/
-static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie)
+static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
{
struct rq *rq = task_rq(p);
@@ -2128,11 +2088,11 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie
* disabled avoiding further scheduler activity on it and we've
* not yet picked a replacement task.
*/
- lockdep_unpin_lock(&rq->lock, cookie);
+ rq_unpin_lock(rq, rf);
raw_spin_unlock(&rq->lock);
raw_spin_lock(&p->pi_lock);
raw_spin_lock(&rq->lock);
- lockdep_repin_lock(&rq->lock, cookie);
+ rq_repin_lock(rq, rf);
}
if (!(p->state & TASK_NORMAL))
@@ -2140,10 +2100,15 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie
trace_sched_waking(p);
- if (!task_on_rq_queued(p))
+ if (!task_on_rq_queued(p)) {
+ if (p->in_iowait) {
+ delayacct_blkio_end();
+ atomic_dec(&rq->nr_iowait);
+ }
ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+ }
- ttwu_do_wakeup(rq, p, 0, cookie);
+ ttwu_do_wakeup(rq, p, 0, rf);
ttwu_stat(p, smp_processor_id(), 0);
out:
raw_spin_unlock(&p->pi_lock);
@@ -2427,7 +2392,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
*/
raw_spin_lock_irqsave(&p->pi_lock, flags);
/*
- * We're setting the cpu for the first time, we don't migrate,
+ * We're setting the CPU for the first time, we don't migrate,
* so use __set_task_cpu().
*/
__set_task_cpu(p, cpu);
@@ -2570,7 +2535,7 @@ void wake_up_new_task(struct task_struct *p)
/*
* Fork balancing, do it here and not earlier because:
* - cpus_allowed can change in the fork path
- * - any previously selected cpu might disappear through hotplug
+ * - any previously selected CPU might disappear through hotplug
*
* Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
* as we're not fully set-up yet.
@@ -2578,6 +2543,7 @@ void wake_up_new_task(struct task_struct *p)
__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
rq = __task_rq_lock(p, &rf);
+ update_rq_clock(rq);
post_init_entity_util_avg(&p->se);
activate_task(rq, p, 0);
@@ -2590,9 +2556,9 @@ void wake_up_new_task(struct task_struct *p)
* Nothing relies on rq->lock after this, so its fine to
* drop it.
*/
- lockdep_unpin_lock(&rq->lock, rf.cookie);
+ rq_unpin_lock(rq, &rf);
p->sched_class->task_woken(rq, p);
- lockdep_repin_lock(&rq->lock, rf.cookie);
+ rq_repin_lock(rq, &rf);
}
#endif
task_rq_unlock(rq, p, &rf);
@@ -2861,7 +2827,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
*/
static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
- struct task_struct *next, struct pin_cookie cookie)
+ struct task_struct *next, struct rq_flags *rf)
{
struct mm_struct *mm, *oldmm;
@@ -2887,13 +2853,16 @@ context_switch(struct rq *rq, struct task_struct *prev,
prev->active_mm = NULL;
rq->prev_mm = oldmm;
}
+
+ rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
+
/*
* Since the runqueue lock will be released by the next
* task (which is an invalid locking op but in the case
* of the scheduler it's an obvious special-case), so we
* do an early lockdep release here:
*/
- lockdep_unpin_lock(&rq->lock, cookie);
+ rq_unpin_lock(rq, rf);
spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
/* Here we just switch the register state and the stack. */
@@ -2920,7 +2889,7 @@ unsigned long nr_running(void)
}
/*
- * Check if only the current task is running on the cpu.
+ * Check if only the current task is running on the CPU.
*
* Caution: this function does not check that the caller has disabled
* preemption, thus the result might have a time-of-check-to-time-of-use
@@ -2949,6 +2918,36 @@ unsigned long long nr_context_switches(void)
return sum;
}
+/*
+ * IO-wait accounting, and how its mostly bollocks (on SMP).
+ *
+ * The idea behind IO-wait account is to account the idle time that we could
+ * have spend running if it were not for IO. That is, if we were to improve the
+ * storage performance, we'd have a proportional reduction in IO-wait time.
+ *
+ * This all works nicely on UP, where, when a task blocks on IO, we account
+ * idle time as IO-wait, because if the storage were faster, it could've been
+ * running and we'd not be idle.
+ *
+ * This has been extended to SMP, by doing the same for each CPU. This however
+ * is broken.
+ *
+ * Imagine for instance the case where two tasks block on one CPU, only the one
+ * CPU will have IO-wait accounted, while the other has regular idle. Even
+ * though, if the storage were faster, both could've ran at the same time,
+ * utilising both CPUs.
+ *
+ * This means, that when looking globally, the current IO-wait accounting on
+ * SMP is a lower bound, by reason of under accounting.
+ *
+ * Worse, since the numbers are provided per CPU, they are sometimes
+ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly
+ * associated with any one particular CPU, it can wake to another CPU than it
+ * blocked on. This means the per CPU IO-wait number is meaningless.
+ *
+ * Task CPU affinities can make all that even more 'interesting'.
+ */
+
unsigned long nr_iowait(void)
{
unsigned long i, sum = 0;
@@ -2959,6 +2958,13 @@ unsigned long nr_iowait(void)
return sum;
}
+/*
+ * Consumers of these two interfaces, like for example the cpufreq menu
+ * governor are using nonsensical data. Boosting frequency for a CPU that has
+ * IO-wait which might not even end up running the task when it does become
+ * runnable.
+ */
+
unsigned long nr_iowait_cpu(int cpu)
{
struct rq *this = cpu_rq(cpu);
@@ -3042,8 +3048,8 @@ unsigned long long task_sched_runtime(struct task_struct *p)
* So we have a optimization chance when the task's delta_exec is 0.
* Reading ->on_cpu is racy, but this is ok.
*
- * If we race with it leaving cpu, we'll take a lock. So we're correct.
- * If we race with it entering cpu, unaccounted time is 0. This is
+ * If we race with it leaving CPU, we'll take a lock. So we're correct.
+ * If we race with it entering CPU, unaccounted time is 0. This is
* indistinguishable from the read occurring a few cycles earlier.
* If we see ->on_cpu without ->on_rq, the task is leaving, and has
* been accounted, so we're correct here as well.
@@ -3257,31 +3263,30 @@ static inline void schedule_debug(struct task_struct *prev)
* Pick up the highest-prio task:
*/
static inline struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
+pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
- const struct sched_class *class = &fair_sched_class;
+ const struct sched_class *class;
struct task_struct *p;
/*
* Optimization: we know that if all tasks are in
* the fair class we can call that function directly:
*/
- if (likely(prev->sched_class == class &&
- rq->nr_running == rq->cfs.h_nr_running)) {
- p = fair_sched_class.pick_next_task(rq, prev, cookie);
+ if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
+ p = fair_sched_class.pick_next_task(rq, prev, rf);
if (unlikely(p == RETRY_TASK))
goto again;
- /* assumes fair_sched_class->next == idle_sched_class */
+ /* Assumes fair_sched_class->next == idle_sched_class */
if (unlikely(!p))
- p = idle_sched_class.pick_next_task(rq, prev, cookie);
+ p = idle_sched_class.pick_next_task(rq, prev, rf);
return p;
}
again:
for_each_class(class) {
- p = class->pick_next_task(rq, prev, cookie);
+ p = class->pick_next_task(rq, prev, rf);
if (p) {
if (unlikely(p == RETRY_TASK))
goto again;
@@ -3289,7 +3294,8 @@ again:
}
}
- BUG(); /* the idle class will always have a runnable task */
+ /* The idle class should always have a runnable task: */
+ BUG();
}
/*
@@ -3335,7 +3341,7 @@ static void __sched notrace __schedule(bool preempt)
{
struct task_struct *prev, *next;
unsigned long *switch_count;
- struct pin_cookie cookie;
+ struct rq_flags rf;
struct rq *rq;
int cpu;
@@ -3358,9 +3364,10 @@ static void __sched notrace __schedule(bool preempt)
*/
smp_mb__before_spinlock();
raw_spin_lock(&rq->lock);
- cookie = lockdep_pin_lock(&rq->lock);
+ rq_pin_lock(rq, &rf);
- rq->clock_skip_update <<= 1; /* promote REQ to ACT */
+ /* Promote REQ to ACT */
+ rq->clock_update_flags <<= 1;
switch_count = &prev->nivcsw;
if (!preempt && prev->state) {
@@ -3370,6 +3377,11 @@ static void __sched notrace __schedule(bool preempt)
deactivate_task(rq, prev, DEQUEUE_SLEEP);
prev->on_rq = 0;
+ if (prev->in_iowait) {
+ atomic_inc(&rq->nr_iowait);
+ delayacct_blkio_start();
+ }
+
/*
* If a worker went to sleep, notify and ask workqueue
* whether it wants to wake up a task to maintain
@@ -3380,7 +3392,7 @@ static void __sched notrace __schedule(bool preempt)
to_wakeup = wq_worker_sleeping(prev);
if (to_wakeup)
- try_to_wake_up_local(to_wakeup, cookie);
+ try_to_wake_up_local(to_wakeup, &rf);
}
}
switch_count = &prev->nvcsw;
@@ -3389,10 +3401,9 @@ static void __sched notrace __schedule(bool preempt)
if (task_on_rq_queued(prev))
update_rq_clock(rq);
- next = pick_next_task(rq, prev, cookie);
+ next = pick_next_task(rq, prev, &rf);
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
- rq->clock_skip_update = 0;
if (likely(prev != next)) {
rq->nr_switches++;
@@ -3400,9 +3411,12 @@ static void __sched notrace __schedule(bool preempt)
++*switch_count;
trace_sched_switch(preempt, prev, next);
- rq = context_switch(rq, prev, next, cookie); /* unlocks the rq */
+
+ /* Also unlocks the rq: */
+ rq = context_switch(rq, prev, next, &rf);
} else {
- lockdep_unpin_lock(&rq->lock, cookie);
+ rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
+ rq_unpin_lock(rq, &rf);
raw_spin_unlock_irq(&rq->lock);
}
@@ -3426,14 +3440,18 @@ void __noreturn do_task_dead(void)
smp_mb();
raw_spin_unlock_wait(&current->pi_lock);
- /* causes final put_task_struct in finish_task_switch(). */
+ /* Causes final put_task_struct in finish_task_switch(): */
__set_current_state(TASK_DEAD);
- current->flags |= PF_NOFREEZE; /* tell freezer to ignore us */
+
+ /* Tell freezer to ignore us: */
+ current->flags |= PF_NOFREEZE;
+
__schedule(false);
BUG();
- /* Avoid "noreturn function does return". */
+
+ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */
for (;;)
- cpu_relax(); /* For when BUG is null */
+ cpu_relax();
}
static inline void sched_submit_work(struct task_struct *tsk)
@@ -3651,6 +3669,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
BUG_ON(prio > MAX_PRIO);
rq = __task_rq_lock(p, &rf);
+ update_rq_clock(rq);
/*
* Idle task boosting is a nono in general. There is one
@@ -3725,7 +3744,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
check_class_changed(rq, p, prev_class, oldprio);
out_unlock:
- preempt_disable(); /* avoid rq from going away on us */
+ /* Avoid rq from going away on us: */
+ preempt_disable();
__task_rq_unlock(rq, &rf);
balance_callback(rq);
@@ -3747,6 +3767,8 @@ void set_user_nice(struct task_struct *p, long nice)
* the task might be in the middle of scheduling on another CPU.
*/
rq = task_rq_lock(p, &rf);
+ update_rq_clock(rq);
+
/*
* The RT priorities are set via sched_setscheduler(), but we still
* allow the 'normal' nice value to be set - but as expected
@@ -3793,7 +3815,7 @@ EXPORT_SYMBOL(set_user_nice);
*/
int can_nice(const struct task_struct *p, const int nice)
{
- /* convert nice value [19,-20] to rlimit style value [1,40] */
+ /* Convert nice value [19,-20] to rlimit style value [1,40]: */
int nice_rlim = nice_to_rlimit(nice);
return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
@@ -3849,7 +3871,7 @@ int task_prio(const struct task_struct *p)
}
/**
- * idle_cpu - is a given cpu idle currently?
+ * idle_cpu - is a given CPU idle currently?
* @cpu: the processor in question.
*
* Return: 1 if the CPU is currently idle. 0 otherwise.
@@ -3873,10 +3895,10 @@ int idle_cpu(int cpu)
}
/**
- * idle_task - return the idle task for a given cpu.
+ * idle_task - return the idle task for a given CPU.
* @cpu: the processor in question.
*
- * Return: The idle task for the cpu @cpu.
+ * Return: The idle task for the CPU @cpu.
*/
struct task_struct *idle_task(int cpu)
{
@@ -4042,7 +4064,7 @@ __checkparam_dl(const struct sched_attr *attr)
}
/*
- * check the target process has a UID that matches the current process's
+ * Check the target process has a UID that matches the current process's:
*/
static bool check_same_owner(struct task_struct *p)
{
@@ -4057,8 +4079,7 @@ static bool check_same_owner(struct task_struct *p)
return match;
}
-static bool dl_param_changed(struct task_struct *p,
- const struct sched_attr *attr)
+static bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
{
struct sched_dl_entity *dl_se = &p->dl;
@@ -4085,10 +4106,10 @@ static int __sched_setscheduler(struct task_struct *p,
int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
struct rq *rq;
- /* may grab non-irq protected spin_locks */
+ /* May grab non-irq protected spin_locks: */
BUG_ON(in_interrupt());
recheck:
- /* double check policy once rq lock held */
+ /* Double check policy once rq lock held: */
if (policy < 0) {
reset_on_fork = p->sched_reset_on_fork;
policy = oldpolicy = p->policy;
@@ -4128,11 +4149,11 @@ recheck:
unsigned long rlim_rtprio =
task_rlimit(p, RLIMIT_RTPRIO);
- /* can't set/change the rt policy */
+ /* Can't set/change the rt policy: */
if (policy != p->policy && !rlim_rtprio)
return -EPERM;
- /* can't increase priority */
+ /* Can't increase priority: */
if (attr->sched_priority > p->rt_priority &&
attr->sched_priority > rlim_rtprio)
return -EPERM;
@@ -4156,11 +4177,11 @@ recheck: