summaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/core.c109
-rw-r--r--kernel/sched/cputime.c101
-rw-r--r--kernel/sched/deadline.c40
-rw-r--r--kernel/sched/debug.c48
-rw-r--r--kernel/sched/fair.c937
-rw-r--r--kernel/sched/features.h18
-rw-r--r--kernel/sched/idle.c14
-rw-r--r--kernel/sched/idle_task.c1
-rw-r--r--kernel/sched/rt.c42
-rw-r--r--kernel/sched/sched.h39
-rw-r--r--kernel/sched/stop_task.c1
11 files changed, 566 insertions, 784 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e9673433cc01..7819725e9da8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1151,15 +1151,45 @@ static int migration_cpu_stop(void *data)
return 0;
}
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+/*
+ * sched_class::set_cpus_allowed must do the below, but is not required to
+ * actually call this function.
+ */
+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
{
- if (p->sched_class->set_cpus_allowed)
- p->sched_class->set_cpus_allowed(p, new_mask);
-
cpumask_copy(&p->cpus_allowed, new_mask);
p->nr_cpus_allowed = cpumask_weight(new_mask);
}
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+ struct rq *rq = task_rq(p);
+ bool queued, running;
+
+ lockdep_assert_held(&p->pi_lock);
+
+ queued = task_on_rq_queued(p);
+ running = task_current(rq, p);
+
+ if (queued) {
+ /*
+ * Because __kthread_bind() calls this on blocked tasks without
+ * holding rq->lock.
+ */
+ lockdep_assert_held(&rq->lock);
+ dequeue_task(rq, p, 0);
+ }
+ if (running)
+ put_prev_task(rq, p);
+
+ p->sched_class->set_cpus_allowed(p, new_mask);
+
+ if (running)
+ p->sched_class->set_curr_task(rq);
+ if (queued)
+ enqueue_task(rq, p, 0);
+}
+
/*
* Change a given task's CPU affinity. Migrate the thread to a
* proper CPU and schedule it away if the CPU it's executing on
@@ -1169,7 +1199,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
* task must not exit() & deallocate itself prematurely. The
* call is not atomic; no spinlocks may be held.
*/
-int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+ const struct cpumask *new_mask, bool check)
{
unsigned long flags;
struct rq *rq;
@@ -1178,6 +1209,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
rq = task_rq_lock(p, &flags);
+ /*
+ * Must re-check here, to close a race against __kthread_bind(),
+ * sched_setaffinity() is not guaranteed to observe the flag.
+ */
+ if (check && (p->flags & PF_NO_SETAFFINITY)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
if (cpumask_equal(&p->cpus_allowed, new_mask))
goto out;
@@ -1214,6 +1254,11 @@ out:
return ret;
}
+
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+ return __set_cpus_allowed_ptr(p, new_mask, false);
+}
EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
@@ -1595,6 +1640,15 @@ static void update_avg(u64 *avg, u64 sample)
s64 diff = sample - *avg;
*avg += diff >> 3;
}
+
+#else
+
+static inline int __set_cpus_allowed_ptr(struct task_struct *p,
+ const struct cpumask *new_mask, bool check)
+{
+ return set_cpus_allowed_ptr(p, new_mask);
+}
+
#endif /* CONFIG_SMP */
static void
@@ -1654,9 +1708,9 @@ static void
ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
{
check_preempt_curr(rq, p, wake_flags);
- trace_sched_wakeup(p, true);
-
p->state = TASK_RUNNING;
+ trace_sched_wakeup(p);
+
#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
/*
@@ -1874,6 +1928,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
if (!(p->state & state))
goto out;
+ trace_sched_waking(p);
+
success = 1; /* we're going to change ->state */
cpu = task_cpu(p);
@@ -1949,6 +2005,8 @@ static void try_to_wake_up_local(struct task_struct *p)
if (!(p->state & TASK_NORMAL))
goto out;
+ trace_sched_waking(p);
+
if (!task_on_rq_queued(p))
ttwu_activate(rq, p, ENQUEUE_WAKEUP);
@@ -2016,9 +2074,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
p->se.vruntime = 0;
-#ifdef CONFIG_SMP
- p->se.avg.decay_count = 0;
-#endif
INIT_LIST_HEAD(&p->se.group_node);
#ifdef CONFIG_SCHEDSTATS
@@ -2200,8 +2255,8 @@ unsigned long to_ratio(u64 period, u64 runtime)
#ifdef CONFIG_SMP
inline struct dl_bw *dl_bw_of(int i)
{
- rcu_lockdep_assert(rcu_read_lock_sched_held(),
- "sched RCU must be held");
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+ "sched RCU must be held");
return &cpu_rq(i)->rd->dl_bw;
}
@@ -2210,8 +2265,8 @@ static inline int dl_bw_cpus(int i)
struct root_domain *rd = cpu_rq(i)->rd;
int cpus = 0;
- rcu_lockdep_assert(rcu_read_lock_sched_held(),
- "sched RCU must be held");
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+ "sched RCU must be held");
for_each_cpu_and(i, rd->span, cpu_active_mask)
cpus++;
@@ -2303,11 +2358,11 @@ void wake_up_new_task(struct task_struct *p)
#endif
/* Initialize new task's runnable average */
- init_task_runnable_average(p);
+ init_entity_runnable_average(&p->se);
rq = __task_rq_lock(p);
activate_task(rq, p, 0);
p->on_rq = TASK_ON_RQ_QUEUED;
- trace_sched_wakeup_new(p, true);
+ trace_sched_wakeup_new(p);
check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
if (p->sched_class->task_woken)
@@ -2469,7 +2524,6 @@ static struct rq *finish_task_switch(struct task_struct *prev)
*/
prev_state = prev->state;
vtime_task_switch(prev);
- finish_arch_switch(prev);
perf_event_task_sched_in(prev, current);
finish_lock_switch(rq, prev);
finish_arch_post_lock_switch();
@@ -4340,7 +4394,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
}
#endif
again:
- retval = set_cpus_allowed_ptr(p, new_mask);
+ retval = __set_cpus_allowed_ptr(p, new_mask, true);
if (!retval) {
cpuset_cpus_allowed(p, cpus_allowed);
@@ -4492,7 +4546,7 @@ SYSCALL_DEFINE0(sched_yield)
int __sched _cond_resched(void)
{
- if (should_resched()) {
+ if (should_resched(0)) {
preempt_schedule_common();
return 1;
}
@@ -4510,7 +4564,7 @@ EXPORT_SYMBOL(_cond_resched);
*/
int __cond_resched_lock(spinlock_t *lock)
{
- int resched = should_resched();
+ int resched = should_resched(PREEMPT_LOCK_OFFSET);
int ret = 0;
lockdep_assert_held(lock);
@@ -4532,7 +4586,7 @@ int __sched __cond_resched_softirq(void)
{
BUG_ON(!in_softirq());
- if (should_resched()) {
+ if (should_resched(SOFTIRQ_DISABLE_OFFSET)) {
local_bh_enable();
preempt_schedule_common();
local_bh_disable();
@@ -4865,7 +4919,8 @@ void init_idle(struct task_struct *idle, int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_lock_irqsave(&idle->pi_lock, flags);
+ raw_spin_lock(&rq->lock);
__sched_fork(0, idle);
idle->state = TASK_RUNNING;
@@ -4891,7 +4946,8 @@ void init_idle(struct task_struct *idle, int cpu)
#if defined(CONFIG_SMP)
idle->on_cpu = 1;
#endif
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
/* Set the preempt count _outside_ the spinlocks! */
init_idle_preempt_count(idle, cpu);
@@ -5311,8 +5367,7 @@ static void register_sched_domain_sysctl(void)
/* may be called multiple times per register */
static void unregister_sched_domain_sysctl(void)
{
- if (sd_sysctl_header)
- unregister_sysctl_table(sd_sysctl_header);
+ unregister_sysctl_table(sd_sysctl_header);
sd_sysctl_header = NULL;
if (sd_ctl_dir[0].child)
sd_free_ctl_entry(&sd_ctl_dir[0].child);
@@ -6453,8 +6508,10 @@ static void init_numa_topology_type(void)
n = sched_max_numa_distance;
- if (n <= 1)
+ if (sched_domains_numa_levels <= 1) {
sched_numa_topology_type = NUMA_DIRECT;
+ return;
+ }
for_each_online_node(a) {
for_each_online_node(b) {
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index f5a64ffad176..8cbc3db671df 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -555,48 +555,43 @@ drop_precision:
}
/*
- * Atomically advance counter to the new value. Interrupts, vcpu
- * scheduling, and scaling inaccuracies can cause cputime_advance
- * to be occasionally called with a new value smaller than counter.
- * Let's enforce atomicity.
+ * Adjust tick based cputime random precision against scheduler runtime
+ * accounting.
*
- * Normally a caller will only go through this loop once, or not
- * at all in case a previous caller updated counter the same jiffy.
- */
-static void cputime_advance(cputime_t *counter, cputime_t new)
-{
- cputime_t old;
-
- while (new > (old = READ_ONCE(*counter)))
- cmpxchg_cputime(counter, old, new);
-}
-
-/*
- * Adjust tick based cputime random precision against scheduler
- * runtime accounting.
+ * Tick based cputime accounting depend on random scheduling timeslices of a
+ * task to be interrupted or not by the timer. Depending on these
+ * circumstances, the number of these interrupts may be over or
+ * under-optimistic, matching the real user and system cputime with a variable
+ * precision.
+ *
+ * Fix this by scaling these tick based values against the total runtime
+ * accounted by the CFS scheduler.
+ *
+ * This code provides the following guarantees:
+ *
+ * stime + utime == rtime
+ * stime_i+1 >= stime_i, utime_i+1 >= utime_i
+ *
+ * Assuming that rtime_i+1 >= rtime_i.
*/
static void cputime_adjust(struct task_cputime *curr,
- struct cputime *prev,
+ struct prev_cputime *prev,
cputime_t *ut, cputime_t *st)
{
cputime_t rtime, stime, utime;
+ unsigned long flags;
- /*
- * Tick based cputime accounting depend on random scheduling
- * timeslices of a task to be interrupted or not by the timer.
- * Depending on these circumstances, the number of these interrupts
- * may be over or under-optimistic, matching the real user and system
- * cputime with a variable precision.
- *
- * Fix this by scaling these tick based values against the total
- * runtime accounted by the CFS scheduler.
- */
+ /* Serialize concurrent callers such that we can honour our guarantees */
+ raw_spin_lock_irqsave(&prev->lock, flags);
rtime = nsecs_to_cputime(curr->sum_exec_runtime);
/*
- * Update userspace visible utime/stime values only if actual execution
- * time is bigger than already exported. Note that can happen, that we
- * provided bigger values due to scaling inaccuracy on big numbers.
+ * This is possible under two circumstances:
+ * - rtime isn't monotonic after all (a bug);
+ * - we got reordered by the lock.
+ *
+ * In both cases this acts as a filter such that the rest of the code
+ * can assume it is monotonic regardless of anything else.
*/
if (prev->stime + prev->utime >= rtime)
goto out;
@@ -606,22 +601,46 @@ static void cputime_adjust(struct task_cputime *curr,
if (utime == 0) {
stime = rtime;
- } else if (stime == 0) {
- utime = rtime;
- } else {
- cputime_t total = stime + utime;
+ goto update;
+ }
- stime = scale_stime((__force u64)stime,
- (__force u64)rtime, (__force u64)total);
- utime = rtime - stime;
+ if (stime == 0) {
+ utime = rtime;
+ goto update;
}
- cputime_advance(&prev->stime, stime);
- cputime_advance(&prev->utime, utime);
+ stime = scale_stime((__force u64)stime, (__force u64)rtime,
+ (__force u64)(stime + utime));
+
+ /*
+ * Make sure stime doesn't go backwards; this preserves monotonicity
+ * for utime because rtime is monotonic.
+ *
+ * utime_i+1 = rtime_i+1 - stime_i
+ * = rtime_i+1 - (rtime_i - utime_i)
+ * = (rtime_i+1 - rtime_i) + utime_i
+ * >= utime_i
+ */
+ if (stime < prev->stime)
+ stime = prev->stime;
+ utime = rtime - stime;
+
+ /*
+ * Make sure utime doesn't go backwards; this still preserves
+ * monotonicity for stime, analogous argument to above.
+ */
+ if (utime < prev->utime) {
+ utime = prev->utime;
+ stime = rtime - utime;
+ }
+update:
+ prev->stime = stime;
+ prev->utime = utime;
out:
*ut = prev->utime;
*st = prev->stime;
+ raw_spin_unlock_irqrestore(&prev->lock, flags);
}
void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0a17af35670a..fc8f01083527 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -953,7 +953,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
/*
* Use the scheduling parameters of the top pi-waiter
- * task if we have one and its (relative) deadline is
+ * task if we have one and its (absolute) deadline is
* smaller than our one... OTW we keep our runtime and
* deadline.
*/
@@ -1563,7 +1563,7 @@ out:
static void push_dl_tasks(struct rq *rq)
{
- /* Terminates as it moves a -deadline task */
+ /* push_dl_task() will return true if it moved a -deadline task */
while (push_dl_task(rq))
;
}
@@ -1657,7 +1657,6 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
{
if (!task_running(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
- has_pushable_dl_tasks(rq) &&
p->nr_cpus_allowed > 1 &&
dl_task(rq->curr) &&
(rq->curr->nr_cpus_allowed < 2 ||
@@ -1669,9 +1668,8 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
static void set_cpus_allowed_dl(struct task_struct *p,
const struct cpumask *new_mask)
{
- struct rq *rq;
struct root_domain *src_rd;
- int weight;
+ struct rq *rq;
BUG_ON(!dl_task(p));
@@ -1697,37 +1695,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
raw_spin_unlock(&src_dl_b->lock);
}
- /*
- * Update only if the task is actually running (i.e.,
- * it is on the rq AND it is not throttled).
- */
- if (!on_dl_rq(&p->dl))
- return;
-
- weight = cpumask_weight(new_mask);
-
- /*
- * Only update if the process changes its state from whether it
- * can migrate or not.
- */
- if ((p->nr_cpus_allowed > 1) == (weight > 1))
- return;
-
- /*
- * The process used to be able to migrate OR it can now migrate
- */
- if (weight <= 1) {
- if (!task_current(rq, p))
- dequeue_pushable_dl_task(rq, p);
- BUG_ON(!rq->dl.dl_nr_migratory);
- rq->dl.dl_nr_migratory--;
- } else {
- if (!task_current(rq, p))
- enqueue_pushable_dl_task(rq, p);
- rq->dl.dl_nr_migratory++;
- }
-
- update_dl_migration(&rq->dl);
+ set_cpus_allowed_common(p, new_mask);
}
/* Assumes rq->lock is held */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4222ec50ab88..641511771ae6 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -68,13 +68,8 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
#define PN(F) \
SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
- if (!se) {
- struct sched_avg *avg = &cpu_rq(cpu)->avg;
- P(avg->runnable_avg_sum);
- P(avg->avg_period);
+ if (!se)
return;
- }
-
PN(se->exec_start);
PN(se->vruntime);
@@ -93,12 +88,8 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
#endif
P(se->load.weight);
#ifdef CONFIG_SMP
- P(se->avg.runnable_avg_sum);
- P(se->avg.running_avg_sum);
- P(se->avg.avg_period);
- P(se->avg.load_avg_contrib);
- P(se->avg.utilization_avg_contrib);
- P(se->avg.decay_count);
+ P(se->avg.load_avg);
+ P(se->avg.util_avg);
#endif
#undef PN
#undef P
@@ -214,21 +205,21 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
#ifdef CONFIG_SMP
- SEQ_printf(m, " .%-30s: %ld\n", "runnable_load_avg",
+ SEQ_printf(m, " .%-30s: %lu\n", "load_avg",
+ cfs_rq->avg.load_avg);
+ SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg",
cfs_rq->runnable_load_avg);
- SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg",
- cfs_rq->blocked_load_avg);
- SEQ_printf(m, " .%-30s: %ld\n", "utilization_load_avg",
- cfs_rq->utilization_load_avg);
+ SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
+ cfs_rq->avg.util_avg);
+ SEQ_printf(m, " .%-30s: %ld\n", "removed_load_avg",
+ atomic_long_read(&cfs_rq->removed_load_avg));
+ SEQ_printf(m, " .%-30s: %ld\n", "removed_util_avg",
+ atomic_long_read(&cfs_rq->removed_util_avg));
#ifdef CONFIG_FAIR_GROUP_SCHED
- SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib",
- cfs_rq->tg_load_contrib);
- SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib",
- cfs_rq->tg_runnable_contrib);
+ SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib",
+ cfs_rq->tg_load_avg_contrib);
SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
atomic_long_read(&cfs_rq->tg->load_avg));
- SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg",
- atomic_read(&cfs_rq->tg->runnable_avg));
#endif
#endif
#ifdef CONFIG_CFS_BANDWIDTH
@@ -636,12 +627,11 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
P(se.load.weight);
#ifdef CONFIG_SMP
- P(se.avg.runnable_avg_sum);
- P(se.avg.running_avg_sum);
- P(se.avg.avg_period);
- P(se.avg.load_avg_contrib);
- P(se.avg.utilization_avg_contrib);
- P(se.avg.decay_count);
+ P(se.avg.load_sum);
+ P(se.avg.util_sum);
+ P(se.avg.load_avg);
+ P(se.avg.util_avg);
+ P(se.avg.last_update_time);
#endif
P(policy);
P(prio);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d113c3ba8bc4..6e2e3483b1ec 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -283,9 +283,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
return grp->my_q;
}
-static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
- int force_update);
-
static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
if (!cfs_rq->on_list) {
@@ -305,8 +302,6 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
}
cfs_rq->on_list = 1;
- /* We should have no load, but we need to update last_decay. */
- update_cfs_rq_blocked_load(cfs_rq, 0);
}
}
@@ -616,15 +611,10 @@ static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
*/
static u64 __sched_period(unsigned long nr_running)
{
- u64 period = sysctl_sched_latency;
- unsigned long nr_latency = sched_nr_latency;
-
- if (unlikely(nr_running > nr_latency)) {
- period = sysctl_sched_min_granularity;
- period *= nr_running;
- }
-
- return period;
+ if (unlikely(nr_running > sched_nr_latency))
+ return nr_running * sysctl_sched_min_granularity;
+ else
+ return sysctl_sched_latency;
}
/*
@@ -669,22 +659,37 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
static int select_idle_sibling(struct task_struct *p, int cpu);
static unsigned long task_h_load(struct task_struct *p);
-static inline void __update_task_entity_contrib(struct sched_entity *se);
-static inline void __update_task_entity_utilization(struct sched_entity *se);
+/*
+ * We choose a half-life close to 1 scheduling period.
+ * Note: The tables below are dependent on this value.
+ */
+#define LOAD_AVG_PERIOD 32
+#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
+#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
-/* Give new task start runnable values to heavy its load in infant time */
-void init_task_runnable_average(struct task_struct *p)
+/* Give new sched_entity start runnable values to heavy its load in infant time */
+void init_entity_runnable_average(struct sched_entity *se)
{
- u32 slice;
+ struct sched_avg *sa = &se->avg;
- slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
- p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice;
- p->se.avg.avg_period = slice;
- __update_task_entity_contrib(&p->se);
- __update_task_entity_utilization(&p->se);
+ sa->last_update_time = 0;
+ /*
+ * sched_avg's period_contrib should be strictly less then 1024, so
+ * we give it 1023 to make sure it is almost a period (1024us), and
+ * will definitely be update (after enqueue).
+ */
+ sa->period_contrib = 1023;
+ sa->load_avg = scale_load_down(se->load.weight);
+ sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
+ sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
+ sa->util_sum = LOAD_AVG_MAX;
+ /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
+
+static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
+static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
#else
-void init_task_runnable_average(struct task_struct *p)
+void init_entity_runnable_average(struct sched_entity *se)
{
}
#endif
@@ -1415,8 +1420,9 @@ static bool numa_has_capacity(struct task_numa_env *env)
* --------------------- vs ---------------------
* src->compute_capacity dst->compute_capacity
*/
- if (src->load * dst->compute_capacity >
- dst->load * src->compute_capacity)
+ if (src->load * dst->compute_capacity * env->imbalance_pct >
+
+ dst->load * src->compute_capacity * 100)
return true;
return false;
@@ -1702,8 +1708,8 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
delta = runtime - p->last_sum_exec_runtime;
*period = now - p->last_task_numa_placement;
} else {
- delta = p->se.avg.runnable_avg_sum;
- *period = p->se.avg.avg_period;
+ delta = p->se.avg.load_sum / p->se.load.weight;
+ *period = LOAD_AVG_MAX;
}
p->last_sum_exec_runtime = runtime;
@@ -2351,13 +2357,13 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
long tg_weight;
/*
- * Use this CPU's actual weight instead of the last load_contribution
- * to gain a more accurate current total weight. See
- * update_cfs_rq_load_contribution().
+ * Use this CPU's real-time load instead of the last load contribution
+ * as the updating of the contribution is delayed, and we will use the
+ * the real-time load to calc the share. See update_tg_load_avg().
*/
tg_weight = atomic_long_read(&tg->load_avg);
- tg_weight -= cfs_rq->tg_load_contrib;
- tg_weight += cfs_rq->load.weight;
+ tg_weight -= cfs_rq->tg_load_avg_contrib;
+ tg_weight += cfs_rq_load_avg(cfs_rq);
return tg_weight;
}
@@ -2367,7 +2373,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
long tg_weight, load, shares;
tg_weight = calc_tg_weight(tg, cfs_rq);
- load = cfs_rq->load.weight;
+ load = cfs_rq_load_avg(cfs_rq);
shares = (tg->shares * load);
if (tg_weight)
@@ -2429,14 +2435,6 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_SMP
-/*
- * We choose a half-life close to 1 scheduling period.
- * Note: The tables below are dependent on this value.
- */
-#define LOAD_AVG_PERIOD 32
-#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
-#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
-
/* Precomputed fixed inverse multiplies for multiplication by y^n */
static const u32 runnable_avg_yN_inv[] = {
0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
@@ -2485,9 +2483,8 @@ static __always_inline u64 decay_load(u64 val, u64 n)
local_n %= LOAD_AVG_PERIOD;
}
- val *= runnable_avg_yN_inv[local_n];
- /* We don't use SRR here since we always want to round down. */
- return val >> 32;
+ val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
+ return val;
}
/*
@@ -2546,23 +2543,22 @@ static u32 __compute_runnable_contrib(u64 n)
* load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
*/
-static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
- struct sched_avg *sa,
- int runnable,
- int running)
+static __always_inline int
+__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
+ unsigned long weight, int running, struct cfs_rq *cfs_rq)
{
u64 delta, periods;
- u32 runnable_contrib;
+ u32 contrib;
int delta_w, decayed = 0;
unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu);
- delta = now - sa->last_runnable_update;
+ delta = now - sa->last_update_time;
/*
* This should only happen when time goes backwards, which it
* unfortunately does during sched clock init when we swap over to TSC.
*/
if ((s64)delta < 0) {
- sa->last_runnable_update = now;
+ sa->last_update_time = now;
return 0;
}
@@ -2573,26 +2569,29 @@ static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
delta >>= 10;
if (!delta)
return 0;
- sa->last_runnable_update = now;
+ sa->last_update_time = now;
/* delta_w is the amount already accumulated against our next period */
- delta_w = sa->avg_period % 1024;
+ delta_w = sa->period_contrib;
if (delta + delta_w >= 1024) {
- /* period roll-over */
decayed = 1;
+ /* how much left for next period will start over, we don't know yet */
+ sa->period_contrib = 0;
+
/*
* Now that we know we're crossing a period boundary, figure
* out how much from delta we need to complete the current
* period and accrue it.
*/
delta_w = 1024 - delta_w;
- if (runnable)
- sa->runnable_avg_sum += delta_w;
+ if (weight) {
+ sa->load_sum += weight * delta_w;
+ if (cfs_rq)
+ cfs_rq->runnable_load_sum += weight * delta_w;
+ }
if (running)
- sa->running_avg_sum += delta_w * scale_freq
- >> SCHED_CAPACITY_SHIFT;
- sa->avg_period += delta_w;
+ sa->util_sum += delta_w * scale_freq >> SCHED_CAPACITY_SHIFT;
delta -= delta_w;
@@ -2600,341 +2599,186 @@ static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
periods = delta / 1024;
delta %= 1024;
- sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
- periods + 1);
- sa->running_avg_sum = decay_load(sa->running_avg_sum,
- periods + 1);
- sa->avg_period = decay_load(sa->avg_period,
- periods + 1);
+ sa->load_sum = decay_load(sa->load_sum, periods + 1);
+ if (cfs_rq) {
+ cfs_rq->runnable_load_sum =
+ decay_load(cfs_rq->runnable_load_sum, periods + 1);
+ }
+ sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
/* Efficiently calculate \sum (1..n_period) 1024*y^i */
- runnable_contrib = __compute_runnable_contrib(periods);
- if (runnable)
- sa->runnable_avg_sum += runnable_contrib;
+ contrib = __compute_runnable_contrib(periods);
+ if (weight) {
+ sa->load_sum += weight * contrib;
+ if (cfs_rq)
+ cfs_rq->runnable_load_sum += weight * contrib;
+ }
if (running)
- sa->running_avg_sum += runnable_contrib * scale_freq
- >> SCHED_CAPACITY_SHIFT;
- sa->avg_period += runnable_contrib;
+ sa->util_sum += contrib * scale_freq >> SCHED_CAPACITY_SHIFT;
}
/* Remainder of delta accrued against u_0` */
- if (runnable)
- sa->runnable_avg_sum += delta;
+ if (weight) {
+ sa->load_sum += weight * delta;
+ if (cfs_rq)
+ cfs_rq->runnable_load_sum += weight * delta;
+ }
if (running)
- sa->running_avg_sum += delta * scale_freq
- >> SCHED_CAPACITY_SHIFT;
- sa->avg_period += delta;
-
- return decayed;
-}
+ sa->util_sum += delta * scale_freq >> SCHED_CAPACITY_SHIFT;
-/* Synchronize an entity's decay with its parenting cfs_rq.*/
-static inline u64 __synchronize_entity_decay(struct sched_entity *se)
-{
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 decays = atomic64_read(&cfs_rq->decay_counter);
-
- decays -= se->avg.decay_count;
- se->avg.decay_count = 0;
- if (!decays)
- return 0;
+ sa->period_contrib += delta;
- se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
- se->avg.utilization_avg_contrib =
- decay_load(se->avg.utilization_avg_contrib, decays);
+ if (decayed) {
+ sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
+ if (cfs_rq) {
+ cfs_rq->runnable_load_avg =
+ div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
+ }
+ sa->util_avg = (sa->util_sum << SCHED_LOAD_SHIFT) / LOAD_AVG_MAX;
+ }
- return decays;
+ return decayed;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
- int force_update)
-{
- struct task_group *tg = cfs_rq->tg;
- long tg_contrib;
-
- tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
- tg_contrib -= cfs_rq->tg_load_contrib;
-
- if (!tg_contrib)
- return;
-
- if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
- atomic_long_add(tg_contrib, &tg->load_avg);
- cfs_rq->tg_load_contrib += tg_contrib;
- }
-}
-
/*
- * Aggregate cfs_rq runnable averages into an equivalent task_group
- * representation for computing load contributions.
+ * Updating tg's load_avg is necessary before update_cfs_share (which is done)
+ * and effective_load (which is not done because it is too costly).
*/
-static inline void __update_tg_runnable_avg(struct sched_avg *sa,
- struct cfs_rq *cfs_rq)
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
{
- struct task_group *tg = cfs_rq->tg;
- long contrib;
+ long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
- /* The fraction of a cpu used by this cfs_rq */
- contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
- sa->avg_period + 1);
- contrib -= cfs_rq->tg_runnable_contrib;
-
- if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
- atomic_add(contrib, &tg->runnable_avg);
- cfs_rq->tg_runnable_contrib += contrib;
+ if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
+ atomic_long_add(delta, &cfs_rq->tg->load_avg);
+ cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
}
}
-static inline void __update_group_entity_contrib(struct sched_entity *se)
-{
- struct cfs_rq *cfs_rq = group_cfs_rq(se);
- struct task_group *tg = cfs_rq->tg;
- int runnable_avg;
-
- u64 contrib;
-
- contrib = cfs_rq->tg_load_contrib * tg->shares;
- se->avg.load_avg_contrib = div_u64(contrib,
- atomic_long_read(&tg->load_avg) + 1);
-
- /*
- * For group entities we need to compute a correction term in the case
- * that they are consuming <1 cpu so that we would contribute the same
- * load as a task of equal weight.
- *
- * Explicitly co-ordinating this measurement would be expensive, but
- * fortunately the sum of each cpus contribution forms a usable
- * lower-bound on the true value.
- *
- * Consider the aggregate of 2 contributions. Either they are disjoint
- * (and the sum represents true value) or they are disjoint and we are
- * understating by the aggregate of their overlap.
- *
- * Extending this to N cpus, for a given overlap, the maximum amount we
- * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
- * cpus that overlap for this interval and w_i is the interval width.
- *
- * On a small machine; the first term is well-bounded which bounds the
- * total error since w_i is a subset of the period. Whereas on a
- * larger machine, while this first term can be larger, if w_i is the
- * of consequential size guaranteed to see n_i*w_i quickly converge to
- * our upper bound of 1-cpu.
- */
- runnable_avg = atomic_read(&tg->runnable_avg);
- if (runnable_avg < NICE_0_LOAD) {
- se->avg.load_avg_contrib *= runnable_avg;
- se->avg.load_avg_contrib >>= NICE_0_SHIFT;
- }
-}
-
-static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
-{
- __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg,
- runnable, runnable);
- __update_tg_runnable_avg(&rq->avg, &rq->cfs);
-}
#else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
- int force_update) {}
-static inline void __update_tg_runnable_avg(struct sched_avg *sa,
- struct cfs_rq *cfs_rq) {}
-static inline void __update_group_entity_contrib(struct sched_entity *se) {}
-static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
#endif /* CONFIG_FAIR_GROUP_SCHED */
-static inline void __update_task_entity_contrib(struct sched_entity *se)
-{
- u32 contri