summaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-11-26 15:23:14 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2019-11-26 15:23:14 -0800
commit77a05940eee7e9891cd6add7a690a3e762ee21b0 (patch)
tree74f4f9b315659093059261a2da8adf7c1aa2a92f /kernel/sched
parent3f59dbcace56fae7e4ed303bab90f1bedadcfdf4 (diff)
parentde881a341c4143650fa50ce95cf450a5c94faa9f (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "The biggest changes in this cycle were: - Make kcpustat vtime aware (Frederic Weisbecker) - Rework the CFS load_balance() logic (Vincent Guittot) - Misc cleanups, smaller enhancements, fixes. The load-balancing rework is the most intrusive change: it replaces the old heuristics that have become less meaningful after the introduction of the PELT metrics, with a grounds-up load-balancing algorithm. As such it's not really an iterative series, but replaces the old load-balancing logic with the new one. We hope there are no performance regressions left - but statistically it's highly probable that there *is* going to be some workload that is hurting from these chnages. If so then we'd prefer to have a look at that workload and fix its scheduling, instead of reverting the changes" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (46 commits) rackmeter: Use vtime aware kcpustat accessor leds: Use all-in-one vtime aware kcpustat accessor cpufreq: Use vtime aware kcpustat accessors for user time procfs: Use all-in-one vtime aware kcpustat accessor sched/vtime: Bring up complete kcpustat accessor sched/cputime: Support other fields on kcpustat_field() sched/cpufreq: Move the cfs_rq_util_change() call to cpufreq_update_util() sched/fair: Add comments for group_type and balancing at SD_NUMA level sched/fair: Fix rework of find_idlest_group() sched/uclamp: Fix overzealous type replacement sched/Kconfig: Fix spelling mistake in user-visible help text sched/core: Further clarify sched_class::set_next_task() sched/fair: Use mul_u32_u32() sched/core: Simplify sched_class::pick_next_task() sched/core: Optimize pick_next_task() sched/core: Make pick_next_task_idle() more consistent sched/fair: Better document newidle_balance() leds: Use vtime aware kcpustat accessor to fetch CPUTIME_SYSTEM cpufreq: Use vtime aware kcpustat accessor to fetch CPUTIME_SYSTEM procfs: Use vtime aware kcpustat accessor to fetch CPUTIME_SYSTEM ...
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/core.c18
-rw-r--r--kernel/sched/cputime.c288
-rw-r--r--kernel/sched/deadline.c12
-rw-r--r--kernel/sched/fair.c1427
-rw-r--r--kernel/sched/features.h1
-rw-r--r--kernel/sched/idle.c10
-rw-r--r--kernel/sched/rt.c12
-rw-r--r--kernel/sched/sched.h25
-rw-r--r--kernel/sched/stop_task.c9
-rw-r--r--kernel/sched/topology.c9
10 files changed, 1172 insertions, 639 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 80b60ca7767f..d82e2f6ac41d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -811,7 +811,7 @@ static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
}
-static inline enum uclamp_id uclamp_none(enum uclamp_id clamp_id)
+static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
{
if (clamp_id == UCLAMP_MIN)
return 0;
@@ -854,7 +854,7 @@ static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
}
static inline
-enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
+unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
unsigned int clamp_value)
{
struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
@@ -919,7 +919,7 @@ uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
return uc_req;
}
-enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
+unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
{
struct uclamp_se uc_eff;
@@ -3918,13 +3918,15 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
prev->sched_class == &fair_sched_class) &&
rq->nr_running == rq->cfs.h_nr_running)) {
- p = fair_sched_class.pick_next_task(rq, prev, rf);
+ p = pick_next_task_fair(rq, prev, rf);
if (unlikely(p == RETRY_TASK))
goto restart;
/* Assumes fair_sched_class->next == idle_sched_class */
- if (unlikely(!p))
- p = idle_sched_class.pick_next_task(rq, prev, rf);
+ if (!p) {
+ put_prev_task(rq, prev);
+ p = pick_next_task_idle(rq);
+ }
return p;
}
@@ -3948,7 +3950,7 @@ restart:
put_prev_task(rq, prev);
for_each_class(class) {
- p = class->pick_next_task(rq, NULL, NULL);
+ p = class->pick_next_task(rq);
if (p)
return p;
}
@@ -6217,7 +6219,7 @@ static struct task_struct *__pick_migrate_task(struct rq *rq)
struct task_struct *next;
for_each_class(class) {
- next = class->pick_next_task(rq, NULL, NULL);
+ next = class->pick_next_task(rq);
if (next) {
next->sched_class->put_prev_task(rq, next);
return next;
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 46ed4e1383e2..d43318a489f2 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -405,27 +405,25 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_
/*
* Use precise platform statistics if available:
*/
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+
# ifndef __ARCH_HAS_VTIME_TASK_SWITCH
-void vtime_common_task_switch(struct task_struct *prev)
+void vtime_task_switch(struct task_struct *prev)
{
if (is_idle_task(prev))
vtime_account_idle(prev);
else
- vtime_account_system(prev);
+ vtime_account_kernel(prev);
vtime_flush(prev);
arch_vtime_task_switch(prev);
}
# endif
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
/*
* Archs that account the whole time spent in the idle task
* (outside irq) as idle time can rely on this and just implement
- * vtime_account_system() and vtime_account_idle(). Archs that
+ * vtime_account_kernel() and vtime_account_idle(). Archs that
* have other meaning of the idle time (s390 only includes the
* time spent by the CPU when it's in low power mode) must override
* vtime_account().
@@ -436,7 +434,7 @@ void vtime_account_irq_enter(struct task_struct *tsk)
if (!in_interrupt() && is_idle_task(tsk))
vtime_account_idle(tsk);
else
- vtime_account_system(tsk);
+ vtime_account_kernel(tsk);
}
EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
#endif /* __ARCH_HAS_VTIME_ACCOUNT */
@@ -477,7 +475,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
u64 cputime, steal;
struct rq *rq = this_rq();
- if (vtime_accounting_cpu_enabled())
+ if (vtime_accounting_enabled_this_cpu())
return;
if (sched_clock_irqtime) {
@@ -711,8 +709,8 @@ static u64 get_vtime_delta(struct vtime *vtime)
return delta - other;
}
-static void __vtime_account_system(struct task_struct *tsk,
- struct vtime *vtime)
+static void vtime_account_system(struct task_struct *tsk,
+ struct vtime *vtime)
{
vtime->stime += get_vtime_delta(vtime);
if (vtime->stime >= TICK_NSEC) {
@@ -731,7 +729,17 @@ static void vtime_account_guest(struct task_struct *tsk,
}
}
-void vtime_account_system(struct task_struct *tsk)
+static void __vtime_account_kernel(struct task_struct *tsk,
+ struct vtime *vtime)
+{
+ /* We might have scheduled out from guest path */
+ if (vtime->state == VTIME_GUEST)
+ vtime_account_guest(tsk, vtime);
+ else
+ vtime_account_system(tsk, vtime);
+}
+
+void vtime_account_kernel(struct task_struct *tsk)
{
struct vtime *vtime = &tsk->vtime;
@@ -739,11 +747,7 @@ void vtime_account_system(struct task_struct *tsk)
return;
write_seqcount_begin(&vtime->seqcount);
- /* We might have scheduled out from guest path */
- if (tsk->flags & PF_VCPU)
- vtime_account_guest(tsk, vtime);
- else
- __vtime_account_system(tsk, vtime);
+ __vtime_account_kernel(tsk, vtime);
write_seqcount_end(&vtime->seqcount);
}
@@ -752,7 +756,7 @@ void vtime_user_enter(struct task_struct *tsk)
struct vtime *vtime = &tsk->vtime;
write_seqcount_begin(&vtime->seqcount);
- __vtime_account_system(tsk, vtime);
+ vtime_account_system(tsk, vtime);
vtime->state = VTIME_USER;
write_seqcount_end(&vtime->seqcount);
}
@@ -782,8 +786,9 @@ void vtime_guest_enter(struct task_struct *tsk)
* that can thus safely catch up with a tickless delta.
*/
write_seqcount_begin(&vtime->seqcount);
- __vtime_account_system(tsk, vtime);
+ vtime_account_system(tsk, vtime);
tsk->flags |= PF_VCPU;
+ vtime->state = VTIME_GUEST;
write_seqcount_end(&vtime->seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_enter);
@@ -795,6 +800,7 @@ void vtime_guest_exit(struct task_struct *tsk)
write_seqcount_begin(&vtime->seqcount);
vtime_account_guest(tsk, vtime);
tsk->flags &= ~PF_VCPU;
+ vtime->state = VTIME_SYS;
write_seqcount_end(&vtime->seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_exit);
@@ -804,19 +810,30 @@ void vtime_account_idle(struct task_struct *tsk)
account_idle_time(get_vtime_delta(&tsk->vtime));
}
-void arch_vtime_task_switch(struct task_struct *prev)
+void vtime_task_switch_generic(struct task_struct *prev)
{
struct vtime *vtime = &prev->vtime;
write_seqcount_begin(&vtime->seqcount);
+ if (vtime->state == VTIME_IDLE)
+ vtime_account_idle(prev);
+ else
+ __vtime_account_kernel(prev, vtime);
vtime->state = VTIME_INACTIVE;
+ vtime->cpu = -1;
write_seqcount_end(&vtime->seqcount);
vtime = &current->vtime;
write_seqcount_begin(&vtime->seqcount);
- vtime->state = VTIME_SYS;
+ if (is_idle_task(current))
+ vtime->state = VTIME_IDLE;
+ else if (current->flags & PF_VCPU)
+ vtime->state = VTIME_GUEST;
+ else
+ vtime->state = VTIME_SYS;
vtime->starttime = sched_clock();
+ vtime->cpu = smp_processor_id();
write_seqcount_end(&vtime->seqcount);
}
@@ -827,8 +844,9 @@ void vtime_init_idle(struct task_struct *t, int cpu)
local_irq_save(flags);
write_seqcount_begin(&vtime->seqcount);
- vtime->state = VTIME_SYS;
+ vtime->state = VTIME_IDLE;
vtime->starttime = sched_clock();
+ vtime->cpu = cpu;
write_seqcount_end(&vtime->seqcount);
local_irq_restore(flags);
}
@@ -846,7 +864,7 @@ u64 task_gtime(struct task_struct *t)
seq = read_seqcount_begin(&vtime->seqcount);
gtime = t->gtime;
- if (vtime->state == VTIME_SYS && t->flags & PF_VCPU)
+ if (vtime->state == VTIME_GUEST)
gtime += vtime->gtime + vtime_delta(vtime);
} while (read_seqcount_retry(&vtime->seqcount, seq));
@@ -877,20 +895,230 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
*utime = t->utime;
*stime = t->stime;
- /* Task is sleeping, nothing to add */
- if (vtime->state == VTIME_INACTIVE || is_idle_task(t))
+ /* Task is sleeping or idle, nothing to add */
+ if (vtime->state < VTIME_SYS)
continue;
delta = vtime_delta(vtime);
/*
- * Task runs either in user or kernel space, add pending nohz time to
- * the right place.
+ * Task runs either in user (including guest) or kernel space,
+ * add pending nohz time to the right place.
*/
- if (vtime->state == VTIME_USER || t->flags & PF_VCPU)
- *utime += vtime->utime + delta;
- else if (vtime->state == VTIME_SYS)
+ if (vtime->state == VTIME_SYS)
*stime += vtime->stime + delta;
+ else
+ *utime += vtime->utime + delta;
+ } while (read_seqcount_retry(&vtime->seqcount, seq));
+}
+
+static int vtime_state_check(struct vtime *vtime, int cpu)
+{
+ /*
+ * We raced against a context switch, fetch the
+ * kcpustat task again.
+ */
+ if (vtime->cpu != cpu && vtime->cpu != -1)
+ return -EAGAIN;
+
+ /*
+ * Two possible things here:
+ * 1) We are seeing the scheduling out task (prev) or any past one.
+ * 2) We are seeing the scheduling in task (next) but it hasn't
+ * passed though vtime_task_switch() yet so the pending
+ * cputime of the prev task may not be flushed yet.
+ *
+ * Case 1) is ok but 2) is not. So wait for a safe VTIME state.
+ */
+ if (vtime->state == VTIME_INACTIVE)
+ return -EAGAIN;
+
+ return 0;
+}
+
+static u64 kcpustat_user_vtime(struct vtime *vtime)
+{
+ if (vtime->state == VTIME_USER)
+ return vtime->utime + vtime_delta(vtime);
+ else if (vtime->state == VTIME_GUEST)
+ return vtime->gtime + vtime_delta(vtime);
+ return 0;
+}
+
+static int kcpustat_field_vtime(u64 *cpustat,
+ struct task_struct *tsk,
+ enum cpu_usage_stat usage,
+ int cpu, u64 *val)
+{
+ struct vtime *vtime = &tsk->vtime;
+ unsigned int seq;
+ int err;
+
+ do {
+ seq = read_seqcount_begin(&vtime->seqcount);
+
+ err = vtime_state_check(vtime, cpu);
+ if (err < 0)
+ return err;
+
+ *val = cpustat[usage];
+
+ /*
+ * Nice VS unnice cputime accounting may be inaccurate if
+ * the nice value has changed since the last vtime update.
+ * But proper fix would involve interrupting target on nice
+ * updates which is a no go on nohz_full (although the scheduler
+ * may still interrupt the target if rescheduling is needed...)
+ */
+ switch (usage) {
+ case CPUTIME_SYSTEM:
+ if (vtime->state == VTIME_SYS)
+ *val += vtime->stime + vtime_delta(vtime);
+ break;
+ case CPUTIME_USER:
+ if (task_nice(tsk) <= 0)
+ *val += kcpustat_user_vtime(vtime);
+ break;
+ case CPUTIME_NICE:
+ if (task_nice(tsk) > 0)
+ *val += kcpustat_user_vtime(vtime);
+ break;
+ case CPUTIME_GUEST:
+ if (vtime->state == VTIME_GUEST && task_nice(tsk) <= 0)
+ *val += vtime->gtime + vtime_delta(vtime);
+ break;
+ case CPUTIME_GUEST_NICE:
+ if (vtime->state == VTIME_GUEST && task_nice(tsk) > 0)
+ *val += vtime->gtime + vtime_delta(vtime);
+ break;
+ default:
+ break;
+ }
+ } while (read_seqcount_retry(&vtime->seqcount, seq));
+
+ return 0;
+}
+
+u64 kcpustat_field(struct kernel_cpustat *kcpustat,
+ enum cpu_usage_stat usage, int cpu)
+{
+ u64 *cpustat = kcpustat->cpustat;
+ struct rq *rq;
+ u64 val;
+ int err;
+
+ if (!vtime_accounting_enabled_cpu(cpu))
+ return cpustat[usage];
+
+ rq = cpu_rq(cpu);
+
+ for (;;) {
+ struct task_struct *curr;
+
+ rcu_read_lock();
+ curr = rcu_dereference(rq->curr);
+ if (WARN_ON_ONCE(!curr)) {
+ rcu_read_unlock();
+ return cpustat[usage];
+ }
+
+ err = kcpustat_field_vtime(cpustat, curr, usage, cpu, &val);
+ rcu_read_unlock();
+
+ if (!err)
+ return val;
+
+ cpu_relax();
+ }
+}
+EXPORT_SYMBOL_GPL(kcpustat_field);
+
+static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,
+ const struct kernel_cpustat *src,
+ struct task_struct *tsk, int cpu)
+{
+ struct vtime *vtime = &tsk->vtime;
+ unsigned int seq;
+ int err;
+
+ do {
+ u64 *cpustat;
+ u64 delta;
+
+ seq = read_seqcount_begin(&vtime->seqcount);
+
+ err = vtime_state_check(vtime, cpu);
+ if (err < 0)
+ return err;
+
+ *dst = *src;
+ cpustat = dst->cpustat;
+
+ /* Task is sleeping, dead or idle, nothing to add */
+ if (vtime->state < VTIME_SYS)
+ continue;
+
+ delta = vtime_delta(vtime);
+
+ /*
+ * Task runs either in user (including guest) or kernel space,
+ * add pending nohz time to the right place.
+ */
+ if (vtime->state == VTIME_SYS) {
+ cpustat[CPUTIME_SYSTEM] += vtime->stime + delta;
+ } else if (vtime->state == VTIME_USER) {
+ if (task_nice(tsk) > 0)
+ cpustat[CPUTIME_NICE] += vtime->utime + delta;
+ else
+ cpustat[CPUTIME_USER] += vtime->utime + delta;
+ } else {
+ WARN_ON_ONCE(vtime->state != VTIME_GUEST);
+ if (task_nice(tsk) > 0) {
+ cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta;
+ cpustat[CPUTIME_NICE] += vtime->gtime + delta;
+ } else {
+ cpustat[CPUTIME_GUEST] += vtime->gtime + delta;
+ cpustat[CPUTIME_USER] += vtime->gtime + delta;
+ }
+ }
} while (read_seqcount_retry(&vtime->seqcount, seq));
+
+ return err;
+}
+
+void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu)
+{
+ const struct kernel_cpustat *src = &kcpustat_cpu(cpu);
+ struct rq *rq;
+ int err;
+
+ if (!vtime_accounting_enabled_cpu(cpu)) {
+ *dst = *src;
+ return;
+ }
+
+ rq = cpu_rq(cpu);
+
+ for (;;) {
+ struct task_struct *curr;
+
+ rcu_read_lock();
+ curr = rcu_dereference(rq->curr);
+ if (WARN_ON_ONCE(!curr)) {
+ rcu_read_unlock();
+ *dst = *src;
+ return;
+ }
+
+ err = kcpustat_cpu_fetch_vtime(dst, src, curr, cpu);
+ rcu_read_unlock();
+
+ if (!err)
+ return;
+
+ cpu_relax();
+ }
}
+EXPORT_SYMBOL_GPL(kcpustat_cpu_fetch);
+
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index a8a08030a8f7..43323f875cb9 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1743,13 +1743,16 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
}
#endif
-static void set_next_task_dl(struct rq *rq, struct task_struct *p)
+static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
{
p->se.exec_start = rq_clock_task(rq);
/* You can't push away the running task */
dequeue_pushable_dl_task(rq, p);
+ if (!first)
+ return;
+
if (hrtick_enabled(rq))
start_hrtick_dl(rq, p);
@@ -1770,22 +1773,19 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
return rb_entry(left, struct sched_dl_entity, rb_node);
}
-static struct task_struct *
-pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+static struct task_struct *pick_next_task_dl(struct rq *rq)
{
struct sched_dl_entity *dl_se;
struct dl_rq *dl_rq = &rq->dl;
struct task_struct *p;
- WARN_ON_ONCE(prev || rf);
-
if (!sched_dl_runnable(rq))
return NULL;
dl_se = pick_next_dl_entity(rq, dl_rq);
BUG_ON(!dl_se);
p = dl_task_of(dl_se);
- set_next_task_dl(rq, p);
+ set_next_task_dl(rq, p, true);
return p;
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 69a81a5709ff..08a233e97a01 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -229,8 +229,7 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight
}
}
- /* hint to use a 32x32->64 mul */
- fact = (u64)(u32)fact * lw->inv_weight;
+ fact = mul_u32_u32(fact, lw->inv_weight);
while (fact >> 32) {
fact >>= 1;
@@ -1474,7 +1473,12 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
}
-static unsigned long cpu_runnable_load(struct rq *rq);
+static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
+
+static unsigned long cpu_runnable_load(struct rq *rq)
+{
+ return cfs_rq_runnable_load_avg(&rq->cfs);
+}
/* Cached statistics for all CPUs within a node */
struct numa_stats {
@@ -3504,9 +3508,6 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
cfs_rq->load_last_update_time_copy = sa->last_update_time;
#endif
- if (decayed)
- cfs_rq_util_change(cfs_rq, 0);
-
return decayed;
}
@@ -3616,8 +3617,12 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
update_tg_load_avg(cfs_rq, 0);
- } else if (decayed && (flags & UPDATE_TG))
- update_tg_load_avg(cfs_rq, 0);
+ } else if (decayed) {
+ cfs_rq_util_change(cfs_rq, 0);
+
+ if (flags & UPDATE_TG)
+ update_tg_load_avg(cfs_rq, 0);
+ }
}
#ifndef CONFIG_64BIT
@@ -3764,10 +3769,21 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
return;
/*
+ * Reset EWMA on utilization increases, the moving average is used only
+ * to smooth utilization decreases.
+ */
+ ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
+ if (sched_feat(UTIL_EST_FASTUP)) {
+ if (ue.ewma < ue.enqueued) {
+ ue.ewma = ue.enqueued;
+ goto done;
+ }
+ }
+
+ /*
* Skip update of task's estimated utilization when its EWMA is
* already ~1% close to its last activation value.
*/
- ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
last_ewma_diff = ue.enqueued - ue.ewma;
if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
return;
@@ -3800,6 +3816,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
ue.ewma += last_ewma_diff;
ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
+done:
WRITE_ONCE(p->se.avg.util_est, ue);
}
@@ -5370,26 +5387,45 @@ static int sched_idle_cpu(int cpu)
rq->nr_running);
}
-static unsigned long cpu_runnable_load(struct rq *rq)
+static unsigned long cpu_load(struct rq *rq)
{
- return cfs_rq_runnable_load_avg(&rq->cfs);
+ return cfs_rq_load_avg(&rq->cfs);
}
-static unsigned long capacity_of(int cpu)
+/*
+ * cpu_load_without - compute CPU load without any contributions from *p
+ * @cpu: the CPU which load is requested
+ * @p: the task which load should be discounted
+ *
+ * The load of a CPU is defined by the load of tasks currently enqueued on that
+ * CPU as well as tasks which are currently sleeping after an execution on that
+ * CPU.
+ *
+ * This method returns the load of the specified CPU by discounting the load of
+ * the specified task, whenever the task is currently contributing to the CPU
+ * load.
+ */
+static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p)
{
- return cpu_rq(cpu)->cpu_capacity;
-}
+ struct cfs_rq *cfs_rq;
+ unsigned int load;
-static unsigned long cpu_avg_load_per_task(int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
- unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
- unsigned long load_avg = cpu_runnable_load(rq);
+ /* Task has no contribution or is new */
+ if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
+ return cpu_load(rq);
- if (nr_running)
- return load_avg / nr_running;
+ cfs_rq = &rq->cfs;
+ load = READ_ONCE(cfs_rq->avg.load_avg);
- return 0;
+ /* Discount task's util from CPU's util */
+ lsub_positive(&load, task_h_load(p));
+
+ return load;
+}
+
+static unsigned long capacity_of(int cpu)
+{
+ return cpu_rq(cpu)->cpu_capacity;
}
static void record_wakee(struct task_struct *p)
@@ -5482,7 +5518,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
s64 this_eff_load, prev_eff_load;
unsigned long task_load;
- this_eff_load = cpu_runnable_load(cpu_rq(this_cpu));
+ this_eff_load = cpu_load(cpu_rq(this_cpu));
if (sync) {
unsigned long current_load = task_h_load(current);
@@ -5500,7 +5536,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
this_eff_load *= 100;
this_eff_load *= capacity_of(prev_cpu);
- prev_eff_load = cpu_runnable_load(cpu_rq(prev_cpu));
+ prev_eff_load = cpu_load(cpu_rq(prev_cpu));
prev_eff_load -= task_load;
if (sched_feat(WA_BIAS))
prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
@@ -5538,149 +5574,9 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
return target;
}
-static unsigned long cpu_util_without(int cpu, struct task_struct *p);
-
-static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
-{
- return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
-}
-
-/*
- * find_idlest_group finds and returns the least busy CPU group within the
- * domain.
- *
- * Assumes p is allowed on at least one CPU in sd.
- */
static struct sched_group *
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
- int this_cpu, int sd_flag)
-{
- struct sched_group *idlest = NULL, *group = sd->groups;
- struct sched_group *most_spare_sg = NULL;
- unsigned long min_runnable_load = ULONG_MAX;
- unsigned long this_runnable_load = ULONG_MAX;
- unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
- unsigned long most_spare = 0, this_spare = 0;
- int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
- unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
- (sd->imbalance_pct-100) / 100;
-
- do {
- unsigned long load, avg_load, runnable_load;
- unsigned long spare_cap, max_spare_cap;
- int local_group;
- int i;
-
- /* Skip over this group if it has no CPUs allowed */
- if (!cpumask_intersects(sched_group_span(group),
- p->cpus_ptr))
- continue;
-
- local_group = cpumask_test_cpu(this_cpu,
- sched_group_span(group));
-
- /*
- * Tally up the load of all CPUs in the group and find
- * the group containing the CPU with most spare capacity.
- */
- avg_load = 0;
- runnable_load = 0;
- max_spare_cap = 0;
-
- for_each_cpu(i, sched_group_span(group)) {
- load = cpu_runnable_load(cpu_rq(i));
- runnable_load += load;
-
- avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
-
- spare_cap = capacity_spare_without(i, p);
-
- if (spare_cap > max_spare_cap)
- max_spare_cap = spare_cap;
- }
-
- /* Adjust by relative CPU capacity of the group */
- avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
- group->sgc->capacity;
- runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
- group->sgc->capacity;
-
- if (local_group) {
- this_runnable_load = runnable_load;
- this_avg_load = avg_load;
- this_spare = max_spare_cap;
- } else {
- if (min_runnable_load > (runnable_load + imbalance)) {
- /*
- * The runnable load is significantly smaller
- * so we can pick this new CPU:
- */
- min_runnable_load = runnable_load;
- min_avg_load = avg_load;
- idlest = group;
- } else if ((runnable_load < (min_runnable_load + imbalance)) &&
- (100*min_avg_load > imbalance_scale*avg_load)) {
- /*
- * The runnable loads are close so take the
- * blocked load into account through avg_load:
- */
- min_avg_load = avg_load;
- idlest = group;
- }
-
- if (most_spare < max_spare_cap) {
- most_spare = max_spare_cap;
- most_spare_sg = group;
- }
- }
- } while (group = group->next, group != sd->groups);
-
- /*
- * The cross-over point between using spare capacity or least load
- * is too conservative for high utilization tasks on partially
- * utilized systems if we require spare_capacity > task_util(p),
- * so we allow for some task stuffing by using
- * spare_capacity > task_util(p)/2.
- *
- * Spare capacity can't be used for fork because the utilization has
- * not been set yet, we must first select a rq to compute the initial
- * utilization.
- */
- if (sd_flag & SD_BALANCE_FORK)
- goto skip_spare;
-
- if (this_spare > task_util(p) / 2 &&
- imbalance_scale*this_spare > 100*most_spare)
- return NULL;
-
- if (most_spare > task_util(p) / 2)
- return most_spare_sg;
-
-skip_spare:
- if (!idlest)
- return NULL;
-
- /*
- * When comparing groups across NUMA domains, it's possible for the
- * local domain to be very lightly loaded relative to the remote
- * domains but "imbalance" skews the comparison making remote CPUs
- * look much more favourable. When considering cross-domain, add
- * imbalance to the runnable load on the remote node and consider
- * staying local.
- */
- if ((sd->flags & SD_NUMA) &&
- min_runnable_load + imbalance >= this_runnable_load)
- return NULL;
-
- if (min_runnable_load > (this_runnable_load + imbalance))
- return NULL;
-
- if ((this_runnable_load < (min_runnable_load + imbalance)) &&
- (100*this_avg_load < imbalance_scale*min_avg_load))
- return NULL;
-
- return idlest;
-}
+ int this_cpu, int sd_flag);
/*
* find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
@@ -5729,7 +5625,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
continue;
}
- load = cpu_runnable_load(cpu_rq(i));
+ load = cpu_load(cpu_rq(i));
if (load < min_load) {
min_load = load;
least_loaded_cpu = i;
@@ -5753,7 +5649,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
return prev_cpu;
/*
- * We need task's util for capacity_spare_without, sync it up to
+ * We need task's util for cpu_util_without, sync it up to
* prev_cpu's last_update_time.
*/
if (!(sd_flag & SD_BALANCE_FORK))
@@ -6746,7 +6642,7 @@ preempt:
set_last_buddy(se);
}
-static struct task_struct *
+struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct cfs_rq *cfs_rq = &rq->cfs;
@@ -6890,6 +6786,11 @@ idle:
return NULL;
}
+static struct task_struct *__pick_next_task_fair(struct rq *rq)
+{
+ return pick_next_task_fair(rq, NULL, NULL);
+}
+
/*
* Account for a descheduled task:
*/
@@ -7079,11 +6980,49 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
enum fbq_type { regular, remote, all };
+/*
+ * 'group_type' describes the group of CPUs at the moment of load balancing.
+ *
+ * The enum is ordered by pulling priority, with the group with lowest priority
+ * first so the group_type can simply be compared when selecting the busiest
+ * group. See update_sd_pick_busiest().
+ */
enum group_type {
- group_other = 0,
+ /* The group has spare capacity that can be used to run more tasks. */
+ group_has_spare = 0,
+ /*
+ * The group is fully used and the tasks don't compete for more CPU
+ * cycles. Nevertheless, some tasks might wait before running.
+ */
+ group_fully_busy,
+ /*
+ * SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity
+ * and must be migrated to a more powerful CPU.
+ */
group_misfit_task,
+ /*
+ * SD_ASYM_PACKING only: One local CPU with higher capacity is available,
+ * and the task should be migrated to it instead of running on the
+ * current CPU.
+ */
+ group_asym_packing,
+ /*
+ * The tasks' affinity constraints previously prevented the scheduler
+ * from balancing the load across the system.
+ */
group_imbalanced,
- group_overloaded,
+ /*
+ * The CPU is overloaded and can't provide expected CPU cycles to all
+ * tasks.
+ */
+ group_overloaded
+};
+
+enum migration_type {
+ migrate_load = 0,
+ migrate_util,
+ migrate_task,
+ migrate_misfit
};
#define LBF_ALL_PINNED 0x01
@@ -7116,7 +7055,7 @@ struct lb_env {
unsigned int loop_max;
enum fbq_type fbq_type;
- enum group_type src_grp_type;
+ enum migration_type migration_type;
struct list_head tasks;
};
@@ -7339,7 +7278,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
static const unsigned int sched_nr_migrate_break = 32;
/*
- * detach_tasks() -- tries to detach up to imbalance runnable load from
+ * detach_tasks() -- tries to detach up to imbalance load/util/tasks from
* busiest_rq, as part of a balancing operation within domain "sd".
*
* Returns number of detached tasks if successful and 0 otherwise.
@@ -7347,8 +7286,8 @@ static const unsigned int sched_nr_migrate_break = 32;
static int detach_tasks(struct lb_env *env)
{
struct list_head *tasks = &env->src_rq->cfs_tasks;
+ unsigned long util, load;
struct task_struct *p;
- unsigned long load;
int detached = 0;
lockdep_assert_held(&env->src_rq->lock);
@@ -7381,19 +7320,46 @@ static int detach_tasks(struct lb_env *env)
if (!can_migrate_task(p, env))
goto next;
- load = task_h_load(p);
+ switch (env->migration_type) {
+ case migrate_load:
+ load = task_h_load(p);
- if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
- goto next;
+ if (sched_feat(LB_MIN) &&
+ load < 16 && !env->sd->nr_balance_failed)
+ goto next;
- if ((load / 2) > env->imbalance)
- goto next;
+ if (load/2 > env->imbalance)
+ goto next;
+
+ env->imbalance -= load;
+ break;
+
+ case migrate_util:
+ util = task_util_est(p);
+
+ if (util > env->imbalance)
+ goto next;
+
+ env->imbalance -= util;
+ break;
+
+ case migrate_task:
+ env->imbalance--;
+ break;
+
+ case migrate_misfit:
+ /* This is not a misfit task */
+ if (task_fits_capacity(p, capacity_of(env->src_cpu)))
+ goto next;
+
+ env->imbalance = 0;
+ break;
+ }
detach_task(p, env);
list_add(&p->se.group_node, &env->tasks);
detached++;
- env->imbalance -= load;
#ifdef CONFIG_PREEMPTION
/*
@@ -7407,7 +7373,7 @@ static int detach_tasks(struct lb_env *env)
/*
* We only want to steal up to the prescribed amou