From a75a6068dac25d4022ebcd82192ed6345407843c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 10 Sep 2015 15:07:50 +0200 Subject: cpu/hotplug: Read_lock(tasklist_lock) doesn't need to disable irqs check_for_tasks() doesn't need to disable irqs, recursive read_lock() from interrupt is fine. While at it, s/do_each_thread/for_each_process_thread/. Signed-off-by: Oleg Nesterov Reviewed-by: Kirill Tkhai Reviewed-by: Srikar Dronamraju Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150910130750.GA20055@redhat.com Signed-off-by: Ingo Molnar --- kernel/cpu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index 82cf9dff4295..050c63472f03 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -304,8 +304,8 @@ static inline void check_for_tasks(int dead_cpu) { struct task_struct *g, *p; - read_lock_irq(&tasklist_lock); - do_each_thread(g, p) { + read_lock(&tasklist_lock); + for_each_process_thread(g, p) { if (!p->on_rq) continue; /* @@ -320,8 +320,8 @@ static inline void check_for_tasks(int dead_cpu) pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n", p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags); - } while_each_thread(g, p); - read_unlock_irq(&tasklist_lock); + } + read_unlock(&tasklist_lock); } struct take_cpu_down_param { -- cgit v1.2.3 From a05e8c51ff097ff73ec2947631d9102283545f7c Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Thu, 20 Aug 2015 20:21:56 +0900 Subject: sched/fair: Factor out the {at,de}taching of the per entity load {to,from} the runqueue Currently we open-code the addition/subtraction of the per entity load to/from the runqueue, factor this out into helper functions. Signed-off-by: Byungchul Park [ Rewrote the changelog. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1440069720-27038-2-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 77 ++++++++++++++++++++++++++--------------------------- 1 file changed, 38 insertions(+), 39 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6e2e3483b1ec..a72a71b501de 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2664,8 +2664,8 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { - int decayed; struct sched_avg *sa = &cfs_rq->avg; + int decayed; if (atomic_long_read(&cfs_rq->removed_load_avg)) { long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); @@ -2695,33 +2695,52 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) static inline void update_load_avg(struct sched_entity *se, int update_tg) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - int cpu = cpu_of(rq_of(cfs_rq)); u64 now = cfs_rq_clock_task(cfs_rq); + int cpu = cpu_of(rq_of(cfs_rq)); /* * Track task load average for carrying it to new CPU after migrated, and * track group sched_entity load average for task_h_load calc in migration */ __update_load_avg(now, cpu, &se->avg, - se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); + se->on_rq * scale_load_down(se->load.weight), + cfs_rq->curr == se, NULL); if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) update_tg_load_avg(cfs_rq, 0); } +static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + se->avg.last_update_time = cfs_rq->avg.last_update_time; + cfs_rq->avg.load_avg += se->avg.load_avg; + cfs_rq->avg.load_sum += se->avg.load_sum; + cfs_rq->avg.util_avg += se->avg.util_avg; + cfs_rq->avg.util_sum += se->avg.util_sum; +} + +static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), + &se->avg, se->on_rq * scale_load_down(se->load.weight), + cfs_rq->curr == se, NULL); + + cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); + cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); + cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); + cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); +} + /* Add the load generated by se into cfs_rq's load average */ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct sched_avg *sa = &se->avg; u64 now = cfs_rq_clock_task(cfs_rq); - int migrated = 0, decayed; + int migrated, decayed; - if (sa->last_update_time == 0) { - sa->last_update_time = now; - migrated = 1; - } - else { + migrated = !sa->last_update_time; + if (!migrated) { __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); @@ -2732,12 +2751,8 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->runnable_load_avg += sa->load_avg; cfs_rq->runnable_load_sum += sa->load_sum; - if (migrated) { - cfs_rq->avg.load_avg += sa->load_avg; - cfs_rq->avg.load_sum += sa->load_sum; - cfs_rq->avg.util_avg += sa->util_avg; - cfs_rq->avg.util_sum += sa->util_sum; - } + if (migrated) + attach_entity_load_avg(cfs_rq, se); if (decayed || migrated) update_tg_load_avg(cfs_rq, 0); @@ -2752,7 +2767,7 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->runnable_load_avg = max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); cfs_rq->runnable_load_sum = - max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); + max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); } /* @@ -2820,6 +2835,11 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void remove_entity_load_avg(struct sched_entity *se) {} +static inline void +attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} +static inline void +detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} + static inline int idle_balance(struct rq *rq) { return 0; @@ -7909,25 +7929,10 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) se->vruntime -= cfs_rq->min_vruntime; } -#ifdef CONFIG_SMP /* Catch up with the cfs_rq and remove our load when we leave */ - __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq), &se->avg, - se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); - - cfs_rq->avg.load_avg = - max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); - cfs_rq->avg.load_sum = - max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); - cfs_rq->avg.util_avg = - max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); - cfs_rq->avg.util_sum = - max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); -#endif + detach_entity_load_avg(cfs_rq, se); } -/* - * We switched to the sched_fair class. - */ static void switched_to_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; @@ -8040,14 +8045,8 @@ static void task_move_group_fair(struct task_struct *p, int queued) cfs_rq = cfs_rq_of(se); se->vruntime += cfs_rq->min_vruntime; -#ifdef CONFIG_SMP /* Virtually synchronize task with its new cfs_rq */ - p->se.avg.last_update_time = cfs_rq->avg.last_update_time; - cfs_rq->avg.load_avg += p->se.avg.load_avg; - cfs_rq->avg.load_sum += p->se.avg.load_sum; - cfs_rq->avg.util_avg += p->se.avg.util_avg; - cfs_rq->avg.util_sum += p->se.avg.util_sum; -#endif + attach_entity_load_avg(cfs_rq, se); } } -- cgit v1.2.3 From 50a2a3b246149d041065a67ccb3e98145f780a2f Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Thu, 20 Aug 2015 20:21:57 +0900 Subject: sched/fair: Have task_move_group_fair() unconditionally add the entity load to the runqueue Currently we conditionally add the entity load to the rq when moving the task between cgroups. This doesn't make sense as we always 'migrate' the task between cgroups, so we should always migrate the load too. [ The history here is that we used to only migrate the blocked load which was only meaningfull when !queued. ] Signed-off-by: Byungchul Park [ Rewrote the changelog. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1440069720-27038-3-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a72a71b501de..959b2ea386b3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8041,13 +8041,12 @@ static void task_move_group_fair(struct task_struct *p, int queued) se->vruntime -= cfs_rq_of(se)->min_vruntime; set_task_rq(p, task_cpu(p)); se->depth = se->parent ? se->parent->depth + 1 : 0; - if (!queued) { - cfs_rq = cfs_rq_of(se); + cfs_rq = cfs_rq_of(se); + if (!queued) se->vruntime += cfs_rq->min_vruntime; - /* Virtually synchronize task with its new cfs_rq */ - attach_entity_load_avg(cfs_rq, se); - } + /* Virtually synchronize task with its new cfs_rq */ + attach_entity_load_avg(cfs_rq, se); } void free_fair_sched_group(struct task_group *tg) -- cgit v1.2.3 From 1746babbb15594ba2d8d8196589bbbc2b5ff51c9 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Thu, 20 Aug 2015 20:21:58 +0900 Subject: sched/fair: Have task_move_group_fair() also detach entity load from the old runqueue Since we attach the entity load to the new runqueue, we should also detatch the entity load from the old runqueue, otherwise load can accumulate. Signed-off-by: Byungchul Park [ Rewrote the changelog. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1440069720-27038-4-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 959b2ea386b3..1e1fe7f796e9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8037,8 +8037,12 @@ static void task_move_group_fair(struct task_struct *p, int queued) if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING)) queued = 1; + cfs_rq = cfs_rq_of(se); if (!queued) - se->vruntime -= cfs_rq_of(se)->min_vruntime; + se->vruntime -= cfs_rq->min_vruntime; + + /* Synchronize task with its prev cfs_rq */ + detach_entity_load_avg(cfs_rq, se); set_task_rq(p, task_cpu(p)); se->depth = se->parent ? se->parent->depth + 1 : 0; cfs_rq = cfs_rq_of(se); -- cgit v1.2.3 From 6efdb105d392da3ad5cb4ef951aed373cd049813 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Thu, 20 Aug 2015 20:21:59 +0900 Subject: sched/fair: Fix switched_to_fair()'s per entity load tracking Where switched_from_fair() will remove the entity's load from the runqueue, switched_to_fair() does not currently add it back. This means that when a task leaves the fair class for a short duration; say because of PI; we loose its load contribution. This can ripple forward and disturb the load tracking because other operations (enqueue, dequeue) assume its factored in. Only once the runqueue empties will the load tracking recover. When we add it back in, age the per entity average to match up with the runqueue age. This has the obvious problem that if the task leaves the fair class for a significant time, the load will age to 0. Employ the normal migration rule for inter-runqueue moves in task_move_group_fair(). Again, there is the obvious problem of the task migrating while not in the fair class. The alternative solution would be to to omit the chunk in attach_entity_load_avg(), which would effectively reset the timestamp and use whatever avg there was. Signed-off-by: Byungchul Park [ Rewrote the changelog and comments. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1440069720-27038-5-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1e1fe7f796e9..5143ea0cb55b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2712,6 +2712,20 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { + /* + * If we got migrated (either between CPUs or between cgroups) we'll + * have aged the average right before clearing @last_update_time. + */ + if (se->avg.last_update_time) { + __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), + &se->avg, 0, 0, NULL); + + /* + * XXX: we could have just aged the entire load away if we've been + * absent from the fair class for too long. + */ + } + se->avg.last_update_time = cfs_rq->avg.last_update_time; cfs_rq->avg.load_avg += se->avg.load_avg; cfs_rq->avg.load_sum += se->avg.load_sum; @@ -7945,6 +7959,9 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) se->depth = se->parent ? se->parent->depth + 1 : 0; #endif + /* Synchronize task with its cfs_rq */ + attach_entity_load_avg(cfs_rq_of(&p->se), &p->se); + if (!task_on_rq_queued(p)) { /* @@ -8044,6 +8061,12 @@ static void task_move_group_fair(struct task_struct *p, int queued) /* Synchronize task with its prev cfs_rq */ detach_entity_load_avg(cfs_rq, se); set_task_rq(p, task_cpu(p)); + +#ifdef CONFIG_SMP + /* Tell se's cfs_rq has been changed -- migrated */ + p->se.avg.last_update_time = 0; +#endif + se->depth = se->parent ? se->parent->depth + 1 : 0; cfs_rq = cfs_rq_of(se); if (!queued) -- cgit v1.2.3 From a9280514bf1e54775b8d7cd93d87c05c2b5273e6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 Sep 2015 16:10:59 +0200 Subject: sched/fair: Make the entity load aging on attaching tunable In case there are problems with the aging on attach, provide a debug knob to turn it off. Signed-off-by: Peter Zijlstra (Intel) Cc: Byungchul Park Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Cc: yuyang.du@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 ++++ kernel/sched/features.h | 2 ++ 2 files changed, 6 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5143ea0cb55b..5cd7054aac85 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2712,6 +2712,9 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { + if (!sched_feat(ATTACH_AGE_LOAD)) + goto skip_aging; + /* * If we got migrated (either between CPUs or between cgroups) we'll * have aged the average right before clearing @last_update_time. @@ -2726,6 +2729,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s */ } +skip_aging: se->avg.last_update_time = cfs_rq->avg.last_update_time; cfs_rq->avg.load_avg += se->avg.load_avg; cfs_rq->avg.load_sum += se->avg.load_sum; diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 83a50e7ca533..e6fd23b7459b 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -73,6 +73,8 @@ SCHED_FEAT(FORCE_SD_OVERLAP, false) SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) +SCHED_FEAT(ATTACH_AGE_LOAD, true) + /* * Apply the automatic NUMA scheduling policy. Enabled automatically * at runtime if running on a NUMA machine. Can be controlled via -- cgit v1.2.3 From daa59407b558e6e621e9081a308d5db3ef991fb6 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Thu, 20 Aug 2015 20:22:00 +0900 Subject: sched/fair: Unify switched_{from,to}_fair() and task_move_group_fair() By observing that switched_from_fair() detaches from a runqueue, and switched_to_fair() attaches to a runqueue, we can see that task_move_group_fair() is one followed by the other with flipping the runqueue in between. Therefore extract all the common bits and implement all three functions in terms of them. This should fix a few corner cases wrt. vruntime normalization; where, when we take a task off of a runqueue we convert to an approximation of lag by subtracting min_vruntime, and when placing a task on the a runqueue to the reverse. Suggested-by: Peter Zijlstra Signed-off-by: Byungchul Park [peterz: Changelog] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Thomas Gleixner Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1440069720-27038-6-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 129 +++++++++++++++++++++------------------------------- 1 file changed, 52 insertions(+), 77 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5cd7054aac85..b96d8dd65331 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7924,21 +7924,39 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) check_preempt_curr(rq, p, 0); } -static void switched_from_fair(struct rq *rq, struct task_struct *p) +static inline bool vruntime_normalized(struct task_struct *p) { struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); /* - * Ensure the task's vruntime is normalized, so that when it's - * switched back to the fair class the enqueue_entity(.flags=0) will - * do the right thing. + * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases, + * the dequeue_entity(.flags=0) will already have normalized the + * vruntime. + */ + if (p->on_rq) + return true; + + /* + * When !on_rq, vruntime of the task has usually NOT been normalized. + * But there are some cases where it has already been normalized: * - * If it's queued, then the dequeue_entity(.flags=0) will already - * have normalized the vruntime, if it's !queued, then only when - * the task is sleeping will it still have non-normalized vruntime. + * - A forked child which is waiting for being woken up by + * wake_up_new_task(). + * - A task which has been woken up by try_to_wake_up() and + * waiting for actually being woken up by sched_ttwu_pending(). */ - if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) { + if (!se->sum_exec_runtime || p->state == TASK_WAKING) + return true; + + return false; +} + +static void detach_task_cfs_rq(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (!vruntime_normalized(p)) { /* * Fix up our vruntime so that the current sleep doesn't * cause 'unlimited' sleep bonus. @@ -7951,9 +7969,10 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) detach_entity_load_avg(cfs_rq, se); } -static void switched_to_fair(struct rq *rq, struct task_struct *p) +static void attach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); #ifdef CONFIG_FAIR_GROUP_SCHED /* @@ -7964,33 +7983,32 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) #endif /* Synchronize task with its cfs_rq */ - attach_entity_load_avg(cfs_rq_of(&p->se), &p->se); + attach_entity_load_avg(cfs_rq, se); + + if (!vruntime_normalized(p)) + se->vruntime += cfs_rq->min_vruntime; +} - if (!task_on_rq_queued(p)) { +static void switched_from_fair(struct rq *rq, struct task_struct *p) +{ + detach_task_cfs_rq(p); +} + +static void switched_to_fair(struct rq *rq, struct task_struct *p) +{ + attach_task_cfs_rq(p); + if (task_on_rq_queued(p)) { /* - * Ensure the task has a non-normalized vruntime when it is switched - * back to the fair class with !queued, so that enqueue_entity() at - * wake-up time will do the right thing. - * - * If it's queued, then the enqueue_entity(.flags=0) makes the task - * has non-normalized vruntime, if it's !queued, then it still has - * normalized vruntime. + * We were most likely switched from sched_rt, so + * kick off the schedule if running, otherwise just see + * if we can still preempt the current task. */ - if (p->state != TASK_RUNNING) - se->vruntime += cfs_rq_of(se)->min_vruntime; - return; + if (rq->curr == p) + resched_curr(rq); + else + check_preempt_curr(rq, p, 0); } - - /* - * We were most likely switched from sched_rt, so - * kick off the schedule if running, otherwise just see - * if we can still preempt the current task. - */ - if (rq->curr == p) - resched_curr(rq); - else - check_preempt_curr(rq, p, 0); } /* Account for a task changing its policy or group. @@ -8027,57 +8045,14 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) #ifdef CONFIG_FAIR_GROUP_SCHED static void task_move_group_fair(struct task_struct *p, int queued) { - struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq; - - /* - * If the task was not on the rq at the time of this cgroup movement - * it must have been asleep, sleeping tasks keep their ->vruntime - * absolute on their old rq until wakeup (needed for the fair sleeper - * bonus in place_entity()). - * - * If it was on the rq, we've just 'preempted' it, which does convert - * ->vruntime to a relative base. - * - * Make sure both cases convert their relative position when migrating - * to another cgroup's rq. This does somewhat interfere with the - * fair sleeper stuff for the first placement, but who cares. - */ - /* - * When !queued, vruntime of the task has usually NOT been normalized. - * But there are some cases where it has already been normalized: - * - * - Moving a forked child which is waiting for being woken up by - * wake_up_new_task(). - * - Moving a task which has been woken up by try_to_wake_up() and - * waiting for actually being woken up by sched_ttwu_pending(). - * - * To prevent boost or penalty in the new cfs_rq caused by delta - * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. - */ - if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING)) - queued = 1; - - cfs_rq = cfs_rq_of(se); - if (!queued) - se->vruntime -= cfs_rq->min_vruntime; - - /* Synchronize task with its prev cfs_rq */ - detach_entity_load_avg(cfs_rq, se); + detach_task_cfs_rq(p); set_task_rq(p, task_cpu(p)); #ifdef CONFIG_SMP /* Tell se's cfs_rq has been changed -- migrated */ p->se.avg.last_update_time = 0; #endif - - se->depth = se->parent ? se->parent->depth + 1 : 0; - cfs_rq = cfs_rq_of(se); - if (!queued) - se->vruntime += cfs_rq->min_vruntime; - - /* Virtually synchronize task with its new cfs_rq */ - attach_entity_load_avg(cfs_rq, se); + attach_task_cfs_rq(p); } void free_fair_sched_group(struct task_group *tg) -- cgit v1.2.3 From bc54da2176cd38cedea767eff637229a191a2383 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 31 Aug 2015 17:13:55 +0200 Subject: sched/core: Remove unused argument from sched_class::task_move_group The previous patches made the second argument go unused, remove it. Signed-off-by: Peter Zijlstra (Intel) Cc: Byungchul Park Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 97d276ff1edb..7c099e69fd5e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7721,7 +7721,7 @@ void sched_move_task(struct task_struct *tsk) #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_move_group) - tsk->sched_class->task_move_group(tsk, queued); + tsk->sched_class->task_move_group(tsk); else #endif set_task_rq(tsk, task_cpu(tsk)); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b96d8dd65331..4e305d174b55 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8043,7 +8043,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) } #ifdef CONFIG_FAIR_GROUP_SCHED -static void task_move_group_fair(struct task_struct *p, int queued) +static void task_move_group_fair(struct task_struct *p) { detach_task_cfs_rq(p); set_task_rq(p, task_cpu(p)); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 68cda117574c..637d5aeaa55a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1226,7 +1226,7 @@ struct sched_class { void (*update_curr) (struct rq *rq); #ifdef CONFIG_FAIR_GROUP_SCHED - void (*task_move_group) (struct task_struct *p, int on_rq); + void (*task_move_group) (struct task_struct *p); #endif }; -- cgit v1.2.3 From 446685e9bfa11174332fbb0b3218b37015fbf4ff Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 31 Aug 2015 15:12:56 +0300 Subject: sched/core: Delete PF_EXITING checks from cpu_cgroup_exit() callback cgroup_exit() is not called from copy_process() after commit: e8604cb43690 ("cgroup: fix spurious lockdep warning in cgroup_exit()") from do_exit(). So this check is useless and the comment is obsolete. Signed-off-by: Kirill Tkhai Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/55E444C8.3020402@odin.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7c099e69fd5e..37ab6f9f0d1e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8193,14 +8193,6 @@ static void cpu_cgroup_exit(struct cgroup_subsys_state *css, struct cgroup_subsys_state *old_css, struct task_struct *task) { - /* - * cgroup_exit() is called in the copy_process() failure path. - * Ignore this case since the task hasn't ran yet, this avoids - * trying to poke a half freed task state from generic code. - */ - if (!(task->flags & PF_EXITING)) - return; - sched_move_task(task); } -- cgit v1.2.3 From c5afb6a87f2386bcf09fa051e6ca390d43e2222e Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 3 Aug 2015 11:55:50 +0200 Subject: sched/fair: Fix nohz.next_balance update Since commit: d4573c3e1c99 ("sched: Improve load balancing in the presence of idle CPUs") the ILB CPU starts with the idle load balancing of other idle CPUs and finishes with itself in order to speed up the spread of tasks in all idle CPUs. The this_rq->next_balance is still used in nohz_idle_balance() as an intermediate step to gather the shortest next balance before updating nohz.next_balance. But the former has not been updated yet and is likely to be set with the current jiffies. As a result, the nohz.next_balance will be set with current jiffies instead of the real next balance date. This generates spurious kicks of nohz ilde balance. nohz_idle_balance() must set the nohz.next_balance without taking into account this_rq->next_balance which is not updated yet. Then, this_rq will update nohz.next_update with its next_balance once updated and if necessary. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Jason Low Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: preeti@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/1438595750-20455-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4e305d174b55..36774e5dab01 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7647,8 +7647,22 @@ out: * When the cpu is attached to null domain for ex, it will not be * updated. */ - if (likely(update_next_balance)) + if (likely(update_next_balance)) { rq->next_balance = next_balance; + +#ifdef CONFIG_NO_HZ_COMMON + /* + * If this CPU has been elected to perform the nohz idle + * balance. Other idle CPUs have already rebalanced with + * nohz_idle_balance() and nohz.next_balance has been + * updated accordingly. This CPU is now running the idle load + * balance for itself and we need to update the + * nohz.next_balance accordingly. + */ + if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance)) + nohz.next_balance = rq->next_balance; +#endif + } } #ifdef CONFIG_NO_HZ_COMMON @@ -7661,6 +7675,9 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) int this_cpu = this_rq->cpu; struct rq *rq; int balance_cpu; + /* Earliest time when we have to do rebalance again */ + unsigned long next_balance = jiffies + 60*HZ; + int update_next_balance = 0; if (idle != CPU_IDLE || !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) @@ -7692,10 +7709,19 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) rebalance_domains(rq, CPU_IDLE); } - if (time_after(this_rq->next_balance, rq->next_balance)) - this_rq->next_balance = rq->next_balance; + if (time_after(next_balance, rq->next_balance)) { + next_balance = rq->next_balance; + update_next_balance = 1; + } } - nohz.next_balance = this_rq->next_balance; + + /* + * next_balance will be updated only when there is a need. + * When the CPU is attached to null domain for ex, it will not be + * updated. + */ + if (likely(update_next_balance)) + nohz.next_balance = next_balance; end: clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); } -- cgit v1.2.3 From 78a9c54649ea220065aad9902460a1d137c7eafd Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Tue, 11 Aug 2015 16:30:11 +0530 Subject: sched/numa: Rename numabalancing_enabled to sched_numa_balancing Simple rename of the 'numabalancing_enabled' variable to 'sched_numa_balancing'. No functional changes. Suggested-by: Ingo Molnar Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1439290813-6683-2-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 +++--- kernel/sched/fair.c | 4 ++-- kernel/sched/sched.h | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 37ab6f9f0d1e..2656af00ea5e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2124,11 +2124,11 @@ void set_numabalancing_state(bool enabled) sched_feat_set("NO_NUMA"); } #else -__read_mostly bool numabalancing_enabled; +__read_mostly bool sched_numa_balancing; void set_numabalancing_state(bool enabled) { - numabalancing_enabled = enabled; + sched_numa_balancing = enabled; } #endif /* CONFIG_SCHED_DEBUG */ @@ -2138,7 +2138,7 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, { struct ctl_table t; int err; - int state = numabalancing_enabled; + int state = sched_numa_balancing; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 36774e5dab01..3a6ac5598bff 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2069,7 +2069,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) int local = !!(flags & TNF_FAULT_LOCAL); int priv; - if (!numabalancing_enabled) + if (!sched_numa_balancing) return; /* for example, ksmd faulting in a user's mm */ @@ -7874,7 +7874,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) entity_tick(cfs_rq, se, queued); } - if (numabalancing_enabled) + if (sched_numa_balancing) task_tick_numa(rq, curr); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 637d5aeaa55a..d0b303d675d6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1006,13 +1006,13 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #ifdef CONFIG_NUMA_BALANCING #define sched_feat_numa(x) sched_feat(x) #ifdef CONFIG_SCHED_DEBUG -#define numabalancing_enabled sched_feat_numa(NUMA) +#define sched_numa_balancing sched_feat_numa(NUMA) #else -extern bool numabalancing_enabled; +extern bool sched_numa_balancing; #endif /* CONFIG_SCHED_DEBUG */ #else #define sched_feat_numa(x) (0) -#define numabalancing_enabled (0) +#define sched_numa_balancing (0) #endif /* CONFIG_NUMA_BALANCING */ static inline u64 global_rt_period(void) -- cgit v1.2.3 From c3b9bc5bbfc3750570d788afffd431263ef695c6 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Tue, 11 Aug 2015 16:30:12 +0530 Subject: sched/numa: Disable sched_numa_balancing on UMA systems Commit 2a1ed24 ("sched/numa: Prefer NUMA hotness over cache hotness") sets sched feature NUMA to true. However this can enable NUMA hinting faults on a UMA system. This commit ensures that NUMA hinting faults occur only on a NUMA system by setting/resetting sched_numa_balancing. This commit: - Makes sched_numa_balancing common to CONFIG_SCHED_DEBUG and !CONFIG_SCHED_DEBUG. Earlier it was only in !CONFIG_SCHED_DEBUG. - Checks for sched_numa_balancing instead of sched_feat(NUMA). Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1439290813-6683-3-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 14 +++++--------- kernel/sched/fair.c | 4 ++-- kernel/sched/sched.h | 6 ------ 3 files changed, 7 insertions(+), 17 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2656af00ea5e..ca665f8bec98 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2115,22 +2115,18 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) } #ifdef CONFIG_NUMA_BALANCING -#ifdef CONFIG_SCHED_DEBUG +__read_mostly bool sched_numa_balancing; + void set_numabalancing_state(bool enabled) { + sched_numa_balancing = enabled; +#ifdef CONFIG_SCHED_DEBUG if (enabled) sched_feat_set("NUMA"); else sched_feat_set("NO_NUMA"); -} -#else -__read_mostly bool sched_numa_balancing; - -void set_numabalancing_state(bool enabled) -{ - sched_numa_balancing = enabled; -} #endif /* CONFIG_SCHED_DEBUG */ +} #ifdef CONFIG_PROC_SYSCTL int sysctl_numa_balancing(struct ctl_table *table, int write, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3a6ac5598bff..e8f0828f4137 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5562,10 +5562,10 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) unsigned long src_faults, dst_faults; int src_nid, dst_nid; - if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) + if (!sched_numa_balancing) return -1; - if (!sched_feat(NUMA)) + if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) return -1; src_nid = cpu_to_node(env->src_cpu); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d0b303d675d6..0d8f885b215b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1004,14 +1004,8 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ #ifdef CONFIG_NUMA_BALANCING -#define sched_feat_numa(x) sched_feat(x) -#ifdef CONFIG_SCHED_DEBUG -#define sched_numa_balancing sched_feat_numa(NUMA) -#else extern bool sched_numa_balancing; -#endif /* CONFIG_SCHED_DEBUG */ #else -#define sched_feat_numa(x) (0) #define sched_numa_balancing (0) #endif /* CONFIG_NUMA_BALANCING */ -- cgit v1.2.3 From 2b49d84b259fc18e131026e5d38e7855352f71b9 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Tue, 11 Aug 2015 16:30:13 +0530 Subject: sched/numa: Remove the NUMA sched_feature Variable sched_numa_balancing is available for both CONFIG_SCHED_DEBUG and !CONFIG_SCHED_DEBUG. All code paths now check for sched_numa_balancing. Hence remove sched_feat(NUMA). Suggested-by: Ingo Molnar Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1439290813-6683-4-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 ------ kernel/sched/features.h | 16 ---------------- 2 files changed, 22 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ca665f8bec98..e0bd88b26a27 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2120,12 +2120,6 @@ __read_mostly bool sched_numa_balancing; void set_numabalancing_state(bool enabled) { sched_numa_balancing = enabled; -#ifdef CONFIG_SCHED_DEBUG - if (enabled) - sched_feat_set("NUMA"); - else - sched_feat_set("NO_NUMA"); -#endif /* CONFIG_SCHED_DEBUG */ } #ifdef CONFIG_PROC_SYSCTL diff --git a/kernel/sched/features.h b/kernel/sched/features.h index e6fd23b7459b..edf5902d5e57 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -72,21 +72,5 @@ SCHED_FEAT(RT_PUSH_IPI, true) SCHED_FEAT(FORCE_SD_OVERLAP, false) SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) - SCHED_FEAT(ATTACH_AGE_LOAD, true) -/* - * Apply the automatic NUMA scheduling policy. Enabled automatically - * at runtime if running on a NUMA machine. Can be controlled via - * numa_balancing= - */ -#ifdef CONFIG_NUMA_BALANCING - -/* - * NUMA will favor moving tasks towards nodes where a higher number of - * hinting faults are recorded during active load balancing. It will - * resist moving tasks towards nodes where a lower number of hinting - * faults have been recorded. - */ -SCHED_FEAT(NUMA, true) -#endif -- cgit v1.2.3 From 2a595721a1fa6b684c1c818f379bef834ac3d65e Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Tue, 11 Aug 2015 21:54:21 +0530 Subject: sched/numa: Convert sched_numa_balancing to a static_branch Variable sched_numa_balancing toggles numa_balancing feature. Hence moving from a simple read mostly variable to a more apt static_branch. Suggested-by: Peter Zijlstra Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mel Gorman Cc: Mike Galbraith Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1439310261-16124-1-git-send-email-srikar@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 10 +++++++--- kernel/sched/fair.c | 6 +++--- kernel/sched/sched.h | 6 +----- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e0bd88b26a27..b62127118fc8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2114,12 +2114,16 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #endif /* CONFIG_NUMA_BALANCING */ } +DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); + #ifdef CONFIG_NUMA_BALANCING -__read_mostly bool sched_numa_balancing; void set_numabalancing_state(bool enabled) { - sched_numa_balancing = enabled; + if (enabled) + static_branch_enable(&sched_numa_balancing); + else + static_branch_disable(&sched_numa_balancing); } #ifdef CONFIG_PROC_SYSCTL @@ -2128,7 +2132,7 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, { struct ctl_table t; int err; - int state = sched_numa_balancing; + int state = static_branch_likely(&sched_numa_balancing); if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e8f0828f4137..47ece22e7ae1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2069,7 +2069,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) int local = !!(flags & TNF_FAULT_LOCAL); int priv; - if (!sched_numa_balancing) + if (!static_branch_likely(&sched_numa_balancing)) return; /* for example, ksmd faulting in a user's mm */ @@ -5562,7 +5562,7 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) unsigned long src_faults, dst_faults; int src_nid, dst_nid; - if (!sched_numa_balancing) + if (!static_branch_likely(&sched_numa_balancing)) return -1; if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) @@ -7874,7 +7874,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) entity_tick(cfs_rq, se, queued); } - if (sched_numa_balancing) + if (!static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0d8f885b215b..2e8530d02b02 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1003,11 +1003,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ -#ifdef CONFIG_NUMA_BALANCING -extern bool sched_numa_balancing; -#else -#define sched_numa_balancing (0) -#endif /* CONFIG_NUMA_BALANCING */ +extern struct static_key_false sched_numa_balancing; static inline u64 global_rt_period(void) { -- cgit v1.2.3 From e0f5f3afd2cffa96291cd852056d83ff4e2e99c7 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 14 Aug 2015 17:23:09 +0100 Subject: sched/fair: Make load tracking frequency scale-invariant Apply frequency scaling correction factor to per-entity load tracking to make it frequency invariant. Currently, load appears bigger when the CPU is running slower which affects load-balancing decisions. Each segment of the sched_avg.load_sum geometric series is now scaled by the current frequency so that the sched_avg.load_avg of each sched entity will be invariant from frequency scaling. Moreover, cfs_rq.runnable_load_sum is scaled by the current frequency as well. Signed-off-by: Dietmar Eggemann Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Acked-by: Vincent Guittot Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: daniel.lezcano@linaro.org Cc: mturquette@baylibre.com Cc: pang.xunlei@zte.com.cn Cc: rjw@rjwysocki.net Cc: sgurrappadi@nvidia.com Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1439569394-11974-2-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 6 +++--- kernel/sched/fair.c | 27 +++++++++++++++++---------- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index a4ab9daa387c..c8d923ba429d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1177,9 +1177,9 @@ struct load_weight { /* * The load_avg/util_avg accumulates an infinite geometric series. - * 1) load_avg factors the amount of time that a sched_entity is - * runnable on a rq into its weight. For cfs_rq, it is the aggregated - * such weights of all runnable and blocked sched_entities. + * 1) load_avg factors frequency scaling into the amount of time that a + * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the + * aggregated such weights of all runnable and blocked sched_entities. * 2) util_avg factors frequency scaling into the amount of time * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE]. * For cfs_rq, it is the aggregated such times of all runnable and diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 47ece22e7ae1..86cb27cae4b7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2515,6 +2515,8 @@ static u32 __compute_runnable_contrib(u64 n) return contrib + runnable_avg_yN_sum[n]; } +#define scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) + /* * We can represent the historical contribution to runnable average as the * coefficients of a geometric series. To do this we sub-divide our runnable @@ -2547,9 +2549,9 @@ static __always_inline int __update_load_avg(u64 now, int cpu, struct sched_avg *sa, unsigned long weight, int running, struct cfs_rq *cfs_rq) { - u64 delta, periods; + u64 delta, scaled_delta, periods; u32 contrib; - int delta_w, decayed = 0; + int delta_w, scaled_delta_w, decayed = 0; unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); delta = now - sa->last_update_time; @@ -2585,13 +2587,16 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, * period and accrue it. */ delta_w = 1024 - delta_w; + scaled_delta_w = scale(delta_w, scale_freq); if (weight) { - sa->load_sum += weight * delta_w; - if (cfs_rq) - cfs_rq->runnable_load_sum += weight * delta_w; + sa->load_sum += weight * scaled_delta_w; + if (cfs_rq) { + cfs_rq->runnable_load_sum += + weight * scaled_delta_w; + } } if (running) - sa->util_sum += delta_w * scale_freq >> SCHED_CAPACITY_SHIFT; + sa->util_sum += scaled_delta_w; delta -= delta_w; @@ -2608,23 +2613,25 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, /* Efficiently calculate \sum (1..n_period) 1024*y^i */ contrib = __compute_runnable_contrib(periods); + contrib = scale(contrib, scale_freq); if (weight) { sa->load_sum += weight * contrib; if (cfs_rq) cfs_rq->runnable_load_sum += weight * contrib; } if (running) - sa->util_sum += contrib * scale_freq >> SCHED_CAPACITY_SHIFT; + sa->util_sum += contrib; } /* Remainder of delta accrued against u_0` */ + scaled_delta = scale(delta, scale_freq); if (weight) { - sa->load_sum += weight * delta; + sa->load_sum += weight * scaled_delta; if (cfs_rq) - cfs_rq->runnable_load_sum += weight * delta; + cfs_rq->runnable_load_sum += weight * scaled_delta; } if (running) - sa->util_sum += delta * scale_freq >> SCHED_CAPACITY_SHIFT; + sa->util_sum += scaled_delta; sa->period_contrib += delta; -- cgit v1.2.3 From 8cd5601c50603caa195ce86cc465cb04079ed488 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 14 Aug 2015 17:23:10 +0100 Subject: sched/fair: Convert arch_scale_cpu_capacity() from weak function to #define Bring arch_scale_cpu_capacity() in line with the recent change of its arch_scale_freq_capacity() sibling in commit dfbca41f3479 ("sched: Optimize freq invariant accounting") from weak function to #define to allow inlining of the function. While at it, remove the ARCH_CAPACITY sched_feature as well. With the change to #define there isn't a straightforward way to allow runtime switch between an arch implementation and the default implementation of arch_scale_cpu_capacity() using sched_feature. The default was to use the arch-specific implementation, but only the arm architecture provides one and that is essentially equivalent to the default implementation. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: daniel.lezcano@linaro.org Cc: mturquette@baylibre.com Cc: pang.xunlei@zte.com.cn Cc: rjw@rjwysocki.net Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1439569394-11974-3-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 22 +--------------------- kernel/sched/features.h | 5 ----- kernel/sched/sched.h | 11 +++++++++++ 3 files changed, 12 insertions(+), 26 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 86cb27cae4b7..102cdf1e4e97 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6054,19 +6054,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd, return load_idx; } -static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) -{ - if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) - return sd->smt_gain / sd->span_weight; - - return SCHED_CAPACITY_SCALE; -} - -unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) -{ - return default_scale_cpu_capacity(sd, cpu); -} - static unsigned long scale_rt_capacity(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -6096,16 +6083,9 @@ static unsigned long scale_rt_capacity(int cpu) static void update_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long capacity = SCHED_CAPACITY_SCALE; + unsigned long capacity = arch_scale_cpu_capacity(sd, cpu); struct sched_group *sdg = sd->groups; - if (sched_feat(ARCH_CAPACITY)) - capacity *= arch_scale_cpu_capacity(sd, cpu); - else - capacity *= default_scale_cpu_capacity(sd, cpu); - - capacity >>= SCHED_CAPACITY_SHIFT; - cpu_rq(cpu)->cpu_capacity_orig = capacity; capacity *= scale_rt_capacity(cpu); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index edf5902d5e57..69631fa46c2f 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -36,11 +36,6 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true) */ SCHED_FEAT(WAKEUP_PREEMPTION, true) -/* - * Use arch dependent cpu capacity functions - */ -SCHED_FEAT(ARCH_CAPACITY, true) - SCHED_FEAT(HRTICK, false) SCHED_FEAT(DOUBLE_TICK, false) SCHED_FEAT(LB_BIAS, true) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2e8530d02b02..c0726d5fd6a3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1394,6 +1394,17 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) } #endif +#ifndef arch_scale_cpu_capacity +static __always_inline +unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) +{ + if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) + return sd->smt_gain / sd->span_weight; + + return SCHED_CAPACITY_SCALE; +} +#endif + static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); -- cgit v1.2.3 From e3279a2e6d697e00e74f905851ee7cf532f72b2d Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Sat, 15 Aug 2015 00:04:41 +0100 Subject: sched/fair: Make utilization tracking CPU scale-invariant Besides the existing frequency scale-invariance correction factor, apply CPU scale-invariance correction factor to utilization tracking to compensate for any differences in compute capacity. This could be due to micro-architectural differences (i.e. instructions per seconds) between cpus in HMP systems (e.g. big.LITTLE), and/or differences in the current maximum frequency supported by individual cpus in SMP systems. In the existing implementation utilization isn't comparable between cpus as it is relative to the capacity of each individual CPU. Each segment of the sched_avg.util_sum geometric series is now scaled by the CPU performance factor too so the sched_avg.util_avg of each sched entity will be invariant from the particular CPU of the HMP/SMP system on which the sched entity is scheduled. With this patch, the utilization of a CPU stays relative to the max CPU performance of the fastest CPU in the system. In contrast to utilization (sched_avg.util_sum), load (sched_avg.load_sum) should not be scaled by compute capacity. The utilization metric is based on running time which only makes sense when cpus are _not_ fully utilized (utilization cannot go beyond 100% even if more tasks are added), where load is runnable time which isn't limited by the capacity of the CPU and therefore is a better metric for overloaded scenarios. If we run two nice-0 busy loops on two cpus with different compute capacity their load should be similar since their compute demands are the same. We have to assume that the compute demand of any task running on a fully utilized CPU (no spare cycles = 100% utilization) is high and the same no matter of the compute capacity of its current CPU, hence we shouldn't scale load by CPU capacity. Signed-off-by: Dietmar Eggemann Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/55CE7409.1000700@arm.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- kernel/sched/fair.c | 7 ++++--- kernel/sched/sched.h | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index c8d923ba429d..bd38b3ee9e83 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1180,7 +1180,7 @@ struct load_weight { * 1) load_avg factors frequency scaling into the amount of time that a * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the * aggregated such weights of all runnable and blocked sched_entities. - * 2) util_avg factors frequency scaling into the amount of time + * 2) util_avg factors frequency and cpu scaling into the amount of time * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE]. * For cfs_rq, it is the aggregated such times of all runnable and * blocked sched_entities. diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 102cdf1e4e97..573dc98c6248 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2553,6 +2553,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, u32 contrib; int delta_w, scaled_delta_w, decayed = 0; unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); + unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu); delta = now - sa->last_update_time; /* @@ -2596,7 +2597,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, } } if (running) - sa->util_sum += scaled_delta_w; + sa->util_sum += scale(scaled_delta_w, scale_cpu); delta -= delta_w; @@ -2620,7 +2621,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, cfs_rq->runnable_load_sum += weight * contrib; } if (running) - sa->util_sum += contrib; + sa->util_sum += scale(contrib, scale_cpu); } /* Remainder of delta accrued against u_0` */ @@ -2631,7 +2632,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, cfs_rq->runnable_load_sum += weight * scaled_delta; } if (running) - sa->util_sum += scaled_delta; + sa->util_sum += scale(scaled_delta, scale_cpu); sa->period_contrib += delta; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c0726d5fd6a3..167ab4844ee6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1398,7 +1398,7 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) static __always_inline unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) { - if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) + if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) return sd->smt_gain / sd->span_weight; return SCHED_CAPACITY_SCALE; -- cgit v1.2.3 From 9e91d61d9b0ca8d865dbd59af8d0d5c5b68003e9 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 14 Aug 2015 17:23:12 +0100 Subject: sched/fair: Name utilization related data and functions consistently Use the advent of the per-entity load tracking rewrite to streamline the naming of utilization related data and functions by using {prefix_}util{_suffix} consistently. Moreover call both signals ({se,cfs}.avg.util_avg) utilization. Signed-off-by: Dietmar Eggemann Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: daniel.lezcano@linaro.org Cc: mturquette@baylibre.com Cc: pang.xunlei@zte.com.cn Cc: rjw@rjwysocki.net Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1439569394-11974-5-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 573dc98c6248..1b56d63c5322 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4863,31 +4863,32 @@ done: return target; } /* - * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS + * cpu_util returns the amount of capacity of a CPU that is used by CFS * tasks. The unit of the return value must be the one of capacity so we can - * compare the usage with the capacity of the CPU that is available for CFS - * task (ie cpu_capacity). + * compare the utilization with the capacity of the CPU that is available for + * CFS task (ie cpu_capacity). * cfs.avg.util_avg is the sum of running time of runnable tasks on a * CPU. It represents the amount of utilization of a CPU in the range - * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full - * capacity of the CPU because it's about the running time on this CPU. + * [0..SCHED_LOAD_SCALE]. The utilization of a CPU can't be higher than the + * full capacity of the CPU because it's about the running time on this CPU. * Nevertheless, cfs.avg.util_avg can be higher than SCHED_LOAD_SCALE * because of unfortunate rounding in util_avg or just * after migrating tasks until the average stabilizes with the new running - * time. So we need to check that the usage stays into the range + * time. So we need to check that the utilization stays into the range * [0..cpu_capacity_orig] and cap if necessary. - * Without capping the usage, a group could be seen as overloaded (CPU0 usage - * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity + * Without capping the utilization, a group could be seen as overloaded (CPU0 + * utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of + * available capacity. */ -static int get_cpu_usage(int cpu) +static int cpu_util(int cpu) { - unsigned long usage = cpu_rq(cpu)->cfs.avg.util_avg; + unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; unsigned long capacity = capacity_orig_of(cpu); - if (usage >= SCHED_LOAD_SCALE) + if (util >= SCHED_LOAD_SCALE) return capacity; - return (usage * capacity) >> SCHED_LOAD_SHIFT; + return (util * capacity) >> SCHED_LOAD_SHIFT; } /* @@ -5979,7 +5980,7 @@ struct sg_lb_stats { unsigned long sum_weighted_load; /* Weighted load of group's tasks */ unsigned long load_per_task; unsigned long group_capacity; - unsigned long group_usage; /* Total usage of the group */ + unsigned long group_util; /* Total utilization of the group */ unsigned int sum_nr_running; /* Nr tasks running in the group */ unsigned int idle_cpus; unsigned int group_weight; @@ -6212,8 +6213,8 @@ static inline int sg_imbalanced(struct sched_group *group) * group_has_capacity returns true if the group has spare capacity that could * be used by some tasks. * We consider that a group has spare capacity if the * number of task is - * smaller than the number of CPUs or if the usage is lower than the available - * capacity for CFS tasks. + * smaller than the number of CPUs or if the utilization is lower than the + * available capacity for CFS tasks. * For the latter, we use a threshold to stabilize the state, to take into * account the variance of the tasks' load and to return true if the available * capacity in meaningful for the load balancer. @@ -6227,7 +6228,7 @@ group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) return true; if ((sgs->group_capacity * 100) > - (sgs->group_usage * env->sd->imbalance_pct)) + (sgs->group_util * env->sd->imbalance_pct)) return true; return false; @@ -6248,7 +6249,7 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) return false; if ((sgs->group_capacity * 100) < - (sgs->group_usage * env->sd->imbalance_pct)) + (sgs->group_util * env->sd->imbalance_pct)) return true; return false; @@ -6296,7 +6297,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, load = source_load(i, load_idx); sgs->group_load += load; - sgs->group_usage += get_cpu_usage(i); + sgs->group_util += cpu_util(i); sgs->sum_nr_running += rq->cfs.h_nr_running; if (rq->nr_running > 1) -- cgit v1.2.3 From 231678b768da07d19ab5683a39eeb0c250631d02 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 14 Aug 2015 17:23:13 +0100 Subject: sched/fair: Get rid of scaling utilization by capacity_orig Utilization is currently scaled by capacity_orig, but since we now have frequency and cpu invariant cfs_rq.avg.util_avg, frequency and cpu scaling now happens as part of the utilization tracking itself. So cfs_rq.avg.util_avg should no longer be scaled in cpu_util(). Signed-off-by: Dietmar Eggemann Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steve Muckle Cc: Thomas Gleixner Cc: daniel.lezcano@linaro.org