From fe7de49f9d4e53f24ec9ef762a503f70b562341c Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Wed, 20 Oct 2010 16:01:12 -0700 Subject: sched: Make sched_param argument static in sched_setscheduler() callers Andrew Morton pointed out almost all sched_setscheduler() callers are using fixed parameters and can be converted to static. It reduces runtime memory use a little. Signed-off-by: KOSAKI Motohiro Reported-by: Andrew Morton Acked-by: James Morris Cc: Ingo Molnar Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/sched.h | 5 +++-- kernel/irq/manage.c | 4 +++- kernel/kthread.c | 2 +- kernel/sched.c | 6 +++--- kernel/softirq.c | 4 +++- kernel/trace/trace_selftest.c | 2 +- kernel/watchdog.c | 2 +- 7 files changed, 15 insertions(+), 10 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 0383601a927c..849c8670583d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1942,9 +1942,10 @@ extern int task_nice(const struct task_struct *p); extern int can_nice(const struct task_struct *p, const int nice); extern int task_curr(const struct task_struct *p); extern int idle_cpu(int cpu); -extern int sched_setscheduler(struct task_struct *, int, struct sched_param *); +extern int sched_setscheduler(struct task_struct *, int, + const struct sched_param *); extern int sched_setscheduler_nocheck(struct task_struct *, int, - struct sched_param *); + const struct sched_param *); extern struct task_struct *idle_task(int cpu); extern struct task_struct *curr_task(int cpu); extern void set_curr_task(int cpu, struct task_struct *p); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 644e8d5fa367..850f030fa0c2 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -573,7 +573,9 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } */ static int irq_thread(void *data) { - struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; + static struct sched_param param = { + .sched_priority = MAX_USER_RT_PRIO/2, + }; struct irqaction *action = data; struct irq_desc *desc = irq_to_desc(action->irq); int wake, oneshot = desc->status & IRQ_ONESHOT; diff --git a/kernel/kthread.c b/kernel/kthread.c index 2dc3786349d1..74cf6f5e7ade 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -148,7 +148,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), wait_for_completion(&create.done); if (!IS_ERR(create.result)) { - struct sched_param param = { .sched_priority = 0 }; + static struct sched_param param = { .sched_priority = 0 }; va_list args; va_start(args, namefmt); diff --git a/kernel/sched.c b/kernel/sched.c index d42992bccdfa..51944e8c38a8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4701,7 +4701,7 @@ static bool check_same_owner(struct task_struct *p) } static int __sched_setscheduler(struct task_struct *p, int policy, - struct sched_param *param, bool user) + const struct sched_param *param, bool user) { int retval, oldprio, oldpolicy = -1, on_rq, running; unsigned long flags; @@ -4856,7 +4856,7 @@ recheck: * NOTE that the task may be already dead. */ int sched_setscheduler(struct task_struct *p, int policy, - struct sched_param *param) + const struct sched_param *param) { return __sched_setscheduler(p, policy, param, true); } @@ -4874,7 +4874,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); * but our caller might not have that capability. */ int sched_setscheduler_nocheck(struct task_struct *p, int policy, - struct sched_param *param) + const struct sched_param *param) { return __sched_setscheduler(p, policy, param, false); } diff --git a/kernel/softirq.c b/kernel/softirq.c index fc978889b194..081869ed3a9f 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -851,7 +851,9 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, cpumask_any(cpu_online_mask)); case CPU_DEAD: case CPU_DEAD_FROZEN: { - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + static struct sched_param param = { + .sched_priority = MAX_RT_PRIO-1 + }; p = per_cpu(ksoftirqd, hotcpu); per_cpu(ksoftirqd, hotcpu) = NULL; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 155a415b3209..562c56e048fd 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -558,7 +558,7 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr) static int trace_wakeup_test_thread(void *data) { /* Make this a RT thread, doesn't need to be too high */ - struct sched_param param = { .sched_priority = 5 }; + static struct sched_param param = { .sched_priority = 5 }; struct completion *x = data; sched_setscheduler(current, SCHED_FIFO, ¶m); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index bafba687a6d8..94ca779aa9c2 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -307,7 +307,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) */ static int watchdog(void *unused) { - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); sched_setscheduler(current, SCHED_FIFO, ¶m); -- cgit v1.2.3 From 48c5ccae88dcd989d9de507e8510313c6cbd352b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 13 Nov 2010 19:32:29 +0100 Subject: sched: Simplify cpu-hot-unplug task migration While discussing the need for sched_idle_next(), Oleg remarked that since try_to_wake_up() ensures sleeping tasks will end up running on a sane cpu, we can do away with migrate_live_tasks(). If we then extend the existing hack of migrating current from CPU_DYING to migrating the full rq worth of tasks from CPU_DYING, the need for the sched_idle_next() abomination disappears as well, since idle will be the only possible thread left after the migration thread stops. This greatly simplifies the hot-unplug task migration path, as can be seen from the resulting code reduction (and about half the new lines are comments). Suggested-by: Oleg Nesterov Signed-off-by: Peter Zijlstra LKML-Reference: <1289851597.2109.547.camel@laptop> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 - kernel/cpu.c | 16 ++-- kernel/sched.c | 206 +++++++++++++++----------------------------------- 3 files changed, 67 insertions(+), 158 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 3cd70cf91fde..29d953abb5ad 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1871,14 +1871,11 @@ extern void sched_clock_idle_sleep_event(void); extern void sched_clock_idle_wakeup_event(u64 delta_ns); #ifdef CONFIG_HOTPLUG_CPU -extern void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p); extern void idle_task_exit(void); #else static inline void idle_task_exit(void) {} #endif -extern void sched_idle_next(void); - #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) extern void wake_up_idle_cpu(int cpu); #else diff --git a/kernel/cpu.c b/kernel/cpu.c index f6e726f18491..8615aa65d927 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -189,7 +189,6 @@ static inline void check_for_tasks(int cpu) } struct take_cpu_down_param { - struct task_struct *caller; unsigned long mod; void *hcpu; }; @@ -208,11 +207,6 @@ static int __ref take_cpu_down(void *_param) cpu_notify(CPU_DYING | param->mod, param->hcpu); - if (task_cpu(param->caller) == cpu) - move_task_off_dead_cpu(cpu, param->caller); - /* Force idle task to run as soon as we yield: it should - immediately notice cpu is offline and die quickly. */ - sched_idle_next(); return 0; } @@ -223,7 +217,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) void *hcpu = (void *)(long)cpu; unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; struct take_cpu_down_param tcd_param = { - .caller = current, .mod = mod, .hcpu = hcpu, }; @@ -253,9 +246,12 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) } BUG_ON(cpu_online(cpu)); - /* Wait for it to sleep (leaving idle task). */ - while (!idle_cpu(cpu)) - yield(); + /* + * The migration_call() CPU_DYING callback will have removed all + * runnable tasks from the cpu, there's only the idle task left now + * that the migration thread is done doing the stop_machine thing. + */ + BUG_ON(!idle_cpu(cpu)); /* This actually kills the CPU. */ __cpu_die(cpu); diff --git a/kernel/sched.c b/kernel/sched.c index 41f18695b730..b0d5f1b24a39 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2366,18 +2366,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p) return dest_cpu; /* No more Mr. Nice Guy. */ - if (unlikely(dest_cpu >= nr_cpu_ids)) { - dest_cpu = cpuset_cpus_allowed_fallback(p); - /* - * Don't tell them about moving exiting tasks or - * kernel threads (both mm NULL), since they never - * leave kernel. - */ - if (p->mm && printk_ratelimit()) { - printk(KERN_INFO "process %d (%s) no " - "longer affine to cpu%d\n", - task_pid_nr(p), p->comm, cpu); - } + dest_cpu = cpuset_cpus_allowed_fallback(p); + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (p->mm && printk_ratelimit()) { + printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", + task_pid_nr(p), p->comm, cpu); } return dest_cpu; @@ -5712,29 +5709,20 @@ static int migration_cpu_stop(void *data) } #ifdef CONFIG_HOTPLUG_CPU + /* - * Figure out where task on dead CPU should go, use force if necessary. + * Ensures that the idle task is using init_mm right before its cpu goes + * offline. */ -void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) +void idle_task_exit(void) { - struct rq *rq = cpu_rq(dead_cpu); - int needs_cpu, uninitialized_var(dest_cpu); - unsigned long flags; + struct mm_struct *mm = current->active_mm; - local_irq_save(flags); + BUG_ON(cpu_online(smp_processor_id())); - raw_spin_lock(&rq->lock); - needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); - if (needs_cpu) - dest_cpu = select_fallback_rq(dead_cpu, p); - raw_spin_unlock(&rq->lock); - /* - * It can only fail if we race with set_cpus_allowed(), - * in the racer should migrate the task anyway. - */ - if (needs_cpu) - __migrate_task(p, dead_cpu, dest_cpu); - local_irq_restore(flags); + if (mm != &init_mm) + switch_mm(mm, &init_mm, current); + mmdrop(mm); } /* @@ -5747,128 +5735,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) static void migrate_nr_uninterruptible(struct rq *rq_src) { struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); - unsigned long flags; - local_irq_save(flags); - double_rq_lock(rq_src, rq_dest); rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; rq_src->nr_uninterruptible = 0; - double_rq_unlock(rq_src, rq_dest); - local_irq_restore(flags); -} - -/* Run through task list and migrate tasks from the dead cpu. */ -static void migrate_live_tasks(int src_cpu) -{ - struct task_struct *p, *t; - - read_lock(&tasklist_lock); - - do_each_thread(t, p) { - if (p == current) - continue; - - if (task_cpu(p) == src_cpu) - move_task_off_dead_cpu(src_cpu, p); - } while_each_thread(t, p); - - read_unlock(&tasklist_lock); } /* - * Schedules idle task to be the next runnable task on current CPU. - * It does so by boosting its priority to highest possible. - * Used by CPU offline code. + * remove the tasks which were accounted by rq from calc_load_tasks. */ -void sched_idle_next(void) +static void calc_global_load_remove(struct rq *rq) { - int this_cpu = smp_processor_id(); - struct rq *rq = cpu_rq(this_cpu); - struct task_struct *p = rq->idle; - unsigned long flags; - - /* cpu has to be offline */ - BUG_ON(cpu_online(this_cpu)); - - /* - * Strictly not necessary since rest of the CPUs are stopped by now - * and interrupts disabled on the current cpu. - */ - raw_spin_lock_irqsave(&rq->lock, flags); - - __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); - - activate_task(rq, p, 0); - - raw_spin_unlock_irqrestore(&rq->lock, flags); + atomic_long_sub(rq->calc_load_active, &calc_load_tasks); + rq->calc_load_active = 0; } /* - * Ensures that the idle task is using init_mm right before its cpu goes - * offline. + * Migrate all tasks from the rq, sleeping tasks will be migrated by + * try_to_wake_up()->select_task_rq(). + * + * Called with rq->lock held even though we'er in stop_machine() and + * there's no concurrency possible, we hold the required locks anyway + * because of lock validation efforts. */ -void idle_task_exit(void) -{ - struct mm_struct *mm = current->active_mm; - - BUG_ON(cpu_online(smp_processor_id())); - - if (mm != &init_mm) - switch_mm(mm, &init_mm, current); - mmdrop(mm); -} - -/* called under rq->lock with disabled interrupts */ -static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) +static void migrate_tasks(unsigned int dead_cpu) { struct rq *rq = cpu_rq(dead_cpu); - - /* Must be exiting, otherwise would be on tasklist. */ - BUG_ON(!p->exit_state); - - /* Cannot have done final schedule yet: would have vanished. */ - BUG_ON(p->state == TASK_DEAD); - - get_task_struct(p); + struct task_struct *next, *stop = rq->stop; + int dest_cpu; /* - * Drop lock around migration; if someone else moves it, - * that's OK. No task can be added to this CPU, so iteration is - * fine. + * Fudge the rq selection such that the below task selection loop + * doesn't get stuck on the currently eligible stop task. + * + * We're currently inside stop_machine() and the rq is either stuck + * in the stop_machine_cpu_stop() loop, or we're executing this code, + * either way we should never end up calling schedule() until we're + * done here. */ - raw_spin_unlock_irq(&rq->lock); - move_task_off_dead_cpu(dead_cpu, p); - raw_spin_lock_irq(&rq->lock); - - put_task_struct(p); -} - -/* release_task() removes task from tasklist, so we won't find dead tasks. */ -static void migrate_dead_tasks(unsigned int dead_cpu) -{ - struct rq *rq = cpu_rq(dead_cpu); - struct task_struct *next; + rq->stop = NULL; for ( ; ; ) { - if (!rq->nr_running) + /* + * There's this thread running, bail when that's the only + * remaining thread. + */ + if (rq->nr_running == 1) break; + next = pick_next_task(rq); - if (!next) - break; + BUG_ON(!next); next->sched_class->put_prev_task(rq, next); - migrate_dead(dead_cpu, next); + /* Find suitable destination for @next, with force if needed. */ + dest_cpu = select_fallback_rq(dead_cpu, next); + raw_spin_unlock(&rq->lock); + + __migrate_task(next, dead_cpu, dest_cpu); + + raw_spin_lock(&rq->lock); } -} -/* - * remove the tasks which were accounted by rq from calc_load_tasks. - */ -static void calc_global_load_remove(struct rq *rq) -{ - atomic_long_sub(rq->calc_load_active, &calc_load_tasks); - rq->calc_load_active = 0; + rq->stop = stop; } + #endif /* CONFIG_HOTPLUG_CPU */ #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) @@ -6078,15 +6007,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) unsigned long flags; struct rq *rq = cpu_rq(cpu); - switch (action) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: rq->calc_load_update = calc_load_update; break; case CPU_ONLINE: - case CPU_ONLINE_FROZEN: /* Update our root-domain */ raw_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { @@ -6098,30 +6025,19 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) break; #ifdef CONFIG_HOTPLUG_CPU - case CPU_DEAD: - case CPU_DEAD_FROZEN: - migrate_live_tasks(cpu); - /* Idle task back to normal (off runqueue, low prio) */ - raw_spin_lock_irq(&rq->lock); - deactivate_task(rq, rq->idle, 0); - __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); - rq->idle->sched_class = &idle_sched_class; - migrate_dead_tasks(cpu); - raw_spin_unlock_irq(&rq->lock); - migrate_nr_uninterruptible(rq); - BUG_ON(rq->nr_running != 0); - calc_global_load_remove(rq); - break; - case CPU_DYING: - case CPU_DYING_FROZEN: /* Update our root-domain */ raw_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } + migrate_tasks(cpu); + BUG_ON(rq->nr_running != 1); /* the migration thread */ raw_spin_unlock_irqrestore(&rq->lock, flags); + + migrate_nr_uninterruptible(rq); + calc_global_load_remove(rq); break; #endif } -- cgit v1.2.3 From 2069dd75c7d0f49355939e5586daf5a9ab216db7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Nov 2010 15:47:00 -0800 Subject: sched: Rewrite tg_shares_up) By tracking a per-cpu load-avg for each cfs_rq and folding it into a global task_group load on each tick we can rework tg_shares_up to be strictly per-cpu. This should improve cpu-cgroup performance for smp systems significantly. [ Paul: changed to use queueing cfs_rq + bug fixes ] Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra LKML-Reference: <20101115234937.580480400@google.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 - kernel/sched.c | 173 ++++++++++++------------------------------------ kernel/sched_debug.c | 15 +++-- kernel/sched_fair.c | 164 +++++++++++++++++++++++++++++---------------- kernel/sched_features.h | 2 - kernel/sysctl.c | 19 ------ 6 files changed, 162 insertions(+), 213 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 29d953abb5ad..8abb8aa59664 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1885,8 +1885,6 @@ static inline void wake_up_idle_cpu(int cpu) { } extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; -extern unsigned int sysctl_sched_shares_ratelimit; -extern unsigned int sysctl_sched_shares_thresh; extern unsigned int sysctl_sched_child_runs_first; enum sched_tunable_scaling { diff --git a/kernel/sched.c b/kernel/sched.c index b0d5f1b24a39..e2f1a3024a99 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -253,6 +253,8 @@ struct task_group { /* runqueue "owned" by this group on each cpu */ struct cfs_rq **cfs_rq; unsigned long shares; + + atomic_t load_weight; #endif #ifdef CONFIG_RT_GROUP_SCHED @@ -359,15 +361,11 @@ struct cfs_rq { */ unsigned long h_load; - /* - * this cpu's part of tg->shares - */ - unsigned long shares; + u64 load_avg; + u64 load_period; + u64 load_stamp; - /* - * load.weight at the time we set shares - */ - unsigned long rq_weight; + unsigned long load_contribution; #endif #endif }; @@ -806,20 +804,6 @@ late_initcall(sched_init_debug); */ const_debug unsigned int sysctl_sched_nr_migrate = 32; -/* - * ratelimit for updating the group shares. - * default: 0.25ms - */ -unsigned int sysctl_sched_shares_ratelimit = 250000; -unsigned int normalized_sysctl_sched_shares_ratelimit = 250000; - -/* - * Inject some fuzzyness into changing the per-cpu group shares - * this avoids remote rq-locks at the expense of fairness. - * default: 4 - */ -unsigned int sysctl_sched_shares_thresh = 4; - /* * period over which we average the RT time consumption, measured * in ms. @@ -1369,6 +1353,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec) lw->inv_weight = 0; } +static inline void update_load_set(struct load_weight *lw, unsigned long w) +{ + lw->weight = w; + lw->inv_weight = 0; +} + /* * To aid in avoiding the subversion of "niceness" due to uneven distribution * of tasks with abnormal "nice" values across CPUs the contribution that @@ -1557,97 +1547,44 @@ static unsigned long cpu_avg_load_per_task(int cpu) #ifdef CONFIG_FAIR_GROUP_SCHED -static __read_mostly unsigned long __percpu *update_shares_data; - -static void __set_se_shares(struct sched_entity *se, unsigned long shares); - -/* - * Calculate and set the cpu's group shares. - */ -static void update_group_shares_cpu(struct task_group *tg, int cpu, - unsigned long sd_shares, - unsigned long sd_rq_weight, - unsigned long *usd_rq_weight) -{ - unsigned long shares, rq_weight; - int boost = 0; - - rq_weight = usd_rq_weight[cpu]; - if (!rq_weight) { - boost = 1; - rq_weight = NICE_0_LOAD; - } - - /* - * \Sum_j shares_j * rq_weight_i - * shares_i = ----------------------------- - * \Sum_j rq_weight_j - */ - shares = (sd_shares * rq_weight) / sd_rq_weight; - shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); - - if (abs(shares - tg->se[cpu]->load.weight) > - sysctl_sched_shares_thresh) { - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); - tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; - tg->cfs_rq[cpu]->shares = boost ? 0 : shares; - __set_se_shares(tg->se[cpu], shares); - raw_spin_unlock_irqrestore(&rq->lock, flags); - } -} +static void update_cfs_load(struct cfs_rq *cfs_rq); +static void update_cfs_shares(struct cfs_rq *cfs_rq); /* - * Re-compute the task group their per cpu shares over the given domain. - * This needs to be done in a bottom-up fashion because the rq weight of a - * parent group depends on the shares of its child groups. + * update tg->load_weight by folding this cpu's load_avg */ static int tg_shares_up(struct task_group *tg, void *data) { - unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0; - unsigned long *usd_rq_weight; - struct sched_domain *sd = data; + long load_avg; + struct cfs_rq *cfs_rq; unsigned long flags; - int i; + int cpu = (long)data; + struct rq *rq; - if (!tg->se[0]) + if (!tg->se[cpu]) return 0; - local_irq_save(flags); - usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id()); - - for_each_cpu(i, sched_domain_span(sd)) { - weight = tg->cfs_rq[i]->load.weight; - usd_rq_weight[i] = weight; - - rq_weight += weight; - /* - * If there are currently no tasks on the cpu pretend there - * is one of average load so that when a new task gets to - * run here it will not get delayed by group starvation. - */ - if (!weight) - weight = NICE_0_LOAD; + rq = cpu_rq(cpu); + cfs_rq = tg->cfs_rq[cpu]; - sum_weight += weight; - shares += tg->cfs_rq[i]->shares; - } + raw_spin_lock_irqsave(&rq->lock, flags); - if (!rq_weight) - rq_weight = sum_weight; + update_rq_clock(rq); + update_cfs_load(cfs_rq); - if ((!shares && rq_weight) || shares > tg->shares) - shares = tg->shares; + load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); + load_avg -= cfs_rq->load_contribution; - if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) - shares = tg->shares; + atomic_add(load_avg, &tg->load_weight); + cfs_rq->load_contribution += load_avg; - for_each_cpu(i, sched_domain_span(sd)) - update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight); + /* + * We need to update shares after updating tg->load_weight in + * order to adjust the weight of groups with long running tasks. + */ + update_cfs_shares(cfs_rq); - local_irq_restore(flags); + raw_spin_unlock_irqrestore(&rq->lock, flags); return 0; } @@ -1666,7 +1603,7 @@ static int tg_load_down(struct task_group *tg, void *data) load = cpu_rq(cpu)->load.weight; } else { load = tg->parent->cfs_rq[cpu]->h_load; - load *= tg->cfs_rq[cpu]->shares; + load *= tg->se[cpu]->load.weight; load /= tg->parent->cfs_rq[cpu]->load.weight + 1; } @@ -1675,21 +1612,16 @@ static int tg_load_down(struct task_group *tg, void *data) return 0; } -static void update_shares(struct sched_domain *sd) +static void update_shares(long cpu) { - s64 elapsed; - u64 now; - if (root_task_group_empty()) return; - now = local_clock(); - elapsed = now - sd->last_update; + /* + * XXX: replace with an on-demand list + */ - if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { - sd->last_update = now; - walk_tg_tree(tg_nop, tg_shares_up, sd); - } + walk_tg_tree(tg_nop, tg_shares_up, (void *)cpu); } static void update_h_load(long cpu) @@ -1699,7 +1631,7 @@ static void update_h_load(long cpu) #else -static inline void update_shares(struct sched_domain *sd) +static inline void update_shares(int cpu) { } @@ -1824,15 +1756,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) #endif -#ifdef CONFIG_FAIR_GROUP_SCHED -static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) -{ -#ifdef CONFIG_SMP - cfs_rq->shares = shares; -#endif -} -#endif - static void calc_load_account_idle(struct rq *this_rq); static void update_sysctl(void); static int get_update_sysctl_factor(void); @@ -5551,7 +5474,6 @@ static void update_sysctl(void) SET_SYSCTL(sched_min_granularity); SET_SYSCTL(sched_latency); SET_SYSCTL(sched_wakeup_granularity); - SET_SYSCTL(sched_shares_ratelimit); #undef SET_SYSCTL } @@ -7787,8 +7709,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, se->cfs_rq = parent->my_q; se->my_q = cfs_rq; - se->load.weight = tg->shares; - se->load.inv_weight = 0; + update_load_set(&se->load, tg->shares); se->parent = parent; } #endif @@ -7881,10 +7802,6 @@ void __init sched_init(void) #endif /* CONFIG_CGROUP_SCHED */ -#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP - update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), - __alignof__(unsigned long)); -#endif for_each_possible_cpu(i) { struct rq *rq; @@ -8452,8 +8369,7 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares) if (on_rq) dequeue_entity(cfs_rq, se, 0); - se->load.weight = shares; - se->load.inv_weight = 0; + update_load_set(&se->load, shares); if (on_rq) enqueue_entity(cfs_rq, se, 0); @@ -8510,7 +8426,6 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) /* * force a rebalance */ - cfs_rq_set_shares(tg->cfs_rq[i], 0); set_se_shares(tg->se[i], shares); } diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 2e1b0d17dd9b..e6590e7312e8 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -202,15 +202,22 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) spread0 = min_vruntime - rq0_min_vruntime; SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", SPLIT_NS(spread0)); - SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); - SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); - SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", cfs_rq->nr_spread_over); + SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); + SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_SMP - SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg", + SPLIT_NS(cfs_rq->load_avg)); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period", + SPLIT_NS(cfs_rq->load_period)); + SEQ_printf(m, " .%-30s: %ld\n", "load_contrib", + cfs_rq->load_contribution); + SEQ_printf(m, " .%-30s: %d\n", "load_tg", + atomic_read(&tg->load_weight)); #endif + print_cfs_group_stats(m, cpu, cfs_rq->tg); #endif } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f4f6a8326dd0..d86544b4151c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -417,7 +417,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write, WRT_SYSCTL(sched_min_granularity); WRT_SYSCTL(sched_latency); WRT_SYSCTL(sched_wakeup_granularity); - WRT_SYSCTL(sched_shares_ratelimit); #undef WRT_SYSCTL return 0; @@ -633,7 +632,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) list_add(&se->group_node, &cfs_rq->tasks); } cfs_rq->nr_running++; - se->on_rq = 1; } static void @@ -647,9 +645,89 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) list_del_init(&se->group_node); } cfs_rq->nr_running--; - se->on_rq = 0; } +#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED +static void update_cfs_load(struct cfs_rq *cfs_rq) +{ + u64 period = sched_avg_period(); + u64 now, delta; + + if (!cfs_rq) + return; + + now = rq_of(cfs_rq)->clock; + delta = now - cfs_rq->load_stamp; + + cfs_rq->load_stamp = now; + cfs_rq->load_period += delta; + cfs_rq->load_avg += delta * cfs_rq->load.weight; + + while (cfs_rq->load_period > period) { + /* + * Inline assembly required to prevent the compiler + * optimising this loop into a divmod call. + * See __iter_div_u64_rem() for another example of this. + */ + asm("" : "+rm" (cfs_rq->load_period)); + cfs_rq->load_period /= 2; + cfs_rq->load_avg /= 2; + } +} + +static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, + unsigned long weight) +{ + if (se->on_rq) + account_entity_dequeue(cfs_rq, se); + + update_load_set(&se->load, weight); + + if (se->on_rq) + account_entity_enqueue(cfs_rq, se); +} + +static void update_cfs_shares(struct cfs_rq *cfs_rq) +{ + struct task_group *tg; + struct sched_entity *se; + long load_weight, load, shares; + + if (!cfs_rq) + return; + + tg = cfs_rq->tg; + se = tg->se[cpu_of(rq_of(cfs_rq))]; + if (!se) + return; + + load = cfs_rq->load.weight; + + load_weight = atomic_read(&tg->load_weight); + load_weight -= cfs_rq->load_contribution; + load_weight += load; + + shares = (tg->shares * load); + if (load_weight) + shares /= load_weight; + + if (shares < MIN_SHARES) + shares = MIN_SHARES; + if (shares > tg->shares) + shares = tg->shares; + + reweight_entity(cfs_rq_of(se), se, shares); +} +#else /* CONFIG_FAIR_GROUP_SCHED */ +static inline void update_cfs_load(struct cfs_rq *cfs_rq) +{ +} + +static inline void update_cfs_shares(struct cfs_rq *cfs_rq) +{ +} +#endif /* CONFIG_FAIR_GROUP_SCHED */ + static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { #ifdef CONFIG_SCHEDSTATS @@ -771,7 +849,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + update_cfs_load(cfs_rq); account_entity_enqueue(cfs_rq, se); + update_cfs_shares(cfs_rq); if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); @@ -782,6 +862,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) check_spread(cfs_rq, se); if (se != cfs_rq->curr) __enqueue_entity(cfs_rq, se); + se->on_rq = 1; } static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -825,8 +906,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); + se->on_rq = 0; + update_cfs_load(cfs_rq); account_entity_dequeue(cfs_rq, se); update_min_vruntime(cfs_rq); + update_cfs_shares(cfs_rq); /* * Normalize the entity after updating the min_vruntime because the @@ -1055,6 +1139,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) flags = ENQUEUE_WAKEUP; } + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + update_cfs_load(cfs_rq); + update_cfs_shares(cfs_rq); + } + hrtick_update(rq); } @@ -1071,12 +1162,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); + /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) break; flags |= DEQUEUE_SLEEP; } + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + update_cfs_load(cfs_rq); + update_cfs_shares(cfs_rq); + } + hrtick_update(rq); } @@ -1143,51 +1242,20 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p) * Adding load to a group doesn't make a group heavier, but can cause movement * of group shares between cpus. Assuming the shares were perfectly aligned one * can calculate the shift in shares. - * - * The problem is that perfectly aligning the shares is rather expensive, hence - * we try to avoid doing that too often - see update_shares(), which ratelimits - * this change. - * - * We compensate this by not only taking the current delta into account, but - * also considering the delta between when the shares were last adjusted and - * now. - * - * We still saw a performance dip, some tracing learned us that between - * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased - * significantly. Therefore try to bias the error in direction of failing - * the affine wakeup. - * */ -static long effective_load(struct task_group *tg, int cpu, - long wl, long wg) +static long effective_load(struct task_group *tg, int cpu, long wl, long wg) { struct sched_entity *se = tg->se[cpu]; if (!tg->parent) return wl; - /* - * By not taking the decrease of shares on the other cpu into - * account our error leans towards reducing the affine wakeups. - */ - if (!wl && sched_feat(ASYM_EFF_LOAD)) - return wl; - for_each_sched_entity(se) { long S, rw, s, a, b; - long more_w; - - /* - * Instead of using this increment, also add the difference - * between when the shares were last updated and now. - */ - more_w = se->my_q->load.weight - se->my_q->rq_weight; - wl += more_w; - wg += more_w; S = se->my_q->tg->shares; - s = se->my_q->shares; - rw = se->my_q->rq_weight; + s = se->load.weight; + rw = se->my_q->load.weight; a = S*(rw + wl); b = S*rw + s*wg; @@ -1508,23 +1576,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ sd = tmp; } -#ifdef CONFIG_FAIR_GROUP_SCHED - if (sched_feat(LB_SHARES_UPDATE)) { - /* - * Pick the largest domain to update shares over - */ - tmp = sd; - if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight)) - tmp = affine_sd; - - if (tmp) { - raw_spin_unlock(&rq->lock); - update_shares(tmp); - raw_spin_lock(&rq->lock); - } - } -#endif - if (affine_sd) { if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) return select_idle_sibling(p, cpu); @@ -3014,7 +3065,6 @@ static int load_balance(int this_cpu, struct rq *this_rq, schedstat_inc(sd, lb_count[idle]); redo: - update_shares(sd); group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, cpus, balance); @@ -3156,8 +3206,6 @@ out_one_pinned: else ld_moved = 0; out: - if (ld_moved) - update_shares(sd); return ld_moved; } @@ -3549,6 +3597,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) int update_next_balance = 0; int need_serialize; + update_shares(cpu); + for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) continue; diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 185f920ec1a2..68e69acc29b9 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -52,8 +52,6 @@ SCHED_FEAT(ARCH_POWER, 0) SCHED_FEAT(HRTICK, 0) SCHED_FEAT(DOUBLE_TICK, 0) SCHED_FEAT(LB_BIAS, 1) -SCHED_FEAT(LB_SHARES_UPDATE, 1) -SCHED_FEAT(ASYM_EFF_LOAD, 1) /* * Spin-wait on mutex acquisition when the mutex owner is running on diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b65bf634035e..3132b25193db 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -259,8 +259,6 @@ static int min_wakeup_granularity_ns; /* 0 usecs */ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; -static int min_sched_shares_ratelimit = 100000; /* 100 usec */ -static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */ #endif #ifdef CONFIG_COMPACTION @@ -304,15 +302,6 @@ static struct ctl_table kern_table[] = { .extra1 = &min_wakeup_granularity_ns, .extra2 = &max_wakeup_granularity_ns, }, - { - .procname = "sched_shares_ratelimit", - .data = &sysctl_sched_shares_ratelimit, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sched_proc_update_handler, - .extra1 = &min_sched_shares_ratelimit, - .extra2 = &max_sched_shares_ratelimit, - }, { .procname = "sched_tunable_scaling", .data = &sysctl_sched_tunable_scaling, @@ -322,14 +311,6 @@ static struct ctl_table kern_table[] = { .extra1 = &min_sched_tunable_scaling, .extra2 = &max_sched_tunable_scaling, }, - { - .procname = "sched_shares_thresh", - .data = &sysctl_sched_shares_thresh, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - }, { .procname = "sched_migration_cost", .data = &sysctl_sched_migration_cost, -- cgit v1.2.3 From 3d4b47b4b040c9d77dd68104cfc1055d89a55afd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Nov 2010 15:47:01 -0800 Subject: sched: Implement on-demand (active) cfs_rq list Make certain load-balance actions scale per number of active cgroups instead of the number of existing cgroups. This makes wakeup/sleep paths more expensive, but is a win for systems where the vast majority of existing cgroups are idle. Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra LKML-Reference: <20101115234937.666535048@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 105 ++++++++++++++-------------------------------------- kernel/sched_fair.c | 46 ++++++++++++++++++++--- kernel/sched_rt.c | 24 ++++++++++++ 3 files changed, 92 insertions(+), 83 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index e2f1a3024a99..22436dd2e19f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -274,9 +274,7 @@ struct task_group { #define root_task_group init_task_group -/* task_group_lock serializes add/remove of task groups and also changes to - * a task group's cpu shares. - */ +/* task_group_lock serializes the addition/removal of task groups */ static DEFINE_SPINLOCK(task_group_lock); #ifdef CONFIG_FAIR_GROUP_SCHED @@ -344,6 +342,7 @@ struct cfs_rq { * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This * list is used during load balance. */ + int on_list; struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ @@ -1547,7 +1546,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) #ifdef CONFIG_FAIR_GROUP_SCHED -static void update_cfs_load(struct cfs_rq *cfs_rq); +static void update_cfs_load(struct cfs_rq *cfs_rq, int lb); static void update_cfs_shares(struct cfs_rq *cfs_rq); /* @@ -1570,7 +1569,7 @@ static int tg_shares_up(struct task_group *tg, void *data) raw_spin_lock_irqsave(&rq->lock, flags); update_rq_clock(rq); - update_cfs_load(cfs_rq); + update_cfs_load(cfs_rq, 1); load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); load_avg -= cfs_rq->load_contribution; @@ -7688,15 +7687,13 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) #ifdef CONFIG_FAIR_GROUP_SCHED static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, - struct sched_entity *se, int cpu, int add, + struct sched_entity *se, int cpu, struct sched_entity *parent) { struct rq *rq = cpu_rq(cpu); tg->cfs_rq[cpu] = cfs_rq; init_cfs_rq(cfs_rq, rq); cfs_rq->tg = tg; - if (add) - list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); tg->se[cpu] = se; /* se could be NULL for init_task_group */ @@ -7716,7 +7713,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, #ifdef CONFIG_RT_GROUP_SCHED static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, - struct sched_rt_entity *rt_se, int cpu, int add, + struct sched_rt_entity *rt_se, int cpu, struct sched_rt_entity *parent) { struct rq *rq = cpu_rq(cpu); @@ -7725,8 +7722,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, init_rt_rq(rt_rq, rq); rt_rq->tg = tg; rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; - if (add) - list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); tg->rt_se[cpu] = rt_se; if (!rt_se) @@ -7835,7 +7830,7 @@ void __init sched_init(void) * We achieve this by letting init_task_group's tasks sit * directly in rq->cfs (i.e init_task_group->se[] = NULL). */ - init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); + init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, NULL); #endif #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -7843,7 +7838,7 @@ void __init sched_init(void) #ifdef CONFIG_RT_GROUP_SCHED INIT_LIST_HEAD(&rq->leaf_rt_rq_list); #ifdef CONFIG_CGROUP_SCHED - init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); + init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, NULL); #endif #endif @@ -8119,7 +8114,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) if (!se) goto err_free_rq; - init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); + init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); } return 1; @@ -8130,15 +8125,22 @@ err: return 0; } -static inline void register_fair_sched_group(struct task_group *tg, int cpu) -{ - list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list, - &cpu_rq(cpu)->leaf_cfs_rq_list); -} - static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) { - list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + int i; + + /* + * Only empty task groups can be destroyed; so we can speculatively + * check on_list without danger of it being re-added. + */ + if (!tg->cfs_rq[cpu]->on_list) + return; + + raw_spin_lock_irqsave(&rq->lock, flags); + list_del_leaf_cfs_rq(tg->cfs_rq[i]); + raw_spin_unlock_irqrestore(&rq->lock, flags); } #else /* !CONFG_FAIR_GROUP_SCHED */ static inline void free_fair_sched_group(struct task_group *tg) @@ -8151,10 +8153,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) return 1; } -static inline void register_fair_sched_group(struct task_group *tg, int cpu) -{ -} - static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) { } @@ -8209,7 +8207,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) if (!rt_se) goto err_free_rq; - init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); + init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); } return 1; @@ -8219,17 +8217,6 @@ err_free_rq: err: return 0; } - -static inline void register_rt_sched_group(struct task_group *tg, int cpu) -{ - list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list, - &cpu_rq(cpu)->leaf_rt_rq_list); -} - -static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) -{ - list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); -} #else /* !CONFIG_RT_GROUP_SCHED */ static inline void free_rt_sched_group(struct task_group *tg) { @@ -8240,14 +8227,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) { return 1; } - -static inline void register_rt_sched_group(struct task_group *tg, int cpu) -{ -} - -static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) -{ -} #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_CGROUP_SCHED @@ -8263,7 +8242,6 @@ struct task_group *sched_create_group(struct task_group *parent) { struct task_group *tg; unsigned long flags; - int i; tg = kzalloc(sizeof(*tg), GFP_KERNEL); if (!tg) @@ -8276,10 +8254,6 @@ struct task_group *sched_create_group(struct task_group *parent) goto err; spin_lock_irqsave(&task_group_lock, flags); - for_each_possible_cpu(i) { - register_fair_sched_group(tg, i); - register_rt_sched_group(tg, i); - } list_add_rcu(&tg->list, &task_groups); WARN_ON(!parent); /* root should already exist */ @@ -8309,11 +8283,11 @@ void sched_destroy_group(struct task_group *tg) unsigned long flags; int i; - spin_lock_irqsave(&task_group_lock, flags); - for_each_possible_cpu(i) { + /* end participation in shares distribution */ + for_each_possible_cpu(i) unregister_fair_sched_group(tg, i); - unregister_rt_sched_group(tg, i); - } + + spin_lock_irqsave(&task_group_lock, flags); list_del_rcu(&tg->list); list_del_rcu(&tg->siblings); spin_unlock_irqrestore(&task_group_lock, flags); @@ -8391,7 +8365,6 @@ static DEFINE_MUTEX(shares_mutex); int sched_group_set_shares(struct task_group *tg, unsigned long shares) { int i; - unsigned long flags; /* * We can't change the weight of the root cgroup. @@ -8408,19 +8381,6 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) if (tg->shares == shares) goto done; - spin_lock_irqsave(&task_group_lock, flags); - for_each_possible_cpu(i) - unregister_fair_sched_group(tg, i); - list_del_rcu(&tg->siblings); - spin_unlock_irqrestore(&task_group_lock, flags); - - /* wait for any ongoing reference to this group to finish */ - synchronize_sched(); - - /* - * Now we are free to modify the group's share on each cpu - * w/o tripping rebalance_share or load_balance_fair. - */ tg->shares = shares; for_each_possible_cpu(i) { /* @@ -8429,15 +8389,6 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) set_se_shares(tg->se[i], shares); } - /* - * Enable load balance activity on this group, by inserting it back on - * each cpu's rq->leaf_cfs_rq_list. - */ - spin_lock_irqsave(&task_group_lock, flags); - for_each_possible_cpu(i) - register_fair_sched_group(tg, i); - list_add_rcu(&tg->siblings, &tg->parent->children); - spin_unlock_irqrestore(&task_group_lock, flags); done: mutex_unlock(&shares_mutex); return 0; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index d86544b4151c..0560e72bd732 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -143,6 +143,24 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) return cfs_rq->tg->cfs_rq[this_cpu]; } +static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ + if (!cfs_rq->on_list) { + list_add_rcu(&cfs_rq->leaf_cfs_rq_list, + &rq_of(cfs_rq)->leaf_cfs_rq_list); + + cfs_rq->on_list = 1; + } +} + +static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->on_list) { + list_del_rcu(&cfs_rq->leaf_cfs_rq_list); + cfs_rq->on_list = 0; + } +} + /* Iterate thr' all leaf cfs_rq's on a runqueue */ #define for_each_leaf_cfs_rq(rq, cfs_rq) \ list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) @@ -246,6 +264,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) return &cpu_rq(this_cpu)->cfs; } +static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ +} + +static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ +} + #define for_each_leaf_cfs_rq(rq, cfs_rq) \ for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) @@ -648,7 +674,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) } #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED -static void update_cfs_load(struct cfs_rq *cfs_rq) +static void update_cfs_load(struct cfs_rq *cfs_rq, int lb) { u64 period = sched_avg_period(); u64 now, delta; @@ -673,6 +699,11 @@ static void update_cfs_load(struct cfs_rq *cfs_rq) cfs_rq->load_period /= 2; cfs_rq->load_avg /= 2; } + + if (lb && !cfs_rq->nr_running) { + if (cfs_rq->load_avg < (period / 8)) + list_del_leaf_cfs_rq(cfs_rq); + } } static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, @@ -719,7 +750,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq) reweight_entity(cfs_rq_of(se), se, shares); } #else /* CONFIG_FAIR_GROUP_SCHED */ -static inline void update_cfs_load(struct cfs_rq *cfs_rq) +static inline void update_cfs_load(struct cfs_rq *cfs_rq, int lb) { } @@ -849,7 +880,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - update_cfs_load(cfs_rq); + update_cfs_load(cfs_rq, 0); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); @@ -863,6 +894,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (se != cfs_rq->curr) __enqueue_entity(cfs_rq, se); se->on_rq = 1; + + if (cfs_rq->nr_running == 1) + list_add_leaf_cfs_rq(cfs_rq); } static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -907,7 +941,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); se->on_rq = 0; - update_cfs_load(cfs_rq); + update_cfs_load(cfs_rq, 0); account_entity_dequeue(cfs_rq, se); update_min_vruntime(cfs_rq); update_cfs_shares(cfs_rq); @@ -1142,7 +1176,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - update_cfs_load(cfs_rq); + update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq); } @@ -1172,7 +1206,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - update_cfs_load(cfs_rq); + update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq); } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index bea7d79f7e9c..c914ec747ca6 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -183,6 +183,17 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); } +static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) +{ + list_add_rcu(&rt_rq->leaf_rt_rq_list, + &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list); +} + +static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) +{ + list_del_rcu(&rt_rq->leaf_rt_rq_list); +} + #define for_each_leaf_rt_rq(rt_rq, rq) \ list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) @@ -276,6 +287,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) return ktime_to_ns(def_rt_bandwidth.rt_period); } +static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) +{ +} + +static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) +{ +} + #define for_each_leaf_rt_rq(rt_rq, rq) \ for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) @@ -825,6 +844,9 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) return; + if (!rt_rq->rt_nr_running) + list_add_leaf_rt_rq(rt_rq); + if (head) list_add(&rt_se->run_list, queue); else @@ -844,6 +866,8 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) __clear_bit(rt_se_prio(rt_se), array->bitmap); dec_rt_tasks(rt_se, rt_rq); + if (!rt_rq->rt_nr_running) + list_del_leaf_rt_rq(rt_rq); } /* -- cgit v1.2.3 From 9e3081ca61147b29f52fddb4f7c6b6b82ea5eb7a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Nov 2010 15:47:02 -0800 Subject: sched: Make tg_shares_up() walk on-demand Make tg_shares_up() use the active cgroup list, this means we cannot do a strict bottom-up walk of the hierarchy, but assuming its a very wide tree with a small number of active groups it should be a win. Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra LKML-Reference: <20101115234937.754159484@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 67 ----------------------------------------------------- kernel/sched_fair.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 67 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 22436dd2e19f..6268d2dc9a91 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -279,13 +279,6 @@ static DEFINE_SPINLOCK(task_group_lock); #ifdef CONFIG_FAIR_GROUP_SCHED -#ifdef CONFIG_SMP -static int root_task_group_empty(void) -{ - return list_empty(&root_task_group.children); -} -#endif - # define INIT_TASK_GROUP_LOAD NICE_0_LOAD /* @@ -1546,48 +1539,6 @@ static unsigned long cpu_avg_load_per_task(int cpu) #ifdef CONFIG_FAIR_GROUP_SCHED -static void update_cfs_load(struct cfs_rq *cfs_rq, int lb); -static void update_cfs_shares(struct cfs_rq *cfs_rq); - -/* - * update tg->load_weight by folding this cpu's load_avg - */ -static int tg_shares_up(struct task_group *tg, void *data) -{ - long load_avg; - struct cfs_rq *cfs_rq; - unsigned long flags; - int cpu = (long)data; - struct rq *rq; - - if (!tg->se[cpu]) - return 0; - - rq = cpu_rq(cpu); - cfs_rq = tg->cfs_rq[cpu]; - - raw_spin_lock_irqsave(&rq->lock, flags); - - update_rq_clock(rq); - update_cfs_load(cfs_rq, 1); - - load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); - load_avg -= cfs_rq->load_contribution; - - atomic_add(load_avg, &tg->load_weight); - cfs_rq->load_contribution += load_avg; - - /* - * We need to update shares after updating tg->load_weight in - * order to adjust the weight of groups with long running tasks. - */ - update_cfs_shares(cfs_rq); - - raw_spin_unlock_irqrestore(&rq->lock, flags); - - return 0; -} - /* * Compute the cpu's hierarchical load factor for each task group. * This needs to be done in a top-down fashion because the load of a child @@ -1611,29 +1562,11 @@ static int tg_load_down(struct task_group *tg, void *data) return 0; } -static void update_shares(long cpu) -{ - if (root_task_group_empty()) - return; - - /* - * XXX: replace with an on-demand list - */ - - walk_tg_tree(tg_nop, tg_shares_up, (void *)cpu); -} - static void update_h_load(long cpu) { walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); } -#else - -static inline void update_shares(int cpu) -{ -} - #endif #ifdef CONFIG_PREEMPT diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0560e72bd732..46ff6587dc16 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2004,6 +2004,60 @@ out: } #ifdef CONFIG_FAIR_GROUP_SCHED +/* + * update tg->load_weight by folding this cpu's load_avg + */ +static int tg_shares_up(struct task_group *tg, int cpu) +{ + struct cfs_rq *cfs_rq; + unsigned long flags; + struct rq *rq; + long load_avg; + + if (!tg->se[cpu]) + return 0; + + rq = cpu_rq(cpu); + cfs_rq = tg->cfs_rq[cpu]; + + raw_spin_lock_irqsave(&rq->lock, flags); + + update_rq_clock(rq); + update_cfs_load(cfs_rq, 1); + + load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); + load_avg -= cfs_rq->load_contribution; + atomic_add(load_avg, &tg->load_weight); + cfs_rq->load_contribution += load_avg; + + /* + * We need to update shares after updating tg->load_weight in + * order to adjust the weight of groups with long running tasks. + */ + update_cfs_shares(cfs_rq); + + raw_spin_unlock_irqrestore(&rq->lock, flags); + + return 0; +} + +static void update_shares(int cpu) +{ + struct cfs_rq *cfs_rq; + struct rq *rq = cpu_rq(cpu); + + rcu_read_lock(); + for_each_leaf_cfs_rq(rq, cfs_rq) { + struct task_group *tg = cfs_rq->tg; + + do { + tg_shares_up(tg, cpu); + tg = tg->parent; + } while (tg); + } + rcu_read_unlock(); +} + static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, @@ -2051,6 +2105,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, return max_load_move - rem_load_move; } #else +static inline void update_shares(int cpu) +{ +} + static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, -- cgit v1.2.3 From f0d7442a5924a802b66eef79b3708f77297bfb35 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Mon, 15 Nov 2010 15:47:03 -0800 Subject: sched: Fix load corruption from update_cfs_shares() As part of enqueue_entity both a new entity weight and its contribution to the queuing cfs_rq / rq are updated. Since update_cfs_shares will only update the queueing weights when the entity is on_rq (which in this case it is not yet), there's a dependency loop here: update_cfs_shares needs account_entity_enqueue to update cfs_rq->load.weight account_entity_enqueue needs the updated weight for the queuing cfs_rq load[*] Fix this and avoid spurious dequeue/enqueues by issuing update_cfs_shares as if we had accounted the enqueue already. This was also resulting in rq->load corruption previously. [*]: this dependency also exists when using the group cfs_rq w/ update_cfs_shares as the weight of the enqueued entity changes without the load being updated. Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra LKML-Reference: <20101115234937.844900206@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 46ff6587dc16..d52b97a04e7a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -718,7 +718,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, account_entity_enqueue(cfs_rq, se); } -static void update_cfs_shares(struct cfs_rq *cfs_rq) +static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) { struct task_group *tg; struct sched_entity *se; @@ -732,7 +732,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq) if (!se) return; - load = cfs_rq->load.weight; + load = cfs_rq->load.weight + weight_delta; load_weight = atomic_read(&tg->load_weight); load_weight -= cfs_rq->load_contribution; @@ -754,7 +754,7 @@ static inline void update_cfs_load(struct cfs_rq *cfs_rq, int lb) { } -static inline void update_cfs_shares(struct cfs_rq *cfs_rq) +static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) { } #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -881,8 +881,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ update_curr(cfs_rq); update_cfs_load(cfs_rq, 0); + update_cfs_shares(cfs_rq, se->load.weight); account_entity_enqueue(cfs_rq, se); - update_cfs_shares(cfs_rq); if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); @@ -944,7 +944,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_cfs_load(cfs_rq, 0); account_entity_dequeue(cfs_rq, se); update_min_vruntime(cfs_rq); - update_cfs_shares(cfs_rq); + update_cfs_shares(cfs_rq, 0); /* * Normalize the entity after updating the min_vruntime because the @@ -1177,7 +1177,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq = cfs_rq_of(se); update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq); + update_cfs_shares(cfs_rq, 0); } hrtick_update(rq); @@ -1207,7 +1207,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq = cfs_rq_of(se); update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq); + update_cfs_shares(cfs_rq, 0); } hrtick_update(rq); @@ -2034,7 +2034,7 @@ static int tg_shares_up(struct task_group *tg, int cpu) * We need to update shares after updating tg->load_weight in * order to adjust the weight of groups with long running tasks. */ - update_cfs_shares(cfs_rq); + update_cfs_shares(cfs_rq, 0); raw_spin_unlock_irqrestore(&rq->lock, flags); -- cgit v1.2.3 From e33078baa4d30ad1d0e46d1f62b9e5a63a3e6ee3 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Mon, 15 Nov 2010 15:47:04 -0800 Subject: sched: Fix update_cfs_load() synchronization Using cfs_rq->nr_running is not sufficient to synchronize update_cfs_load with the put path since nr_running accounting occurs at deactivation. It's also not safe to make the removal decision based on load_avg as this fails with both high periods and low shares. Resolve this by clipping history after 4 periods without activity. Note: the above will always occur from update_shares() since in the last-task-sleep-case that task will still be cfs_rq->curr when update_cfs_load is called. Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra LKML-Reference: <20101115234937.933428187@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- kernel/sched_fair.c | 33 +++++++++++++++++++++------------ 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 6268d2dc9a91..dadab4d13875 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -355,7 +355,7 @@ struct cfs_rq { u64 load_avg; u64 load_period; - u64 load_stamp; + u64 load_stamp, load_last; unsigned long load_contribution; #endif diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index d52b97a04e7a..a543a5b202a4 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -674,10 +674,11 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) } #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED -static void update_cfs_load(struct cfs_rq *cfs_rq, int lb) +static void update_cfs_load(struct cfs_rq *cfs_rq) { u64 period = sched_avg_period(); u64 now, delta; + unsigned long load = cfs_rq->load.weight; if (!cfs_rq) return; @@ -685,9 +686,19 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int lb) now = rq_of(cfs_rq)->clock; delta = now - cfs_rq->load_stamp; + /* truncate load history at 4 idle periods */ + if (cfs_rq->load_stamp > cfs_rq->load_last && + now - cfs_rq->load_last > 4 * period) { + cfs_rq->load_period = 0; + cfs_rq->load_avg = 0; + } + cfs_rq->load_stamp = now; cfs_rq->load_period += delta; - cfs_rq->load_avg += delta * cfs_rq->load.weight; + if (load) { + cfs_rq->load_last = now; + cfs_rq->load_avg += delta * load; + } while (cfs_rq->load_period > period) { /* @@ -700,10 +711,8 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int lb) cfs_rq->load_avg /= 2; } - if (lb && !cfs_rq->nr_running) { - if (cfs_rq->load_avg < (period / 8)) - list_del_leaf_cfs_rq(cfs_rq); - } + if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg) + list_del_leaf_cfs_rq(cfs_rq); } static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, @@ -750,7 +759,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) reweight_entity(cfs_rq_of(se), se, shares); } #else /* CONFIG_FAIR_GROUP_SCHED */ -static inline void update_cfs_load(struct cfs_rq *cfs_rq, int lb) +static inline void update_cfs_load(struct cfs_rq *cfs_rq) { } @@ -880,7 +889,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); - update_cfs_load(cfs_rq, 0); + update_cfs_load(cfs_rq); update_cfs_shares(cfs_rq, se->load.weight); account_entity_enqueue(cfs_rq, se); @@ -941,7 +950,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); se->on_rq = 0; - update_cfs_load(cfs_rq, 0); + update_cfs_load(cfs_rq); account_entity_dequeue(cfs_rq, se); update_min_vruntime(cfs_rq); update_cfs_shares(cfs_rq, 0); @@ -1176,7 +1185,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - update_cfs_load(cfs_rq, 0); + update_cfs_load(cfs_rq); update_cfs_shares(cfs_rq, 0); } @@ -1206,7 +1215,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - update_cfs_load(cfs_rq, 0); + update_cfs_load(cfs_rq); update_cfs_shares(cfs_rq, 0); } @@ -2023,7 +2032,7 @@ static int tg_shares_up(struct task_group *tg, int cpu) raw_spin_lock_irqsave(&rq->lock, flags); update_rq_clock(rq); - update_cfs_load(cfs_rq, 1); + update_cfs_load(cfs_rq); load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); load_avg -= cfs_rq->load_contribution; -- cgit v1.2.3 From 67e86250f8ea7b8f7da53ac25ea73c6bd71f5cd9 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Mon, 15 Nov 2010 15:47:05 -0800 Subject: sched: Introduce hierarchal order on shares update list Avo