summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cpu.c18
-rw-r--r--kernel/fork.c7
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/mutex.c2
-rw-r--r--kernel/printk.c8
-rw-r--r--kernel/sched.c565
-rw-r--r--kernel/sched_autogroup.c238
-rw-r--r--kernel/sched_autogroup.h32
-rw-r--r--kernel/sched_clock.c2
-rw-r--r--kernel/sched_debug.c91
-rw-r--r--kernel/sched_fair.c322
-rw-r--r--kernel/sched_features.h2
-rw-r--r--kernel/sched_rt.c24
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/sys.c4
-rw-r--r--kernel/sysctl.c37
-rw-r--r--kernel/trace/trace_selftest.c2
-rw-r--r--kernel/watchdog.c2
19 files changed, 783 insertions, 583 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f6e726f18491..cb7a1efa9c2b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -189,7 +189,6 @@ static inline void check_for_tasks(int cpu)
}
struct take_cpu_down_param {
- struct task_struct *caller;
unsigned long mod;
void *hcpu;
};
@@ -198,7 +197,6 @@ struct take_cpu_down_param {
static int __ref take_cpu_down(void *_param)
{
struct take_cpu_down_param *param = _param;
- unsigned int cpu = (unsigned long)param->hcpu;
int err;
/* Ensure this CPU doesn't handle any more interrupts. */
@@ -208,11 +206,6 @@ static int __ref take_cpu_down(void *_param)
cpu_notify(CPU_DYING | param->mod, param->hcpu);
- if (task_cpu(param->caller) == cpu)
- move_task_off_dead_cpu(cpu, param->caller);
- /* Force idle task to run as soon as we yield: it should
- immediately notice cpu is offline and die quickly. */
- sched_idle_next();
return 0;
}
@@ -223,7 +216,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
void *hcpu = (void *)(long)cpu;
unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
struct take_cpu_down_param tcd_param = {
- .caller = current,
.mod = mod,
.hcpu = hcpu,
};
@@ -253,9 +245,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
}
BUG_ON(cpu_online(cpu));
- /* Wait for it to sleep (leaving idle task). */
+ /*
+ * The migration_call() CPU_DYING callback will have removed all
+ * runnable tasks from the cpu, there's only the idle task left now
+ * that the migration thread is done doing the stop_machine thing.
+ *
+ * Wait for the stop thread to go away.
+ */
while (!idle_cpu(cpu))
- yield();
+ cpu_relax();
/* This actually kills the CPU. */
__cpu_die(cpu);
diff --git a/kernel/fork.c b/kernel/fork.c
index 5447dc7defa9..7d164e25b0f0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -174,8 +174,10 @@ static inline void free_signal_struct(struct signal_struct *sig)
static inline void put_signal_struct(struct signal_struct *sig)
{
- if (atomic_dec_and_test(&sig->sigcnt))
+ if (atomic_dec_and_test(&sig->sigcnt)) {
+ sched_autogroup_exit(sig);
free_signal_struct(sig);
+ }
}
void __put_task_struct(struct task_struct *tsk)
@@ -905,6 +907,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
posix_cpu_timers_init_group(sig);
tty_audit_fork(sig);
+ sched_autogroup_fork(sig);
sig->oom_adj = current->signal->oom_adj;
sig->oom_score_adj = current->signal->oom_score_adj;
@@ -1315,7 +1318,7 @@ bad_fork_cleanup_mm:
}
bad_fork_cleanup_signal:
if (!(clone_flags & CLONE_THREAD))
- free_signal_struct(p->signal);
+ put_signal_struct(p->signal);
bad_fork_cleanup_sighand:
__cleanup_sighand(p->sighand);
bad_fork_cleanup_fs:
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5f92acc5f952..91a5fa25054e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -577,7 +577,9 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
*/
static int irq_thread(void *data)
{
- struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, };
+ static struct sched_param param = {
+ .sched_priority = MAX_USER_RT_PRIO/2,
+ };
struct irqaction *action = data;
struct irq_desc *desc = irq_to_desc(action->irq);
int wake, oneshot = desc->status & IRQ_ONESHOT;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index ca61bbdd44b2..5355cfd44a3f 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -148,7 +148,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
wait_for_completion(&create.done);
if (!IS_ERR(create.result)) {
- struct sched_param param = { .sched_priority = 0 };
+ static struct sched_param param = { .sched_priority = 0 };
va_list args;
va_start(args, namefmt);
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 200407c1502f..a5889fb28ecf 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -199,7 +199,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* memory barriers as we'll eventually observe the right
* values at the cost of a few extra spins.
*/
- cpu_relax();
+ arch_mutex_cpu_relax();
}
#endif
spin_lock_mutex(&lock->wait_lock, flags);
diff --git a/kernel/printk.c b/kernel/printk.c
index a23315dc4498..ab3ffc5b3b64 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1074,17 +1074,17 @@ static DEFINE_PER_CPU(int, printk_pending);
void printk_tick(void)
{
- if (__get_cpu_var(printk_pending)) {
- __get_cpu_var(printk_pending) = 0;
+ if (__this_cpu_read(printk_pending)) {
+ __this_cpu_write(printk_pending, 0);
wake_up_interruptible(&log_wait);
}
}
int printk_needs_cpu(int cpu)
{
- if (unlikely(cpu_is_offline(cpu)))
+ if (cpu_is_offline(cpu))
printk_tick();
- return per_cpu(printk_pending, cpu);
+ return __this_cpu_read(printk_pending);
}
void wake_up_klogd(void)
diff --git a/kernel/sched.c b/kernel/sched.c
index 297d1a0eedb0..f2f914e0c47c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,9 +75,11 @@
#include <asm/tlb.h>
#include <asm/irq_regs.h>
+#include <asm/mutex.h>
#include "sched_cpupri.h"
#include "workqueue_sched.h"
+#include "sched_autogroup.h"
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
@@ -253,6 +255,8 @@ struct task_group {
/* runqueue "owned" by this group on each cpu */
struct cfs_rq **cfs_rq;
unsigned long shares;
+
+ atomic_t load_weight;
#endif
#ifdef CONFIG_RT_GROUP_SCHED
@@ -268,24 +272,19 @@ struct task_group {
struct task_group *parent;
struct list_head siblings;
struct list_head children;
+
+#ifdef CONFIG_SCHED_AUTOGROUP
+ struct autogroup *autogroup;
+#endif
};
#define root_task_group init_task_group
-/* task_group_lock serializes add/remove of task groups and also changes to
- * a task group's cpu shares.
- */
+/* task_group_lock serializes the addition/removal of task groups */
static DEFINE_SPINLOCK(task_group_lock);
#ifdef CONFIG_FAIR_GROUP_SCHED
-#ifdef CONFIG_SMP
-static int root_task_group_empty(void)
-{
- return list_empty(&root_task_group.children);
-}
-#endif
-
# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
/*
@@ -342,6 +341,7 @@ struct cfs_rq {
* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
* list is used during load balance.
*/
+ int on_list;
struct list_head leaf_cfs_rq_list;
struct task_group *tg; /* group that "owns" this runqueue */
@@ -360,14 +360,17 @@ struct cfs_rq {
unsigned long h_load;
/*
- * this cpu's part of tg->shares
+ * Maintaining per-cpu shares distribution for group scheduling
+ *
+ * load_stamp is the last time we updated the load average
+ * load_last is the last time we updated the load average and saw load
+ * load_unacc_exec_time is currently unaccounted execution time
*/
- unsigned long shares;
+ u64 load_avg;
+ u64 load_period;
+ u64 load_stamp, load_last, load_unacc_exec_time;
- /*
- * load.weight at the time we set shares
- */
- unsigned long rq_weight;
+ unsigned long load_contribution;
#endif
#endif
};
@@ -605,11 +608,14 @@ static inline int cpu_of(struct rq *rq)
*/
static inline struct task_group *task_group(struct task_struct *p)
{
+ struct task_group *tg;
struct cgroup_subsys_state *css;
css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
lockdep_is_held(&task_rq(p)->lock));
- return container_of(css, struct task_group, css);
+ tg = container_of(css, struct task_group, css);
+
+ return autogroup_task_group(p, tg);
}
/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -793,20 +799,6 @@ late_initcall(sched_init_debug);
const_debug unsigned int sysctl_sched_nr_migrate = 32;
/*
- * ratelimit for updating the group shares.
- * default: 0.25ms
- */
-unsigned int sysctl_sched_shares_ratelimit = 250000;
-unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
-
-/*
- * Inject some fuzzyness into changing the per-cpu group shares
- * this avoids remote rq-locks at the expense of fairness.
- * default: 4
- */
-unsigned int sysctl_sched_shares_thresh = 4;
-
-/*
* period over which we average the RT time consumption, measured
* in ms.
*
@@ -1355,6 +1347,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
lw->inv_weight = 0;
}
+static inline void update_load_set(struct load_weight *lw, unsigned long w)
+{
+ lw->weight = w;
+ lw->inv_weight = 0;
+}
+
/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
* of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1543,101 +1541,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
#ifdef CONFIG_FAIR_GROUP_SCHED
-static __read_mostly unsigned long __percpu *update_shares_data;
-
-static void __set_se_shares(struct sched_entity *se, unsigned long shares);
-
-/*
- * Calculate and set the cpu's group shares.
- */
-static void update_group_shares_cpu(struct task_group *tg, int cpu,
- unsigned long sd_shares,
- unsigned long sd_rq_weight,
- unsigned long *usd_rq_weight)
-{
- unsigned long shares, rq_weight;
- int boost = 0;
-
- rq_weight = usd_rq_weight[cpu];
- if (!rq_weight) {
- boost = 1;
- rq_weight = NICE_0_LOAD;
- }
-
- /*
- * \Sum_j shares_j * rq_weight_i
- * shares_i = -----------------------------
- * \Sum_j rq_weight_j
- */
- shares = (sd_shares * rq_weight) / sd_rq_weight;
- shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
-
- if (abs(shares - tg->se[cpu]->load.weight) >
- sysctl_sched_shares_thresh) {
- struct rq *rq = cpu_rq(cpu);
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
- tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
- tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
- __set_se_shares(tg->se[cpu], shares);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
- }
-}
-
-/*
- * Re-compute the task group their per cpu shares over the given domain.
- * This needs to be done in a bottom-up fashion because the rq weight of a
- * parent group depends on the shares of its child groups.
- */
-static int tg_shares_up(struct task_group *tg, void *data)
-{
- unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
- unsigned long *usd_rq_weight;
- struct sched_domain *sd = data;
- unsigned long flags;
- int i;
-
- if (!tg->se[0])
- return 0;
-
- local_irq_save(flags);
- usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
-
- for_each_cpu(i, sched_domain_span(sd)) {
- weight = tg->cfs_rq[i]->load.weight;
- usd_rq_weight[i] = weight;
-
- rq_weight += weight;
- /*
- * If there are currently no tasks on the cpu pretend there
- * is one of average load so that when a new task gets to
- * run here it will not get delayed by group starvation.
- */
- if (!weight)
- weight = NICE_0_LOAD;
-
- sum_weight += weight;
- shares += tg->cfs_rq[i]->shares;
- }
-
- if (!rq_weight)
- rq_weight = sum_weight;
-
- if ((!shares && rq_weight) || shares > tg->shares)
- shares = tg->shares;
-
- if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
- shares = tg->shares;
-
- for_each_cpu(i, sched_domain_span(sd))
- update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
-
- local_irq_restore(flags);
-
- return 0;
-}
-
/*
* Compute the cpu's hierarchical load factor for each task group.
* This needs to be done in a top-down fashion because the load of a child
@@ -1652,7 +1555,7 @@ static int tg_load_down(struct task_group *tg, void *data)
load = cpu_rq(cpu)->load.weight;
} else {
load = tg->parent->cfs_rq[cpu]->h_load;
- load *= tg->cfs_rq[cpu]->shares;
+ load *= tg->se[cpu]->load.weight;
load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
}
@@ -1661,34 +1564,11 @@ static int tg_load_down(struct task_group *tg, void *data)
return 0;
}
-static void update_shares(struct sched_domain *sd)
-{
- s64 elapsed;
- u64 now;
-
- if (root_task_group_empty())
- return;
-
- now = local_clock();
- elapsed = now - sd->last_update;
-
- if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
- sd->last_update = now;
- walk_tg_tree(tg_nop, tg_shares_up, sd);
- }
-}
-
static void update_h_load(long cpu)
{
walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
}
-#else
-
-static inline void update_shares(struct sched_domain *sd)
-{
-}
-
#endif
#ifdef CONFIG_PREEMPT
@@ -1810,15 +1690,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
#endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
-{
-#ifdef CONFIG_SMP
- cfs_rq->shares = shares;
-#endif
-}
-#endif
-
static void calc_load_account_idle(struct rq *this_rq);
static void update_sysctl(void);
static int get_update_sysctl_factor(void);
@@ -2063,6 +1934,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
#include "sched_idletask.c"
#include "sched_fair.c"
#include "sched_rt.c"
+#include "sched_autogroup.c"
#include "sched_stoptask.c"
#ifdef CONFIG_SCHED_DEBUG
# include "sched_debug.c"
@@ -2255,10 +2127,8 @@ static int migration_cpu_stop(void *data);
* The task's runqueue lock must be held.
* Returns true if you have to wait for migration thread.
*/
-static bool migrate_task(struct task_struct *p, int dest_cpu)
+static bool migrate_task(struct task_struct *p, struct rq *rq)
{
- struct rq *rq = task_rq(p);
-
/*
* If the task is not on a runqueue (and not running), then
* the next wake-up will properly place the task.
@@ -2438,18 +2308,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
return dest_cpu;
/* No more Mr. Nice Guy. */
- if (unlikely(dest_cpu >= nr_cpu_ids)) {
- dest_cpu = cpuset_cpus_allowed_fallback(p);
- /*
- * Don't tell them about moving exiting tasks or
- * kernel threads (both mm NULL), since they never
- * leave kernel.
- */
- if (p->mm && printk_ratelimit()) {
- printk(KERN_INFO "process %d (%s) no "
- "longer affine to cpu%d\n",
- task_pid_nr(p), p->comm, cpu);
- }
+ dest_cpu = cpuset_cpus_allowed_fallback(p);
+ /*
+ * Don't tell them about moving exiting tasks or
+ * kernel threads (both mm NULL), since they never
+ * leave kernel.
+ */
+ if (p->mm && printk_ratelimit()) {
+ printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
+ task_pid_nr(p), p->comm, cpu);
}
return dest_cpu;
@@ -2785,7 +2652,9 @@ void sched_fork(struct task_struct *p, int clone_flags)
/* Want to start with kernel preemption disabled. */
task_thread_info(p)->preempt_count = 1;
#endif
+#ifdef CONFIG_SMP
plist_node_init(&p->pushable_tasks, MAX_PRIO);
+#endif
put_cpu();
}
@@ -3549,7 +3418,7 @@ void sched_exec(void)
* select_task_rq() can race against ->cpus_allowed
*/
if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
- likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
+ likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
struct migration_arg arg = { p, dest_cpu };
task_rq_unlock(rq, &flags);
@@ -4214,7 +4083,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
if (task_thread_info(rq->curr) != owner || need_resched())
return 0;
- cpu_relax();
+ arch_mutex_cpu_relax();
}
return 1;
@@ -4901,7 +4770,7 @@ static bool check_same_owner(struct task_struct *p)
}
static int __sched_setscheduler(struct task_struct *p, int policy,
- struct sched_param *param, bool user)
+ const struct sched_param *param, bool user)
{
int retval, oldprio, oldpolicy = -1, on_rq, running;
unsigned long flags;
@@ -5056,7 +4925,7 @@ recheck:
* NOTE that the task may be already dead.
*/
int sched_setscheduler(struct task_struct *p, int policy,
- struct sched_param *param)
+ const struct sched_param *param)
{
return __sched_setscheduler(p, policy, param, true);
}
@@ -5074,7 +4943,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
* but our caller might not have that capability.
*/
int sched_setscheduler_nocheck(struct task_struct *p, int policy,
- struct sched_param *param)
+ const struct sched_param *param)
{
return __sched_setscheduler(p, policy, param, false);
}
@@ -5590,7 +5459,7 @@ void sched_show_task(struct task_struct *p)
unsigned state;
state = p->state ? __ffs(p->state) + 1 : 0;
- printk(KERN_INFO "%-13.13s %c", p->comm,
+ printk(KERN_INFO "%-15.15s %c", p->comm,
state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
#if BITS_PER_LONG == 32
if (state == TASK_RUNNING)
@@ -5754,7 +5623,6 @@ static void update_sysctl(void)
SET_SYSCTL(sched_min_granularity);
SET_SYSCTL(sched_latency);
SET_SYSCTL(sched_wakeup_granularity);
- SET_SYSCTL(sched_shares_ratelimit);
#undef SET_SYSCTL
}
@@ -5830,7 +5698,7 @@ again:
goto out;
dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
- if (migrate_task(p, dest_cpu)) {
+ if (migrate_task(p, rq)) {
struct migration_arg arg = { p, dest_cpu };
/* Need help from migration thread: drop lock and wait. */
task_rq_unlock(rq, &flags);
@@ -5912,29 +5780,20 @@ static int migration_cpu_stop(void *data)
}
#ifdef CONFIG_HOTPLUG_CPU
+
/*
- * Figure out where task on dead CPU should go, use force if necessary.
+ * Ensures that the idle task is using init_mm right before its cpu goes
+ * offline.
*/
-void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
+void idle_task_exit(void)
{
- struct rq *rq = cpu_rq(dead_cpu);
- int needs_cpu, uninitialized_var(dest_cpu);
- unsigned long flags;
+ struct mm_struct *mm = current->active_mm;
- local_irq_save(flags);
+ BUG_ON(cpu_online(smp_processor_id()));
- raw_spin_lock(&rq->lock);
- needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
- if (needs_cpu)
- dest_cpu = select_fallback_rq(dead_cpu, p);
- raw_spin_unlock(&rq->lock);
- /*
- * It can only fail if we race with set_cpus_allowed(),
- * in the racer should migrate the task anyway.
- */
- if (needs_cpu)
- __migrate_task(p, dead_cpu, dest_cpu);
- local_irq_restore(flags);
+ if (mm != &init_mm)
+ switch_mm(mm, &init_mm, current);
+ mmdrop(mm);
}
/*
@@ -5947,128 +5806,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
static void migrate_nr_uninterruptible(struct rq *rq_src)
{
struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
- unsigned long flags;
- local_irq_save(flags);
- double_rq_lock(rq_src, rq_dest);
rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
rq_src->nr_uninterruptible = 0;
- double_rq_unlock(rq_src, rq_dest);
- local_irq_restore(flags);
-}
-
-/* Run through task list and migrate tasks from the dead cpu. */
-static void migrate_live_tasks(int src_cpu)
-{
- struct task_struct *p, *t;
-
- read_lock(&tasklist_lock);
-
- do_each_thread(t, p) {
- if (p == current)
- continue;
-
- if (task_cpu(p) == src_cpu)
- move_task_off_dead_cpu(src_cpu, p);
- } while_each_thread(t, p);
-
- read_unlock(&tasklist_lock);
}
/*
- * Schedules idle task to be the next runnable task on current CPU.
- * It does so by boosting its priority to highest possible.
- * Used by CPU offline code.
+ * remove the tasks which were accounted by rq from calc_load_tasks.
*/
-void sched_idle_next(void)
+static void calc_global_load_remove(struct rq *rq)
{
- int this_cpu = smp_processor_id();
- struct rq *rq = cpu_rq(this_cpu);
- struct task_struct *p = rq->idle;
- unsigned long flags;
-
- /* cpu has to be offline */
- BUG_ON(cpu_online(this_cpu));
-
- /*
- * Strictly not necessary since rest of the CPUs are stopped by now
- * and interrupts disabled on the current cpu.
- */
- raw_spin_lock_irqsave(&rq->lock, flags);
-
- __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
-
- activate_task(rq, p, 0);
-
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+ rq->calc_load_active = 0;
}
/*
- * Ensures that the idle task is using init_mm right before its cpu goes
- * offline.
+ * Migrate all tasks from the rq, sleeping tasks will be migrated by
+ * try_to_wake_up()->select_task_rq().
+ *
+ * Called with rq->lock held even though we'er in stop_machine() and
+ * there's no concurrency possible, we hold the required locks anyway
+ * because of lock validation efforts.
*/
-void idle_task_exit(void)
-{
- struct mm_struct *mm = current->active_mm;
-
- BUG_ON(cpu_online(smp_processor_id()));
-
- if (mm != &init_mm)
- switch_mm(mm, &init_mm, current);
- mmdrop(mm);
-}
-
-/* called under rq->lock with disabled interrupts */
-static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
+static void migrate_tasks(unsigned int dead_cpu)
{
struct rq *rq = cpu_rq(dead_cpu);
-
- /* Must be exiting, otherwise would be on tasklist. */
- BUG_ON(!p->exit_state);
-
- /* Cannot have done final schedule yet: would have vanished. */
- BUG_ON(p->state == TASK_DEAD);
-
- get_task_struct(p);
+ struct task_struct *next, *stop = rq->stop;
+ int dest_cpu;
/*
- * Drop lock around migration; if someone else moves it,
- * that's OK. No task can be added to this CPU, so iteration is
- * fine.
+ * Fudge the rq selection such that the below task selection loop
+ * doesn't get stuck on the currently eligible stop task.
+ *
+ * We're currently inside stop_machine() and the rq is either stuck
+ * in the stop_machine_cpu_stop() loop, or we're executing this code,
+ * either way we should never end up calling schedule() until we're
+ * done here.
*/
- raw_spin_unlock_irq(&rq->lock);
- move_task_off_dead_cpu(dead_cpu, p);
- raw_spin_lock_irq(&rq->lock);
-
- put_task_struct(p);
-}
-
-/* release_task() removes task from tasklist, so we won't find dead tasks. */
-static void migrate_dead_tasks(unsigned int dead_cpu)
-{
- struct rq *rq = cpu_rq(dead_cpu);
- struct task_struct *next;
+ rq->stop = NULL;
for ( ; ; ) {
- if (!rq->nr_running)
+ /*
+ * There's this thread running, bail when that's the only
+ * remaining thread.
+ */
+ if (rq->nr_running == 1)
break;
+
next = pick_next_task(rq);
- if (!next)
- break;
+ BUG_ON(!next);
next->sched_class->put_prev_task(rq, next);
- migrate_dead(dead_cpu, next);
+ /* Find suitable destination for @next, with force if needed. */
+ dest_cpu = select_fallback_rq(dead_cpu, next);
+ raw_spin_unlock(&rq->lock);
+
+ __migrate_task(next, dead_cpu, dest_cpu);
+
+ raw_spin_lock(&rq->lock);
}
-}
-/*
- * remove the tasks which were accounted by rq from calc_load_tasks.
- */
-static void calc_global_load_remove(struct rq *rq)
-{
- atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
- rq->calc_load_active = 0;
+ rq->stop = stop;
}
+
#endif /* CONFIG_HOTPLUG_CPU */
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -6278,15 +6078,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
unsigned long flags;
struct rq *rq = cpu_rq(cpu);
- switch (action) {
+ switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
rq->calc_load_update = calc_load_update;
break;
case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
/* Update our root-domain */
raw_spin_lock_irqsave(&rq->lock, flags);
if (rq->rd) {
@@ -6298,30 +6096,19 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
#ifdef CONFIG_HOTPLUG_CPU
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- migrate_live_tasks(cpu);
- /* Idle task back to normal (off runqueue, low prio) */
- raw_spin_lock_irq(&rq->lock);
- deactivate_task(rq, rq->idle, 0);
- __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
- rq->idle->sched_class = &idle_sched_class;
- migrate_dead_tasks(cpu);
- raw_spin_unlock_irq(&rq->lock);
- migrate_nr_uninterruptible(rq);
- BUG_ON(rq->nr_running != 0);
- calc_global_load_remove(rq);
- break;
-
case CPU_DYING:
- case CPU_DYING_FROZEN:
/* Update our root-domain */
raw_spin_lock_irqsave(&rq->lock, flags);
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
}
+ migrate_tasks(cpu);
+ BUG_ON(rq->nr_running != 1); /* the migration thread */
raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+ migrate_nr_uninterruptible(rq);
+ calc_global_load_remove(rq);
break;
#endif
}
@@ -8052,15 +7839,13 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
#ifdef CONFIG_FAIR_GROUP_SCHED
static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
- struct sched_entity *se, int cpu, int add,
+ struct sched_entity *se, int cpu,
struct sched_entity *parent)
{
struct rq *rq = cpu_rq(cpu);
tg->cfs_rq[cpu] = cfs_rq;
init_cfs_rq(cfs_rq, rq);
cfs_rq->tg = tg;
- if (add)
- list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
tg->se[cpu] = se;
/* se could be NULL for init_task_group */
@@ -8073,15 +7858,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
se->cfs_rq = parent->my_q;
se->my_q = cfs_rq;
- se->load.weight = tg->shares;
- se->load.inv_weight = 0;
+ update_load_set(&se->load, 0);
se->parent = parent;
}
#endif
#ifdef CONFIG_RT_GROUP_SCHED
static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
- struct sched_rt_entity *rt_se, int cpu, int add,
+ struct sched_rt_entity *rt_se, int cpu,
struct sched_rt_entity *parent)
{
struct rq *rq = cpu_rq(cpu);
@@ -8090,8 +7874,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
init_rt_rq(rt_rq, rq);
rt_rq->tg = tg;
rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
- if (add)
- list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
tg->rt_se[cpu] = rt_se;
if (!rt_se)
@@ -8164,13 +7946,9 @@ void __init sched_init(void)
#ifdef CONFIG_CGROUP_SCHED
list_add(&init_task_group.list, &task_groups);
INIT_LIST_HEAD(&init_task_group.children);
-
+ autogroup_init(&init_task);
#endif /* CONFIG_CGROUP_SCHED */
-#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
- update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
- __alignof__(unsigned long));
-#endif
for_each_possible_cpu(i) {
struct rq *rq;
@@ -8184,7 +7962,6 @@ void __init sched_init(void)
#ifdef CONFIG_FAIR_GROUP_SCHED
init_task_group.shares = init_task_group_load;
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
-#ifdef CONFIG_CGROUP_SCHED
/*
* How much cpu bandwidth does init_task_group get?
*
@@ -8204,16 +7981,13 @@ void __init sched_init(void)
* We achieve this by letting init_task_group's tasks sit
* directly in rq->cfs (i.e init_task_group->se[] = NULL).
*/
- init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
-#endif
+ init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
#ifdef CONFIG_RT_GROUP_SCHED
INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
-#ifdef CONFIG_CGROUP_SCHED
- init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
-#endif
+ init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, NULL);
#endif
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -8488,7 +8262,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
if (!se)
goto err_free_rq;
- init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
+ init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
}
return 1;
@@ -8499,15 +8273,21 @@ err:
return 0;
}
-static inline void register_fair_sched_group(struct task_group *tg, int cpu)
-{
- list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
- &cpu_rq(cpu)->leaf_cfs_rq_list);
-}
-
static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
{
- list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ /*
+ * Only empty task groups can be destroyed; so we can speculatively
+ * check on_list without danger of it being re-added.
+ */
+ if (!tg->cfs_rq[cpu]->on_list)
+ return;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
}
#else /* !CONFG_FAIR_GROUP_SCHED */
static inline void free_fa