summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/delayacct.c2
-rw-r--r--kernel/exit.c6
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/ksysfs.c8
-rw-r--r--kernel/sched.c1444
-rw-r--r--kernel/sched_debug.c282
-rw-r--r--kernel/sched_fair.c811
-rw-r--r--kernel/sched_idletask.c8
-rw-r--r--kernel/sched_rt.c19
-rw-r--r--kernel/sched_stats.h28
-rw-r--r--kernel/sysctl.c37
-rw-r--r--kernel/user.c249
12 files changed, 1654 insertions, 1243 deletions
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 81e697829633..09e9574eeb26 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -119,7 +119,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
* No locking available for sched_info (and too expensive to add one)
* Mitigate by taking snapshot of values
*/
- t1 = tsk->sched_info.pcnt;
+ t1 = tsk->sched_info.pcount;
t2 = tsk->sched_info.run_delay;
t3 = tsk->sched_info.cpu_time;
diff --git a/kernel/exit.c b/kernel/exit.c
index 993369ee94d1..7f7959de4a87 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -111,6 +111,7 @@ static void __exit_signal(struct task_struct *tsk)
*/
sig->utime = cputime_add(sig->utime, tsk->utime);
sig->stime = cputime_add(sig->stime, tsk->stime);
+ sig->gtime = cputime_add(sig->gtime, tsk->gtime);
sig->min_flt += tsk->min_flt;
sig->maj_flt += tsk->maj_flt;
sig->nvcsw += tsk->nvcsw;
@@ -1242,6 +1243,11 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
cputime_add(p->stime,
cputime_add(sig->stime,
sig->cstime)));
+ psig->cgtime =
+ cputime_add(psig->cgtime,
+ cputime_add(p->gtime,
+ cputime_add(sig->gtime,
+ sig->cgtime)));
psig->cmin_flt +=
p->min_flt + sig->min_flt + sig->cmin_flt;
psig->cmaj_flt +=
diff --git a/kernel/fork.c b/kernel/fork.c
index 5e67f90a1694..3fc3c1383912 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -877,6 +877,8 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
sig->tty_old_pgrp = NULL;
sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
+ sig->gtime = cputime_zero;
+ sig->cgtime = cputime_zero;
sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
@@ -1045,6 +1047,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->utime = cputime_zero;
p->stime = cputime_zero;
+ p->gtime = cputime_zero;
#ifdef CONFIG_TASK_XACCT
p->rchar = 0; /* I/O counter: bytes read */
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index d0e5c48e18c7..6046939d0804 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -14,6 +14,7 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/kexec.h>
+#include <linux/sched.h>
#define KERNEL_ATTR_RO(_name) \
static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
@@ -116,6 +117,13 @@ static int __init ksysfs_init(void)
&notes_attr);
}
+ /*
+ * Create "/sys/kernel/uids" directory and corresponding root user's
+ * directory under it.
+ */
+ if (!error)
+ error = uids_kobject_init();
+
return error;
}
diff --git a/kernel/sched.c b/kernel/sched.c
index 6c10fa796ca0..bba57adb9504 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -96,7 +96,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
/*
* Some helpers for converting nanosecond timing to jiffy resolution
*/
-#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
+#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (1000000000 / HZ))
#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
#define NICE_0_LOAD SCHED_LOAD_SCALE
@@ -105,11 +105,9 @@ unsigned long long __attribute__((weak)) sched_clock(void)
/*
* These are the 'tuning knobs' of the scheduler:
*
- * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
- * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
+ * default timeslice is 100 msecs (used only for SCHED_RR tasks).
* Timeslices get refilled after they expire.
*/
-#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
#define DEF_TIMESLICE (100 * HZ / 1000)
#ifdef CONFIG_SMP
@@ -133,24 +131,6 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
}
#endif
-#define SCALE_PRIO(x, prio) \
- max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
-
-/*
- * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
- * to time slice values: [800ms ... 100ms ... 5ms]
- */
-static unsigned int static_prio_timeslice(int static_prio)
-{
- if (static_prio == NICE_TO_PRIO(19))
- return 1;
-
- if (static_prio < NICE_TO_PRIO(0))
- return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
- else
- return SCALE_PRIO(DEF_TIMESLICE, static_prio);
-}
-
static inline int rt_policy(int policy)
{
if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
@@ -171,31 +151,91 @@ struct rt_prio_array {
struct list_head queue[MAX_RT_PRIO];
};
-struct load_stat {
- struct load_weight load;
- u64 load_update_start, load_update_last;
- unsigned long delta_fair, delta_exec, delta_stat;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+struct cfs_rq;
+
+/* task group related information */
+struct task_group {
+ /* schedulable entities of this group on each cpu */
+ struct sched_entity **se;
+ /* runqueue "owned" by this group on each cpu */
+ struct cfs_rq **cfs_rq;
+ unsigned long shares;
+ /* spinlock to serialize modification to shares */
+ spinlock_t lock;
+};
+
+/* Default task group's sched entity on each cpu */
+static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
+/* Default task group's cfs_rq on each cpu */
+static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
+
+static struct sched_entity *init_sched_entity_p[NR_CPUS];
+static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
+
+/* Default task group.
+ * Every task in system belong to this group at bootup.
+ */
+struct task_group init_task_group = {
+ .se = init_sched_entity_p,
+ .cfs_rq = init_cfs_rq_p,
};
+#ifdef CONFIG_FAIR_USER_SCHED
+# define INIT_TASK_GRP_LOAD 2*NICE_0_LOAD
+#else
+# define INIT_TASK_GRP_LOAD NICE_0_LOAD
+#endif
+
+static int init_task_group_load = INIT_TASK_GRP_LOAD;
+
+/* return group to which a task belongs */
+static inline struct task_group *task_group(struct task_struct *p)
+{
+ struct task_group *tg;
+
+#ifdef CONFIG_FAIR_USER_SCHED
+ tg = p->user->tg;
+#else
+ tg = &init_task_group;
+#endif
+
+ return tg;
+}
+
+/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
+static inline void set_task_cfs_rq(struct task_struct *p)
+{
+ p->se.cfs_rq = task_group(p)->cfs_rq[task_cpu(p)];
+ p->se.parent = task_group(p)->se[task_cpu(p)];
+}
+
+#else
+
+static inline void set_task_cfs_rq(struct task_struct *p) { }
+
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
unsigned long nr_running;
- s64 fair_clock;
u64 exec_clock;
- s64 wait_runtime;
- u64 sleeper_bonus;
- unsigned long wait_runtime_overruns, wait_runtime_underruns;
+ u64 min_vruntime;
struct rb_root tasks_timeline;
struct rb_node *rb_leftmost;
struct rb_node *rb_load_balance_curr;
-#ifdef CONFIG_FAIR_GROUP_SCHED
/* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
struct sched_entity *curr;
+
+ unsigned long nr_spread_over;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
/* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
@@ -206,6 +246,8 @@ struct cfs_rq {
* list is used during load balance.
*/
struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
+ struct task_group *tg; /* group that "owns" this runqueue */
+ struct rcu_head rcu;
#endif
};
@@ -237,7 +279,7 @@ struct rq {
#ifdef CONFIG_NO_HZ
unsigned char in_nohz_recently;
#endif
- struct load_stat ls; /* capture load from *all* tasks on this cpu */
+ struct load_weight load; /* capture load from *all* tasks on this cpu */
unsigned long nr_load_updates;
u64 nr_switches;
@@ -289,16 +331,19 @@ struct rq {
unsigned long yld_exp_empty;
unsigned long yld_act_empty;
unsigned long yld_both_empty;
- unsigned long yld_cnt;
+ unsigned long yld_count;
/* schedule() stats */
unsigned long sched_switch;
- unsigned long sched_cnt;
+ unsigned long sched_count;
unsigned long sched_goidle;
/* try_to_wake_up() stats */
- unsigned long ttwu_cnt;
+ unsigned long ttwu_count;
unsigned long ttwu_local;
+
+ /* BKL stats */
+ unsigned long bkl_count;
#endif
struct lock_class_key rq_lock_key;
};
@@ -383,6 +428,37 @@ static void update_rq_clock(struct rq *rq)
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
/*
+ * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
+ */
+#ifdef CONFIG_SCHED_DEBUG
+# define const_debug __read_mostly
+#else
+# define const_debug static const
+#endif
+
+/*
+ * Debugging: various feature bits
+ */
+enum {
+ SCHED_FEAT_NEW_FAIR_SLEEPERS = 1,
+ SCHED_FEAT_START_DEBIT = 2,
+ SCHED_FEAT_TREE_AVG = 4,
+ SCHED_FEAT_APPROX_AVG = 8,
+ SCHED_FEAT_WAKEUP_PREEMPT = 16,
+ SCHED_FEAT_PREEMPT_RESTRICT = 32,
+};
+
+const_debug unsigned int sysctl_sched_features =
+ SCHED_FEAT_NEW_FAIR_SLEEPERS *1 |
+ SCHED_FEAT_START_DEBIT *1 |
+ SCHED_FEAT_TREE_AVG *0 |
+ SCHED_FEAT_APPROX_AVG *0 |
+ SCHED_FEAT_WAKEUP_PREEMPT *1 |
+ SCHED_FEAT_PREEMPT_RESTRICT *1;
+
+#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
+
+/*
* For kernel-internal use: high-speed (but slightly incorrect) per-cpu
* clock constructed from sched_clock():
*/
@@ -400,18 +476,7 @@ unsigned long long cpu_clock(int cpu)
return now;
}
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/* Change a task's ->cfs_rq if it moves across CPUs */
-static inline void set_task_cfs_rq(struct task_struct *p)
-{
- p->se.cfs_rq = &task_rq(p)->cfs;
-}
-#else
-static inline void set_task_cfs_rq(struct task_struct *p)
-{
-}
-#endif
+EXPORT_SYMBOL_GPL(cpu_clock);
#ifndef prepare_arch_switch
# define prepare_arch_switch(next) do { } while (0)
@@ -497,16 +562,13 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
static inline struct rq *__task_rq_lock(struct task_struct *p)
__acquires(rq->lock)
{
- struct rq *rq;
-
-repeat_lock_task:
- rq = task_rq(p);
- spin_lock(&rq->lock);
- if (unlikely(rq != task_rq(p))) {
+ for (;;) {
+ struct rq *rq = task_rq(p);
+ spin_lock(&rq->lock);
+ if (likely(rq == task_rq(p)))
+ return rq;
spin_unlock(&rq->lock);
- goto repeat_lock_task;
}
- return rq;
}
/*
@@ -519,18 +581,17 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
{
struct rq *rq;
-repeat_lock_task:
- local_irq_save(*flags);
- rq = task_rq(p);
- spin_lock(&rq->lock);
- if (unlikely(rq != task_rq(p))) {
+ for (;;) {
+ local_irq_save(*flags);
+ rq = task_rq(p);
+ spin_lock(&rq->lock);
+ if (likely(rq == task_rq(p)))
+ return rq;
spin_unlock_irqrestore(&rq->lock, *flags);
- goto repeat_lock_task;
}
- return rq;
}
-static inline void __task_rq_unlock(struct rq *rq)
+static void __task_rq_unlock(struct rq *rq)
__releases(rq->lock)
{
spin_unlock(&rq->lock);
@@ -545,7 +606,7 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
/*
* this_rq_lock - lock this runqueue and disable interrupts.
*/
-static inline struct rq *this_rq_lock(void)
+static struct rq *this_rq_lock(void)
__acquires(rq->lock)
{
struct rq *rq;
@@ -645,19 +706,6 @@ static inline void resched_task(struct task_struct *p)
}
#endif
-static u64 div64_likely32(u64 divident, unsigned long divisor)
-{
-#if BITS_PER_LONG == 32
- if (likely(divident <= 0xffffffffULL))
- return (u32)divident / divisor;
- do_div(divident, divisor);
-
- return divident;
-#else
- return divident / divisor;
-#endif
-}
-
#if BITS_PER_LONG == 32
# define WMULT_CONST (~0UL)
#else
@@ -699,16 +747,14 @@ calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
}
-static void update_load_add(struct load_weight *lw, unsigned long inc)
+static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
lw->weight += inc;
- lw->inv_weight = 0;
}
-static void update_load_sub(struct load_weight *lw, unsigned long dec)
+static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
{
lw->weight -= dec;
- lw->inv_weight = 0;
}
/*
@@ -784,29 +830,20 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
int *this_best_prio, struct rq_iterator *iterator);
#include "sched_stats.h"
-#include "sched_rt.c"
-#include "sched_fair.c"
#include "sched_idletask.c"
+#include "sched_fair.c"
+#include "sched_rt.c"
#ifdef CONFIG_SCHED_DEBUG
# include "sched_debug.c"
#endif
#define sched_class_highest (&rt_sched_class)
-static void __update_curr_load(struct rq *rq, struct load_stat *ls)
-{
- if (rq->curr != rq->idle && ls->load.weight) {
- ls->delta_exec += ls->delta_stat;
- ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
- ls->delta_stat = 0;
- }
-}
-
/*
* Update delta_exec, delta_fair fields for rq.
*
* delta_fair clock advances at a rate inversely proportional to
- * total load (rq->ls.load.weight) on the runqueue, while
+ * total load (rq->load.weight) on the runqueue, while
* delta_exec advances at the same rate as wall-clock (provided
* cpu is not idle).
*
@@ -814,35 +851,17 @@ static void __update_curr_load(struct rq *rq, struct load_stat *ls)
* runqueue over any given interval. This (smoothened) load is used
* during load balance.
*
- * This function is called /before/ updating rq->ls.load
+ * This function is called /before/ updating rq->load
* and when switching tasks.
*/
-static void update_curr_load(struct rq *rq)
-{
- struct load_stat *ls = &rq->ls;
- u64 start;
-
- start = ls->load_update_start;
- ls->load_update_start = rq->clock;
- ls->delta_stat += rq->clock - start;
- /*
- * Stagger updates to ls->delta_fair. Very frequent updates
- * can be expensive.
- */
- if (ls->delta_stat >= sysctl_sched_stat_granularity)
- __update_curr_load(rq, ls);
-}
-
static inline void inc_load(struct rq *rq, const struct task_struct *p)
{
- update_curr_load(rq);
- update_load_add(&rq->ls.load, p->se.load.weight);
+ update_load_add(&rq->load, p->se.load.weight);
}
static inline void dec_load(struct rq *rq, const struct task_struct *p)
{
- update_curr_load(rq);
- update_load_sub(&rq->ls.load, p->se.load.weight);
+ update_load_sub(&rq->load, p->se.load.weight);
}
static void inc_nr_running(struct task_struct *p, struct rq *rq)
@@ -859,8 +878,6 @@ static void dec_nr_running(struct task_struct *p, struct rq *rq)
static void set_load_weight(struct task_struct *p)
{
- p->se.wait_runtime = 0;
-
if (task_has_rt_policy(p)) {
p->se.load.weight = prio_to_weight[0] * 2;
p->se.load.inv_weight = prio_to_wmult[0] >> 1;
@@ -952,20 +969,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
}
/*
- * activate_idle_task - move idle task to the _front_ of runqueue.
- */
-static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
-{
- update_rq_clock(rq);
-
- if (p->state == TASK_UNINTERRUPTIBLE)
- rq->nr_uninterruptible--;
-
- enqueue_task(rq, p, 0);
- inc_nr_running(p, rq);
-}
-
-/*
* deactivate_task - remove a task from the runqueue.
*/
static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
@@ -989,32 +992,50 @@ inline int task_curr(const struct task_struct *p)
/* Used instead of source_load when we know the type == 0 */
unsigned long weighted_cpuload(const int cpu)
{
- return cpu_rq(cpu)->ls.load.weight;
+ return cpu_rq(cpu)->load.weight;
}
static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
{
#ifdef CONFIG_SMP
task_thread_info(p)->cpu = cpu;
- set_task_cfs_rq(p);
#endif
+ set_task_cfs_rq(p);
}
#ifdef CONFIG_SMP
+/*
+ * Is this task likely cache-hot:
+ */
+static inline int
+task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
+{
+ s64 delta;
+
+ if (p->sched_class != &fair_sched_class)
+ return 0;
+
+ if (sysctl_sched_migration_cost == -1)
+ return 1;
+ if (sysctl_sched_migration_cost == 0)
+ return 0;
+
+ delta = now - p->se.exec_start;
+
+ return delta < (s64)sysctl_sched_migration_cost;
+}
+
+
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{
int old_cpu = task_cpu(p);
struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
- u64 clock_offset, fair_clock_offset;
+ struct cfs_rq *old_cfsrq = task_cfs_rq(p),
+ *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
+ u64 clock_offset;
clock_offset = old_rq->clock - new_rq->clock;
- fair_clock_offset = old_rq->cfs.fair_clock - new_rq->cfs.fair_clock;
-
- if (p->se.wait_start_fair)
- p->se.wait_start_fair -= fair_clock_offset;
- if (p->se.sleep_start_fair)
- p->se.sleep_start_fair -= fair_clock_offset;
#ifdef CONFIG_SCHEDSTATS
if (p->se.wait_start)
@@ -1023,7 +1044,14 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
p->se.sleep_start -= clock_offset;
if (p->se.block_start)
p->se.block_start -= clock_offset;
+ if (old_cpu != new_cpu) {
+ schedstat_inc(p, se.nr_migrations);
+ if (task_hot(p, old_rq->clock, NULL))
+ schedstat_inc(p, se.nr_forced2_migrations);
+ }
#endif
+ p->se.vruntime -= old_cfsrq->min_vruntime -
+ new_cfsrq->min_vruntime;
__set_task_cpu(p, new_cpu);
}
@@ -1078,69 +1106,71 @@ void wait_task_inactive(struct task_struct *p)
int running, on_rq;
struct rq *rq;
-repeat:
- /*
- * We do the initial early heuristics without holding
- * any task-queue locks at all. We'll only try to get
- * the runqueue lock when things look like they will
- * work out!
- */
- rq = task_rq(p);
+ for (;;) {
+ /*
+ * We do the initial early heuristics without holding
+ * any task-queue locks at all. We'll only try to get
+ * the runqueue lock when things look like they will
+ * work out!
+ */
+ rq = task_rq(p);
- /*
- * If the task is actively running on another CPU
- * still, just relax and busy-wait without holding
- * any locks.
- *
- * NOTE! Since we don't hold any locks, it's not
- * even sure that "rq" stays as the right runqueue!
- * But we don't care, since "task_running()" will
- * return false if the runqueue has changed and p
- * is actually now running somewhere else!
- */
- while (task_running(rq, p))
- cpu_relax();
+ /*
+ * If the task is actively running on another CPU
+ * still, just relax and busy-wait without holding
+ * any locks.
+ *
+ * NOTE! Since we don't hold any locks, it's not
+ * even sure that "rq" stays as the right runqueue!
+ * But we don't care, since "task_running()" will
+ * return false if the runqueue has changed and p
+ * is actually now running somewhere else!
+ */
+ while (task_running(rq, p))
+ cpu_relax();
- /*
- * Ok, time to look more closely! We need the rq
- * lock now, to be *sure*. If we're wrong, we'll
- * just go back and repeat.
- */
- rq = task_rq_lock(p, &flags);
- running = task_running(rq, p);
- on_rq = p->se.on_rq;
- task_rq_unlock(rq, &flags);
+ /*
+ * Ok, time to look more closely! We need the rq
+ * lock now, to be *sure*. If we're wrong, we'll
+ * just go back and repeat.
+ */
+ rq = task_rq_lock(p, &flags);
+ running = task_running(rq, p);
+ on_rq = p->se.on_rq;
+ task_rq_unlock(rq, &flags);
- /*
- * Was it really running after all now that we
- * checked with the proper locks actually held?
- *
- * Oops. Go back and try again..
- */
- if (unlikely(running)) {
- cpu_relax();
- goto repeat;
- }
+ /*
+ * Was it really running after all now that we
+ * checked with the proper locks actually held?
+ *
+ * Oops. Go back and try again..
+ */
+ if (unlikely(running)) {
+ cpu_relax();
+ continue;
+ }
- /*
- * It's not enough that it's not actively running,
- * it must be off the runqueue _entirely_, and not
- * preempted!
- *
- * So if it wa still runnable (but just not actively
- * running right now), it's preempted, and we should
- * yield - it could be a while.
- */
- if (unlikely(on_rq)) {
- yield();
- goto repeat;
- }
+ /*
+ * It's not enough that it's not actively running,
+ * it must be off the runqueue _entirely_, and not
+ * preempted!
+ *
+ * So if it wa still runnable (but just not actively
+ * running right now), it's preempted, and we should
+ * yield - it could be a while.
+ */
+ if (unlikely(on_rq)) {
+ schedule_timeout_uninterruptible(1);
+ continue;
+ }
- /*
- * Ahh, all good. It wasn't running, and it wasn't
- * runnable, which means that it will never become
- * running in the future either. We're all done!
- */
+ /*
+ * Ahh, all good. It wasn't running, and it wasn't
+ * runnable, which means that it will never become
+ * running in the future either. We're all done!
+ */
+ break;
+ }
}
/***
@@ -1174,7 +1204,7 @@ void kick_process(struct task_struct *p)
* We want to under-estimate the load of migration sources, to
* balance conservatively.
*/
-static inline unsigned long source_load(int cpu, int type)
+static unsigned long source_load(int cpu, int type)
{
struct rq *rq = cpu_rq(cpu);
unsigned long total = weighted_cpuload(cpu);
@@ -1189,7 +1219,7 @@ static inline unsigned long source_load(int cpu, int type)
* Return a high guess at the load of a migration-target cpu weighted
* according to the scheduling class and "nice" value.
*/
-static inline unsigned long target_load(int cpu, int type)
+static unsigned long target_load(int cpu, int type)
{
struct rq *rq = cpu_rq(cpu);
unsigned long total = weighted_cpuload(cpu);
@@ -1231,7 +1261,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
/* Skip over this group if it has no CPUs allowed */
if (!cpus_intersects(group->cpumask, p->cpus_allowed))
- goto nextgroup;
+ continue;
local_group = cpu_isset(this_cpu, group->cpumask);
@@ -1259,9 +1289,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
min_load = avg_load;
idlest = group;
}
-nextgroup:
- group = group->next;
- } while (group != sd->groups);
+ } while (group = group->next, group != sd->groups);
if (!idlest || 100*this_load < imbalance*min_load)
return NULL;
@@ -1393,8 +1421,13 @@ static int wake_idle(int cpu, struct task_struct *p)
if (sd->flags & SD_WAKE_IDLE) {
cpus_and(tmp, sd->span, p->cpus_allowed);
for_each_cpu_mask(i, tmp) {
- if (idle_cpu(i))
+ if (idle_cpu(i)) {
+ if (i != task_cpu(p)) {
+ schedstat_inc(p,
+ se.nr_wakeups_idle);
+ }
return i;
+ }
}
} else {
break;
@@ -1425,7 +1458,7 @@ static inline int wake_idle(int cpu, struct task_struct *p)
*/
static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
{
- int cpu, this_cpu, success = 0;
+ int cpu, orig_cpu, this_cpu, success = 0;
unsigned long flags;
long old_state;
struct rq *rq;
@@ -1444,6 +1477,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
goto out_running;
cpu = task_cpu(p);
+ orig_cpu = cpu;
this_cpu = smp_processor_id();
#ifdef CONFIG_SMP
@@ -1452,7 +1486,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
new_cpu = cpu;
- schedstat_inc(rq, ttwu_cnt);
+ schedstat_inc(rq, ttwu_count);
if (cpu == this_cpu) {
schedstat_inc(rq, ttwu_local);
goto out_set_cpu;
@@ -1487,6 +1521,13 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
unsigned long tl = this_load;
unsigned long tl_per_task;
+ /*
+ * Attract cache-cold tasks on sync wakeups:
+ */
+ if (sync && !task_hot(p, rq->clock, this_sd))
+ goto out_set_cpu;
+
+ schedstat_inc(p, se.nr_wakeups_affine_attempts);
tl_per_task = cpu_avg_load_per_task(this_cpu);
/*
@@ -1506,6 +1547,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
* there is no bad imbalance.
*/
schedstat_inc(this_sd, ttwu_move_affine);
+ schedstat_inc(p, se.nr_wakeups_affine);
goto out_set_cpu;
}
}
@@ -1517,6 +1559,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
if (this_sd->flags & SD_WAKE_BALANCE) {
if (imbalance*this_load <= 100*load) {
schedstat_inc(this_sd, ttwu_move_balance);
+ schedstat_inc(p, se.nr_wakeups_passive);
goto out_set_cpu;
}
}
@@ -1542,18 +1585,18 @@ out_set_cpu:
out_activate:
#endif /* CONFIG_SMP */
+ schedstat_inc(p, se.nr_wakeups);
+ if (sync)
+ schedstat_inc(p, se.nr_wakeups_sync);
+ if (orig_cpu != cpu)
+ schedstat_inc(p, se.nr_wakeups_migrate);
+ if (cpu == this_cpu)
+ schedstat_inc(p, se.nr_wakeups_local);
+ else
+ schedstat_inc(p, se.nr_wakeups_remote);
update_rq_clock(rq);
activate_task(rq, p, 1);
- /*
- * Sync wakeups (i.e. those types of wakeups where the waker
- * has indicated that it will leave the CPU in short order)
- * don't trigger a preemption, if the woken up task will run on
- * this cpu. (in this case the 'I will reschedule' promise of
- * the waker guarantees that the freshly woken up task is going
- * to be considered on this CPU.)
- */
- if (!sync || cpu != this_cpu)
- check_preempt_curr(rq, p);
+ check_preempt_curr(rq, p);
success = 1;
out_running:
@@ -1584,28 +1627,20 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state)
*/
static void __sched_fork(struct task_struct *p)
{
- p->se.wait_start_fair = 0;
p->se.exec_start = 0;
p->se.sum_exec_runtime = 0;
p->se.prev_sum_exec_runtime = 0;
- p->se.delta_exec = 0;
- p->se.delta_fair_run = 0;
- p->se.delta_fair_sleep = 0;
- p->se.wait_runtime = 0;
- p->se.sleep_start_fair = 0;
#ifdef CONFIG_SCHEDSTATS
p->se.wait_start = 0;
- p->se.sum_wait_runtime = 0;
p->se.sum_sleep_runtime = 0;
p->se.sleep_start = 0;
p->se.block_start = 0;
p->se.sleep_max = 0;
p->se.block_max = 0;
p->se.exec_max = 0;
+ p->se.slice_max = 0;
p->se.wait_max = 0;
- p->se.wait_runtime_overruns = 0;
- p->se.wait_runtime_underruns = 0;
#endif
INIT_LIST_HEAD(&p->run_list);
@@ -1636,12 +1671,14 @@ void sched_fork(struct task_struct *p, int clone_flags)
#ifdef CONFIG_SMP
cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
#endif
- __set_task_cpu(p, cpu);
+ set_task_cpu(p, cpu);
/*
* Make sure we do not leak PI boosting priority to the child:
*/
p->prio = current->normal_prio;
+ if (!rt_prio(p->prio))
+ p->sched_class = &fair_sched_class;
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
if (likely(sched_info_on()))
@@ -1658,12 +1695,6 @@ void sched_fork(struct task_struct *p, int clone_flags)
}
/*
- * After fork, child runs first. (default) If set to 0 then
- * parent will (try to) run first.
- */
-unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
-
-/*
* wake_up_new_task - wake up a newly created task for the first time.
*
* This function will do some initial scheduler statistics housekeeping
@@ -1674,24 +1705,14 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
{
unsigned long flags;
struct rq *rq;
- int this_cpu;
rq = task_rq_lock(p, &flags);
BUG_ON(p->state != TASK_RUNNING);
- this_cpu = smp_processor_id(); /* parent's CPU */
update_rq_clock(rq);
p->prio = effective_prio(p);
- if (rt_prio(p->prio))
- p->sched_class = &rt_sched_class;
- else
- p->sched_class = &fair_sched_class;
-
- if (!p->sched_class->task_new || !sysctl_sched_child_runs_first ||
- (clone_flags & CLONE_VM) || task_cpu(p) != this_cpu ||
- !current->se.on_rq) {
-
+ if (!p->sched_class->task_new || !current->se.on_rq || !rq->cfs.curr) {
activate_task(rq, p, 0);
} else {
/*
@@ -1800,7 +1821,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
* with the lock held can cause deadlocks; see schedule() for
* details.)
*/
-static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
+static void finish_task_switch(struct rq *rq, struct task_struct *prev)
__releases(rq->lock)
{
struct mm_struct *mm = rq->prev_mm;
@@ -1982,42 +2003,10 @@ unsigned long nr_active(void)
*/
static void update_cpu_load(struct rq *this_rq)
{
- u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
- unsigned long total_load = this_rq->ls.load.weight;
- unsigned long this_load = total_load;
- struct load_stat *ls = &this_rq->ls;
+ unsigned long this_load = this_rq->load.weight;
int i, scale;
this_rq->nr_load_updates++;
- if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
- goto do_avg;
-
- /* Update delta_fair/delta_exec fields first */
- update_curr_load(this_rq);
-
- fair_delta64 = ls->delta_fair + 1;
- ls->delta_fair = 0;
-
- exec_delta64 = ls->delta_exec + 1;
- ls->delta_exec = 0;
-
- sample_interval64 = this_rq->clock - ls->load_update_last;
- ls->load_update_last = this_rq->clock;
-
- if ((s64)sample_interval64 < (s64)TICK_NSEC)
- sample_interval64 = TICK_NSEC;
-
- if (exec_delta64 > sample_interval64)
- exec_delta64 = sample_interval64;
-
- idle_delta64 = sample_interval64 - exec_delta64;
-
- tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
- tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
-
- this_load = (unsigned long)tmp64;
-
-do_avg:
/* Update our load: */
for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
@@ -2027,7 +2016,13 @@ do_avg:
old_load = this_rq->cpu_load[i];
new_load = this_load;
-
+ /*
+ * Round up the averaging division if load is increasing. This
+ * prevents us from getting stuck on 9 if the load is 10, for
+ * example.
+ */
+ if (new_load > old_load)
+ new_load += scale-1;
this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
}
}
@@ -2179,13 +2174,38 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
* 2) cannot be migrated to this CPU due to cpus_allowed, or
* 3) are cache-hot on their current CPU.
*/
- if (!cpu_isset(this_cpu, p->cpus_allowed))
+ if (!cpu_isset(this_cpu, p->cpus_allowed)) {
+ schedstat_inc(p, se.nr_failed_migrations_affine);
return 0;
+ }
*all_pinned = 0;
- if (task_running(rq, p))
+ if (task_running(rq, p)) {
+ schedstat_inc(p, se.nr_failed_migrations_running);
return 0;
+ }
+
+ /*
+ * Aggressive migration if:
+ * 1) task is cache cold, or
+ * 2) too many balance attempts have failed.
+ */
+
+ if (!task_hot(p, rq->clock, sd) ||
+ sd->nr_balance_failed > sd->cache_nice_tries) {
+#ifdef CONFIG_SCHEDSTATS
+ if (task_hot(p, rq->clock, sd)) {
+ schedsta