summaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-04-30 07:43:28 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2013-04-30 07:43:28 -0700
commit16fa94b532b1958f508e07eca1a9256351241fbc (patch)
tree90012a7b7fe2b8cf96f6f5ec12490e0c5e152291 /kernel/sched
parente0972916e8fe943f342b0dd1c9d43dbf5bc261c2 (diff)
parent25f55d9d01ad7a7ad248fd5af1d22675ffd202c5 (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler changes from Ingo Molnar: "The main changes in this development cycle were: - full dynticks preparatory work by Frederic Weisbecker - factor out the cpu time accounting code better, by Li Zefan - multi-CPU load balancer cleanups and improvements by Joonsoo Kim - various smaller fixes and cleanups" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (45 commits) sched: Fix init NOHZ_IDLE flag sched: Prevent to re-select dst-cpu in load_balance() sched: Rename load_balance_tmpmask to load_balance_mask sched: Move up affinity check to mitigate useless redoing overhead sched: Don't consider other cpus in our group in case of NEWLY_IDLE sched: Explicitly cpu_idle_type checking in rebalance_domains() sched: Change position of resched_cpu() in load_balance() sched: Fix wrong rq's runnable_avg update with rt tasks sched: Document task_struct::personality field sched/cpuacct/UML: Fix header file dependency bug on the UML build cgroup: Kill subsys.active flag sched/cpuacct: No need to check subsys active state sched/cpuacct: Initialize cpuacct subsystem earlier sched/cpuacct: Initialize root cpuacct earlier sched/cpuacct: Allocate per_cpu cpuusage for root cpuacct statically sched/cpuacct: Clean up cpuacct.h sched/cpuacct: Remove redundant NULL checks in cpuacct_acount_field() sched/cpuacct: Remove redundant NULL checks in cpuacct_charge() sched/cpuacct: Add cpuacct_acount_field() sched/cpuacct: Add cpuacct_init() ...
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/Makefile1
-rw-r--r--kernel/sched/core.c254
-rw-r--r--kernel/sched/cpuacct.c296
-rw-r--r--kernel/sched/cpuacct.h17
-rw-r--r--kernel/sched/cputime.c214
-rw-r--r--kernel/sched/fair.c148
-rw-r--r--kernel/sched/idle_task.c16
-rw-r--r--kernel/sched/sched.h219
8 files changed, 718 insertions, 447 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index f06d249e103b..deaf90e4a1de 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -16,3 +16,4 @@ obj-$(CONFIG_SMP) += cpupri.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o
+obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d8285eb0cde6..ebdb19541218 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1288,8 +1288,8 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
static void
ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
{
- trace_sched_wakeup(p, true);
check_preempt_curr(rq, p, wake_flags);
+ trace_sched_wakeup(p, true);
p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
@@ -3039,11 +3039,13 @@ EXPORT_SYMBOL(preempt_schedule);
asmlinkage void __sched preempt_schedule_irq(void)
{
struct thread_info *ti = current_thread_info();
+ enum ctx_state prev_state;
/* Catch callers which need to be fixed */
BUG_ON(ti->preempt_count || !irqs_disabled());
- user_exit();
+ prev_state = exception_enter();
+
do {
add_preempt_count(PREEMPT_ACTIVE);
local_irq_enable();
@@ -3057,6 +3059,8 @@ asmlinkage void __sched preempt_schedule_irq(void)
*/
barrier();
} while (need_resched());
+
+ exception_exit(prev_state);
}
#endif /* CONFIG_PREEMPT */
@@ -6204,7 +6208,7 @@ static void sched_init_numa(void)
* 'level' contains the number of unique distances, excluding the
* identity distance node_distance(i,i).
*
- * The sched_domains_nume_distance[] array includes the actual distance
+ * The sched_domains_numa_distance[] array includes the actual distance
* numbers.
*/
@@ -6817,11 +6821,15 @@ int in_sched_functions(unsigned long addr)
}
#ifdef CONFIG_CGROUP_SCHED
+/*
+ * Default task group.
+ * Every task in system belongs to this group at bootup.
+ */
struct task_group root_task_group;
LIST_HEAD(task_groups);
#endif
-DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
void __init sched_init(void)
{
@@ -6858,7 +6866,7 @@ void __init sched_init(void)
#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_CPUMASK_OFFSTACK
for_each_possible_cpu(i) {
- per_cpu(load_balance_tmpmask, i) = (void *)ptr;
+ per_cpu(load_balance_mask, i) = (void *)ptr;
ptr += cpumask_size();
}
#endif /* CONFIG_CPUMASK_OFFSTACK */
@@ -6884,12 +6892,6 @@ void __init sched_init(void)
#endif /* CONFIG_CGROUP_SCHED */
-#ifdef CONFIG_CGROUP_CPUACCT
- root_cpuacct.cpustat = &kernel_cpustat;
- root_cpuacct.cpuusage = alloc_percpu(u64);
- /* Too early, not expected to fail */
- BUG_ON(!root_cpuacct.cpuusage);
-#endif
for_each_possible_cpu(i) {
struct rq *rq;
@@ -7411,7 +7413,7 @@ unlock:
return err;
}
-int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
+static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
{
u64 rt_runtime, rt_period;
@@ -7423,7 +7425,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
}
-long sched_group_rt_runtime(struct task_group *tg)
+static long sched_group_rt_runtime(struct task_group *tg)
{
u64 rt_runtime_us;
@@ -7435,7 +7437,7 @@ long sched_group_rt_runtime(struct task_group *tg)
return rt_runtime_us;
}
-int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
+static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
{
u64 rt_runtime, rt_period;
@@ -7448,7 +7450,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
}
-long sched_group_rt_period(struct task_group *tg)
+static long sched_group_rt_period(struct task_group *tg)
{
u64 rt_period_us;
@@ -7483,7 +7485,7 @@ static int sched_rt_global_constraints(void)
return ret;
}
-int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
+static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
{
/* Don't accept realtime tasks when there is no way for them to run */
if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
@@ -7991,226 +7993,6 @@ struct cgroup_subsys cpu_cgroup_subsys = {
#endif /* CONFIG_CGROUP_SCHED */
-#ifdef CONFIG_CGROUP_CPUACCT
-
-/*
- * CPU accounting code for task groups.
- *
- * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
- * (balbir@in.ibm.com).
- */
-
-struct cpuacct root_cpuacct;
-
-/* create a new cpu accounting group */
-static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
-{
- struct cpuacct *ca;
-
- if (!cgrp->parent)
- return &root_cpuacct.css;
-
- ca = kzalloc(sizeof(*ca), GFP_KERNEL);
- if (!ca)
- goto out;
-
- ca->cpuusage = alloc_percpu(u64);
- if (!ca->cpuusage)
- goto out_free_ca;
-
- ca->cpustat = alloc_percpu(struct kernel_cpustat);
- if (!ca->cpustat)
- goto out_free_cpuusage;
-
- return &ca->css;
-
-out_free_cpuusage:
- free_percpu(ca->cpuusage);
-out_free_ca:
- kfree(ca);
-out:
- return ERR_PTR(-ENOMEM);
-}
-
-/* destroy an existing cpu accounting group */
-static void cpuacct_css_free(struct cgroup *cgrp)
-{
- struct cpuacct *ca = cgroup_ca(cgrp);
-
- free_percpu(ca->cpustat);
- free_percpu(ca->cpuusage);
- kfree(ca);
-}
-
-static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
-{
- u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
- u64 data;
-
-#ifndef CONFIG_64BIT
- /*
- * Take rq->lock to make 64-bit read safe on 32-bit platforms.
- */
- raw_spin_lock_irq(&cpu_rq(cpu)->lock);
- data = *cpuusage;
- raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
-#else
- data = *cpuusage;
-#endif
-
- return data;
-}
-
-static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
-{
- u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
-
-#ifndef CONFIG_64BIT
- /*
- * Take rq->lock to make 64-bit write safe on 32-bit platforms.
- */
- raw_spin_lock_irq(&cpu_rq(cpu)->lock);
- *cpuusage = val;
- raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
-#else
- *cpuusage = val;
-#endif
-}
-
-/* return total cpu usage (in nanoseconds) of a group */
-static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
-{
- struct cpuacct *ca = cgroup_ca(cgrp);
- u64 totalcpuusage = 0;
- int i;
-
- for_each_present_cpu(i)
- totalcpuusage += cpuacct_cpuusage_read(ca, i);
-
- return totalcpuusage;
-}
-
-static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
- u64 reset)
-{
- struct cpuacct *ca = cgroup_ca(cgrp);
- int err = 0;
- int i;
-
- if (reset) {
- err = -EINVAL;
- goto out;
- }
-
- for_each_present_cpu(i)
- cpuacct_cpuusage_write(ca, i, 0);
-
-out:
- return err;
-}
-
-static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
- struct seq_file *m)
-{
- struct cpuacct *ca = cgroup_ca(cgroup);
- u64 percpu;
- int i;
-
- for_each_present_cpu(i) {
- percpu = cpuacct_cpuusage_read(ca, i);
- seq_printf(m, "%llu ", (unsigned long long) percpu);
- }
- seq_printf(m, "\n");
- return 0;
-}
-
-static const char *cpuacct_stat_desc[] = {
- [CPUACCT_STAT_USER] = "user",
- [CPUACCT_STAT_SYSTEM] = "system",
-};
-
-static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
- struct cgroup_map_cb *cb)
-{
- struct cpuacct *ca = cgroup_ca(cgrp);
- int cpu;
- s64 val = 0;
-
- for_each_online_cpu(cpu) {
- struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
- val += kcpustat->cpustat[CPUTIME_USER];
- val += kcpustat->cpustat[CPUTIME_NICE];
- }
- val = cputime64_to_clock_t(val);
- cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
-
- val = 0;
- for_each_online_cpu(cpu) {
- struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
- val += kcpustat->cpustat[CPUTIME_SYSTEM];
- val += kcpustat->cpustat[CPUTIME_IRQ];
- val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
- }
-
- val = cputime64_to_clock_t(val);
- cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
-
- return 0;
-}
-
-static struct cftype files[] = {
- {
- .name = "usage",
- .read_u64 = cpuusage_read,
- .write_u64 = cpuusage_write,
- },
- {
- .name = "usage_percpu",
- .read_seq_string = cpuacct_percpu_seq_read,
- },
- {
- .name = "stat",
- .read_map = cpuacct_stats_show,
- },
- { } /* terminate */
-};
-
-/*
- * charge this task's execution time to its accounting group.
- *
- * called with rq->lock held.
- */
-void cpuacct_charge(struct task_struct *tsk, u64 cputime)
-{
- struct cpuacct *ca;
- int cpu;
-
- if (unlikely(!cpuacct_subsys.active))
- return;
-
- cpu = task_cpu(tsk);
-
- rcu_read_lock();
-
- ca = task_ca(tsk);
-
- for (; ca; ca = parent_ca(ca)) {
- u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
- *cpuusage += cputime;
- }
-
- rcu_read_unlock();
-}
-
-struct cgroup_subsys cpuacct_subsys = {
- .name = "cpuacct",
- .css_alloc = cpuacct_css_alloc,
- .css_free = cpuacct_css_free,
- .subsys_id = cpuacct_subsys_id,
- .base_cftypes = files,
-};
-#endif /* CONFIG_CGROUP_CPUACCT */
-
void dump_cpu_task(int cpu)
{
pr_info("Task dump for CPU %d:\n", cpu);
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
new file mode 100644
index 000000000000..dbb7e2cd95eb
--- /dev/null
+++ b/kernel/sched/cpuacct.c
@@ -0,0 +1,296 @@
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#include <linux/percpu.h>
+#include <linux/spinlock.h>
+#include <linux/cpumask.h>
+#include <linux/seq_file.h>
+#include <linux/rcupdate.h>
+#include <linux/kernel_stat.h>
+#include <linux/err.h>
+
+#include "sched.h"
+
+/*
+ * CPU accounting code for task groups.
+ *
+ * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
+ * (balbir@in.ibm.com).
+ */
+
+/* Time spent by the tasks of the cpu accounting group executing in ... */
+enum cpuacct_stat_index {
+ CPUACCT_STAT_USER, /* ... user mode */
+ CPUACCT_STAT_SYSTEM, /* ... kernel mode */
+
+ CPUACCT_STAT_NSTATS,
+};
+
+/* track cpu usage of a group of tasks and its child groups */
+struct cpuacct {
+ struct cgroup_subsys_state css;
+ /* cpuusage holds pointer to a u64-type object on every cpu */
+ u64 __percpu *cpuusage;
+ struct kernel_cpustat __percpu *cpustat;
+};
+
+/* return cpu accounting group corresponding to this container */
+static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
+{
+ return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
+ struct cpuacct, css);
+}
+
+/* return cpu accounting group to which this task belongs */
+static inline struct cpuacct *task_ca(struct task_struct *tsk)
+{
+ return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
+ struct cpuacct, css);
+}
+
+static inline struct cpuacct *__parent_ca(struct cpuacct *ca)
+{
+ return cgroup_ca(ca->css.cgroup->parent);
+}
+
+static inline struct cpuacct *parent_ca(struct cpuacct *ca)
+{
+ if (!ca->css.cgroup->parent)
+ return NULL;
+ return cgroup_ca(ca->css.cgroup->parent);
+}
+
+static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
+static struct cpuacct root_cpuacct = {
+ .cpustat = &kernel_cpustat,
+ .cpuusage = &root_cpuacct_cpuusage,
+};
+
+/* create a new cpu accounting group */
+static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
+{
+ struct cpuacct *ca;
+
+ if (!cgrp->parent)
+ return &root_cpuacct.css;
+
+ ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+ if (!ca)
+ goto out;
+
+ ca->cpuusage = alloc_percpu(u64);
+ if (!ca->cpuusage)
+ goto out_free_ca;
+
+ ca->cpustat = alloc_percpu(struct kernel_cpustat);
+ if (!ca->cpustat)
+ goto out_free_cpuusage;
+
+ return &ca->css;
+
+out_free_cpuusage:
+ free_percpu(ca->cpuusage);
+out_free_ca:
+ kfree(ca);
+out:
+ return ERR_PTR(-ENOMEM);
+}
+
+/* destroy an existing cpu accounting group */
+static void cpuacct_css_free(struct cgroup *cgrp)
+{
+ struct cpuacct *ca = cgroup_ca(cgrp);
+
+ free_percpu(ca->cpustat);
+ free_percpu(ca->cpuusage);
+ kfree(ca);
+}
+
+static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
+{
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+ u64 data;
+
+#ifndef CONFIG_64BIT
+ /*
+ * Take rq->lock to make 64-bit read safe on 32-bit platforms.
+ */
+ raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+ data = *cpuusage;
+ raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+#else
+ data = *cpuusage;
+#endif
+
+ return data;
+}
+
+static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
+{
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+
+#ifndef CONFIG_64BIT
+ /*
+ * Take rq->lock to make 64-bit write safe on 32-bit platforms.
+ */
+ raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+ *cpuusage = val;
+ raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+#else
+ *cpuusage = val;
+#endif
+}
+
+/* return total cpu usage (in nanoseconds) of a group */
+static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
+{
+ struct cpuacct *ca = cgroup_ca(cgrp);
+ u64 totalcpuusage = 0;
+ int i;
+
+ for_each_present_cpu(i)
+ totalcpuusage += cpuacct_cpuusage_read(ca, i);
+
+ return totalcpuusage;
+}
+
+static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
+ u64 reset)
+{
+ struct cpuacct *ca = cgroup_ca(cgrp);
+ int err = 0;
+ int i;
+
+ if (reset) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ for_each_present_cpu(i)
+ cpuacct_cpuusage_write(ca, i, 0);
+
+out:
+ return err;
+}
+
+static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
+ struct seq_file *m)
+{
+ struct cpuacct *ca = cgroup_ca(cgroup);
+ u64 percpu;
+ int i;
+
+ for_each_present_cpu(i) {
+ percpu = cpuacct_cpuusage_read(ca, i);
+ seq_printf(m, "%llu ", (unsigned long long) percpu);
+ }
+ seq_printf(m, "\n");
+ return 0;
+}
+
+static const char * const cpuacct_stat_desc[] = {
+ [CPUACCT_STAT_USER] = "user",
+ [CPUACCT_STAT_SYSTEM] = "system",
+};
+
+static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
+ struct cgroup_map_cb *cb)
+{
+ struct cpuacct *ca = cgroup_ca(cgrp);
+ int cpu;
+ s64 val = 0;
+
+ for_each_online_cpu(cpu) {
+ struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
+ val += kcpustat->cpustat[CPUTIME_USER];
+ val += kcpustat->cpustat[CPUTIME_NICE];
+ }
+ val = cputime64_to_clock_t(val);
+ cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
+
+ val = 0;
+ for_each_online_cpu(cpu) {
+ struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
+ val += kcpustat->cpustat[CPUTIME_SYSTEM];
+ val += kcpustat->cpustat[CPUTIME_IRQ];
+ val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
+ }
+
+ val = cputime64_to_clock_t(val);
+ cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
+
+ return 0;
+}
+
+static struct cftype files[] = {
+ {
+ .name = "usage",
+ .read_u64 = cpuusage_read,
+ .write_u64 = cpuusage_write,
+ },
+ {
+ .name = "usage_percpu",
+ .read_seq_string = cpuacct_percpu_seq_read,
+ },
+ {
+ .name = "stat",
+ .read_map = cpuacct_stats_show,
+ },
+ { } /* terminate */
+};
+
+/*
+ * charge this task's execution time to its accounting group.
+ *
+ * called with rq->lock held.
+ */
+void cpuacct_charge(struct task_struct *tsk, u64 cputime)
+{
+ struct cpuacct *ca;
+ int cpu;
+
+ cpu = task_cpu(tsk);
+
+ rcu_read_lock();
+
+ ca = task_ca(tsk);
+
+ while (true) {
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+ *cpuusage += cputime;
+
+ ca = parent_ca(ca);
+ if (!ca)
+ break;
+ }
+
+ rcu_read_unlock();
+}
+
+/*
+ * Add user/system time to cpuacct.
+ *
+ * Note: it's the caller that updates the account of the root cgroup.
+ */
+void cpuacct_account_field(struct task_struct *p, int index, u64 val)
+{
+ struct kernel_cpustat *kcpustat;
+ struct cpuacct *ca;
+
+ rcu_read_lock();
+ ca = task_ca(p);
+ while (ca != &root_cpuacct) {
+ kcpustat = this_cpu_ptr(ca->cpustat);
+ kcpustat->cpustat[index] += val;
+ ca = __parent_ca(ca);
+ }
+ rcu_read_unlock();
+}
+
+struct cgroup_subsys cpuacct_subsys = {
+ .name = "cpuacct",
+ .css_alloc = cpuacct_css_alloc,
+ .css_free = cpuacct_css_free,
+ .subsys_id = cpuacct_subsys_id,
+ .base_cftypes = files,
+ .early_init = 1,
+};
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
new file mode 100644
index 000000000000..ed605624a5e7
--- /dev/null
+++ b/kernel/sched/cpuacct.h
@@ -0,0 +1,17 @@
+#ifdef CONFIG_CGROUP_CPUACCT
+
+extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
+extern void cpuacct_account_field(struct task_struct *p, int index, u64 val);
+
+#else
+
+static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime)
+{
+}
+
+static inline void
+cpuacct_account_field(struct task_struct *p, int index, u64 val)
+{
+}
+
+#endif
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index e93cca92f38b..ea32f02bf2c3 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -115,10 +115,6 @@ static int irqtime_account_si_update(void)
static inline void task_group_account_field(struct task_struct *p, int index,
u64 tmp)
{
-#ifdef CONFIG_CGROUP_CPUACCT
- struct kernel_cpustat *kcpustat;
- struct cpuacct *ca;
-#endif
/*
* Since all updates are sure to touch the root cgroup, we
* get ourselves ahead and touch it first. If the root cgroup
@@ -127,19 +123,7 @@ static inline void task_group_account_field(struct task_struct *p, int index,
*/
__get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
-#ifdef CONFIG_CGROUP_CPUACCT
- if (unlikely(!cpuacct_subsys.active))
- return;
-
- rcu_read_lock();
- ca = task_ca(p);
- while (ca && (ca != &root_cpuacct)) {
- kcpustat = this_cpu_ptr(ca->cpustat);
- kcpustat->cpustat[index] += tmp;
- ca = parent_ca(ca);
- }
- rcu_read_unlock();
-#endif
+ cpuacct_account_field(p, index, tmp);
}
/*
@@ -388,82 +372,10 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_
struct rq *rq) {}
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-/*
- * Account a single tick of cpu time.
- * @p: the process that the cpu time gets accounted to
- * @user_tick: indicates if the tick is a user or a system tick
- */
-void account_process_tick(struct task_struct *p, int user_tick)
-{
- cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
- struct rq *rq = this_rq();
-
- if (vtime_accounting_enabled())
- return;
-
- if (sched_clock_irqtime) {
- irqtime_account_process_tick(p, user_tick, rq);
- return;
- }
-
- if (steal_account_process_tick())
- return;
-
- if (user_tick)
- account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
- else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
- account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
- one_jiffy_scaled);
- else
- account_idle_time(cputime_one_jiffy);
-}
-
-/*
- * Account multiple ticks of steal time.
- * @p: the process from which the cpu time has been stolen
- * @ticks: number of stolen ticks
- */
-void account_steal_ticks(unsigned long ticks)
-{
- account_steal_time(jiffies_to_cputime(ticks));
-}
-
-/*
- * Account multiple ticks of idle time.
- * @ticks: number of stolen ticks
- */
-void account_idle_ticks(unsigned long ticks)
-{
-
- if (sched_clock_irqtime) {
- irqtime_account_idle_ticks(ticks);
- return;
- }
-
- account_idle_time(jiffies_to_cputime(ticks));
-}
-#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
-
/*
* Use precise platform statistics if available:
*/
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
- *ut = p->utime;
- *st = p->stime;
-}
-
-void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
- struct task_cputime cputime;
-
- thread_group_cputime(p, &cputime);
-
- *ut = cputime.utime;
- *st = cputime.stime;
-}
#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
void vtime_task_switch(struct task_struct *prev)
@@ -518,21 +430,111 @@ void vtime_account_irq_enter(struct task_struct *tsk)
}
EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
#endif /* __ARCH_HAS_VTIME_ACCOUNT */
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
+
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+ *ut = p->utime;
+ *st = p->stime;
+}
-#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
+void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+ struct task_cputime cputime;
-static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total)
+ thread_group_cputime(p, &cputime);
+
+ *ut = cputime.utime;
+ *st = cputime.stime;
+}
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+/*
+ * Account a single tick of cpu time.
+ * @p: the process that the cpu time gets accounted to
+ * @user_tick: indicates if the tick is a user or a system tick
+ */
+void account_process_tick(struct task_struct *p, int user_tick)
{
- u64 temp = (__force u64) rtime;
+ cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+ struct rq *rq = this_rq();
- temp *= (__force u64) stime;
+ if (vtime_accounting_enabled())
+ return;
+
+ if (sched_clock_irqtime) {
+ irqtime_account_process_tick(p, user_tick, rq);
+ return;
+ }
+
+ if (steal_account_process_tick())
+ return;
- if (sizeof(cputime_t) == 4)
- temp = div_u64(temp, (__force u32) total);
+ if (user_tick)
+ account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
+ account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
+ one_jiffy_scaled);
else
- temp = div64_u64(temp, (__force u64) total);
+ account_idle_time(cputime_one_jiffy);
+}
- return (__force cputime_t) temp;
+/*
+ * Account multiple ticks of steal time.
+ * @p: the process from which the cpu time has been stolen
+ * @ticks: number of stolen ticks
+ */
+void account_steal_ticks(unsigned long ticks)
+{
+ account_steal_time(jiffies_to_cputime(ticks));
+}
+
+/*
+ * Account multiple ticks of idle time.
+ * @ticks: number of stolen ticks
+ */
+void account_idle_ticks(unsigned long ticks)
+{
+
+ if (sched_clock_irqtime) {
+ irqtime_account_idle_ticks(ticks);
+ return;
+ }
+
+ account_idle_time(jiffies_to_cputime(ticks));
+}
+
+/*
+ * Perform (stime * rtime) / total with reduced chances
+ * of multiplication overflows by using smaller factors
+ * like quotient and remainders of divisions between
+ * rtime and total.
+ */
+static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
+{
+ u64 rem, res, scaled;
+
+ if (rtime >= total) {
+ /*
+ * Scale up to rtime / total then add
+ * the remainder scaled to stime / total.
+ */
+ res = div64_u64_rem(rtime, total, &rem);
+ scaled = stime * res;
+ scaled += div64_u64(stime * rem, total);
+ } else {
+ /*
+ * Same in reverse: scale down to total / rtime
+ * then substract that result scaled to
+ * to the remaining part.
+ */
+ res = div64_u64_rem(total, rtime, &rem);
+ scaled = div64_u64(stime, res);
+ scaled -= div64_u64(scaled * rem, total);
+ }
+
+ return (__force cputime_t) scaled;
}
/*
@@ -545,6 +547,12 @@ static void cputime_adjust(struct task_cputime *curr,
{
cputime_t rtime, stime, total;
+ if (vtime_accounting_enabled()) {
+ *ut = curr->utime;
+ *st = curr->stime;
+ return;
+ }
+
stime = curr->stime;
total = stime + curr->utime;
@@ -560,10 +568,14 @@ static void cputime_adjust(struct task_cputime *curr,
*/
rtime = nsecs_to_cputime(curr->sum_exec_runtime);
- if (total)
- stime = scale_stime(stime, rtime, total);
- else
+ if (!rtime) {
+ stime = 0;
+ } else if (!total) {
stime = rtime;
+ } else {
+ stime = scale_stime((__force u64)stime,
+ (__force u64)rtime, (__force u64)total);
+ }
/*
* If the tick based count grows faster than the scheduler one,
@@ -597,7 +609,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
thread_group_cputime(p, &cputime);
cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
}
-#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
+#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
static unsigned long long vtime_delta(struct task_struct *tsk)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7a33e5986fc5..8bf7081b1ec5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -431,13 +431,13 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
* Scheduling class tree data structure manipulation methods:
*/
-static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
+static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
{
- s64 delta = (s64)(vruntime - min_vruntime);
+ s64 delta = (s64)(vruntime - max_vruntime);
if (delta > 0)
- min_vruntime = vruntime;
+ max_vruntime = vruntime;
- return min_vruntime;
+ return max_vruntime;
}
static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
@@ -473,6 +473,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
vruntime = min_vruntime(vruntime, se->vruntime);
}
+ /* ensure we never gain time by being placed backwards. */
cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
#ifndef CONFIG_64BIT
smp_wmb();
@@ -652,7 +653,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
/*
- * We calculate the vruntime slice of a to be inserted task
+ * We calculate the vruntime slice of a to-be-inserted task.
*
* vs = s/w
*/
@@ -1562,6 +1563,27 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
} /* migrations, e.g. sleep=0 leave decay_count == 0 */
}
+
+/*
+ * Update the rq's load with the elapsed running time before entering
+ * idle. if the last scheduled task is not a CFS task, idle_enter will
+ * be the only way to update the runnable statistic.
+ */
+void idle_enter_fair(struct rq *this_rq)
+{
+ update_rq_runnable_avg(this_rq, 1);
+}
+
+/*
+ * Update the rq's load with the elapsed idle time before a task is
+ * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
+ * be the only way to update the runnable statistic.
+ */
+void idle_exit_fair(struct rq *this_rq)
+{
+ update_rq_runnable_avg(this_rq, 0);
+}
+
#else
static inline void update_entity_load_avg(struct sched_entity *se,
int update_cfs_rq) {}
@@ -3874,12 +3896,16 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
int tsk_cache_hot = 0;
/*
* We do not migrate tasks that are:
- * 1) running (obviously), or
+ * 1) throttled_lb_pair, or
* 2) cannot be migrated to this CPU due to cpus_allowed, or
- * 3) are cache-hot on their current CPU.
+ * 3) running (obviously), or
+ * 4) are cache-hot on their current CPU.
*/
+ if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
+ return 0;
+
if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
- int new_dst_cpu;
+ int cpu;
schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
@@ -3894,12 +3920,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
return 0;
- new_dst_cpu = cpumask_first_and(env->dst_grpmask,
- tsk_cpus_allowed(p));
- if (new_dst_cpu < nr_cpu_ids) {
- env->flags |= LBF_SOME_PINNED;
- env->new_dst_cpu = new_dst_cpu;
+ /* Prevent to re-select dst_cpu via env's cpus */
+ for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
+ if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
+ env->flags |= LBF_SOME_PINNED;
+ env->new_dst_cpu = cpu;
+ break;
+ }
}
+
return 0;
}
@@ -3920,20 +3949,17 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
if (!tsk_cache_hot ||
env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
-#ifdef CONFIG_SCHEDSTATS
+
if (tsk_cache_hot) {
schedstat_inc(env->sd, lb_hot_gained[env->idle]);
schedstat_inc(p, se.statistics.nr_forced_migrations);
}
-#endif
+
return 1;
}
- if (tsk_cache_hot) {
- schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
- return 0;
- }
- return 1;
+ schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
+ return 0;
}
/*
@@ -3948,9 +3974,6 @@ static int move_one_task(struct lb_env *env)
struct task_struct *p, *n;
list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
- if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
- continue;
-
if (!can_migrate_task(p, env))
continue;
@@ -4002,7 +4025,7 @@ static int move_tasks(struct lb_env *env)
break;
}
- if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
+ if (!can_migrate_task(p, env))
goto next;