summaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-09-04 09:10:24 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2017-09-04 09:10:24 -0700
commitf213a6c84c1b4b396a0713ee33cff0e02ba8235f (patch)
treed5c0893d171dce6b1f1cf259a0e59bf433da0839 /kernel/sched
parent621bee34f6ed12d6d4f8896028333fc2200b4ced (diff)
parentbbdacdfed2f5fa50a2cc9f500a36e05990a0837d (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "The main changes in this cycle were: - fix affine wakeups (Peter Zijlstra) - improve CPU onlining (and general bootup) scalability on systems with ridiculous number (thousands) of CPUs (Peter Zijlstra) - sched/numa updates (Rik van Riel) - sched/deadline updates (Byungchul Park) - sched/cpufreq enhancements and related cleanups (Viresh Kumar) - sched/debug enhancements (Xie XiuQi) - various fixes" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (27 commits) sched/debug: Optimize sched_domain sysctl generation sched/topology: Avoid pointless rebuild sched/topology, cpuset: Avoid spurious/wrong domain rebuilds sched/topology: Improve comments sched/topology: Fix memory leak in __sdt_alloc() sched/completion: Document that reinit_completion() must be called after complete_all() sched/autogroup: Fix error reporting printk text in autogroup_create() sched/fair: Fix wake_affine() for !NUMA_BALANCING sched/debug: Intruduce task_state_to_char() helper function sched/debug: Show task state in /proc/sched_debug sched/debug: Use task_pid_nr_ns in /proc/$pid/sched sched/core: Remove unnecessary initialization init_idle_bootup_task() sched/deadline: Change return value of cpudl_find() sched/deadline: Make find_later_rq() choose a closer CPU in topology sched/numa: Scale scan period with tasks in group and shared/private sched/numa: Slow down scan rate if shared faults dominate sched/pelt: Fix false running accounting sched: Mark pick_next_task_dl() and build_sched_domain() as static sched/cpupri: Don't re-initialize 'struct cpupri' sched/deadline: Don't re-initialize 'struct cpudl' ...
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/autogroup.c3
-rw-r--r--kernel/sched/completion.c8
-rw-r--r--kernel/sched/core.c22
-rw-r--r--kernel/sched/cpudeadline.c27
-rw-r--r--kernel/sched/cpupri.c2
-rw-r--r--kernel/sched/deadline.c33
-rw-r--r--kernel/sched/debug.c83
-rw-r--r--kernel/sched/fair.c459
-rw-r--r--kernel/sched/sched.h4
-rw-r--r--kernel/sched/topology.c39
10 files changed, 433 insertions, 247 deletions
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index da39489d2d80..de6d7f4dfcb5 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -71,7 +71,6 @@ static inline struct autogroup *autogroup_create(void)
goto out_fail;
tg = sched_create_group(&root_task_group);
-
if (IS_ERR(tg))
goto out_free;
@@ -101,7 +100,7 @@ out_free:
out_fail:
if (printk_ratelimit()) {
printk(KERN_WARNING "autogroup_create: %s failure.\n",
- ag ? "sched_create_group()" : "kmalloc()");
+ ag ? "sched_create_group()" : "kzalloc()");
}
return autogroup_kref_get(&autogroup_default);
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index c9524d2d9316..5d9131aa846f 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -47,6 +47,13 @@ EXPORT_SYMBOL(complete);
*
* It may be assumed that this function implies a write memory barrier before
* changing the task state if and only if any tasks are woken up.
+ *
+ * Since complete_all() sets the completion of @x permanently to done
+ * to allow multiple waiters to finish, a call to reinit_completion()
+ * must be used on @x if @x is to be used again. The code must make
+ * sure that all waiters have woken and finished before reinitializing
+ * @x. Also note that the function completion_done() can not be used
+ * to know if there are still waiters after complete_all() has been called.
*/
void complete_all(struct completion *x)
{
@@ -297,6 +304,7 @@ EXPORT_SYMBOL(try_wait_for_completion);
* Return: 0 if there are waiters (wait_for_completion() in progress)
* 1 if there are no waiters.
*
+ * Note, this will always return true if complete_all() was called on @X.
*/
bool completion_done(struct completion *x)
{
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e053c31d96da..c1fcd96cf432 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5133,24 +5133,17 @@ out_unlock:
return retval;
}
-static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
-
void sched_show_task(struct task_struct *p)
{
unsigned long free = 0;
int ppid;
- unsigned long state = p->state;
-
- /* Make sure the string lines up properly with the number of task states: */
- BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1);
if (!try_get_task_stack(p))
return;
- if (state)
- state = __ffs(state) + 1;
- printk(KERN_INFO "%-15.15s %c", p->comm,
- state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
- if (state == TASK_RUNNING)
+
+ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p));
+
+ if (p->state == TASK_RUNNING)
printk(KERN_CONT " running task ");
#ifdef CONFIG_DEBUG_STACK_USAGE
free = stack_not_used(p);
@@ -5207,11 +5200,6 @@ void show_state_filter(unsigned long state_filter)
debug_show_all_locks();
}
-void init_idle_bootup_task(struct task_struct *idle)
-{
- idle->sched_class = &idle_sched_class;
-}
-
/**
* init_idle - set up an idle thread for a given CPU
* @idle: task in question
@@ -5468,7 +5456,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
*/
next = pick_next_task(rq, &fake_task, rf);
BUG_ON(!next);
- next->sched_class->put_prev_task(rq, next);
+ put_prev_task(rq, next);
/*
* Rules for changing task_struct::cpus_allowed are holding
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index fba235c7d026..8d9562d890d3 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -119,29 +119,29 @@ static inline int cpudl_maximum(struct cpudl *cp)
* @p: the task
* @later_mask: a mask to fill in with the selected CPUs (or NULL)
*
- * Returns: int - best CPU (heap maximum if suitable)
+ * Returns: int - CPUs were found
*/
int cpudl_find(struct cpudl *cp, struct task_struct *p,
struct cpumask *later_mask)
{
- int best_cpu = -1;
const struct sched_dl_entity *dl_se = &p->dl;
if (later_mask &&
cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
- best_cpu = cpumask_any(later_mask);
- goto out;
- } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
- dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
- best_cpu = cpudl_maximum(cp);
- if (later_mask)
- cpumask_set_cpu(best_cpu, later_mask);
- }
+ return 1;
+ } else {
+ int best_cpu = cpudl_maximum(cp);
+ WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
-out:
- WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
+ if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) &&
+ dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
+ if (later_mask)
+ cpumask_set_cpu(best_cpu, later_mask);
- return best_cpu;
+ return 1;
+ }
+ }
+ return 0;
}
/*
@@ -246,7 +246,6 @@ int cpudl_init(struct cpudl *cp)
{
int i;
- memset(cp, 0, sizeof(*cp));
raw_spin_lock_init(&cp->lock);
cp->size = 0;
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 981fcd7dc394..2511aba36b89 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -209,8 +209,6 @@ int cpupri_init(struct cpupri *cp)
{
int i;
- memset(cp, 0, sizeof(*cp));
-
for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
struct cpupri_vec *vec = &cp->pri_to_cpu[i];
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 755bd3f1a1a9..d05bd9457a40 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1594,7 +1594,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
* let's hope p can move out.
*/
if (rq->curr->nr_cpus_allowed == 1 ||
- cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1)
+ !cpudl_find(&rq->rd->cpudl, rq->curr, NULL))
return;
/*
@@ -1602,7 +1602,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
* see if it is pushed or pulled somewhere else.
*/
if (p->nr_cpus_allowed != 1 &&
- cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
+ cpudl_find(&rq->rd->cpudl, p, NULL))
return;
resched_curr(rq);
@@ -1655,7 +1655,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
return rb_entry(left, struct sched_dl_entity, rb_node);
}
-struct task_struct *
+static struct task_struct *
pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct sched_dl_entity *dl_se;
@@ -1798,7 +1798,7 @@ static int find_later_rq(struct task_struct *task)
struct sched_domain *sd;
struct cpumask *later_mask = this_cpu_cpumask_var_ptr(local_cpu_mask_dl);
int this_cpu = smp_processor_id();
- int best_cpu, cpu = task_cpu(task);
+ int cpu = task_cpu(task);
/* Make sure the mask is initialized first */
if (unlikely(!later_mask))
@@ -1811,17 +1811,14 @@ static int find_later_rq(struct task_struct *task)
* We have to consider system topology and task affinity
* first, then we can look for a suitable cpu.
*/
- best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
- task, later_mask);
- if (best_cpu == -1)
+ if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask))
return -1;
/*
- * If we are here, some target has been found,
- * the most suitable of which is cached in best_cpu.
- * This is, among the runqueues where the current tasks
- * have later deadlines than the task's one, the rq
- * with the latest possible one.
+ * If we are here, some targets have been found, including
+ * the most suitable which is, among the runqueues where the
+ * current tasks have later deadlines than the task's one, the
+ * rq with the latest possible one.
*
* Now we check how well this matches with task's
* affinity and system topology.
@@ -1841,6 +1838,7 @@ static int find_later_rq(struct task_struct *task)
rcu_read_lock();
for_each_domain(cpu, sd) {
if (sd->flags & SD_WAKE_AFFINE) {
+ int best_cpu;
/*
* If possible, preempting this_cpu is
@@ -1852,12 +1850,15 @@ static int find_later_rq(struct task_struct *task)
return this_cpu;
}
+ best_cpu = cpumask_first_and(later_mask,
+ sched_domain_span(sd));
/*
- * Last chance: if best_cpu is valid and is
- * in the mask, that becomes our choice.
+ * Last chance: if a cpu being in both later_mask
+ * and current sd span is valid, that becomes our
+ * choice. Of course, the latest possible cpu is
+ * already under consideration through later_mask.
*/
- if (best_cpu < nr_cpu_ids &&
- cpumask_test_cpu(best_cpu, sched_domain_span(sd))) {
+ if (best_cpu < nr_cpu_ids) {
rcu_read_unlock();
return best_cpu;
}
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4fa66de52bd6..4a23bbc3111b 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -327,38 +327,78 @@ static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
return table;
}
+static cpumask_var_t sd_sysctl_cpus;
static struct ctl_table_header *sd_sysctl_header;
+
void register_sched_domain_sysctl(void)
{
- int i, cpu_num = num_possible_cpus();
- struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
+ static struct ctl_table *cpu_entries;
+ static struct ctl_table **cpu_idx;
char buf[32];
+ int i;
- WARN_ON(sd_ctl_dir[0].child);
- sd_ctl_dir[0].child = entry;
+ if (!cpu_entries) {
+ cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1);
+ if (!cpu_entries)
+ return;
- if (entry == NULL)
- return;
+ WARN_ON(sd_ctl_dir[0].child);
+ sd_ctl_dir[0].child = cpu_entries;
+ }
- for_each_possible_cpu(i) {
- snprintf(buf, 32, "cpu%d", i);
- entry->procname = kstrdup(buf, GFP_KERNEL);
- entry->mode = 0555;
- entry->child = sd_alloc_ctl_cpu_table(i);
- entry++;
+ if (!cpu_idx) {
+ struct ctl_table *e = cpu_entries;
+
+ cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL);
+ if (!cpu_idx)
+ return;
+
+ /* deal with sparse possible map */
+ for_each_possible_cpu(i) {
+ cpu_idx[i] = e;
+ e++;
+ }
+ }
+
+ if (!cpumask_available(sd_sysctl_cpus)) {
+ if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL))
+ return;
+
+ /* init to possible to not have holes in @cpu_entries */
+ cpumask_copy(sd_sysctl_cpus, cpu_possible_mask);
+ }
+
+ for_each_cpu(i, sd_sysctl_cpus) {
+ struct ctl_table *e = cpu_idx[i];
+
+ if (e->child)
+ sd_free_ctl_entry(&e->child);
+
+ if (!e->procname) {
+ snprintf(buf, 32, "cpu%d", i);
+ e->procname = kstrdup(buf, GFP_KERNEL);
+ }
+ e->mode = 0555;
+ e->child = sd_alloc_ctl_cpu_table(i);
+
+ __cpumask_clear_cpu(i, sd_sysctl_cpus);
}
WARN_ON(sd_sysctl_header);
sd_sysctl_header = register_sysctl_table(sd_ctl_root);
}
+void dirty_sched_domain_sysctl(int cpu)
+{
+ if (cpumask_available(sd_sysctl_cpus))
+ __cpumask_set_cpu(cpu, sd_sysctl_cpus);
+}
+
/* may be called multiple times per register */
void unregister_sched_domain_sysctl(void)
{
unregister_sysctl_table(sd_sysctl_header);
sd_sysctl_header = NULL;
- if (sd_ctl_dir[0].child)
- sd_free_ctl_entry(&sd_ctl_dir[0].child);
}
#endif /* CONFIG_SYSCTL */
#endif /* CONFIG_SMP */
@@ -421,13 +461,15 @@ static char *task_group_path(struct task_group *tg)
}
#endif
+static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
+
static void
print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
{
if (rq->curr == p)
- SEQ_printf(m, "R");
+ SEQ_printf(m, ">R");
else
- SEQ_printf(m, " ");
+ SEQ_printf(m, " %c", task_state_to_char(p));
SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
p->comm, task_pid_nr(p),
@@ -456,9 +498,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
SEQ_printf(m,
"\nrunnable tasks:\n"
- " task PID tree-key switches prio"
+ " S task PID tree-key switches prio"
" wait-time sum-exec sum-sleep\n"
- "------------------------------------------------------"
+ "-------------------------------------------------------"
"----------------------------------------------------\n");
rcu_read_lock();
@@ -872,11 +914,12 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
#endif
}
-void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
+void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
+ struct seq_file *m)
{
unsigned long nr_switches;
- SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p),
+ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns),
get_nr_threads(p));
SEQ_printf(m,
"---------------------------------------------------------"
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c95880e216f6..8d5868771cb3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -806,7 +806,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
/*
* For !fair tasks do:
*
- update_cfs_rq_load_avg(now, cfs_rq, false);
+ update_cfs_rq_load_avg(now, cfs_rq);
attach_entity_load_avg(cfs_rq, se);
switched_from_fair(rq, p);
*
@@ -1071,6 +1071,29 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
unsigned int sysctl_numa_balancing_scan_delay = 1000;
+struct numa_group {
+ atomic_t refcount;
+
+ spinlock_t lock; /* nr_tasks, tasks */
+ int nr_tasks;
+ pid_t gid;
+ int active_nodes;
+
+ struct rcu_head rcu;
+ unsigned long total_faults;
+ unsigned long max_faults_cpu;
+ /*
+ * Faults_cpu is used to decide whether memory should move
+ * towards the CPU. As a consequence, these stats are weighted
+ * more by CPU use than by memory faults.
+ */
+ unsigned long *faults_cpu;
+ unsigned long faults[0];
+};
+
+static inline unsigned long group_faults_priv(struct numa_group *ng);
+static inline unsigned long group_faults_shared(struct numa_group *ng);
+
static unsigned int task_nr_scan_windows(struct task_struct *p)
{
unsigned long rss = 0;
@@ -1107,13 +1130,47 @@ static unsigned int task_scan_min(struct task_struct *p)
return max_t(unsigned int, floor, scan);
}
+static unsigned int task_scan_start(struct task_struct *p)
+{
+ unsigned long smin = task_scan_min(p);
+ unsigned long period = smin;
+
+ /* Scale the maximum scan period with the amount of shared memory. */
+ if (p->numa_group) {
+ struct numa_group *ng = p->numa_group;
+ unsigned long shared = group_faults_shared(ng);
+ unsigned long private = group_faults_priv(ng);
+
+ period *= atomic_read(&ng->refcount);
+ period *= shared + 1;
+ period /= private + shared + 1;
+ }
+
+ return max(smin, period);
+}
+
static unsigned int task_scan_max(struct task_struct *p)
{
- unsigned int smin = task_scan_min(p);
- unsigned int smax;
+ unsigned long smin = task_scan_min(p);
+ unsigned long smax;
/* Watch for min being lower than max due to floor calculations */
smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
+
+ /* Scale the maximum scan period with the amount of shared memory. */
+ if (p->numa_group) {
+ struct numa_group *ng = p->numa_group;
+ unsigned long shared = group_faults_shared(ng);
+ unsigned long private = group_faults_priv(ng);
+ unsigned long period = smax;
+
+ period *= atomic_read(&ng->refcount);
+ period *= shared + 1;
+ period /= private + shared + 1;
+
+ smax = max(smax, period);
+ }
+
return max(smin, smax);
}
@@ -1129,26 +1186,6 @@ static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
}
-struct numa_group {
- atomic_t refcount;
-
- spinlock_t lock; /* nr_tasks, tasks */
- int nr_tasks;
- pid_t gid;
- int active_nodes;
-
- struct rcu_head rcu;
- unsigned long total_faults;
- unsigned long max_faults_cpu;
- /*
- * Faults_cpu is used to decide whether memory should move
- * towards the CPU. As a consequence, these stats are weighted
- * more by CPU use than by memory faults.
- */
- unsigned long *faults_cpu;
- unsigned long faults[0];
-};
-
/* Shared or private faults. */
#define NR_NUMA_HINT_FAULT_TYPES 2
@@ -1198,6 +1235,30 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
}
+static inline unsigned long group_faults_priv(struct numa_group *ng)
+{
+ unsigned long faults = 0;
+ int node;
+
+ for_each_online_node(node) {
+ faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
+ }
+
+ return faults;
+}
+
+static inline unsigned long group_faults_shared(struct numa_group *ng)
+{
+ unsigned long faults = 0;
+ int node;
+
+ for_each_online_node(node) {
+ faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
+ }
+
+ return faults;
+}
+
/*
* A node triggering more than 1/3 as many NUMA faults as the maximum is
* considered part of a numa group's pseudo-interleaving set. Migrations
@@ -1378,7 +1439,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
}
-static unsigned long weighted_cpuload(const int cpu);
+static unsigned long weighted_cpuload(struct rq *rq);
static unsigned long source_load(int cpu, int type);
static unsigned long target_load(int cpu, int type);
static unsigned long capacity_of(int cpu);
@@ -1409,7 +1470,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
struct rq *rq = cpu_rq(cpu);
ns->nr_running += rq->nr_running;
- ns->load += weighted_cpuload(cpu);
+ ns->load += weighted_cpuload(rq);
ns->compute_capacity += capacity_of(cpu);
cpus++;
@@ -1808,7 +1869,7 @@ static int task_numa_migrate(struct task_struct *p)
* Reset the scan period if the task is being rescheduled on an
* alternative node to recheck if the tasks is now properly placed.
*/
- p->numa_scan_period = task_scan_min(p);
+ p->numa_scan_period = task_scan_start(p);
if (env.best_task == NULL) {
ret = migrate_task_to(p, env.best_cpu);
@@ -1892,7 +1953,7 @@ static void update_task_scan_period(struct task_struct *p,
unsigned long shared, unsigned long private)
{
unsigned int period_slot;
- int ratio;
+ int lr_ratio, ps_ratio;
int diff;
unsigned long remote = p->numa_faults_locality[0];
@@ -1922,25 +1983,36 @@ static void update_task_scan_period(struct task_struct *p,
* >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
*/
period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
- ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
- if (ratio >= NUMA_PERIOD_THRESHOLD) {
- int slot = ratio - NUMA_PERIOD_THRESHOLD;
+ lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
+ ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
+
+ if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
+ /*
+ * Most memory accesses are local. There is no need to
+ * do fast NUMA scanning, since memory is already local.
+ */
+ int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
+ if (!slot)
+ slot = 1;
+ diff = slot * period_slot;
+ } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
+ /*
+ * Most memory accesses are shared with other tasks.
+ * There is no point in continuing fast NUMA scanning,
+ * since other tasks may just move the memory elsewhere.
+ */
+ int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
if (!slot)
slot = 1;
diff = slot * period_slot;
} else {
- diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
-
/*
- * Scale scan rate increases based on sharing. There is an
- * inverse relationship between the degree of sharing and
- * the adjustment made to the scanning period. Broadly
- * speaking the intent is that there is little point
- * scanning faster if shared accesses dominate as it may
- * simply bounce migrations uselessly
+ * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
+ * yet they are not on the local NUMA node. Speed up
+ * NUMA scanning to get the memory moved over.
*/
- ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
- diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
+ int ratio = max(lr_ratio, ps_ratio);
+ diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
}
p->numa_scan_period = clamp(p->numa_scan_period + diff,
@@ -2448,7 +2520,7 @@ void task_numa_work(struct callback_head *work)
if (p->numa_scan_period == 0) {
p->numa_scan_period_max = task_scan_max(p);
- p->numa_scan_period = task_scan_min(p);
+ p->numa_scan_period = task_scan_start(p);
}
next_scan = now + msecs_to_jiffies(p->numa_scan_period);
@@ -2576,7 +2648,7 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
if (now > curr->node_stamp + period) {
if (!curr->node_stamp)
- curr->numa_scan_period = task_scan_min(curr);
+ curr->numa_scan_period = task_scan_start(curr);
curr->node_stamp += period;
if (!time_before(jiffies, curr->mm->numa_next_scan)) {
@@ -2586,59 +2658,6 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
}
}
-/*
- * Can a task be moved from prev_cpu to this_cpu without causing a load
- * imbalance that would trigger the load balancer?
- */
-static inline bool numa_wake_affine(struct sched_domain *sd,
- struct task_struct *p, int this_cpu,
- int prev_cpu, int sync)
-{
- struct numa_stats prev_load, this_load;
- s64 this_eff_load, prev_eff_load;
-
- update_numa_stats(&prev_load, cpu_to_node(prev_cpu));
- update_numa_stats(&this_load, cpu_to_node(this_cpu));
-
- /*
- * If sync wakeup then subtract the (maximum possible)
- * effect of the currently running task from the load
- * of the current CPU:
- */
- if (sync) {
- unsigned long current_load = task_h_load(current);
-
- if (this_load.load > current_load)
- this_load.load -= current_load;
- else
- this_load.load = 0;
- }
-
- /*
- * In low-load situations, where this_cpu's node is idle due to the
- * sync cause above having dropped this_load.load to 0, move the task.
- * Moving to an idle socket will not create a bad imbalance.
- *
- * Otherwise check if the nodes are near enough in load to allow this
- * task to be woken on this_cpu's node.
- */
- if (this_load.load > 0) {
- unsigned long task_load = task_h_load(p);
-
- this_eff_load = 100;
- this_eff_load *= prev_load.compute_capacity;
-
- prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
- prev_eff_load *= this_load.compute_capacity;
-
- this_eff_load *= this_load.load + task_load;
- prev_eff_load *= prev_load.load - task_load;
-
- return this_eff_load <= prev_eff_load;
- }
-
- return true;
-}
#else
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
@@ -2652,14 +2671,6 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
{
}
-#ifdef CONFIG_SMP
-static inline bool numa_wake_affine(struct sched_domain *sd,
- struct task_struct *p, int this_cpu,
- int prev_cpu, int sync)
-{
- return true;
-}
-#endif /* !SMP */
#endif /* CONFIG_NUMA_BALANCING */
static void
@@ -2790,6 +2801,29 @@ static inline void update_cfs_shares(struct sched_entity *se)
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
+{
+ if (&this_rq()->cfs == cfs_rq) {
+ /*
+ * There are a few boundary cases this might miss but it should
+ * get called often enough that that should (hopefully) not be
+ * a real problem -- added to that it only calls on the local
+ * CPU, so if we enqueue remotely we'll miss an update, but
+ * the next tick/schedule should update.
+ *
+ * It will not get called when we go idle, because the idle
+ * thread is a different class (!fair), nor will the utilization
+ * number include things like RT tasks.
+ *
+ * As is, the util number is not freq-invariant (we'd have to
+ * implement arch_scale_freq_capacity() for that).
+ *
+ * See cpu_util().
+ */
+ cpufreq_update_util(rq_of(cfs_rq), 0);
+ }
+}
+
#ifdef CONFIG_SMP
/*
* Approximate:
@@ -2968,6 +3002,18 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
sa->last_update_time += delta << 10;
/*
+ * running is a subset of runnable (weight) so running can't be set if
+ * runnable is clear. But there are some corner cases where the current
+ * se has been already dequeued but cfs_rq->curr still points to it.
+ * This means that weight will be 0 but not running for a sched_entity
+ * but also for a cfs_rq if the latter becomes idle. As an example,
+ * this happens during idle_balance() which calls
+ * update_blocked_averages()
+ */
+ if (!weight)
+ running = 0;
+
+ /*
* Now we know we crossed measurement unit boundaries. The *_avg
* accrues by two steps:
*
@@ -3276,29 +3322,6 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
#endif /* CONFIG_FAIR_GROUP_SCHED */
-static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
-{
- if (&this_rq()->cfs == cfs_rq) {
- /*
- * There are a few boundary cases this might miss but it should
- * get called often enough that that should (hopefully) not be
- * a real problem -- added to that it only calls on the local
- * CPU, so if we enqueue remotely we'll miss an update, but
- * the next tick/schedule should update.
- *
- * It will not get called when we go idle, because the idle
- * thread is a different class (!fair), nor will the utilization
- * number include things like RT tasks.
- *
- * As is, the util number is not freq-invariant (we'd have to
- * implement arch_scale_freq_capacity() for that).
- *
- * See cpu_util().
- */
- cpufreq_update_util(rq_of(cfs_rq), 0);
- }
-}
-
/*
* Unsigned subtract and clamp on underflow.
*
@@ -3320,7 +3343,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
* update_cfs_rq_load_avg - update the cfs_rq's load/util averages
* @now: current time, as per cfs_rq_clock_task()
* @cfs_rq: cfs_rq to update
- * @update_freq: should we call cfs_rq_util_change() or will the call do so
*
* The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
* avg. The immediate corollary is that all (fair) tasks must be attached, see
@@ -3334,7 +3356,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
* call update_tg_load_avg() when this function returns true.
*/
static inline int
-update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
struct sched_avg *sa = &cfs_rq->avg;
int decayed, removed_load = 0, removed_util = 0;
@@ -3362,7 +3384,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
cfs_rq->load_last_update_time_copy = sa->last_update_time;
#endif
- if (update_freq && (decayed || removed_util))
+ if (decayed || removed_util)
cfs_rq_util_change(cfs_rq);
return decayed || removed_load;
@@ -3390,7 +3412,7 @@ static inline void update_load_avg(struct sched_entity *se, int flags)
if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
__update_load_avg_se(now, cpu, cfs_rq, se);
- decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
+ decayed = update_cfs_rq_load_avg(now, cfs_rq);
decayed |= propagate_entity_load_avg(se);
if (decayed && (flags & UPDATE_TG))
@@ -3534,7 +3556,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
#else /* CONFIG_SMP */
static inline int
-update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
return 0;
}
@@ -3544,7 +3566,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
static inline void update_load_avg(struct sched_entity *se, int not_used1)
{
- cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
+ cfs_rq_util_change(cfs_rq_of(se));
}
static inline void
@@ -5125,9 +5147,9 @@ static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
}
/* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(const int cpu)
+static unsigned long weighted_cpuload(struct rq *rq)
{
- return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
+ return cfs_rq_runnable_load_avg(&rq->cfs);
}
#ifdef CONFIG_NO_HZ_COMMON
@@ -5172,7 +5194,7 @@ static void cpu_load_update_idle(struct rq *this_rq)
/*
* bail if there's load or we're actually up-to-date.
*/
- if (weighted_cpuload(cpu_of(this_rq)))
+ if (weighted_cpuload(this_rq))
return;
cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
@@ -5193,7 +5215,7 @@ void cpu_load_update_nohz_start(void)
* concurrently we'll exit nohz. And cpu_load write can race with
* cpu_load_update_idle() but both updater would be writing the same.
*/
- this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
+ this_rq->cpu_load[0] = weighted_cpuload(this_rq);
}
/*
@@ -5209,7 +5231,7 @@ void cpu_load_update_nohz_stop(void)
if (curr_jiffies == this_rq->last_load_update_tick)
return;
- load = weighted_cpuload(cpu_of(this_rq));
+ load = weighted_cpuload(this_rq);
rq_lock(this_rq, &rf);
update_rq_clock(this_rq);
cpu_load_update_nohz(this_rq, curr_jiffies, load);
@@ -5235,7 +5257,7 @@ static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
*/
void cpu_load_update_active(struct rq *this_rq)
{
- unsigned long load = weighted_cpuload(cpu_of(this_rq));
+ unsigned long load = weighted_cpuload(this_rq);
if (tick_nohz_tick_stopped())
cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
@@ -5253,7 +5275,7 @@ void cpu_load_update_active(struct rq *this_rq)
static unsigned long source_load(int cpu, int type)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long total = weighted_cpuload(cpu);
+ unsigned long total = weighted_cpuload(rq);
if (type == 0 || !sched_feat(LB_BIAS))
return total;
@@ -5268,7 +5290,7 @@ static unsigned long source_load(int cpu, int type)
static unsigned long target_load(int cpu, int type)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long total = weighted_cpuload(cpu);
+ unsigned long total = weighted_cpuload(rq);
if (type == 0 || !sched_feat(LB_BIAS))
return total;
@@ -5290,7 +5312,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
- unsigned long load_avg = weighted_cpuload(cpu);
+ unsigned long load_avg = weighted_cpuload(rq);
if (nr_running)
return load_avg / nr_running;
@@ -5345,20 +5367,115 @@ static int wake_wide(struct task_struct *p)
return 1;
}
+struct llc_stats {
+ unsigned long nr_running;
+ unsigned long load;
+ unsigned long capacity;
+ int has_capacity;
+};
+
+static bool get_llc_stats(struct llc_stats *stats, int cpu)
+{
+ struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+
+ if (!sds)
+ return false;
+
+ stats->nr_running = READ_ONCE(sds->nr_running);