From 9942f79baaaf111d63ebf0862a819278d84fccc4 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 17 Oct 2014 03:29:49 -0400 Subject: sched/numa: Export info needed for NUMA balancing on complex topologies Export some information that is necessary to do placement of tasks on systems with multi-level NUMA topologies. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: mgorman@suse.de Cc: chegu_vinod@hp.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1413530994-9732-2-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 +++- kernel/sched/sched.h | 5 +++++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 240157c13ddc..4007595f87e4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6129,6 +6129,7 @@ static void claim_allocations(int cpu, struct sched_domain *sd) #ifdef CONFIG_NUMA static int sched_domains_numa_levels; static int *sched_domains_numa_distance; +int sched_max_numa_distance; static struct cpumask ***sched_domains_numa_masks; static int sched_domains_curr_level; #endif @@ -6300,7 +6301,7 @@ static void sched_numa_warn(const char *str) printk(KERN_WARNING "\n"); } -static bool find_numa_distance(int distance) +bool find_numa_distance(int distance) { int i; @@ -6447,6 +6448,7 @@ static void sched_init_numa(void) sched_domain_topology = tl; sched_domains_numa_levels = level; + sched_max_numa_distance = sched_domains_numa_distance[level - 1]; } static void sched_domains_numa_masks_set(int cpu) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 24156c8434d1..443d6e152a03 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -678,6 +678,11 @@ static inline u64 rq_clock_task(struct rq *rq) return rq->clock_task; } +#ifdef CONFIG_NUMA +extern int sched_max_numa_distance; +extern bool find_numa_distance(int distance); +#endif + #ifdef CONFIG_NUMA_BALANCING extern void sched_setnuma(struct task_struct *p, int node); extern int migrate_task_to(struct task_struct *p, int cpu); -- cgit v1.2.3 From e3fe70b1f72e3f83a00d9c332ec09ab347a981e2 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 17 Oct 2014 03:29:50 -0400 Subject: sched/numa: Classify the NUMA topology of a system Smaller NUMA systems tend to have all NUMA nodes directly connected to each other. This includes the degenerate case of a system with just one node, ie. a non-NUMA system. Larger systems can have two kinds of NUMA topology, which affects how tasks and memory should be placed on the system. On glueless mesh systems, nodes that are not directly connected to each other will bounce traffic through intermediary nodes. Task groups can be run closer to each other by moving tasks from a node to an intermediary node between it and the task's preferred node. On NUMA systems with backplane controllers, the intermediary hops are incapable of running programs. This creates "islands" of nodes that are at an equal distance to anywhere else in the system. Each kind of topology requires a slightly different placement algorithm; this patch provides the mechanism to detect the kind of NUMA topology of a system. Signed-off-by: Rik van Riel Tested-by: Chegu Vinod [ Changed to use kernel/sched/sched.h ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: mgorman@suse.de Cc: chegu_vinod@hp.com Link: http://lkml.kernel.org/r/1413530994-9732-3-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 6 ++++++ 2 files changed, 59 insertions(+) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4007595f87e4..cde848149dd6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6128,6 +6128,7 @@ static void claim_allocations(int cpu, struct sched_domain *sd) #ifdef CONFIG_NUMA static int sched_domains_numa_levels; +enum numa_topology_type sched_numa_topology_type; static int *sched_domains_numa_distance; int sched_max_numa_distance; static struct cpumask ***sched_domains_numa_masks; @@ -6316,6 +6317,56 @@ bool find_numa_distance(int distance) return false; } +/* + * A system can have three types of NUMA topology: + * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system + * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes + * NUMA_BACKPLANE: nodes can reach other nodes through a backplane + * + * The difference between a glueless mesh topology and a backplane + * topology lies in whether communication between not directly + * connected nodes goes through intermediary nodes (where programs + * could run), or through backplane controllers. This affects + * placement of programs. + * + * The type of topology can be discerned with the following tests: + * - If the maximum distance between any nodes is 1 hop, the system + * is directly connected. + * - If for two nodes A and B, located N > 1 hops away from each other, + * there is an intermediary node C, which is < N hops away from both + * nodes A and B, the system is a glueless mesh. + */ +static void init_numa_topology_type(void) +{ + int a, b, c, n; + + n = sched_max_numa_distance; + + if (n <= 1) + sched_numa_topology_type = NUMA_DIRECT; + + for_each_online_node(a) { + for_each_online_node(b) { + /* Find two nodes furthest removed from each other. */ + if (node_distance(a, b) < n) + continue; + + /* Is there an intermediary node between a and b? */ + for_each_online_node(c) { + if (node_distance(a, c) < n && + node_distance(b, c) < n) { + sched_numa_topology_type = + NUMA_GLUELESS_MESH; + return; + } + } + + sched_numa_topology_type = NUMA_BACKPLANE; + return; + } + } +} + static void sched_init_numa(void) { int next_distance, curr_distance = node_distance(0, 0); @@ -6449,6 +6500,8 @@ static void sched_init_numa(void) sched_domains_numa_levels = level; sched_max_numa_distance = sched_domains_numa_distance[level - 1]; + + init_numa_topology_type(); } static void sched_domains_numa_masks_set(int cpu) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 443d6e152a03..57aacea1cbdf 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -679,6 +679,12 @@ static inline u64 rq_clock_task(struct rq *rq) } #ifdef CONFIG_NUMA +enum numa_topology_type { + NUMA_DIRECT, + NUMA_GLUELESS_MESH, + NUMA_BACKPLANE, +}; +extern enum numa_topology_type sched_numa_topology_type; extern int sched_max_numa_distance; extern bool find_numa_distance(int distance); #endif -- cgit v1.2.3 From 7bd953206b0b5e0a3aded871982367410b42e1b1 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 17 Oct 2014 03:29:51 -0400 Subject: sched/numa: Prepare for complex topology placement Preparatory patch for adding NUMA placement on systems with complex NUMA topology. Also fix a potential divide by zero in group_weight() Signed-off-by: Rik van Riel Tested-by: Chegu Vinod Signed-off-by: Peter Zijlstra (Intel) Cc: mgorman@suse.de Cc: chegu_vinod@hp.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1413530994-9732-4-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 57 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 20 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 34baa60f8a7b..0af3bed3521d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -931,9 +931,10 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) * larger multiplier, in order to group tasks together that are almost * evenly spread out between numa nodes. */ -static inline unsigned long task_weight(struct task_struct *p, int nid) +static inline unsigned long task_weight(struct task_struct *p, int nid, + int dist) { - unsigned long total_faults; + unsigned long faults, total_faults; if (!p->numa_faults_memory) return 0; @@ -943,15 +944,25 @@ static inline unsigned long task_weight(struct task_struct *p, int nid) if (!total_faults) return 0; - return 1000 * task_faults(p, nid) / total_faults; + faults = task_faults(p, nid); + return 1000 * faults / total_faults; } -static inline unsigned long group_weight(struct task_struct *p, int nid) +static inline unsigned long group_weight(struct task_struct *p, int nid, + int dist) { - if (!p->numa_group || !p->numa_group->total_faults) + unsigned long faults, total_faults; + + if (!p->numa_group) + return 0; + + total_faults = p->numa_group->total_faults; + + if (!total_faults) return 0; - return 1000 * group_faults(p, nid) / p->numa_group->total_faults; + faults = group_faults(p, nid); + return 1000 * faults / total_faults; } bool should_numa_migrate_memory(struct task_struct *p, struct page * page, @@ -1084,6 +1095,7 @@ struct task_numa_env { struct numa_stats src_stats, dst_stats; int imbalance_pct; + int dist; struct task_struct *best_task; long best_imp; @@ -1163,6 +1175,7 @@ static void task_numa_compare(struct task_numa_env *env, long load; long imp = env->p->numa_group ? groupimp : taskimp; long moveimp = imp; + int dist = env->dist; rcu_read_lock(); @@ -1196,8 +1209,8 @@ static void task_numa_compare(struct task_numa_env *env, * in any group then look only at task weights. */ if (cur->numa_group == env->p->numa_group) { - imp = taskimp + task_weight(cur, env->src_nid) - - task_weight(cur, env->dst_nid); + imp = taskimp + task_weight(cur, env->src_nid, dist) - + task_weight(cur, env->dst_nid, dist); /* * Add some hysteresis to prevent swapping the * tasks within a group over tiny differences. @@ -1211,11 +1224,11 @@ static void task_numa_compare(struct task_numa_env *env, * instead. */ if (cur->numa_group) - imp += group_weight(cur, env->src_nid) - - group_weight(cur, env->dst_nid); + imp += group_weight(cur, env->src_nid, dist) - + group_weight(cur, env->dst_nid, dist); else - imp += task_weight(cur, env->src_nid) - - task_weight(cur, env->dst_nid); + imp += task_weight(cur, env->src_nid, dist) - + task_weight(cur, env->dst_nid, dist); } } @@ -1314,7 +1327,7 @@ static int task_numa_migrate(struct task_struct *p) }; struct sched_domain *sd; unsigned long taskweight, groupweight; - int nid, ret; + int nid, ret, dist; long taskimp, groupimp; /* @@ -1342,12 +1355,13 @@ static int task_numa_migrate(struct task_struct *p) return -EINVAL; } - taskweight = task_weight(p, env.src_nid); - groupweight = group_weight(p, env.src_nid); - update_numa_stats(&env.src_stats, env.src_nid); env.dst_nid = p->numa_preferred_nid; - taskimp = task_weight(p, env.dst_nid) - taskweight; - groupimp = group_weight(p, env.dst_nid) - groupweight; + dist = env.dist = node_distance(env.src_nid, env.dst_nid); + taskweight = task_weight(p, env.src_nid, dist); + groupweight = group_weight(p, env.src_nid, dist); + update_numa_stats(&env.src_stats, env.src_nid); + taskimp = task_weight(p, env.dst_nid, dist) - taskweight; + groupimp = group_weight(p, env.dst_nid, dist) - groupweight; update_numa_stats(&env.dst_stats, env.dst_nid); /* Try to find a spot on the preferred nid. */ @@ -1359,12 +1373,15 @@ static int task_numa_migrate(struct task_struct *p) if (nid == env.src_nid || nid == p->numa_preferred_nid) continue; + dist = node_distance(env.src_nid, env.dst_nid); + /* Only consider nodes where both task and groups benefit */ - taskimp = task_weight(p, nid) - taskweight; - groupimp = group_weight(p, nid) - groupweight; + taskimp = task_weight(p, nid, dist) - taskweight; + groupimp = group_weight(p, nid, dist) - groupweight; if (taskimp < 0 && groupimp < 0) continue; + env.dist = dist; env.dst_nid = nid; update_numa_stats(&env.dst_stats, env.dst_nid); task_numa_find_cpu(&env, taskimp, groupimp); -- cgit v1.2.3 From 6c6b1193e71fed1a58dc3fab9d967d245177f87b Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 17 Oct 2014 03:29:52 -0400 Subject: sched/numa: Calculate node scores in complex NUMA topologies In order to do task placement on systems with complex NUMA topologies, it is necessary to count the faults on nodes nearby the node that is being examined for a potential move. In case of a system with a backplane interconnect, we are dealing with groups of NUMA nodes; each of the nodes within a group is the same number of hops away from nodes in other groups in the system. Optimal placement on this topology is achieved by counting all nearby nodes equally. When comparing nodes A and B at distance N, nearby nodes are those at distances smaller than N from nodes A or B. Placement strategy on a system with a glueless mesh NUMA topology needs to be different, because there are no natural groups of nodes determined by the hardware. Instead, when dealing with two nodes A and B at distance N, N >= 2, there will be intermediate nodes at distance < N from both nodes A and B. Good placement can be achieved by right shifting the faults on nearby nodes by the number of hops from the node being scored. In this context, a nearby node is any node less than the maximum distance in the system away from the node. Those nodes are skipped for efficiency reasons, there is no real policy reason to do so. Placement policy on directly connected NUMA systems is not affected. Signed-off-by: Rik van Riel Tested-by: Chegu Vinod Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: mgorman@suse.de Cc: chegu_vinod@hp.com Link: http://lkml.kernel.org/r/1413530994-9732-5-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0af3bed3521d..7e5712a0e61b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -925,6 +925,71 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) group->faults_cpu[task_faults_idx(nid, 1)]; } +/* Handle placement on systems where not all nodes are directly connected. */ +static unsigned long score_nearby_nodes(struct task_struct *p, int nid, + int maxdist, bool task) +{ + unsigned long score = 0; + int node; + + /* + * All nodes are directly connected, and the same distance + * from each other. No need for fancy placement algorithms. + */ + if (sched_numa_topology_type == NUMA_DIRECT) + return 0; + + /* + * This code is called for each node, introducing N^2 complexity, + * which should be ok given the number of nodes rarely exceeds 8. + */ + for_each_online_node(node) { + unsigned long faults; + int dist = node_distance(nid, node); + + /* + * The furthest away nodes in the system are not interesting + * for placement; nid was already counted. + */ + if (dist == sched_max_numa_distance || node == nid) + continue; + + /* + * On systems with a backplane NUMA topology, compare groups + * of nodes, and move tasks towards the group with the most + * memory accesses. When comparing two nodes at distance + * "hoplimit", only nodes closer by than "hoplimit" are part + * of each group. Skip other nodes. + */ + if (sched_numa_topology_type == NUMA_BACKPLANE && + dist > maxdist) + continue; + + /* Add up the faults from nearby nodes. */ + if (task) + faults = task_faults(p, node); + else + faults = group_faults(p, node); + + /* + * On systems with a glueless mesh NUMA topology, there are + * no fixed "groups of nodes". Instead, nodes that are not + * directly connected bounce traffic through intermediate + * nodes; a numa_group can occupy any set of nodes. + * The further away a node is, the less the faults count. + * This seems to result in good task placement. + */ + if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { + faults *= (sched_max_numa_distance - dist); + faults /= (sched_max_numa_distance - LOCAL_DISTANCE); + } + + score += faults; + } + + return score; +} + /* * These return the fraction of accesses done by a particular task, or * task group, on a particular numa node. The group weight is given a @@ -945,6 +1010,8 @@ static inline unsigned long task_weight(struct task_struct *p, int nid, return 0; faults = task_faults(p, nid); + faults += score_nearby_nodes(p, nid, dist, true); + return 1000 * faults / total_faults; } @@ -962,6 +1029,8 @@ static inline unsigned long group_weight(struct task_struct *p, int nid, return 0; faults = group_faults(p, nid); + faults += score_nearby_nodes(p, nid, dist, false); + return 1000 * faults / total_faults; } @@ -1374,6 +1443,11 @@ static int task_numa_migrate(struct task_struct *p) continue; dist = node_distance(env.src_nid, env.dst_nid); + if (sched_numa_topology_type == NUMA_BACKPLANE && + dist != env.dist) { + taskweight = task_weight(p, env.src_nid, dist); + groupweight = group_weight(p, env.src_nid, dist); + } /* Only consider nodes where both task and groups benefit */ taskimp = task_weight(p, nid, dist) - taskweight; -- cgit v1.2.3 From 54009416ac3b5f219c0df68559ce534287ae97b1 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 17 Oct 2014 03:29:53 -0400 Subject: sched/numa: Find the preferred nid with complex NUMA topology On systems with complex NUMA topologies, the node scoring is adjusted to allow workloads to converge on nodes that are near each other. The way a task group's preferred nid is determined needs to be adjusted, in order for the preferred_nid to be consistent with group_weight scoring. This ensures that we actually try to converge workloads on adjacent nodes. Signed-off-by: Rik van Riel Tested-by: Chegu Vinod Signed-off-by: Peter Zijlstra (Intel) Cc: mgorman@suse.de Cc: chegu_vinod@hp.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1413530994-9732-6-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 87 insertions(+), 1 deletion(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7e5712a0e61b..7760c2ad3162 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1659,6 +1659,92 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) return delta; } +/* + * Determine the preferred nid for a task in a numa_group. This needs to + * be done in a way that produces consistent results with group_weight, + * otherwise workloads might not converge. + */ +static int preferred_group_nid(struct task_struct *p, int nid) +{ + nodemask_t nodes; + int dist; + + /* Direct connections between all NUMA nodes. */ + if (sched_numa_topology_type == NUMA_DIRECT) + return nid; + + /* + * On a system with glueless mesh NUMA topology, group_weight + * scores nodes according to the number of NUMA hinting faults on + * both the node itself, and on nearby nodes. + */ + if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { + unsigned long score, max_score = 0; + int node, max_node = nid; + + dist = sched_max_numa_distance; + + for_each_online_node(node) { + score = group_weight(p, node, dist); + if (score > max_score) { + max_score = score; + max_node = node; + } + } + return max_node; + } + + /* + * Finding the preferred nid in a system with NUMA backplane + * interconnect topology is more involved. The goal is to locate + * tasks from numa_groups near each other in the system, and + * untangle workloads from different sides of the system. This requires + * searching down the hierarchy of node groups, recursively searching + * inside the highest scoring group of nodes. The nodemask tricks + * keep the complexity of the search down. + */ + nodes = node_online_map; + for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) { + unsigned long max_faults = 0; + nodemask_t max_group; + int a, b; + + /* Are there nodes at this distance from each other? */ + if (!find_numa_distance(dist)) + continue; + + for_each_node_mask(a, nodes) { + unsigned long faults = 0; + nodemask_t this_group; + nodes_clear(this_group); + + /* Sum group's NUMA faults; includes a==b case. */ + for_each_node_mask(b, nodes) { + if (node_distance(a, b) < dist) { + faults += group_faults(p, b); + node_set(b, this_group); + node_clear(b, nodes); + } + } + + /* Remember the top group. */ + if (faults > max_faults) { + max_faults = faults; + max_group = this_group; + /* + * subtle: at the smallest distance there is + * just one node left in each "group", the + * winner is the preferred nid. + */ + nid = a; + } + } + /* Next round, evaluate the nodes within max_group. */ + nodes = max_group; + } + return nid; +} + static void task_numa_placement(struct task_struct *p) { int seq, nid, max_nid = -1, max_group_nid = -1; @@ -1741,7 +1827,7 @@ static void task_numa_placement(struct task_struct *p) if (p->numa_group) { update_numa_active_node_mask(p->numa_group); spin_unlock_irq(group_lock); - max_nid = max_group_nid; + max_nid = preferred_group_nid(p, max_group_nid); } if (max_faults) { -- cgit v1.2.3 From 9de05d48711cd5314920ed05f873d84eaf66ccf1 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 9 Oct 2014 17:27:47 -0400 Subject: sched/numa: Check all nodes when placing a pseudo-interleaved group In pseudo-interleaved numa_groups, all tasks try to relocate to the group's preferred_nid. When a group is spread across multiple NUMA nodes, this can lead to tasks swapping their location with other tasks inside the same group, instead of swapping location with tasks from other NUMA groups. This can keep NUMA groups from converging. Examining all nodes, when dealing with a task in a pseudo-interleaved NUMA group, avoids this problem. Note that only CPUs in nodes that improve the task or group score are examined, so the loop isn't too bad. Tested-by: Vinod Chegu Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: "Vinod Chegu" Cc: mgorman@suse.de Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20141009172747.0d97c38c@annuminas.surriel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7760c2ad3162..ec32c26d7fb6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1436,8 +1436,15 @@ static int task_numa_migrate(struct task_struct *p) /* Try to find a spot on the preferred nid. */ task_numa_find_cpu(&env, taskimp, groupimp); - /* No space available on the preferred nid. Look elsewhere. */ - if (env.best_cpu == -1) { + /* + * Look at other nodes in these cases: + * - there is no space available on the preferred_nid + * - the task is part of a numa_group that is interleaved across + * multiple NUMA nodes; in order to better consolidate the group, + * we need to check other locations. + */ + if (env.best_cpu == -1 || (p->numa_group && + nodes_weight(p->numa_group->active_nodes) > 1)) { for_each_online_node(nid) { if (nid == env.src_nid || nid == p->numa_preferred_nid) continue; -- cgit v1.2.3 From 1a43a14a5bd9c32dbd7af35e35a5afa703944bcb Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 8 Oct 2014 21:36:44 +0200 Subject: sched: Fix schedule_tail() to disable preemption finish_task_switch() enables preemption, so post_schedule(rq) can be called on the wrong (and even dead) CPU. Afaics, nothing really bad can happen, but in this case we can wrongly clear rq->post_schedule on that CPU. And this simply looks wrong in any case. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Steven Rostedt Cc: Kirill Tkhai Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20141008193644.GA32055@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cde848149dd6..b4935600cd85 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2309,15 +2309,14 @@ static inline void post_schedule(struct rq *rq) asmlinkage __visible void schedule_tail(struct task_struct *prev) __releases(rq->lock) { - struct rq *rq = this_rq(); + struct rq *rq; + /* finish_task_switch() drops rq->lock and enables preemtion */ + preempt_disable(); + rq = this_rq(); finish_task_switch(rq, prev); - - /* - * FIXME: do we need to worry about rq being invalidated by the - * task_switch? - */ post_schedule(rq); + preempt_enable(); if (current->set_child_tid) put_user(task_pid_vnr(current), current->set_child_tid); -- cgit v1.2.3 From dfa50b605c2a933b7bb1c1d575a0da4e897e3c7d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 9 Oct 2014 21:32:32 +0200 Subject: sched: Make finish_task_switch() return 'struct rq *' Both callers of finish_task_switch() need to recalculate this_rq() and pass it as an argument, plus __schedule() does this again after context_switch(). It would be simpler to call this_rq() once in finish_task_switch() and return the this rq to the callers. Note: probably "int cpu" in __schedule() should die; it is not used and both rcu_note_context_switch() and wq_worker_sleeping() do not really need this argument. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Kirill Tkhai Cc: Steven Rostedt Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20141009193232.GB5408@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 38 +++++++++++++++----------------------- 1 file changed, 15 insertions(+), 23 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b4935600cd85..1b69603c1d3e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2220,7 +2220,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, /** * finish_task_switch - clean up after a task-switch - * @rq: runqueue associated with task-switch * @prev: the thread we just switched away from. * * finish_task_switch must be called after the context switch, paired @@ -2232,10 +2231,16 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, * so, we finish that here outside of the runqueue lock. (Doing it * with the lock held can cause deadlocks; see schedule() for * details.) + * + * The context switch have flipped the stack from under us and restored the + * local variables which were saved when this task called schedule() in the + * past. prev == current is still correct but we need to recalculate this_rq + * because prev may have moved to another CPU. */ -static void finish_task_switch(struct rq *rq, struct task_struct *prev) +static struct rq *finish_task_switch(struct task_struct *prev) __releases(rq->lock) { + struct rq *rq = this_rq(); struct mm_struct *mm = rq->prev_mm; long prev_state; @@ -2275,6 +2280,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) } tick_nohz_task_switch(current); + return rq; } #ifdef CONFIG_SMP @@ -2313,8 +2319,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) /* finish_task_switch() drops rq->lock and enables preemtion */ preempt_disable(); - rq = this_rq(); - finish_task_switch(rq, prev); + rq = finish_task_switch(prev); post_schedule(rq); preempt_enable(); @@ -2323,10 +2328,9 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) } /* - * context_switch - switch to the new MM and the new - * thread's register state. + * context_switch - switch to the new MM and the new thread's register state. */ -static inline void +static inline struct rq * context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { @@ -2365,14 +2369,9 @@ context_switch(struct rq *rq, struct task_struct *prev, context_tracking_task_switch(prev, next); /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); - barrier(); - /* - * this_rq must be evaluated again because prev may have moved - * CPUs since it called schedule(), thus the 'rq' on its stack - * frame will be invalid. - */ - finish_task_switch(this_rq(), prev); + + return finish_task_switch(prev); } /* @@ -2854,15 +2853,8 @@ need_resched: rq->curr = next; ++*switch_count; - context_switch(rq, prev, next); /* unlocks the rq */ - /* - * The context switch have flipped the stack from under us - * and restored the local variables which were saved when - * this task called schedule() in the past. prev == current - * is still correct, but it can be moved to another cpu/rq. - */ - cpu = smp_processor_id(); - rq = cpu_rq(cpu); + rq = context_switch(rq, prev, next); /* unlocks the rq */ + cpu = cpu_of(rq); } else raw_spin_unlock_irq(&rq->lock); -- cgit v1.2.3 From e2336f6e51edda875a49770b616ed5b02a74665b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 8 Oct 2014 20:33:48 +0200 Subject: sched: Kill task_preempt_count() task_preempt_count() is pointless if preemption counter is per-cpu, currently this is x86 only. It is only valid if the task is not running, and even in this case the only info it can provide is the state of PREEMPT_ACTIVE bit. Change its single caller to check p->on_rq instead, this should be the same if p->state != TASK_RUNNING, and kill this helper. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Steven Rostedt Cc: Kirill Tkhai Cc: Alexander Graf Cc: Andrew Morton Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Linus Torvalds Cc: linux-arch@vger.kernel.org Link: http://lkml.kernel.org/r/20141008183348.GC17495@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1b69603c1d3e..5c067fd66db9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1054,7 +1054,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * ttwu() will sort out the placement. */ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && - !(task_preempt_count(p) & PREEMPT_ACTIVE)); + !p->on_rq); #ifdef CONFIG_LOCKDEP /* -- cgit v1.2.3 From d9aade7ae1d283097a3f626790e7c325a5c69007 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 22 Oct 2014 08:36:43 +0800 Subject: sched/deadline: Do not try to push tasks if pinned task switches to dl As Kirill mentioned (https://lkml.org/lkml/2013/1/29/118): | If rq has already had 2 or more pushable tasks and we try to add a | pinned task then call of push_rt_task will just waste a time. Just switched pinned task is not able to be pushed. If the rq has had several dl tasks before they have already been considered as candidates to be pushed (or pulled). This patch implements the same behavior as rt class which introduced by commit 10447917551e ("sched/rt: Do not try to push tasks if pinned task switches to RT"). Suggested-by: Kirill V Tkhai Acked-by: Juri Lelli Signed-off-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Cc: Steven Rostedt Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1413938203-224610-1-git-send-email-wanpeng.li@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/sched') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 5285332392d5..9d1e76a21297 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1622,7 +1622,8 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP - if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) + if (p->nr_cpus_allowed > 1 && rq->dl.overloaded && + push_dl_task(rq) && rq != task_rq(p)) /* Only reschedule if pushing failed */ check_resched = 0; #endif /* CONFIG_SMP */ -- cgit v1.2.3 From 7f51412a415d87ea8598d14722fb31e4f5701257 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 19 Sep 2014 10:22:40 +0100 Subject: sched/deadline: Fix bandwidth check/update when migrating tasks between exclusive cpusets Exclusive cpusets are the only way users can restrict SCHED_DEADLINE tasks affinity (performing what is commonly called clustered scheduling). Unfortunately, such thing is currently broken for two reasons: - No check is performed when the user tries to attach a task to an exlusive cpuset (recall that exclusive cpusets have an associated maximum allowed bandwidth). - Bandwidths of source and destination cpusets are not correctly updated after a task is migrated between them. This patch fixes both things at once, as they are opposite faces of the same coin. The check is performed in cpuset_can_attach(), as there aren't any points of failure after that function. The updated is split in two halves. We first reserve bandwidth in the destination cpuset, after we pass the check in cpuset_can_attach(). And we then release bandwidth from the source cpuset when the task's affinity is actually changed. Even if there can be time windows when sched_setattr() may erroneously fail in the source cpuset, we are fine with it, as we can't perfom an atomic update of both cpusets at once. Reported-by: Daniel Wagner Reported-by: Vincent Legout Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: Dario Faggioli Cc: Michael Trimarchi Cc: Fabio Checconi Cc: michael@amarulasolutions.com Cc: luca.abeni@unitn.it Cc: Li Zefan Cc: Linus Torvalds Cc: cgroups@vger.kernel.org Link: http://lkml.kernel.org/r/1411118561-26323-3-git-send-email-juri.lelli@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 70 +++++++++++++++++++++++++++++++++++-------------- kernel/sched/deadline.c | 25 ++++++++++++++++-- kernel/sched/sched.h | 19 ++++++++++++++ 3 files changed, 93 insertions(+), 21 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5c067fd66db9..9993feeb8b10 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2034,25 +2034,6 @@ static inline int dl_bw_cpus(int i) } #endif -static inline -void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) -{ - dl_b->total_bw -= tsk_bw; -} - -static inline -void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) -{ - dl_b->total_bw += tsk_bw; -} - -static inline -bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) -{ - return dl_b->bw != -1 && - dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; -} - /* * We must be sure that accepting a new task (or allowing changing the * parameters of an existing one) is consistent with the bandwidth @@ -4669,6 +4650,57 @@ void init_idle(struct task_struct *idle, int cpu) #endif } +int task_can_attach(struct task_struct *p, + const struct cpumask *cs_cpus_allowed) +{ + int ret = 0; + + /* + * Kthreads which disallow setaffinity shouldn't be moved + * to a new cpuset; we don't want to change their cpu + * affinity and isolating such threads by their set of + * allowed nodes is unnecessary. Thus, cpusets are not + * applicable for such threads. This prevents checking for + * success of set_cpus_allowed_ptr() on all attached tasks + * before cpus_allowed may be changed. + */ + if (p->flags & PF_NO_SETAFFINITY) { + ret = -EINVAL; + goto out; + } + +#ifdef CONFIG_SMP + if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, + cs_cpus_allowed)) { + unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, + cs_cpus_allowed); + struct dl_bw *dl_b = dl_bw_of(dest_cpu); + bool overflow; + int cpus; + unsigned long flags; + + raw_spin_lock_irqsave(&dl_b->lock, flags); + cpus = dl_bw_cpus(dest_cpu); + overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); + if (overflow) + ret = -EBUSY; + else { + /* + * We reserve space for this task in the destination + * root_domain, as we can't fail after this point. + * We will free resources in the source root_domain + * later on (see set_cpus_allowed_dl()). + */ + __dl_add(dl_b, p->dl.dl_bw); + } + raw_spin_unlock_irqrestore(&dl_b->lock, flags); + + } +#endif +out: + return ret; +} + #ifdef CONFIG_SMP /* * move_queued_task - move a queued task to new rq. diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9d1e76a21297..8aaa971ffecd 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1517,10 +1517,33 @@ static void set_cpus_allowed_dl(struct task_struct *p, const struct cpumask *new_mask) { struct rq *rq; + struct root_domain *src_rd; int weight; BUG_ON(!dl_task(p)); + rq = task_rq(p); + src_rd = rq->rd; + /* + * Migrating a SCHED_DEADLINE task between exclusive + * cpusets (different root_domains) entails a bandwidth + * update. We already made space for us in the destination + * domain (see cpuset_can_attach()). + */ + if (!cpumask_intersects(src_rd->span, new_mask)) { + struct dl_bw *src_dl_b; + + src_dl_b = dl_bw_of(cpu_of(rq)); + /* + * We now free resources of the root_domain we are migrating + * off. In the worst case, sched_setattr() may temporary fail + * until we complete the update. + */ + raw_spin_lock(&src_dl_b->lock); + __dl_clear(src_dl_b, p->dl.dl_bw); + raw_spin_unlock(&src_dl_b->lock); + } + /* * Update only if the task is actually running (i.e., * it is on the rq AND it is not throttled). @@ -1537,8 +1560,6 @@ static void set_cpus_allowed_dl(struct task_struct *p, if ((p->nr_cpus_allowed > 1) == (weight > 1)) return; - rq = task_rq(p); - /* * The process used to be able to migrate OR it can now migrate */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 57aacea1cbdf..ec3917c5f898 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -176,6 +176,25 @@ struct dl_bw { u64 bw, total_bw; }; +static inline +void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) +{ + dl_b->total_bw -= tsk_bw; +} + +static inline +void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) +{ + dl_b->total_bw += tsk_bw; +} + +static inline +bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) +{ + return dl_b->bw != -1 && + dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; +} + extern struct mutex sched_domains_mutex; #ifdef CONFIG_CGROUP_SCHED -- cgit v1.2.3 From f82f80426f7afcf55953924e71555984a4bd6ce6 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 7 Oct 2014 09:52:11 +0100 Subject: sched/deadline: Ensure that updates to exclusive cpusets don't break AC How we deal with updates to exclusive cpusets is currently broken. As an example, suppose we have an exclusive cpuset composed of two cpus: A[cpu0,cpu1]. We can assign SCHED_DEADLINE task to it up to the allowed bandwidth. If we want now to modify cpusetA's cpumask, we have to check that removing a cpu's amount of bandwidth doesn't break AC guarantees. This thing isn't checked in the current code. This patch fixes the problem above, denying an update if the new cpumask won't have enough bandwidth for SCHED_DEADLINE tasks that are currently active. Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Li Zefan Cc: cgroups@vger.kernel.org Link: http://lkml.kernel.org/r/5433E6AF.5080105@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9993feeb8b10..0456a55fc27f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4650,6 +4650,25 @@ void init_idle(struct task_struct *idle, int cpu) #endif } +int cpuset_cpumask_can_shrink(const struct cpumask *cur, + const struct cpumask *trial) +{ + int ret = 1, trial_cpus; + struct dl_bw *cur_dl_b; + unsigned long flags; + + cur_dl_b = dl_bw_of(cpumask_any(cur)); + trial_cpus = cpumask_weight(trial); + + raw_spin_lock_irqsave(&cur_dl_b->lock, flags); + if (cur_dl_b->bw != -1 && + cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw) + ret = 0; + raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); + + return ret; +} + int task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed) { -- cgit v1.2.3 From 1d7e974cbf2fce2683f34ff33c173fd7ef5478c7 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 14 Oct 2014 10:22:39 +0800 Subject: sched/deadline: Don't check SD_BALANCE_FORK There is no need to do balance during fork since SCHED_DEADLINE tasks can't fork. This patch avoid the SD_BALANCE_FORK check. Signed-off-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/1413253360-5318-1-git-send-email-wanpeng.li@linux.intel.com Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 8aaa971ffecd..fab3bf81bb7c 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -933,7 +933,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) struct task_struct *curr; struct rq *rq; - if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) + if (sd_flag != SD_BALANCE_WAKE) goto out; rq = cpu_rq(cpu); -- cgit v1.2.3 From f4e9d94a5bf60193d45f92b136e3d166be3ec8d5 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 14 Oct 2014 10:22:40 +0800 Subject: sched/deadline: Don't balance during wakeup if wakee is pinned Use nr_cpus_allowed to bail from select_task_rq() when only one cpu can be used, and saves some cycles for pinned tasks. Signed-off-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1413253360-5318-2-git-send-email-wanpeng.li@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/sched') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fab3bf81bb7c..2e31a30e623c 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -933,6 +933,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) struct task_struct *curr; struct rq *rq; + if (p->nr_cpus_allowed == 1) + goto out; + if (sd_flag != SD_BALANCE_WAKE) goto out; -- cgit v1.2.3 From 61ada528dea028331e99e8ceaed87c683ad25de2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Sep 2014 10:18:47 +0200 Subject: sched/wait: Provide infrastructure to deal with nested blocking There are a few places that call blocking primitives from wait loops, provide infrastructure to support this without the typical task_struct::state collision. We record the wakeup in wait_queue_t::flags which leaves task_struct::state free to be used by others. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Oleg Nesterov Cc: tglx@linutronix.de Cc: ilya.dryomov@inktank.com Cc: umgwanakikbuti@gmail.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140924082242.051202318@infradead.org Signed-off-by: Ingo Molnar --- kernel/sched/wait.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) (limited to 'kernel/sched') diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 5a62915f47a8..4dae1885db6f 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -297,6 +297,67 @@ int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void * } EXPORT_SYMBOL(autoremove_wake_function); + +/* + * DEFINE_WAIT_FUNC(wait, woken_wake_func); + * + * add_wait_queue(&wq, &wait); + * for (;;) { + * if (condition) + * break; + * + * p->state = mode; condition = true; + * smp_mb(); // A smp_wmb(); // C + * if (!wait->flags & WQ_FLAG_WOKEN) wait->flags |= WQ_FLAG_WOKEN; + * schedule() try_to_wake_up(); + * p->state = TASK_RUNNING; ~~~~~~~~~~~~~~~~~~ + * wait->flags &= ~WQ_FLAG_WOKEN; condition = true; + * smp_mb() // B smp_wmb(); // C + * wait->flags |= WQ_FLAG_WOKEN; + * } + * remove_wait_queue(&wq, &wait); + * + */ +long wait_woken(wait_queue_t *wait, unsigned mode, long timeout) +{ + set_current_state(mode); /* A */ + /* + * The above implies an smp_mb(), which matches with the smp_wmb() from + * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must + * also observe all state before the wakeup. + */ + if (!(wait->flags & WQ_FLAG_WOKEN)) + timeout = schedule_timeout(timeout); + __set_current_state(TASK_RUNNING); + + /* + * The below implies an smp_mb(), it too pairs with the smp_wmb() from + * woken_wake_function() such that we must either observe the wait + * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss + * an event. + */ + set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */ + + return timeout; +} +EXPORT_SYMBOL(wait_woken); + +int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + /* + * Although this function is called under waitqueue lock, LOCK + * doesn't imply write barrier and the users expects write + * barrier semantics on wakeup functions. The following + * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() + * and is paired with set_mb() in wait_woken(). + */ + smp_wmb(); /* C */ + wait->flags |= WQ_FLAG_WOKEN; + + return default_wake_function(wait, mode, sync, key); +} +EXPORT_SYMBOL(woken_wake_function); + int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) { struct wait_bit_key *key = arg; -- cgit v1.2.3 From 8eb23b9f35aae413140d3fda766a98092c21e9b0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Sep 2014 10:18:55 +0200 Subject: sched: Debug nested sleeps Validate we call might_sleep() with TASK_RUNNING, which catches places where we nest blocking primitives, eg. mutex usage in a wait loop. Since all blocking is arranged through task_struct::state, nesting this will cause the inner primitive to set TASK_RUNNING and the outer will thus not block. Another observed problem is calling a blocking function from schedule()->sched_submit_work()->blk_schedule_flush_plug() which will then destroy the task state for the actual __schedule() call that comes after it. Signed-off-by: Peter Zijlstra (Intel) Cc: tglx@linutronix.de Cc: ilya.dryomov@inktank.com Cc: umgwanakikbuti@gmail.com Cc: oleg@redhat.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140924082242.591637616@infradead.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0456a55fc27f..5b4b96b27cd7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7298,6 +7298,19 @@ void __might_sleep(const char *file, int line, int preempt_offset) { static unsigned long prev_jiffy; /* ratelimiting */ + /* + * Blocking primitives will set (and therefore destroy) current->state, + * since we will exit with TASK_RUNNING make sure we enter with it, + * otherwise we will destroy state. + */ + if (WARN(current->state != TASK_RUNNING, + "do not call blocking ops when !TASK_RUNNING; " + "state=%lx set at [<%p>] %pS\n", + current->state, + (void *)current->task_state_change, + (void *)current->task_state_change)) + __set_current_state(TASK_RUNNING); + rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && !is_idle_task(current)) || -- cgit v1.2.3 From 3427445afd26bd2395f29241319283a93f362cd0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Sep 2014 10:18:56 +0200 Subject: sched: Exclude cond_resched() from nested sleep test cond_resched() is a preemption point, not strictly a blocking primitive, so exclude it from the ->state test. In particular, preemption preserves task_struct::state. Signed-off-by: Peter Zijlstra (Intel) Cc: tglx@linutronix.de Cc: ilya.dryomov@inktank.com Cc: umgwanakikbuti@gmail.com Cc: oleg@redhat.com Cc: Alex Elder Cc: Andrew Morton Cc: Axel Lin Cc: Daniel Borkmann Cc: Dave Jones Cc: Jason Baron Cc: Linus Torvalds Cc: Rusty Russell Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20140924082242.656559952@infradead.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5b4b96b27cd7..b9f78f12ac22 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7296,8 +7296,6 @@ static inline int preempt_count_equals(int preempt_offset) void __might_sleep(const char *file, int line, int preempt_offset) { - static unsigned long prev_jiffy; /* ratelimiting */ - /* * Blocking primitives will set (and therefore destroy) current->state, * since we will exit with TASK_RUNNING make sure we enter with it, @@ -7311,6 +7309,14 @@ void __might_sleep(const char *file, int line, int preempt_offset) (void *)current->task_state_change)) __set_current_state(TASK_RUNNING); + ___might_sleep(file, line, preempt_offset); +} +EXPORT_SYMBOL(__might_sleep); + +void ___might_sleep(const char *file, int line, int preempt_offset) +{ + static unsigned long prev_jiffy; /* ratelimiting */ + rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && !is_idle_task(current)) || @@ -7340,7 +7346,7 @@ void __might_sleep(const char *file, int line, int preempt_offset) #endif dump_stack(); } -EXPORT_SYMBOL(__might_sleep); +EXPORT_SYMBOL(___might_sleep); #endif #ifdef CONFIG_MAGIC_SYSRQ -- cgit v1.2.3 From cb6538e740d7543cd989128625cf8cac4b471e0a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 31 Oct 2014 11:57:30 +0100 Subject: sched/wait: Fix a kthread race with wait_woken() There is a race between kthread_stop() and the new wait_woken() that can result in a lack of progress. CPU 0 | CPU 1 | rfcomm_run() | kthread_stop() ... | if (!test_bit(KTHREAD_SHOULD_STOP)) | | set_bit(KTHREAD_SHOULD_STOP) | wake_up_process() wait_woken() | wait_for_completion() set_current_state(INTERRUPTIBLE) | if (!WQ_FLAG_WOKEN) | schedule_timeout() | | After which both tasks will wait.. forever. Fix this by having wait_woken() check for kthread_should_stop() but only for kthreads (obviously). Signed-off-by: Peter Zijlstra (Intel) Cc: Peter Hurley Cc: Oleg Nesterov Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- kernel/sched/wait.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel/sched') diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 4dae1885db6f..852143a79f36 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -9,6 +9,7 @@ #include #include #include +#include void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) { @@ -297,6 +298,10 @@ int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void * } EXPORT_SYMBOL(autoremove_wake_function); +static inline bool is_kthread_should_stop(void) +{ + return (current->flags & PF_KTHREAD) && kthread_should_stop(); +} /* * DEFINE_WAIT_FUNC(wait, woken_wake_func); @@ -326,7 +331,7 @@ long wait_woken(wait_queue_t *wait, unsigned mode, long timeout) * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must * also observe all state before the wakeup. */ - if (!(wait->flags & WQ_FLAG_WOKEN)) + if (!(wait->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop()) timeout = schedule_timeout(timeout); __set_current_state(TASK_RUNNING); -- cgit v1.2.3 From e7097e8bd0074b465f9c78dcff25cd3f82382581 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 Oct 2014 17:08:45 +0100 Subject: sched: Use WARN_ONCE for the might_sleep() TASK_RUNNING test In some cases this can trigger a true flood of output. Requested-by: Ingo Molnar Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b9f78f12ac22..0cd34e68680c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7301,7 +7301,7 @@ void __might_sleep(const char *file, int line, int preempt_offset) * since we will exit with TASK_RUNNING make sure we enter with it, * otherwise we will destroy state. */ - if (WARN(current->state != TASK_RUNNING, + if (WARN_ONCE(current->state != TASK_RUNNING, "do not call blocking ops when !TASK_RUNNING; " "state=%lx set at [<%p>] %pS\n", current->state, -- cgit v1.2.3 From 67dfa1b756f250972bde31d65e3f8fde6aeddc5b Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 27 Oct 2014 17:40:52 +0300 Subject: sched/deadline: Implement cancel_dl_timer() to use in switched_from_dl() Currently used hrtimer_try_to_cancel() is racy: raw_spin_lock(&rq->lock) ... dl_task_timer raw_spin_lock(&rq->lock) ... raw_spin_lock(&rq->lock) ... switched_from_dl() ... ... hrtimer_try_to_cancel() ... ... switched_to_fair() ... ... ... ... ... ... ... ... raw_spin_unlock(&rq->lock) ... (asquired) ... ... ... ... ... ... do_exit() ... ... schedule() ... ... raw_spin_lock(&rq->lock) ... raw_spin_unlock(&rq->lock) ... ... ... raw_spin_unlock(&rq->lock) ... raw_spin_lock(&rq->lock) ... ... (asquired) put_task_struct() ... ... free_task_struct() ... ... ... ... raw_spin_unlock(&rq->lock) ... (asquired) ... ... ... ... ... (use after free) ... So, let's implement 100% guaranteed way to cancel the timer and let's be sure we are safe even in very unlikely situations. rq unlocking does not limit the area of switched_from_dl() use, because this has already been possible in pull_dl_task() below. Let's consider the safety of of this unlocking. New code in the patch is working when hrtimer_try_to_cancel() fails. This means the callback is running. In this case hrtimer_cancel() is just waiting till the callback is finished. Two 1) Since we are in switched_from_dl(), new class is not dl_sched_class and new prio is not less MAX_DL_PRIO. So, the callback returns early; it's right after !dl_task() check. After that hrtimer_cancel() returns back too. The above is: raw_spin_lock(rq->lock); ... ... dl_task_timer() ... raw_spin_lock(rq->lock); switched_from_dl() ... hrtimer_try_to_cancel() ... raw_spin_unlock(rq->lock); ... hrtimer_cancel() ... ... raw_spin_unlock(rq->lock); ... return HRTIMER_NORESTART; ... ... raw_spin_lock(rq->lock); ... 2) But the below is also possible: dl_task_timer() raw_spin_lock(rq->lock); ... raw_spin_unlock(rq->lock); raw_spin_lock(rq->lock); ... switched_from_dl() ... hrtimer_try_to_cancel() ... ... return HRTIMER_NORESTART; raw_spin_unlock(rq->lock); ... hrtimer_cancel(); ... raw_spin_lock(rq->lock); ... In this case hrtimer_cancel() returns immediately. Very unlikely case, just to mention. Nobody can manipulate the task, because check_class_changed() is always called with pi_lock locked. Nobody can force the task to participate in (concurrent) priority inheritance schemes (the same reason). All concurrent task operations require pi_lock, which is held by us. No deadlocks with dl_task_timer() are possible, because it returns right after !dl_task() check (it does nothing). If we receive a new dl_task during the time of unlocked rq, we just don't have to do pull_dl_task() in switched_from_dl() further. Signed-off-by: Kirill Tkhai [ Added comments] Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1414420852.19914.186.camel@tkhai Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++++ kernel/sched/deadline.c | 34 +++++++++++++++++++++++++++------- kernel/sched/sched.h | 5 +++++ 3 files changed, 36 insertions(+), 7 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0cd34e68680c..379cb87da69d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1008,6 +1008,9 @@ inline int task_curr(const struct task_struct *p) return cpu_curr(task_cpu(p)) == p; } +/* + * Can drop rq->lock because from sched_class::switched_from() methods drop it. + */ static inline void check_class_changed(struct rq *rq, struct task_struct *p, const struct sched_class *prev_class, int oldprio) @@ -1015,6 +1018,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, if (prev_class != p->sched_class) { if (prev_class->switched_from) prev_class->switched_from(rq, p); + /* Possble rq->lock 'hole'. */ p->sched_class->switched_to(rq, p); } else if (oldprio != p->prio || dl_task(p)) p->sched_class->prio_changed(rq, p, oldprio); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 2e31a30e623c..9d483e862e58 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -563,11 +563,6 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) { struct hrtimer *timer = &dl_se->dl_timer; - if (hrtimer_active(timer)) { - hrtimer_try_to_cancel(timer); - return; - } - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); timer->function = dl_task_timer; } @@ -1610,10 +1605,35 @@ void init_sched_dl_class(void) #endif /* CONFIG_SMP */ +/* + * Ensure p's dl_timer is cancelled. May drop rq->lock for a while. + */ +static void cancel_dl_timer(struct rq *rq, struct task_struct *p) +{ + struct hrtimer *dl_timer = &p->dl.dl_timer; + + /* Nobody will change task's class if pi_lock is held */ + lockdep_assert_held(&p->pi_lock); + + if (hrtimer_active(dl_timer)) { + int ret = hrtimer_try_to_cancel(dl_timer); + + if (unlikely(ret == -1)) { + /* + * Note, p may migrate OR new deadline tasks + * may appear in rq when we are unlocking it. + * A caller of us must be fine with that. + */ + raw_spin_unlock(&rq->lock); + hrtimer_cancel(dl_timer); + raw_spin_lock(&rq->lock); + } + } +} + static void switched_from_dl(struct rq *rq, struct task_struct *p) { - if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) - hrtimer_try_to_cancel(&p->dl.dl_timer); + cancel_dl_timer(rq, p); __dl_clear_params(p); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ec3917c5f898..49b941fe2cc2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1157,6 +1157,11 @@ struct sched_class { void (*task_fork) (struct task_struct *p); void (*task_dead) (struct task_struct *p); + /* + * The switched_from() call is allowed to drop rq->lock, therefore we + * cannot assume the switched_from/switched_to pair is serliazed by + * rq->lock. They are however serialized by p->pi_lock. + */ void (*switched_from) (struct rq *this_rq, struct task_struct *task); void (*switched_to) (struct rq *this_rq, struct task_struct *task); void (*prio_changed) (struct rq *this_rq, struct task_struct *task, -- cgit v1.2.3 From 9f96742a13135e6c609cc99a3a458402af3c8f31 Mon Sep 17 00:00:00 2001 From: Yao Dongdong Date: Tue, 28 Oct 2014 04:08:06 +0000 Subject: sched: Check if we got a shallowest_idle_cpu before searching for least_loaded_cpu Idle cpu is idler than non-idle cpu, so we needn't search for least_loaded_cpu after we have found an idle cpu. Signed-off-by: Yao Dongdong Reviewed-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1414469286-6023-1-git-send-email-yaodongdong@huawei.com Signed-off-by: Ingo Molnar