summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-11-13 13:37:52 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2017-11-13 13:37:52 -0800
commit3e2014637c50e5d6a77cd63d5db6c209fe29d1b1 (patch)
treea672ed603262aeddda4490056b27b09791d0cbbb
parentf2be8bd52e7410c70145f73511a2e80f4797e1a5 (diff)
parent765cc3a4b224e22bf524fabe40284a524f37cdd0 (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "The main updates in this cycle were: - Group balancing enhancements and cleanups (Brendan Jackman) - Move CPU isolation related functionality into its separate kernel/sched/isolation.c file, with related 'housekeeping_*()' namespace and nomenclature et al. (Frederic Weisbecker) - Improve the interactive/cpu-intense fairness calculation (Josef Bacik) - Improve the PELT code and related cleanups (Peter Zijlstra) - Improve the logic of pick_next_task_fair() (Uladzislau Rezki) - Improve the RT IPI based balancing logic (Steven Rostedt) - Various micro-optimizations: - better !CONFIG_SCHED_DEBUG optimizations (Patrick Bellasi) - better idle loop (Cheng Jian) - ... plus misc fixes, cleanups and updates" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (54 commits) sched/core: Optimize sched_feat() for !CONFIG_SCHED_DEBUG builds sched/sysctl: Fix attributes of some extern declarations sched/isolation: Document isolcpus= boot parameter flags, mark it deprecated sched/isolation: Add basic isolcpus flags sched/isolation: Move isolcpus= handling to the housekeeping code sched/isolation: Handle the nohz_full= parameter sched/isolation: Introduce housekeeping flags sched/isolation: Split out new CONFIG_CPU_ISOLATION=y config from CONFIG_NO_HZ_FULL sched/isolation: Rename is_housekeeping_cpu() to housekeeping_cpu() sched/isolation: Use its own static key sched/isolation: Make the housekeeping cpumask private sched/isolation: Provide a dynamic off-case to housekeeping_any_cpu() sched/isolation, watchdog: Use housekeeping_cpumask() instead of ad-hoc version sched/isolation: Move housekeeping related code to its own file sched/idle: Micro-optimize the idle loop sched/isolcpus: Fix "isolcpus=" boot parameter handling when !CONFIG_CPUMASK_OFFSTACK x86/tsc: Append the 'tsc=' description for the 'tsc=unstable' boot parameter sched/rt: Simplify the IPI based RT balancing logic block/ioprio: Use a helper to check for RT prio sched/rt: Add a helper to test for a RT task ...
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt40
-rw-r--r--drivers/base/cpu.c11
-rw-r--r--drivers/net/ethernet/tile/tilegx.c6
-rw-r--r--fs/proc/array.c2
-rw-r--r--include/linux/cpumask.h16
-rw-r--r--include/linux/ioprio.h3
-rw-r--r--include/linux/sched.h19
-rw-r--r--include/linux/sched/isolation.h51
-rw-r--r--include/linux/sched/rt.h11
-rw-r--r--include/linux/sched/sysctl.h6
-rw-r--r--include/linux/tick.h39
-rw-r--r--include/trace/events/sched.h2
-rw-r--r--init/Kconfig7
-rw-r--r--init/main.c2
-rw-r--r--kernel/cgroup/cpuset.c15
-rw-r--r--kernel/rcu/tree_plugin.h3
-rw-r--r--kernel/rcu/update.c3
-rw-r--r--kernel/sched/Makefile1
-rw-r--r--kernel/sched/core.c56
-rw-r--r--kernel/sched/deadline.c21
-rw-r--r--kernel/sched/debug.c18
-rw-r--r--kernel/sched/fair.c1049
-rw-r--r--kernel/sched/idle.c4
-rw-r--r--kernel/sched/isolation.c155
-rw-r--r--kernel/sched/rt.c316
-rw-r--r--kernel/sched/sched.h73
-rw-r--r--kernel/sched/topology.c49
-rw-r--r--kernel/time/tick-sched.c33
-rw-r--r--kernel/trace/trace_output.c12
-rw-r--r--kernel/trace/trace_sched_wakeup.c8
-rw-r--r--kernel/watchdog.c13
31 files changed, 1270 insertions, 774 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 116e798b61e6..38ed8787261b 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1730,20 +1730,33 @@
isapnp= [ISAPNP]
Format: <RDP>,<reset>,<pci_scan>,<verbosity>
- isolcpus= [KNL,SMP] Isolate CPUs from the general scheduler.
- The argument is a cpu list, as described above.
+ isolcpus= [KNL,SMP] Isolate a given set of CPUs from disturbance.
+ [Deprecated - use cpusets instead]
+ Format: [flag-list,]<cpu-list>
+
+ Specify one or more CPUs to isolate from disturbances
+ specified in the flag list (default: domain):
+
+ nohz
+ Disable the tick when a single task runs.
+ domain
+ Isolate from the general SMP balancing and scheduling
+ algorithms. Note that performing domain isolation this way
+ is irreversible: it's not possible to bring back a CPU to
+ the domains once isolated through isolcpus. It's strongly
+ advised to use cpusets instead to disable scheduler load
+ balancing through the "cpuset.sched_load_balance" file.
+ It offers a much more flexible interface where CPUs can
+ move in and out of an isolated set anytime.
+
+ You can move a process onto or off an "isolated" CPU via
+ the CPU affinity syscalls or cpuset.
+ <cpu number> begins at 0 and the maximum value is
+ "number of CPUs in system - 1".
+
+ The format of <cpu-list> is described above.
- This option can be used to specify one or more CPUs
- to isolate from the general SMP balancing and scheduling
- algorithms. You can move a process onto or off an
- "isolated" CPU via the CPU affinity syscalls or cpuset.
- <cpu number> begins at 0 and the maximum value is
- "number of CPUs in system - 1".
- This option is the preferred way to isolate CPUs. The
- alternative -- manually setting the CPU mask of all
- tasks in the system -- can cause problems and
- suboptimal load balancer performance.
iucv= [HW,NET]
@@ -4209,6 +4222,9 @@
Used to run time disable IRQ_TIME_ACCOUNTING on any
platforms where RDTSC is slow and this accounting
can add overhead.
+ [x86] unstable: mark the TSC clocksource as unstable, this
+ marks the TSC unconditionally unstable at bootup and
+ avoids any further wobbles once the TSC watchdog notices.
turbografx.map[2|3]= [HW,JOY]
TurboGraFX parallel port interface
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 321cd7b4d817..a73ab95558f5 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -18,6 +18,7 @@
#include <linux/cpufeature.h>
#include <linux/tick.h>
#include <linux/pm_qos.h>
+#include <linux/sched/isolation.h>
#include "base.h"
@@ -271,8 +272,16 @@ static ssize_t print_cpus_isolated(struct device *dev,
struct device_attribute *attr, char *buf)
{
int n = 0, len = PAGE_SIZE-2;
+ cpumask_var_t isolated;
- n = scnprintf(buf, len, "%*pbl\n", cpumask_pr_args(cpu_isolated_map));
+ if (!alloc_cpumask_var(&isolated, GFP_KERNEL))
+ return -ENOMEM;
+
+ cpumask_andnot(isolated, cpu_possible_mask,
+ housekeeping_cpumask(HK_FLAG_DOMAIN));
+ n = scnprintf(buf, len, "%*pbl\n", cpumask_pr_args(isolated));
+
+ free_cpumask_var(isolated);
return n;
}
diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c
index c00102b8145a..b3e5816a4678 100644
--- a/drivers/net/ethernet/tile/tilegx.c
+++ b/drivers/net/ethernet/tile/tilegx.c
@@ -40,7 +40,7 @@
#include <linux/tcp.h>
#include <linux/net_tstamp.h>
#include <linux/ptp_clock_kernel.h>
-#include <linux/tick.h>
+#include <linux/sched/isolation.h>
#include <asm/checksum.h>
#include <asm/homecache.h>
@@ -2270,8 +2270,8 @@ static int __init tile_net_init_module(void)
tile_net_dev_init(name, mac);
if (!network_cpus_init())
- cpumask_and(&network_cpus_map, housekeeping_cpumask(),
- cpu_online_mask);
+ cpumask_and(&network_cpus_map,
+ housekeeping_cpumask(HK_FLAG_MISC), cpu_online_mask);
return 0;
}
diff --git a/fs/proc/array.c b/fs/proc/array.c
index d82549e80402..6f6fc1672ad1 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -138,7 +138,7 @@ static const char * const task_state_array[] = {
static inline const char *get_task_state(struct task_struct *tsk)
{
BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != ARRAY_SIZE(task_state_array));
- return task_state_array[__get_task_state(tsk)];
+ return task_state_array[task_state_index(tsk)];
}
static inline int get_task_umask(struct task_struct *tsk)
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 8d3125c493b2..75b565194437 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -131,6 +131,11 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp)
return 0;
}
+static inline unsigned int cpumask_last(const struct cpumask *srcp)
+{
+ return 0;
+}
+
/* Valid inputs for n are -1 and 0. */
static inline unsigned int cpumask_next(int n, const struct cpumask *srcp)
{
@@ -179,6 +184,17 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp)
return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits);
}
+/**
+ * cpumask_last - get the last CPU in a cpumask
+ * @srcp: - the cpumask pointer
+ *
+ * Returns >= nr_cpumask_bits if no CPUs set.
+ */
+static inline unsigned int cpumask_last(const struct cpumask *srcp)
+{
+ return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits);
+}
+
unsigned int cpumask_next(int n, const struct cpumask *srcp);
/**
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index 2cdd74809899..627efac73e6d 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -3,6 +3,7 @@
#define IOPRIO_H
#include <linux/sched.h>
+#include <linux/sched/rt.h>
#include <linux/iocontext.h>
/*
@@ -63,7 +64,7 @@ static inline int task_nice_ioclass(struct task_struct *task)
{
if (task->policy == SCHED_IDLE)
return IOPRIO_CLASS_IDLE;
- else if (task->policy == SCHED_FIFO || task->policy == SCHED_RR)
+ else if (task_is_realtime(task))
return IOPRIO_CLASS_RT;
else
return IOPRIO_CLASS_BE;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index fdf74f27acf1..a5dc7c98b0a2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -166,8 +166,6 @@ struct task_group;
/* Task command name length: */
#define TASK_COMM_LEN 16
-extern cpumask_var_t cpu_isolated_map;
-
extern void scheduler_tick(void);
#define MAX_SCHEDULE_TIMEOUT LONG_MAX
@@ -332,9 +330,11 @@ struct load_weight {
struct sched_avg {
u64 last_update_time;
u64 load_sum;
+ u64 runnable_load_sum;
u32 util_sum;
u32 period_contrib;
unsigned long load_avg;
+ unsigned long runnable_load_avg;
unsigned long util_avg;
};
@@ -377,6 +377,7 @@ struct sched_statistics {
struct sched_entity {
/* For load-balancing: */
struct load_weight load;
+ unsigned long runnable_weight;
struct rb_node run_node;
struct list_head group_node;
unsigned int on_rq;
@@ -472,10 +473,10 @@ struct sched_dl_entity {
* conditions between the inactive timer handler and the wakeup
* code.
*/
- int dl_throttled;
- int dl_boosted;
- int dl_yielded;
- int dl_non_contending;
+ int dl_throttled : 1;
+ int dl_boosted : 1;
+ int dl_yielded : 1;
+ int dl_non_contending : 1;
/*
* Bandwidth enforcement timer. Each -deadline task has its
@@ -1246,7 +1247,7 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk)
#define TASK_REPORT_IDLE (TASK_REPORT + 1)
#define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1)
-static inline unsigned int __get_task_state(struct task_struct *tsk)
+static inline unsigned int task_state_index(struct task_struct *tsk)
{
unsigned int tsk_state = READ_ONCE(tsk->state);
unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT;
@@ -1259,7 +1260,7 @@ static inline unsigned int __get_task_state(struct task_struct *tsk)
return fls(state);
}
-static inline char __task_state_to_char(unsigned int state)
+static inline char task_index_to_char(unsigned int state)
{
static const char state_char[] = "RSDTtXZPI";
@@ -1270,7 +1271,7 @@ static inline char __task_state_to_char(unsigned int state)
static inline char task_state_to_char(struct task_struct *tsk)
{
- return __task_state_to_char(__get_task_state(tsk));
+ return task_index_to_char(task_state_index(tsk));
}
/**
diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
new file mode 100644
index 000000000000..d849431c8060
--- /dev/null
+++ b/include/linux/sched/isolation.h
@@ -0,0 +1,51 @@
+#ifndef _LINUX_SCHED_ISOLATION_H
+#define _LINUX_SCHED_ISOLATION_H
+
+#include <linux/cpumask.h>
+#include <linux/init.h>
+#include <linux/tick.h>
+
+enum hk_flags {
+ HK_FLAG_TIMER = 1,
+ HK_FLAG_RCU = (1 << 1),
+ HK_FLAG_MISC = (1 << 2),
+ HK_FLAG_SCHED = (1 << 3),
+ HK_FLAG_TICK = (1 << 4),
+ HK_FLAG_DOMAIN = (1 << 5),
+};
+
+#ifdef CONFIG_CPU_ISOLATION
+DECLARE_STATIC_KEY_FALSE(housekeeping_overriden);
+extern int housekeeping_any_cpu(enum hk_flags flags);
+extern const struct cpumask *housekeeping_cpumask(enum hk_flags flags);
+extern void housekeeping_affine(struct task_struct *t, enum hk_flags flags);
+extern bool housekeeping_test_cpu(int cpu, enum hk_flags flags);
+extern void __init housekeeping_init(void);
+
+#else
+
+static inline int housekeeping_any_cpu(enum hk_flags flags)
+{
+ return smp_processor_id();
+}
+
+static inline const struct cpumask *housekeeping_cpumask(enum hk_flags flags)
+{
+ return cpu_possible_mask;
+}
+
+static inline void housekeeping_affine(struct task_struct *t,
+ enum hk_flags flags) { }
+static inline void housekeeping_init(void) { }
+#endif /* CONFIG_CPU_ISOLATION */
+
+static inline bool housekeeping_cpu(int cpu, enum hk_flags flags)
+{
+#ifdef CONFIG_CPU_ISOLATION
+ if (static_branch_unlikely(&housekeeping_overriden))
+ return housekeeping_test_cpu(cpu, flags);
+#endif
+ return true;
+}
+
+#endif /* _LINUX_SCHED_ISOLATION_H */
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index db865ed25ef3..e5af028c08b4 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -18,6 +18,17 @@ static inline int rt_task(struct task_struct *p)
return rt_prio(p->prio);
}
+static inline bool task_is_realtime(struct task_struct *tsk)
+{
+ int policy = tsk->policy;
+
+ if (policy == SCHED_FIFO || policy == SCHED_RR)
+ return true;
+ if (policy == SCHED_DEADLINE)
+ return true;
+ return false;
+}
+
#ifdef CONFIG_RT_MUTEXES
/*
* Must hold either p->pi_lock or task_rq(p)->lock.
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index d6a18a3839cc..1c1a1512ec55 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -38,9 +38,9 @@ extern unsigned int sysctl_numa_balancing_scan_period_max;
extern unsigned int sysctl_numa_balancing_scan_size;
#ifdef CONFIG_SCHED_DEBUG
-extern unsigned int sysctl_sched_migration_cost;
-extern unsigned int sysctl_sched_nr_migrate;
-extern unsigned int sysctl_sched_time_avg;
+extern __read_mostly unsigned int sysctl_sched_migration_cost;
+extern __read_mostly unsigned int sysctl_sched_nr_migrate;
+extern __read_mostly unsigned int sysctl_sched_time_avg;
int sched_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length,
diff --git a/include/linux/tick.h b/include/linux/tick.h
index cf413b344ddb..f442d1a42025 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -138,7 +138,6 @@ static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
#ifdef CONFIG_NO_HZ_FULL
extern bool tick_nohz_full_running;
extern cpumask_var_t tick_nohz_full_mask;
-extern cpumask_var_t housekeeping_mask;
static inline bool tick_nohz_full_enabled(void)
{
@@ -162,11 +161,6 @@ static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask)
cpumask_or(mask, mask, tick_nohz_full_mask);
}
-static inline int housekeeping_any_cpu(void)
-{
- return cpumask_any_and(housekeeping_mask, cpu_online_mask);
-}
-
extern void tick_nohz_dep_set(enum tick_dep_bits bit);
extern void tick_nohz_dep_clear(enum tick_dep_bits bit);
extern void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit);
@@ -235,11 +229,8 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal,
extern void tick_nohz_full_kick_cpu(int cpu);
extern void __tick_nohz_task_switch(void);
+extern void __init tick_nohz_full_setup(cpumask_var_t cpumask);
#else
-static inline int housekeeping_any_cpu(void)
-{
- return smp_processor_id();
-}
static inline bool tick_nohz_full_enabled(void) { return false; }
static inline bool tick_nohz_full_cpu(int cpu) { return false; }
static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { }
@@ -259,35 +250,9 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal,
static inline void tick_nohz_full_kick_cpu(int cpu) { }
static inline void __tick_nohz_task_switch(void) { }
+static inline void tick_nohz_full_setup(cpumask_var_t cpumask) { }
#endif
-static inline const struct cpumask *housekeeping_cpumask(void)
-{
-#ifdef CONFIG_NO_HZ_FULL
- if (tick_nohz_full_enabled())
- return housekeeping_mask;
-#endif
- return cpu_possible_mask;
-}
-
-static inline bool is_housekeeping_cpu(int cpu)
-{
-#ifdef CONFIG_NO_HZ_FULL
- if (tick_nohz_full_enabled())
- return cpumask_test_cpu(cpu, housekeeping_mask);
-#endif
- return true;
-}
-
-static inline void housekeeping_affine(struct task_struct *t)
-{
-#ifdef CONFIG_NO_HZ_FULL
- if (tick_nohz_full_enabled())
- set_cpus_allowed_ptr(t, housekeeping_mask);
-
-#endif
-}
-
static inline void tick_nohz_task_switch(void)
{
if (tick_nohz_full_enabled())
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index da10aa21bebc..306b31de5194 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -118,7 +118,7 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct *
if (preempt)
return TASK_STATE_MAX;
- return __get_task_state(p);
+ return task_state_index(p);
}
#endif /* CREATE_TRACE_POINTS */
diff --git a/init/Kconfig b/init/Kconfig
index 3c1faaa2af4a..c1fd2863d4ba 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -472,6 +472,13 @@ config TASK_IO_ACCOUNTING
endmenu # "CPU/Task time and stats accounting"
+config CPU_ISOLATION
+ bool "CPU isolation"
+ help
+ Make sure that CPUs running critical tasks are not disturbed by
+ any source of "noise" such as unbound workqueues, timers, kthreads...
+ Unbound jobs get offloaded to housekeeping CPUs.
+
source "kernel/rcu/Kconfig"
config BUILD_BIN2C
diff --git a/init/main.c b/init/main.c
index 0ee9c6866ada..4610c99ae306 100644
--- a/init/main.c
+++ b/init/main.c
@@ -46,6 +46,7 @@
#include <linux/cgroup.h>
#include <linux/efi.h>
#include <linux/tick.h>
+#include <linux/sched/isolation.h>
#include <linux/interrupt.h>
#include <linux/taskstats_kern.h>
#include <linux/delayacct.h>
@@ -606,6 +607,7 @@ asmlinkage __visible void __init start_kernel(void)
early_irq_init();
init_IRQ();
tick_init();
+ housekeeping_init();
rcu_init_nohz();
init_timers();
hrtimers_init();
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 4657e2924ecb..f7efa7b4d825 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -57,7 +57,7 @@
#include <linux/backing-dev.h>
#include <linux/sort.h>
#include <linux/oom.h>
-
+#include <linux/sched/isolation.h>
#include <linux/uaccess.h>
#include <linux/atomic.h>
#include <linux/mutex.h>
@@ -656,7 +656,6 @@ static int generate_sched_domains(cpumask_var_t **domains,
int csn; /* how many cpuset ptrs in csa so far */
int i, j, k; /* indices for partition finding loops */
cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
- cpumask_var_t non_isolated_cpus; /* load balanced CPUs */
struct sched_domain_attr *dattr; /* attributes for custom domains */
int ndoms = 0; /* number of sched domains in result */
int nslot; /* next empty doms[] struct cpumask slot */
@@ -666,10 +665,6 @@ static int generate_sched_domains(cpumask_var_t **domains,
dattr = NULL;
csa = NULL;
- if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL))
- goto done;
- cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
-
/* Special case for the 99% of systems with one, full, sched domain */
if (is_sched_load_balance(&top_cpuset)) {
ndoms = 1;
@@ -683,7 +678,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
update_domain_attr_tree(dattr, &top_cpuset);
}
cpumask_and(doms[0], top_cpuset.effective_cpus,
- non_isolated_cpus);
+ housekeeping_cpumask(HK_FLAG_DOMAIN));
goto done;
}
@@ -707,7 +702,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
*/
if (!cpumask_empty(cp->cpus_allowed) &&
!(is_sched_load_balance(cp) &&
- cpumask_intersects(cp->cpus_allowed, non_isolated_cpus)))
+ cpumask_intersects(cp->cpus_allowed,
+ housekeeping_cpumask(HK_FLAG_DOMAIN))))
continue;
if (is_sched_load_balance(cp))
@@ -789,7 +785,7 @@ restart:
if (apn == b->pn) {
cpumask_or(dp, dp, b->effective_cpus);
- cpumask_and(dp, dp, non_isolated_cpus);
+ cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
if (dattr)
update_domain_attr_tree(dattr + nslot, b);
@@ -802,7 +798,6 @@ restart:
BUG_ON(nslot != ndoms);
done:
- free_cpumask_var(non_isolated_cpus);
kfree(csa);
/*
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index dd4d0d390e5b..910405dc6e5c 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -29,6 +29,7 @@
#include <linux/oom.h>
#include <linux/sched/debug.h>
#include <linux/smpboot.h>
+#include <linux/sched/isolation.h>
#include <uapi/linux/sched/types.h>
#include "../time/tick-internal.h"
@@ -2587,7 +2588,7 @@ static void rcu_bind_gp_kthread(void)
if (!tick_nohz_full_enabled())
return;
- housekeeping_affine(current);
+ housekeeping_affine(current, HK_FLAG_RCU);
}
/* Record the current task on dyntick-idle entry. */
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 27694561f769..fbd56d6e575b 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -51,6 +51,7 @@
#include <linux/kthread.h>
#include <linux/tick.h>
#include <linux/rcupdate_wait.h>
+#include <linux/sched/isolation.h>
#define CREATE_TRACE_POINTS
@@ -714,7 +715,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
LIST_HEAD(rcu_tasks_holdouts);
/* Run on housekeeping CPUs by default. Sysadm can move if desired. */
- housekeeping_affine(current);
+ housekeeping_affine(current, HK_FLAG_RCU);
/*
* Each pass through the following loop makes one check for
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index a9ee16bbc693..e2f9d4feff40 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -27,3 +27,4 @@ obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
obj-$(CONFIG_CPU_FREQ) += cpufreq.o
obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
obj-$(CONFIG_MEMBARRIER) += membarrier.o
+obj-$(CONFIG_CPU_ISOLATION) += isolation.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9446b2e5eac5..5b82a0073532 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -26,6 +26,7 @@
#include <linux/profile.h>
#include <linux/security.h>
#include <linux/syscalls.h>
+#include <linux/sched/isolation.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
@@ -42,18 +43,21 @@
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
/*
* Debugging: various feature bits
+ *
+ * If SCHED_DEBUG is disabled, each compilation unit has its own copy of
+ * sysctl_sched_features, defined in sched.h, to allow constants propagation
+ * at compile time and compiler optimization based on features default.
*/
-
#define SCHED_FEAT(name, enabled) \
(1UL << __SCHED_FEAT_##name) * enabled |
-
const_debug unsigned int sysctl_sched_features =
#include "features.h"
0;
-
#undef SCHED_FEAT
+#endif
/*
* Number of tasks to iterate in a single balance run.
@@ -83,9 +87,6 @@ __read_mostly int scheduler_running;
*/
int sysctl_sched_rt_runtime = 950000;
-/* CPUs with isolated domains */
-cpumask_var_t cpu_isolated_map;
-
/*
* __task_rq_lock - lock the rq @p resides on.
*/
@@ -525,7 +526,7 @@ int get_nohz_timer_target(void)
int i, cpu = smp_processor_id();
struct sched_domain *sd;
- if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
+ if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
return cpu;
rcu_read_lock();
@@ -534,15 +535,15 @@ int get_nohz_timer_target(void)
if (cpu == i)
continue;
- if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
+ if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
cpu = i;
goto unlock;
}
}
}
- if (!is_housekeeping_cpu(cpu))
- cpu = housekeeping_any_cpu();
+ if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
+ cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
unlock:
rcu_read_unlock();
return cpu;
@@ -732,7 +733,7 @@ int tg_nop(struct task_group *tg, void *data)
}
#endif
-static void set_load_weight(struct task_struct *p)
+static void set_load_weight(struct task_struct *p, bool update_load)
{
int prio = p->static_prio - MAX_RT_PRIO;
struct load_weight *load = &p->se.load;
@@ -746,8 +747,16 @@ static void set_load_weight(struct task_struct *p)
return;
}
- load->weight = scale_load(sched_prio_to_weight[prio]);
- load->inv_weight = sched_prio_to_wmult[prio];
+ /*
+ * SCHED_OTHER tasks have to update their load when changing their
+ * weight
+ */
+ if (update_load && p->sched_class == &fair_sched_class) {
+ reweight_task(p, prio);
+ } else {
+ load->weight = scale_load(sched_prio_to_weight[prio]);
+ load->inv_weight = sched_prio_to_wmult[prio];
+ }
}
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -2357,7 +2366,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->static_prio = NICE_TO_PRIO(0);
p->prio = p->normal_prio = __normal_prio(p);
- set_load_weight(p);
+ set_load_weight(p, false);
/*
* We don't need the reset flag anymore after the fork. It has
@@ -3804,7 +3813,7 @@ void set_user_nice(struct task_struct *p, long nice)
put_prev_task(rq, p);
p->static_prio = NICE_TO_PRIO(nice);
- set_load_weight(p);
+ set_load_weight(p, true);
old_prio = p->prio;
p->prio = effective_prio(p);
delta = p->prio - old_prio;
@@ -3961,7 +3970,7 @@ static void __setscheduler_params(struct task_struct *p,
*/
p->rt_priority = attr->sched_priority;
p->normal_prio = normal_prio(p);
- set_load_weight(p);
+ set_load_weight(p, true);
}
/* Actually do priority change: must hold pi & rq lock. */
@@ -5727,10 +5736,6 @@ static inline void sched_init_smt(void) { }
void __init sched_init_smp(void)
{
- cpumask_var_t non_isolated_cpus;
-
- alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
-
sched_init_numa();
/*
@@ -5740,16 +5745,12 @@ void __init sched_init_smp(void)
*/
mutex_lock(&sched_domains_mutex);
sched_init_domains(cpu_active_mask);
- cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
- if (cpumask_empty(non_isolated_cpus))
- cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
mutex_unlock(&sched_domains_mutex);
/* Move init over to a non-isolated CPU */
- if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
+ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
BUG();
sched_init_granularity();
- free_cpumask_var(non_isolated_cpus);
init_sched_rt_class();
init_sched_dl_class();
@@ -5934,7 +5935,7 @@ void __init sched_init(void)
atomic_set(&rq->nr_iowait, 0);
}
- set_load_weight(&init_task);
+ set_load_weight(&init_task, false);
/*
* The boot idle thread does lazy MMU switching as well:
@@ -5953,9 +5954,6 @@ void __init sched_init(void)
calc_load_update = jiffies + LOAD_FREQ;
#ifdef CONFIG_SMP
- /* May be allocated at isolcpus cmdline parse time */
- if (cpu_isolated_map == NULL)
- zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
idle_thread_set_boot_cpu();
set_cpu_rq_start_time(smp_processor_id());
#endif
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 4ae5c1ea90e2..f349f7e98dec 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -243,7 +243,7 @@ static void task_non_contending(struct task_struct *p)
if (p->state == TA