summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-06-12 19:42:15 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2014-06-12 19:42:15 -0700
commitb2e09f633a3994ee97fa6bc734b533d9c8e6ea0f (patch)
tree8f398d3f7ac19a4f4d64862086597f335d977203
parent3737a12761636ebde0f09ef49daebb8eed18cc8a (diff)
parent535560d841b2d54f31280e05e9c6ffd19da0c4e7 (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull more scheduler updates from Ingo Molnar: "Second round of scheduler changes: - try-to-wakeup and IPI reduction speedups, from Andy Lutomirski - continued power scheduling cleanups and refactorings, from Nicolas Pitre - misc fixes and enhancements" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/deadline: Delete extraneous extern for to_ratio() sched/idle: Optimize try-to-wake-up IPI sched/idle: Simplify wake_up_idle_cpu() sched/idle: Clear polling before descheduling the idle thread sched, trace: Add a tracepoint for IPI-less remote wakeups cpuidle: Set polling in poll_idle sched: Remove redundant assignment to "rt_rq" in update_curr_rt(...) sched: Rename capacity related flags sched: Final power vs. capacity cleanups sched: Remove remaining dubious usage of "power" sched: Let 'struct sched_group_power' care about CPU capacity sched/fair: Disambiguate existing/remaining "capacity" usage sched/fair: Change "has_capacity" to "has_free_capacity" sched/fair: Remove "power" from 'struct numa_stats' sched: Fix signedness bug in yield_to() sched/fair: Use time_after() in record_wakee() sched/balancing: Reduce the rate of needless idle load balancing sched/fair: Fix unlocked reads of some cfs_b->quota/period
-rw-r--r--arch/arm/kernel/topology.c54
-rw-r--r--arch/powerpc/kernel/smp.c2
-rw-r--r--drivers/cpuidle/driver.c7
-rw-r--r--include/linux/kvm_host.h2
-rw-r--r--include/linux/sched.h14
-rw-r--r--include/trace/events/sched.h20
-rw-r--r--kernel/sched/core.c182
-rw-r--r--kernel/sched/deadline.c2
-rw-r--r--kernel/sched/fair.c390
-rw-r--r--kernel/sched/features.h8
-rw-r--r--kernel/sched/idle.c30
-rw-r--r--kernel/sched/rt.c3
-rw-r--r--kernel/sched/sched.h24
-rw-r--r--virt/kvm/kvm_main.c4
14 files changed, 416 insertions, 326 deletions
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 3997c411c140..9d853189028b 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -26,30 +26,30 @@
#include <asm/topology.h>
/*
- * cpu power scale management
+ * cpu capacity scale management
*/
/*
- * cpu power table
+ * cpu capacity table
* This per cpu data structure describes the relative capacity of each core.
* On a heteregenous system, cores don't have the same computation capacity
- * and we reflect that difference in the cpu_power field so the scheduler can
- * take this difference into account during load balance. A per cpu structure
- * is preferred because each CPU updates its own cpu_power field during the
- * load balance except for idle cores. One idle core is selected to run the
- * rebalance_domains for all idle cores and the cpu_power can be updated
- * during this sequence.
+ * and we reflect that difference in the cpu_capacity field so the scheduler
+ * can take this difference into account during load balance. A per cpu
+ * structure is preferred because each CPU updates its own cpu_capacity field
+ * during the load balance except for idle cores. One idle core is selected
+ * to run the rebalance_domains for all idle cores and the cpu_capacity can be
+ * updated during this sequence.
*/
static DEFINE_PER_CPU(unsigned long, cpu_scale);
-unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu)
+unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
{
return per_cpu(cpu_scale, cpu);
}
-static void set_power_scale(unsigned int cpu, unsigned long power)
+static void set_capacity_scale(unsigned int cpu, unsigned long capacity)
{
- per_cpu(cpu_scale, cpu) = power;
+ per_cpu(cpu_scale, cpu) = capacity;
}
#ifdef CONFIG_OF
@@ -62,11 +62,11 @@ struct cpu_efficiency {
* Table of relative efficiency of each processors
* The efficiency value must fit in 20bit and the final
* cpu_scale value must be in the range
- * 0 < cpu_scale < 3*SCHED_POWER_SCALE/2
+ * 0 < cpu_scale < 3*SCHED_CAPACITY_SCALE/2
* in order to return at most 1 when DIV_ROUND_CLOSEST
* is used to compute the capacity of a CPU.
* Processors that are not defined in the table,
- * use the default SCHED_POWER_SCALE value for cpu_scale.
+ * use the default SCHED_CAPACITY_SCALE value for cpu_scale.
*/
static const struct cpu_efficiency table_efficiency[] = {
{"arm,cortex-a15", 3891},
@@ -83,9 +83,9 @@ static unsigned long middle_capacity = 1;
* Iterate all CPUs' descriptor in DT and compute the efficiency
* (as per table_efficiency). Also calculate a middle efficiency
* as close as possible to (max{eff_i} - min{eff_i}) / 2
- * This is later used to scale the cpu_power field such that an
- * 'average' CPU is of middle power. Also see the comments near
- * table_efficiency[] and update_cpu_power().
+ * This is later used to scale the cpu_capacity field such that an
+ * 'average' CPU is of middle capacity. Also see the comments near
+ * table_efficiency[] and update_cpu_capacity().
*/
static void __init parse_dt_topology(void)
{
@@ -141,15 +141,15 @@ static void __init parse_dt_topology(void)
* cpu_scale because all CPUs have the same capacity. Otherwise, we
* compute a middle_capacity factor that will ensure that the capacity
* of an 'average' CPU of the system will be as close as possible to
- * SCHED_POWER_SCALE, which is the default value, but with the
+ * SCHED_CAPACITY_SCALE, which is the default value, but with the
* constraint explained near table_efficiency[].
*/
if (4*max_capacity < (3*(max_capacity + min_capacity)))
middle_capacity = (min_capacity + max_capacity)
- >> (SCHED_POWER_SHIFT+1);
+ >> (SCHED_CAPACITY_SHIFT+1);
else
middle_capacity = ((max_capacity / 3)
- >> (SCHED_POWER_SHIFT-1)) + 1;
+ >> (SCHED_CAPACITY_SHIFT-1)) + 1;
}
@@ -158,20 +158,20 @@ static void __init parse_dt_topology(void)
* boot. The update of all CPUs is in O(n^2) for heteregeneous system but the
* function returns directly for SMP system.
*/
-static void update_cpu_power(unsigned int cpu)
+static void update_cpu_capacity(unsigned int cpu)
{
if (!cpu_capacity(cpu))
return;
- set_power_scale(cpu, cpu_capacity(cpu) / middle_capacity);
+ set_capacity_scale(cpu, cpu_capacity(cpu) / middle_capacity);
- printk(KERN_INFO "CPU%u: update cpu_power %lu\n",
- cpu, arch_scale_freq_power(NULL, cpu));
+ printk(KERN_INFO "CPU%u: update cpu_capacity %lu\n",
+ cpu, arch_scale_freq_capacity(NULL, cpu));
}
#else
static inline void parse_dt_topology(void) {}
-static inline void update_cpu_power(unsigned int cpuid) {}
+static inline void update_cpu_capacity(unsigned int cpuid) {}
#endif
/*
@@ -267,7 +267,7 @@ void store_cpu_topology(unsigned int cpuid)
update_siblings_masks(cpuid);
- update_cpu_power(cpuid);
+ update_cpu_capacity(cpuid);
printk(KERN_INFO "CPU%u: thread %d, cpu %d, socket %d, mpidr %x\n",
cpuid, cpu_topology[cpuid].thread_id,
@@ -297,7 +297,7 @@ void __init init_cpu_topology(void)
{
unsigned int cpu;
- /* init core mask and power*/
+ /* init core mask and capacity */
for_each_possible_cpu(cpu) {
struct cputopo_arm *cpu_topo = &(cpu_topology[cpu]);
@@ -307,7 +307,7 @@ void __init init_cpu_topology(void)
cpumask_clear(&cpu_topo->core_sibling);
cpumask_clear(&cpu_topo->thread_sibling);
- set_power_scale(cpu, SCHED_POWER_SCALE);
+ set_capacity_scale(cpu, SCHED_CAPACITY_SCALE);
}
smp_wmb();
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 7753af2d2613..51a3ff78838a 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -749,7 +749,7 @@ int setup_profiling_timer(unsigned int multiplier)
/* cpumask of CPUs with asymetric SMT dependancy */
static const int powerpc_smt_flags(void)
{
- int flags = SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES;
+ int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c
index 136d6a283e0a..9634f20e3926 100644
--- a/drivers/cpuidle/driver.c
+++ b/drivers/cpuidle/driver.c
@@ -187,8 +187,11 @@ static int poll_idle(struct cpuidle_device *dev,
t1 = ktime_get();
local_irq_enable();
- while (!need_resched())
- cpu_relax();
+ if (!current_set_polling_and_test()) {
+ while (!need_resched())
+ cpu_relax();
+ }
+ current_clr_polling();
t2 = ktime_get();
diff = ktime_to_us(ktime_sub(t2, t1));
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 970c68197c69..ec4e3bd83d47 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -586,7 +586,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn);
void kvm_vcpu_block(struct kvm_vcpu *vcpu);
void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
-bool kvm_vcpu_yield_to(struct kvm_vcpu *target);
+int kvm_vcpu_yield_to(struct kvm_vcpu *target);
void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b8a98427f964..306f4f0c987a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -847,10 +847,10 @@ enum cpu_idle_type {
};
/*
- * Increase resolution of cpu_power calculations
+ * Increase resolution of cpu_capacity calculations
*/
-#define SCHED_POWER_SHIFT 10
-#define SCHED_POWER_SCALE (1L << SCHED_POWER_SHIFT)
+#define SCHED_CAPACITY_SHIFT 10
+#define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT)
/*
* sched-domains (multiprocessor balancing) declarations:
@@ -862,7 +862,7 @@ enum cpu_idle_type {
#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */
#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */
#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
-#define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */
+#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu power */
#define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */
#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */
#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
@@ -874,7 +874,7 @@ enum cpu_idle_type {
#ifdef CONFIG_SCHED_SMT
static inline const int cpu_smt_flags(void)
{
- return SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES;
+ return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
}
#endif
@@ -1006,7 +1006,7 @@ typedef const int (*sched_domain_flags_f)(void);
struct sd_data {
struct sched_domain **__percpu sd;
struct sched_group **__percpu sg;
- struct sched_group_power **__percpu sgp;
+ struct sched_group_capacity **__percpu sgc;
};
struct sched_domain_topology_level {
@@ -2173,7 +2173,7 @@ static inline void sched_autogroup_fork(struct signal_struct *sig) { }
static inline void sched_autogroup_exit(struct signal_struct *sig) { }
#endif
-extern bool yield_to(struct task_struct *p, bool preempt);
+extern int yield_to(struct task_struct *p, bool preempt);
extern void set_user_nice(struct task_struct *p, long nice);
extern int task_prio(const struct task_struct *p);
/**
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 67e1bbf83695..0a68d5ae584e 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -530,6 +530,26 @@ TRACE_EVENT(sched_swap_numa,
__entry->dst_pid, __entry->dst_tgid, __entry->dst_ngid,
__entry->dst_cpu, __entry->dst_nid)
);
+
+/*
+ * Tracepoint for waking a polling cpu without an IPI.
+ */
+TRACE_EVENT(sched_wake_idle_without_ipi,
+
+ TP_PROTO(int cpu),
+
+ TP_ARGS(cpu),
+
+ TP_STRUCT__entry(
+ __field( int, cpu )
+ ),
+
+ TP_fast_assign(
+ __entry->cpu = cpu;
+ ),
+
+ TP_printk("cpu=%d", __entry->cpu)
+);
#endif /* _TRACE_SCHED_H */
/* This part must be outside protection */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4f611561ba4c..3bdf01b494fe 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -535,7 +535,7 @@ static inline void init_hrtick(void)
__old; \
})
-#ifdef TIF_POLLING_NRFLAG
+#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
/*
* Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
* this avoids any races wrt polling state changes and thereby avoids
@@ -546,12 +546,44 @@ static bool set_nr_and_not_polling(struct task_struct *p)
struct thread_info *ti = task_thread_info(p);
return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
}
+
+/*
+ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
+ *
+ * If this returns true, then the idle task promises to call
+ * sched_ttwu_pending() and reschedule soon.
+ */
+static bool set_nr_if_polling(struct task_struct *p)
+{
+ struct thread_info *ti = task_thread_info(p);
+ typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
+
+ for (;;) {
+ if (!(val & _TIF_POLLING_NRFLAG))
+ return false;
+ if (val & _TIF_NEED_RESCHED)
+ return true;
+ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
+ if (old == val)
+ break;
+ val = old;
+ }
+ return true;
+}
+
#else
static bool set_nr_and_not_polling(struct task_struct *p)
{
set_tsk_need_resched(p);
return true;
}
+
+#ifdef CONFIG_SMP
+static bool set_nr_if_polling(struct task_struct *p)
+{
+ return false;
+}
+#endif
#endif
/*
@@ -580,6 +612,8 @@ void resched_task(struct task_struct *p)
if (set_nr_and_not_polling(p))
smp_send_reschedule(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
}
void resched_cpu(int cpu)
@@ -642,27 +676,10 @@ static void wake_up_idle_cpu(int cpu)
if (cpu == smp_processor_id())
return;
- /*
- * This is safe, as this function is called with the timer
- * wheel base lock of (cpu) held. When the CPU is on the way
- * to idle and has not yet set rq->curr to idle then it will
- * be serialized on the timer wheel base lock and take the new
- * timer into account automatically.
- */
- if (rq->curr != rq->idle)
- return;
-
- /*
- * We can set TIF_RESCHED on the idle task of the other CPU
- * lockless. The worst case is that the other CPU runs the
- * idle task through an additional NOOP schedule()
- */
- set_tsk_need_resched(rq->idle);
-
- /* NEED_RESCHED must be visible before we test polling */
- smp_mb();
- if (!tsk_is_polling(rq->idle))
+ if (set_nr_and_not_polling(rq->idle))
smp_send_reschedule(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
}
static bool wake_up_full_nohz_cpu(int cpu)
@@ -888,7 +905,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->clock_task += delta;
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
- if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
+ if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
sched_rt_avg_update(rq, irq_delta + steal);
#endif
}
@@ -1521,13 +1538,17 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
}
#ifdef CONFIG_SMP
-static void sched_ttwu_pending(void)
+void sched_ttwu_pending(void)
{
struct rq *rq = this_rq();
struct llist_node *llist = llist_del_all(&rq->wake_list);
struct task_struct *p;
+ unsigned long flags;
- raw_spin_lock(&rq->lock);
+ if (!llist)
+ return;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
while (llist) {
p = llist_entry(llist, struct task_struct, wake_entry);
@@ -1535,7 +1556,7 @@ static void sched_ttwu_pending(void)
ttwu_do_activate(rq, p, 0);
}
- raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
}
void scheduler_ipi(void)
@@ -1581,8 +1602,14 @@ void scheduler_ipi(void)
static void ttwu_queue_remote(struct task_struct *p, int cpu)
{
- if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
- smp_send_reschedule(cpu);
+ struct rq *rq = cpu_rq(cpu);
+
+ if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
+ if (!set_nr_if_polling(rq->idle))
+ smp_send_reschedule(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
+ }
}
bool cpus_share_cache(int this_cpu, int that_cpu)
@@ -4219,7 +4246,7 @@ EXPORT_SYMBOL(yield);
* false (0) if we failed to boost the target.
* -ESRCH if there's no task to yield to.
*/
-bool __sched yield_to(struct task_struct *p, bool preempt)
+int __sched yield_to(struct task_struct *p, bool preempt)
{
struct task_struct *curr = current;
struct rq *rq, *p_rq;
@@ -5245,14 +5272,13 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
}
/*
- * Even though we initialize ->power to something semi-sane,
- * we leave power_orig unset. This allows us to detect if
+ * Even though we initialize ->capacity to something semi-sane,
+ * we leave capacity_orig unset. This allows us to detect if
* domain iteration is still funny without causing /0 traps.
*/
- if (!group->sgp->power_orig) {
+ if (!group->sgc->capacity_orig) {
printk(KERN_CONT "\n");
- printk(KERN_ERR "ERROR: domain->cpu_power not "
- "set\n");
+ printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n");
break;
}
@@ -5274,9 +5300,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
printk(KERN_CONT " %s", str);
- if (group->sgp->power != SCHED_POWER_SCALE) {
- printk(KERN_CONT " (cpu_power = %d)",
- group->sgp->power);
+ if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
+ printk(KERN_CONT " (cpu_capacity = %d)",
+ group->sgc->capacity);
}
group = group->next;
@@ -5334,7 +5360,7 @@ static int sd_degenerate(struct sched_domain *sd)
SD_BALANCE_NEWIDLE |
SD_BALANCE_FORK |
SD_BALANCE_EXEC |
- SD_SHARE_CPUPOWER |
+ SD_SHARE_CPUCAPACITY |
SD_SHARE_PKG_RESOURCES |
SD_SHARE_POWERDOMAIN)) {
if (sd->groups != sd->groups->next)
@@ -5365,7 +5391,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
SD_BALANCE_NEWIDLE |
SD_BALANCE_FORK |
SD_BALANCE_EXEC |
- SD_SHARE_CPUPOWER |
+ SD_SHARE_CPUCAPACITY |
SD_SHARE_PKG_RESOURCES |
SD_PREFER_SIBLING |
SD_SHARE_POWERDOMAIN);
@@ -5490,7 +5516,7 @@ static struct root_domain *alloc_rootdomain(void)
return rd;
}
-static void free_sched_groups(struct sched_group *sg, int free_sgp)
+static void free_sched_groups(struct sched_group *sg, int free_sgc)
{
struct sched_group *tmp, *first;
@@ -5501,8 +5527,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgp)
do {
tmp = sg->next;
- if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
- kfree(sg->sgp);
+ if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
+ kfree(sg->sgc);
kfree(sg);
sg = tmp;
@@ -5520,7 +5546,7 @@ static void free_sched_domain(struct rcu_head *rcu)
if (sd->flags & SD_OVERLAP) {
free_sched_groups(sd->groups, 1);
} else if (atomic_dec_and_test(&sd->groups->ref)) {
- kfree(sd->groups->sgp);
+ kfree(sd->groups->sgc);
kfree(sd->groups);
}
kfree(sd);
@@ -5731,17 +5757,17 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
cpumask_or(covered, covered, sg_span);
- sg->sgp = *per_cpu_ptr(sdd->sgp, i);
- if (atomic_inc_return(&sg->sgp->ref) == 1)
+ sg->sgc = *per_cpu_ptr(sdd->sgc, i);
+ if (atomic_inc_return(&sg->sgc->ref) == 1)
build_group_mask(sd, sg);
/*
- * Initialize sgp->power such that even if we mess up the
+ * Initialize sgc->capacity such that even if we mess up the
* domains and no possible iteration will get us here, we won't
* die on a /0 trap.
*/
- sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
- sg->sgp->power_orig = sg->sgp->power;
+ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
+ sg->sgc->capacity_orig = sg->sgc->capacity;
/*
* Make sure the first group of this domain contains the
@@ -5779,8 +5805,8 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
if (sg) {
*sg = *per_cpu_ptr(sdd->sg, cpu);
- (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
- atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
+ (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
+ atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */
}
return cpu;
@@ -5789,7 +5815,7 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
/*
* build_sched_groups will build a circular linked list of the groups
* covered by the given span, and will set each group's ->cpumask correctly,
- * and ->cpu_power to 0.
+ * and ->cpu_capacity to 0.
*
* Assumes the sched_domain tree is fully constructed
*/
@@ -5843,16 +5869,16 @@ build_sched_groups(struct sched_domain *sd, int cpu)
}
/*
- * Initialize sched groups cpu_power.
+ * Initialize sched groups cpu_capacity.
*
- * cpu_power indicates the capacity of sched group, which is used while
+ * cpu_capacity indicates the capacity of sched group, which is used while
* distributing the load between different sched groups in a sched domain.
- * Typically cpu_power for all the groups in a sched domain will be same unless
- * there are asymmetries in the topology. If there are asymmetries, group
- * having more cpu_power will pickup more load compared to the group having
- * less cpu_power.
+ * Typically cpu_capacity for all the groups in a sched domain will be same
+ * unless there are asymmetries in the topology. If there are asymmetries,
+ * group having more cpu_capacity will pickup more load compared to the
+ * group having less cpu_capacity.
*/
-static void init_sched_groups_power(int cpu, struct sched_domain *sd)
+static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
{
struct sched_group *sg = sd->groups;
@@ -5866,8 +5892,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
if (cpu != group_balance_cpu(sg))
return;
- update_group_power(sd, cpu);
- atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
+ update_group_capacity(sd, cpu);
+ atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
}
/*
@@ -5958,8 +5984,8 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
*per_cpu_ptr(sdd->sg, cpu) = NULL;
- if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
- *per_cpu_ptr(sdd->sgp, cpu) = NULL;
+ if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
+ *per_cpu_ptr(sdd->sgc, cpu) = NULL;
}
#ifdef CONFIG_NUMA
@@ -5972,7 +5998,7 @@ static int sched_domains_curr_level;
/*
* SD_flags allowed in topology descriptions.
*
- * SD_SHARE_CPUPOWER - describes SMT topologies
+ * SD_SHARE_CPUCAPACITY - describes SMT topologies
* SD_SHARE_PKG_RESOURCES - describes shared caches
* SD_NUMA - describes NUMA topologies
* SD_SHARE_POWERDOMAIN - describes shared power domain
@@ -5981,7 +6007,7 @@ static int sched_domains_curr_level;
* SD_ASYM_PACKING - describes SMT quirks
*/
#define TOPOLOGY_SD_FLAGS \
- (SD_SHARE_CPUPOWER | \
+ (SD_SHARE_CPUCAPACITY | \
SD_SHARE_PKG_RESOURCES | \
SD_NUMA | \
SD_ASYM_PACKING | \
@@ -6027,7 +6053,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
| 1*SD_BALANCE_FORK
| 0*SD_BALANCE_WAKE
| 1*SD_WAKE_AFFINE
- | 0*SD_SHARE_CPUPOWER
+ | 0*SD_SHARE_CPUCAPACITY
| 0*SD_SHARE_PKG_RESOURCES
| 0*SD_SERIALIZE
| 0*SD_PREFER_SIBLING
@@ -6049,7 +6075,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
* Convert topological properties into behaviour.
*/
- if (sd->flags & SD_SHARE_CPUPOWER) {
+ if (sd->flags & SD_SHARE_CPUCAPACITY) {
sd->imbalance_pct = 110;
sd->smt_gain = 1178; /* ~15% */
@@ -6361,14 +6387,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
if (!sdd->sg)
return -ENOMEM;
- sdd->sgp = alloc_percpu(struct sched_group_power *);
- if (!sdd->sgp)
+ sdd->sgc = alloc_percpu(struct sched_group_capacity *);
+ if (!sdd->sgc)
return -ENOMEM;
for_each_cpu(j, cpu_map) {
struct sched_domain *sd;
struct sched_group *sg;
- struct sched_group_power *sgp;
+ struct sched_group_capacity *sgc;
sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
@@ -6386,12 +6412,12 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
*per_cpu_ptr(sdd->sg, j) = sg;
- sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
+ sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
- if (!sgp)
+ if (!sgc)
return -ENOMEM;
- *per_cpu_ptr(sdd->sgp, j) = sgp;
+ *per_cpu_ptr(sdd->sgc, j) = sgc;
}
}
@@ -6418,15 +6444,15 @@ static void __sdt_free(const struct cpumask *cpu_map)
if (sdd->sg)
kfree(*per_cpu_ptr(sdd->sg, j));
- if (sdd->sgp)
- kfree(*per_cpu_ptr(sdd->sgp, j));
+ if (sdd->sgc)
+ kfree(*per_cpu_ptr(sdd->sgc, j));
}
free_percpu(sdd->sd);
sdd->sd = NULL;
free_percpu(sdd->sg);
sdd->sg = NULL;
- free_percpu(sdd->sgp);
- sdd->sgp = NULL;
+ free_percpu(sdd->sgc);
+ sdd->sgc = NULL;
}
}
@@ -6496,14 +6522,14 @@ static int build_sched_domains(const struct cpumask *cpu_map,
}
}
- /* Calculate CPU power for physical packages and nodes */
+ /* Calculate CPU capacity for physical packages and nodes */
for (i = nr_cpumask_bits-1; i >= 0; i--) {
if (!cpumask_test_cpu(i, cpu_map))
continue;
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
claim_allocations(i, sd);
- init_sched_groups_power(i, sd);
+ init_sched_groups_capacity(i, sd);
}
}
@@ -6946,7 +6972,7 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
- rq->cpu_power = SCHED_POWER_SCALE;
+ rq->cpu_capacity = SCHED_CAPACITY_SCALE;
rq->post_schedule = 0;
rq->active_balance = 0;
rq->next_balance = jiffies;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 2b8cbf09d1a4..fc4f98b1258f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -57,8 +57,6 @@ void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
dl_b->dl_runtime = runtime;
}
-extern unsigned long to_ratio(u64 period, u64 runtime);
-
void init_dl_bw(struct dl_bw *dl_b)
{
raw_spin_lock_init(&dl_b->lock);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9855e87d671a..fea7d3335e1f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1017,7 +1017,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
static unsigned long weighted_cpuload(const int cpu);
static unsigned long source_load(int cpu, int type);
static unsigned long target_load(int cpu, int type);
-static unsigned long power_of(int cpu);
+static unsigned long capacity_of(int cpu);
static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
/* Cached statistics for all CPUs within a node */
@@ -1026,11 +1026,11 @@ struct numa_stats {
unsigned long load;
/* Total compute capacity of CPUs on a node */
- unsigned long power;
+ unsigned long compute_capacity;
/* Approximate capacity in terms of runnable tasks on a node */
- unsigned long capacity;
- int has_capacity;
+ unsigned long task_capacity;
+ int has_free_capacity;
};
/*
@@ -1046,7 +1046,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
ns->nr_running += rq->nr_running;
ns->load += weighted_cpuload(cpu);
- ns->power += power_of(cpu);
+ ns->compute_capacity += capacity_of(cpu);
cpus++;
}
@@ -1056,15 +1056,16 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
* the @ns structure is NULL'ed and task_numa_compare() will
* not find this node attractive.
*
- * We'll either bail at !has_capacity, or we'll detect a huge imbalance
- * and bail there.
+ * We'll either bail at !has_free_capacity, or we'll detect a huge
+ * imbalance and bail there.
*/
if (!cpus)
return;
- ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
- ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
- ns->has_capacity = (ns->nr_running < ns->capacity);
+ ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity;
+ ns->task_capacity =
+ DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE);
+ ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
}
struct task_numa_env {
@@ -1195,8 +1196,8 @@ static void task_numa_compare(struct task_numa_env *env,
if (!cur) {
/* Is there capacity at our destination? */
- if (env->src_stats.has_capacity &&
- !env->dst_stats.has_capacity)
+ if (env->src_stats.has_free_capacity &&
+ !env->dst_stats.has_free_capacity)
goto unlock;
goto balance;
@@ -1213,7 +1214,7 @@ balance:
orig_dst_load = env->dst_stats.load;
orig_src_load = env->src_stats.load;
- /* XXX missing power terms */
+ /* XXX missing capacity terms */
load = task_h_load(env->p);
dst_load = orig_dst_load + load;
src_load = orig_src_load - load;
@@ -1301,8 +1302,8 @@ static int task_numa_migrate(struct task_struct *p)
groupimp = group_weight(p, env.dst_nid) - groupweight;
update_numa_stats(&env.dst_stats, env.dst_nid);
- /* If the preferred nid has capacity, try to use it. */
- if (env.dst_stats.has_capacity)
+ /* If the preferred nid has free capacity, try to use it. */
+ if (env.dst_stats.has_free_capacity)
task_numa_find_cpu(&env, taskimp, groupimp);
/* No space available on the preferred nid. Look elsewhere. */
@@ -3225,10 +3226,12 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
* has not truly expired.
*
* Fortunately we can check determine whether this the case by checking
- * whether the global deadline has advanced.
+ * whether the global deadline has advanced. It is valid to compare
+ * cfs_b->runtime_expires without any locks since we only care about
+ * exact equality, so a partial write will still work.
*/
- if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
+ if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
/* extend local deadline, drift is bounded above by 2 ticks */
cfs_rq->runtime_expires += TICK_NSEC;
} else {
@@ -3457,21 +3460,21 @@ next:
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
{
u64 runtime, runtime_expires;
- int idle = 1, throttled;
+ int throttled;
- raw_spin_lock(&cfs_b->lock);
/* no need to continue the timer with no bandwidth constraint */
if (cfs_b->quota == RUNTIME_INF)
- goto out_unlock;
+ goto out_deactivate;
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
- /* idle depends on !throttled (for the case of a large deficit) */
- idle = cfs_b->idle && !throttled;
cfs_b->nr_periods += overrun;
- /* if we're going inactive then everything else can be deferred */
- if (idle)
- goto out_unlock;
+ /*
+ * idle depends on !throttled (for the case of a large deficit), and if
+ * we're going inactive then everything else can be deferred
+ */
+ if (cfs_b->idle && !throttled)
+ goto out_deactivate;
/*
* if we have relooped after returning idle once, we need to update our
@@ -3485,7 +3488,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
if (!throttled) {
/* mark as potentially idle for the upcoming period */
cfs_b->idle = 1;
- goto out_unlock;
+ return 0;
}
/* account preceding periods in which throttling occurred */
@@ -3525,12 +3528,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
* timer to remain active while there are any throttled entities.)
*/
cfs_b->idle = 0;
-out_unlock:
- if (idle)
- cfs_b->timer_active = 0;
- raw_spin_unlock(&cfs_b->lock);
- return idle;
+ return 0;
+
+out_deactivate:
+ cfs_b->timer_active = 0;
+ return 1;
}
/* a cfs_rq won't donate quota below this amount */
@@ -3707,6 +3710,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
int overrun;
int idle = 0;
+ raw_spin_lock(&cfs_b->lock);
for (;;) {
now = hrtimer_cb_get_time(timer);
overrun = hrtimer_forward(timer, now, cfs_b->period);
@@ -3716,6 +3720,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
idle = do_sched_cfs_period_timer(cfs_b, overrun);
}
+ raw_spin_unlock(&cfs_b->lock);
return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
}
@@ -3775,8 +3780,6 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
struct cfs_rq *cfs_rq;