summaryrefslogtreecommitdiffstats
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c1150
1 files changed, 793 insertions, 357 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f5591071ae98..0951d1c58d2f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -772,7 +772,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
* For !fair tasks do:
*
update_cfs_rq_load_avg(now, cfs_rq);
- attach_entity_load_avg(cfs_rq, se);
+ attach_entity_load_avg(cfs_rq, se, 0);
switched_from_fair(rq, p);
*
* such that the next switched_to_fair() has the
@@ -3009,11 +3009,11 @@ static inline void update_cfs_group(struct sched_entity *se)
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
-static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
{
struct rq *rq = rq_of(cfs_rq);
- if (&rq->cfs == cfs_rq) {
+ if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) {
/*
* There are a few boundary cases this might miss but it should
* get called often enough that that should (hopefully) not be
@@ -3028,7 +3028,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
*
* See cpu_util().
*/
- cpufreq_update_util(rq, 0);
+ cpufreq_update_util(rq, flags);
}
}
@@ -3243,6 +3243,32 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna
}
/*
+ * When a task is dequeued, its estimated utilization should not be update if
+ * its util_avg has not been updated at least once.
+ * This flag is used to synchronize util_avg updates with util_est updates.
+ * We map this information into the LSB bit of the utilization saved at
+ * dequeue time (i.e. util_est.dequeued).
+ */
+#define UTIL_AVG_UNCHANGED 0x1
+
+static inline void cfs_se_util_change(struct sched_avg *avg)
+{
+ unsigned int enqueued;
+
+ if (!sched_feat(UTIL_EST))
+ return;
+
+ /* Avoid store if the flag has been already set */
+ enqueued = avg->util_est.enqueued;
+ if (!(enqueued & UTIL_AVG_UNCHANGED))
+ return;
+
+ /* Reset flag to report util_avg has been updated */
+ enqueued &= ~UTIL_AVG_UNCHANGED;
+ WRITE_ONCE(avg->util_est.enqueued, enqueued);
+}
+
+/*
* sched_entity:
*
* task:
@@ -3293,6 +3319,7 @@ __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entit
cfs_rq->curr == se)) {
___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
+ cfs_se_util_change(&se->avg);
return 1;
}
@@ -3686,7 +3713,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
#endif
if (decayed)
- cfs_rq_util_change(cfs_rq);
+ cfs_rq_util_change(cfs_rq, 0);
return decayed;
}
@@ -3699,7 +3726,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
* Must call update_cfs_rq_load_avg() before this, since we rely on
* cfs_rq->avg.last_update_time being current.
*/
-static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
@@ -3735,7 +3762,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
- cfs_rq_util_change(cfs_rq);
+ cfs_rq_util_change(cfs_rq, flags);
}
/**
@@ -3754,7 +3781,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
- cfs_rq_util_change(cfs_rq);
+ cfs_rq_util_change(cfs_rq, 0);
}
/*
@@ -3784,7 +3811,14 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
- attach_entity_load_avg(cfs_rq, se);
+ /*
+ * DO_ATTACH means we're here from enqueue_entity().
+ * !last_update_time means we've passed through
+ * migrate_task_rq_fair() indicating we migrated.
+ *
+ * IOW we're enqueueing a task on a new CPU.
+ */
+ attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
update_tg_load_avg(cfs_rq, 0);
} else if (decayed && (flags & UPDATE_TG))
@@ -3866,6 +3900,120 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
+static inline unsigned long task_util(struct task_struct *p)
+{
+ return READ_ONCE(p->se.avg.util_avg);
+}
+
+static inline unsigned long _task_util_est(struct task_struct *p)
+{
+ struct util_est ue = READ_ONCE(p->se.avg.util_est);
+
+ return max(ue.ewma, ue.enqueued);
+}
+
+static inline unsigned long task_util_est(struct task_struct *p)
+{
+ return max(task_util(p), _task_util_est(p));
+}
+
+static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
+ struct task_struct *p)
+{
+ unsigned int enqueued;
+
+ if (!sched_feat(UTIL_EST))
+ return;
+
+ /* Update root cfs_rq's estimated utilization */
+ enqueued = cfs_rq->avg.util_est.enqueued;
+ enqueued += (_task_util_est(p) | UTIL_AVG_UNCHANGED);
+ WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
+}
+
+/*
+ * Check if a (signed) value is within a specified (unsigned) margin,
+ * based on the observation that:
+ *
+ * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
+ *
+ * NOTE: this only works when value + maring < INT_MAX.
+ */
+static inline bool within_margin(int value, int margin)
+{
+ return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
+}
+
+static void
+util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
+{
+ long last_ewma_diff;
+ struct util_est ue;
+
+ if (!sched_feat(UTIL_EST))
+ return;
+
+ /*
+ * Update root cfs_rq's estimated utilization
+ *
+ * If *p is the last task then the root cfs_rq's estimated utilization
+ * of a CPU is 0 by definition.
+ */
+ ue.enqueued = 0;
+ if (cfs_rq->nr_running) {
+ ue.enqueued = cfs_rq->avg.util_est.enqueued;
+ ue.enqueued -= min_t(unsigned int, ue.enqueued,
+ (_task_util_est(p) | UTIL_AVG_UNCHANGED));
+ }
+ WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
+
+ /*
+ * Skip update of task's estimated utilization when the task has not
+ * yet completed an activation, e.g. being migrated.
+ */
+ if (!task_sleep)
+ return;
+
+ /*
+ * If the PELT values haven't changed since enqueue time,
+ * skip the util_est update.
+ */
+ ue = p->se.avg.util_est;
+ if (ue.enqueued & UTIL_AVG_UNCHANGED)
+ return;
+
+ /*
+ * Skip update of task's estimated utilization when its EWMA is
+ * already ~1% close to its last activation value.
+ */
+ ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
+ last_ewma_diff = ue.enqueued - ue.ewma;
+ if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
+ return;
+
+ /*
+ * Update Task's estimated utilization
+ *
+ * When *p completes an activation we can consolidate another sample
+ * of the task size. This is done by storing the current PELT value
+ * as ue.enqueued and by using this value to update the Exponential
+ * Weighted Moving Average (EWMA):
+ *
+ * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1)
+ * = w * task_util(p) + ewma(t-1) - w * ewma(t-1)
+ * = w * (task_util(p) - ewma(t-1)) + ewma(t-1)
+ * = w * ( last_ewma_diff ) + ewma(t-1)
+ * = w * (last_ewma_diff + ewma(t-1) / w)
+ *
+ * Where 'w' is the weight of new samples, which is configured to be
+ * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
+ */
+ ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
+ ue.ewma += last_ewma_diff;
+ ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
+ WRITE_ONCE(p->se.avg.util_est, ue);
+}
+
#else /* CONFIG_SMP */
static inline int
@@ -3880,13 +4028,13 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
{
- cfs_rq_util_change(cfs_rq);
+ cfs_rq_util_change(cfs_rq, 0);
}
static inline void remove_entity_load_avg(struct sched_entity *se) {}
static inline void
-attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {}
static inline void
detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
@@ -3895,6 +4043,13 @@ static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
return 0;
}
+static inline void
+util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
+
+static inline void
+util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
+ bool task_sleep) {}
+
#endif /* CONFIG_SMP */
static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -5242,6 +5397,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se)
add_nr_running(rq, 1);
+ util_est_enqueue(&rq->cfs, p);
hrtick_update(rq);
}
@@ -5301,6 +5457,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se)
sub_nr_running(rq, 1);
+ util_est_dequeue(&rq->cfs, p, task_sleep);
hrtick_update(rq);
}
@@ -5376,6 +5533,15 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
}
return load;
}
+
+static struct {
+ cpumask_var_t idle_cpus_mask;
+ atomic_t nr_cpus;
+ int has_blocked; /* Idle CPUS has blocked load */
+ unsigned long next_balance; /* in jiffy units */
+ unsigned long next_blocked; /* Next update of blocked load in jiffies */
+} nohz ____cacheline_aligned;
+
#endif /* CONFIG_NO_HZ_COMMON */
/**
@@ -5819,7 +5985,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
return target;
}
-static inline unsigned long task_util(struct task_struct *p);
static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
@@ -6301,11 +6466,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
return target;
}
-/*
- * cpu_util returns the amount of capacity of a CPU that is used by CFS
- * tasks. The unit of the return value must be the one of capacity so we can
- * compare the utilization with the capacity of the CPU that is available for
- * CFS task (ie cpu_capacity).
+/**
+ * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks
+ * @cpu: the CPU to get the utilization of
+ *
+ * The unit of the return value must be the one of capacity so we can compare
+ * the utilization with the capacity of the CPU that is available for CFS task
+ * (ie cpu_capacity).
*
* cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
* recent utilization of currently non-runnable tasks on a CPU. It represents
@@ -6316,6 +6483,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
* current capacity (capacity_curr <= capacity_orig) of the CPU because it is
* the running time on this CPU scaled by capacity_curr.
*
+ * The estimated utilization of a CPU is defined to be the maximum between its
+ * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks
+ * currently RUNNABLE on that CPU.
+ * This allows to properly represent the expected utilization of a CPU which
+ * has just got a big task running since a long sleep period. At the same time
+ * however it preserves the benefits of the "blocked utilization" in
+ * describing the potential for other tasks waking up on the same CPU.
+ *
* Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
* higher than capacity_orig because of unfortunate rounding in
* cfs.avg.util_avg or just after migrating tasks and new task wakeups until
@@ -6326,18 +6501,21 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
* available capacity. We allow utilization to overshoot capacity_curr (but not
* capacity_orig) as it useful for predicting the capacity required after task
* migrations (scheduler-driven DVFS).
+ *
+ * Return: the (estimated) utilization for the specified CPU
*/
-static unsigned long cpu_util(int cpu)
+static inline unsigned long cpu_util(int cpu)
{
- unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
- unsigned long capacity = capacity_orig_of(cpu);
+ struct cfs_rq *cfs_rq;
+ unsigned int util;
- return (util >= capacity) ? capacity : util;
-}
+ cfs_rq = &cpu_rq(cpu)->cfs;
+ util = READ_ONCE(cfs_rq->avg.util_avg);
-static inline unsigned long task_util(struct task_struct *p)
-{
- return p->se.avg.util_avg;
+ if (sched_feat(UTIL_EST))
+ util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
+
+ return min_t(unsigned long, util, capacity_orig_of(cpu));
}
/*
@@ -6346,16 +6524,54 @@ static inline unsigned long task_util(struct task_struct *p)
*/
static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
{
- unsigned long util, capacity;
+ struct cfs_rq *cfs_rq;
+ unsigned int util;
/* Task has no contribution or is new */
- if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
+ if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
return cpu_util(cpu);
- capacity = capacity_orig_of(cpu);
- util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0);
+ cfs_rq = &cpu_rq(cpu)->cfs;
+ util = READ_ONCE(cfs_rq->avg.util_avg);
+
+ /* Discount task's blocked util from CPU's util */
+ util -= min_t(unsigned int, util, task_util(p));
+
+ /*
+ * Covered cases:
+ *
+ * a) if *p is the only task sleeping on this CPU, then:
+ * cpu_util (== task_util) > util_est (== 0)
+ * and thus we return:
+ * cpu_util_wake = (cpu_util - task_util) = 0
+ *
+ * b) if other tasks are SLEEPING on this CPU, which is now exiting
+ * IDLE, then:
+ * cpu_util >= task_util
+ * cpu_util > util_est (== 0)
+ * and thus we discount *p's blocked utilization to return:
+ * cpu_util_wake = (cpu_util - task_util) >= 0
+ *
+ * c) if other tasks are RUNNABLE on that CPU and
+ * util_est > cpu_util
+ * then we use util_est since it returns a more restrictive
+ * estimation of the spare capacity on that CPU, by just
+ * considering the expected utilization of tasks already
+ * runnable on that CPU.
+ *
+ * Cases a) and b) are covered by the above code, while case c) is
+ * covered by the following code when estimated utilization is
+ * enabled.
+ */
+ if (sched_feat(UTIL_EST))
+ util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
- return (util >= capacity) ? capacity : util;
+ /*
+ * Utilization (estimated) can exceed the CPU capacity, thus let's
+ * clamp to the maximum CPU capacity to ensure consistency with
+ * the cpu_util call.
+ */
+ return min_t(unsigned long, util, capacity_orig_of(cpu));
}
/*
@@ -6797,7 +7013,7 @@ simple:
p = task_of(se);
-done: __maybe_unused
+done: __maybe_unused;
#ifdef CONFIG_SMP
/*
* Move the next running task to the front of
@@ -7022,6 +7238,8 @@ enum fbq_type { regular, remote, all };
#define LBF_NEED_BREAK 0x02
#define LBF_DST_PINNED 0x04
#define LBF_SOME_PINNED 0x08
+#define LBF_NOHZ_STATS 0x10
+#define LBF_NOHZ_AGAIN 0x20
struct lb_env {
struct sched_domain *sd;
@@ -7406,6 +7624,17 @@ static void attach_tasks(struct lb_env *env)
rq_unlock(env->dst_rq, &rf);
}
+static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
+{
+ if (cfs_rq->avg.load_avg)
+ return true;
+
+ if (cfs_rq->avg.util_avg)
+ return true;
+
+ return false;
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
@@ -7430,6 +7659,7 @@ static void update_blocked_averages(int cpu)
struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq, *pos;
struct rq_flags rf;
+ bool done = true;
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
@@ -7459,7 +7689,17 @@ static void update_blocked_averages(int cpu)
*/
if (cfs_rq_is_decayed(cfs_rq))
list_del_leaf_cfs_rq(cfs_rq);
+
+ /* Don't need periodic decay once load/util_avg are null */
+ if (cfs_rq_has_blocked(cfs_rq))
+ done = false;
}
+
+#ifdef CONFIG_NO_HZ_COMMON
+ rq->last_blocked_load_update_tick = jiffies;
+ if (done)
+ rq->has_blocked_load = 0;
+#endif
rq_unlock_irqrestore(rq, &rf);
}
@@ -7519,6 +7759,11 @@ static inline void update_blocked_averages(int cpu)
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+#ifdef CONFIG_NO_HZ_COMMON
+ rq->last_blocked_load_update_tick = jiffies;
+ if (!cfs_rq_has_blocked(cfs_rq))
+ rq->has_blocked_load = 0;
+#endif
rq_unlock_irqrestore(rq, &rf);
}
@@ -7853,6 +8098,28 @@ group_type group_classify(struct sched_group *group,
return group_other;
}
+static bool update_nohz_stats(struct rq *rq, bool force)
+{
+#ifdef CONFIG_NO_HZ_COMMON
+ unsigned int cpu = rq->cpu;
+
+ if (!rq->has_blocked_load)
+ return false;
+
+ if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
+ return false;
+
+ if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
+ return true;
+
+ update_blocked_averages(cpu);
+
+ return rq->has_blocked_load;
+#else
+ return false;
+#endif
+}
+
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
@@ -7875,6 +8142,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
+ if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
+ env->flags |= LBF_NOHZ_AGAIN;
+
/* Bias balancing toward CPUs of our domain: */
if (local_group)
load = target_load(i, load_idx);
@@ -8030,6 +8300,11 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
if (child && child->flags & SD_PREFER_SIBLING)
prefer_sibling = 1;
+#ifdef CONFIG_NO_HZ_COMMON
+ if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
+ env->flags |= LBF_NOHZ_STATS;
+#endif
+
load_idx = get_sd_load_idx(env->sd, env->idle);
do {
@@ -8083,6 +8358,15 @@ next_group:
sg = sg->next;
} while (sg != env->sd->groups);
+#ifdef CONFIG_NO_HZ_COMMON
+ if ((env->flags & LBF_NOHZ_AGAIN) &&
+ cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
+
+ WRITE_ONCE(nohz.next_blocked,
+ jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
+ }
+#endif
+
if (env->sd->flags & SD_NUMA)
env->fbq_type = fbq_classify_group(&sds->busiest_stat);
@@ -8833,120 +9117,6 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
}
/*
- * idle_balance is called by schedule() if this_cpu is about to become
- * idle. Attempts to pull tasks from other CPUs.
- */
-static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
-{
- unsigned long next_balance = jiffies + HZ;
- int this_cpu = this_rq->cpu;
- struct sched_domain *sd;
- int pulled_task = 0;
- u64 curr_cost = 0;
-
- /*
- * We must set idle_stamp _before_ calling idle_balance(), such that we
- * measure the duration of idle_balance() as idle time.
- */
- this_rq->idle_stamp = rq_clock(this_rq);
-
- /*
- * Do not pull tasks towards !active CPUs...
- */
- if (!cpu_active(this_cpu))
- return 0;
-
- /*
- * This is OK, because current is on_cpu, which avoids it being picked
- * for load-balance and preemption/IRQs are still disabled avoiding
- * further scheduler activity on it and we're being very careful to
- * re-start the picking loop.
- */
- rq_unpin_lock(this_rq, rf);
-
- if (this_rq->avg_idle < sysctl_sched_migration_cost ||
- !this_rq->rd->overload) {
- rcu_read_lock();
- sd = rcu_dereference_check_sched_domain(this_rq->sd);
- if (sd)
- update_next_balance(sd, &next_balance);
- rcu_read_unlock();
-
- goto out;
- }
-
- raw_spin_unlock(&this_rq->lock);
-
- update_blocked_averages(this_cpu);
- rcu_read_lock();
- for_each_domain(this_cpu, sd) {
- int continue_balancing = 1;
- u64 t0, domain_cost;
-
- if (!(sd->flags & SD_LOAD_BALANCE))
- continue;
-
- if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
- update_next_balance(sd, &next_balance);
- break;
- }
-
- if (sd->flags & SD_BALANCE_NEWIDLE) {
- t0 = sched_clock_cpu(this_cpu);
-
- pulled_task = load_balance(this_cpu, this_rq,
- sd, CPU_NEWLY_IDLE,
- &continue_balancing);
-
- domain_cost = sched_clock_cpu(this_cpu) - t0;
- if (domain_cost > sd->max_newidle_lb_cost)
- sd->max_newidle_lb_cost = domain_cost;
-
- curr_cost += domain_cost;
- }
-
- update_next_balance(sd, &next_balance);
-
- /*
- * Stop searching for tasks to pull if there are
- * now runnable tasks on this rq.
- */
- if (pulled_task || this_rq->nr_running > 0)
- break;
- }
- rcu_read_unlock();
-
- raw_spin_lock(&this_rq->lock);
-
- if (curr_cost > this_rq->max_idle_balance_cost)
- this_rq->max_idle_balance_cost = curr_cost;
-
- /*
- * While browsing the domains, we released the rq lock, a task could
- * have been enqueued in the meantime. Since we're not going idle,
- * pretend we pulled a task.
- */
- if (this_rq->cfs.h_nr_running && !pulled_task)
- pulled_task = 1;
-
-out:
- /* Move the next balance forward */
- if (time_after(this_rq->next_balance, next_balance))
- this_rq->next_balance = next_balance;
-
- /* Is there a task of a high priority class? */
- if (this_rq->nr_running != this_rq->cfs.h_nr_running)
- pulled_task = -1;
-
- if (pulled_task)
- this_rq->idle_stamp = 0;
-
- rq_repin_lock(this_rq, rf);
-
- return pulled_task;
-}
-
-/*
* active_load_balance_cpu_stop is run by the CPU stopper. It pushes
* running tasks off the busiest CPU onto idle CPUs. It requires at
* least 1 task to be running on each physical CPU where possible, and
@@ -9037,137 +9207,6 @@ out_unlock:
return 0;
}
-static inline int on_null_domain(struct rq *rq)
-{
- return unlikely(!rcu_dereference_sched(rq->sd));
-}
-
-#ifdef CONFIG_NO_HZ_COMMON
-/*
- * idle load balancing details
- * - When one of the busy CPUs notice that there may be an idle rebalancing
- * needed, they will kick the idle load balancer, which then does idle
- * load balancing for all the idle CPUs.
- */
-static struct {
- cpumask_var_t idle_cpus_mask;
- atomic_t nr_cpus;
- unsigned long next_balance; /* in jiffy units */
-} nohz ____cacheline_aligned;
-
-static inline int find_new_ilb(void)
-{
- int ilb = cpumask_first(nohz.idle_cpus_mask);
-
- if (ilb < nr_cpu_ids && idle_cpu(ilb))
- return ilb;
-
- return nr_cpu_ids;
-}
-
-/*
- * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
- * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
- * CPU (if there is one).
- */
-static void nohz_balancer_kick(void)
-{
- int ilb_cpu;
-
- nohz.next_balance++;
-
- ilb_cpu = find_new_ilb();
-
- if (ilb_cpu >= nr_cpu_ids)
- return;
-
- if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
- return;
- /*
- * Use smp_send_reschedule() instead of resched_cpu().
- * This way we generate a sched IPI on the target CPU which
- * is idle. And the softirq performing nohz idle load balance
- * will be run before returning from the IPI.
- */
- smp_send_reschedule(ilb_cpu);
- return;
-}
-
-void nohz_balance_exit_idle(unsigned int cpu)
-{
- if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
- /*
- * Completely isolated CPUs don't ever set, so we must test.
- */
- if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
- cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
- atomic_dec(&nohz.nr_cpus);
- }
- clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
- }
-}
-
-static inline void set_cpu_sd_state_busy(void)
-{
- struct sched_domain *sd;
- int cpu = smp_processor_id();
-
- rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_llc, cpu));
-
- if (!sd || !sd->nohz_idle)
- goto unlock;
- sd->nohz_idle = 0;
-
- atomic_inc(&sd->shared->nr_busy_cpus);
-unlock:
- rcu_read_unlock();
-}
-
-void set_cpu_sd_state_idle(void)
-{
- struct sched_domain *sd;
- int cpu = smp_processor_id();
-
- rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_llc, cpu));
-
- if (!sd || sd->nohz_idle)
- goto unlock;
- sd->nohz_idle = 1;
-
- atomic_dec(&sd->shared->nr_busy_cpus);
-unlock:
- rcu_read_unlock();
-}
-
-/*
- * This routine will record that the CPU is going idle with tick stopped.
- * This info will be used in performing idle load balancing in the future.
- */
-void nohz_balance_enter_idle(int cpu)
-{
- /* If this CPU is going down, then nothing needs to be done: */
- if (!cpu_active(cpu))
- return;
-
- /* Spare idle load balancing on CPUs that don't want to be disturbed: */
- if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
- return;
-
- if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
- return;
-
- /* If we're a completely isolated CPU, we don't play: */
- if (on_null_domain(cpu_rq(cpu)))
- return;
-
- cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
- atomic_inc(&nohz.nr_cpus);
- set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
-}
-#endif
-
static DEFINE_SPINLOCK(balancing);
/*
@@ -9197,8 +9236,6 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
int need_serialize, need_decay = 0;
u64 max_cost = 0;
- update_blocked_averages(cpu);
-
rcu_read_lock();
for_each_domain(cpu, sd) {
/*
@@ -9267,7 +9304,7 @@ out:
/*
* next_balance will be updated only when there is a need.
- * When the CPU is attached to null domain for ex, it will not be
+ * When the cpu is attached to null domain for ex, it will not be
* updated.
*/
if (likely(update_next_balance)) {
@@ -9288,111 +9325,104 @@ out:
}
}
+static inline int on_null_domain(struct rq *rq)
+{
+ return unlikely(!rcu_dereference_sched(rq->sd));
+}
+
#ifdef CONFIG_NO_HZ_COMMON
/*
- * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
- * rebalancing for all the CPUs for whom scheduler ticks are stopped.
+ * idle load balancing details
+ * - When one of the busy CPUs notice that there may be an idle rebalancing
+ * needed, they will kick the idle load balancer, which then does idle
+ * load balancing for all the idle CPUs.
*/
-static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
-{
- int this_cpu = this_rq->cpu;
- struct rq *rq;
- int balance_cpu;
- /* Earliest time when we have to do rebalance again */
- unsigned long next_balance = jiffies + 60*HZ;
- int update_next_balance = 0;
- if (idle != CPU_IDLE ||
- !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
- goto end;
+static inline int find_new_ilb(void)
+{
+ int ilb = cpumask_first(nohz.idle_cpus_mask);
- for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
- if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
- continue;
+ if (ilb < nr_cpu_ids && idle_cpu(ilb))
+ return ilb;
- /*
- * If this CPU gets work to do, stop the load balancing
- * work being done for other CPUs. Next load
- * balancing owner will pick it up.
- */
- if (need_resched())
- break;
+ return nr_cpu_ids;
+}
- rq = cpu_rq(balance_cpu);
+/*
+ * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
+ * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
+ * CPU (if there is one).
+ */
+static void kick_ilb(unsigned int flags)
+{
+ int ilb_cpu;
- /*
- * If time for next balance is due,
- * do the balance.
- */
- if (time_after_eq(jiffies, rq->next_balance)) {
- struct rq_flags rf;
+ nohz.next_balance++;
- rq_lock_irq(rq, &rf);
- update_rq_clock(rq);
- cpu_load_update_idle(rq);
- rq_unlock_irq(rq, &rf);
+ ilb_cpu = find_new_ilb();
- rebalance_domains(rq, CPU_IDLE);
- }
+ if (ilb_cpu >= nr_cpu_ids)
+ return;
- if (time_after(next_balance, rq->next_balance)) {
- next_balance = rq->next_balance;
- update_next_balance = 1;
- }
- }
+ flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
+ if (flags & NOHZ_KICK_MASK)
+ return;
/*
- * next_balance will be updated only when there is a need.
- * When the CPU is attached to null domain for ex, it will not be
- * updated.
+ * Use smp_send_reschedule() instead of resched_cpu().
+ * This way we generate a sched IPI on the target CPU which
+ * is idle. And the softirq performing nohz idle load balance
+ * will be run before returning from the IPI.
*/
- if (likely(update_next_balance))
- nohz.next_balance = next_balance;
-end:
- clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
+ smp_send_reschedule(ilb_cpu);
}
/*
* Current heuristic for kicking the idle load balancer in the presence
- * of an idle CPU in the system.
+ * of an idle cpu in the system.
* - This rq has more than one task.
* - This rq has at least one CFS task and the capacity of the CPU is
* significantly reduced because of RT tasks or IRQs.
- * - At parent of LLC scheduler domain level, this CPU's scheduler group has
- * multiple busy CPUs.
- * - For SD_ASYM_PACKING, if the lower numbered CPU's in the scheduler
+ * - At parent of LLC scheduler domain level, this cpu's scheduler group has
+ * multiple busy cpu.
+ * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
* domain span are idle.
*/
-static inline bool nohz_kick_needed(struct rq *rq)
+static void nohz_balancer_kick(struct rq *rq)
{
unsigned long now = jiffies;
struct sched_domain_shared *sds;
struct sched_domain *sd;
int nr_busy, i, cpu = rq->cpu;
- bool kick = false;
+ unsigned int flags = 0;
if (unlikely(rq->idle_balance))
- return false;
+ return;
- /*
- * We may be recently in ticked or tickless idle mode. At the first
- * busy tick after returning from idle, we will update the busy stats.
- */
- set_cpu_sd_state_busy();
- nohz_balance_exit_idle(cpu);
+ /*
+ * We may be recently in ticked or tickless idle mode. At the first
+ * busy tick after returning from idle, we will update the busy stats.
+ */
+ nohz_balance_exit_idle(rq);
/*
* None are in tickless mode and hence no need for NOHZ idle load
* balancing.
*/
if (likely(!atomic_read(&nohz.nr_cpus)))
- return false;
+ return;
+
+ if (READ_ONCE(nohz.has_blocked) &&
+ time_after(now, READ_ONCE(nohz.next_blocked)))
+ flags = NOHZ_STATS_KICK;
if (time_before(now, nohz.next_balance))
- return false;
+ goto out;
- if (rq->nr_running >= 2)
- return true;
+ if (rq->nr_running >= 2) {
+ flags = NOHZ_KICK_MASK;
+ goto out;
+ }
rcu_read_lock();
sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
@@ -9403,7 +9433,7 @@ static inline bool nohz_kick_needed(struct rq *rq)
*/
nr_busy = atomic_read(&sds->nr_busy_cpus);
if (nr_busy > 1) {
- kick = true;
+ flags = NOHZ_KICK_MASK;
goto unlock;
}
@@ -9413,7 +9443,7 @@ static inline bool nohz_kick_needed(struct rq *rq)
if (sd) {
if ((rq->cfs.h_nr_running >= 1) &&
check_cpu_capacity(rq, sd)) {
- kick = true;
+ flags = NOHZ_KICK_MASK;
goto unlock;
}
}
@@ -9426,18 +9456,421 @@ static inline bool nohz_kick_needed(struct rq *rq)
continue;
if (sched_asym_prefer(i, cpu)) {
- kick = true;
+ flags = NOHZ_KICK_MASK;
goto unlock;
}
}
}
unlock:
rcu_read_unlock();
- return kick;
+out:
+ if (flags)
+ kick_ilb(flags);
+}
+
+static void set_cpu_sd_state_busy(int cpu)
+{
+ struct sched_domain *sd;
+
+ rcu_read_lock();
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
+
+ if (!sd || !sd->nohz_idle)
+ goto unlock;
+ sd->nohz_idle = 0;
+
+ atomic_inc(&sd->shared->nr_busy_cpus);
+unlock:
+ rcu_read_unlock();
+}
+
+void nohz_balance_exit_idle(struct rq *rq)
+{
+ SCHED_WARN_ON(rq != this_rq());
+
+ if (likely(!rq->nohz_tick_stopped))
+ return;
+
+ rq->nohz_tick_stopped = 0;
+ cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
+ atomic_dec(&nohz.nr_cpus);
+
+ set_cpu_sd_state_busy(rq->cpu);
+}
+
+static void set_cpu_sd_state_idle(int cpu)
+{
+ struct sched_domain *sd;
+
+ rcu_read_lock();
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
+
+ if (!sd || sd->nohz_idle)
+ goto unlock;
+ sd->nohz_idle = 1;
+
+ atomic_dec(&sd->shared->nr_busy_cpus);
+unlock:
+ rcu_read_unlock();
+}
+
+/*
+ * This routine will record that the CPU is going idle with tick stopped.
+ * This info will be used in performing idle load balancing in the future.
+ */
+void nohz_balance_enter_idle(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ SCHED_WARN_ON(cpu != smp_processor_id());
+
+ /* If this CPU is going down, then nothing needs to be done: */
+ if (!cpu_active(cpu))
+ return;
+
+ /* Spare idle load balancing on CPUs that don't want to be disturbed: */
+ if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
+ return;
+
+ /*
+ * Can be set safely without rq->lock held
+ * If a clear happens, it will have evaluated last additions because
+ * rq->lock is held during the check and the clear
+ */
+ rq->has_blocked_load = 1;
+
+ /*
+ * The tick is still stopped but load could have been added in the
+ * meantime. We set the nohz.has_blocked flag to trig a check of the
+ * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
+ * of nohz.has_blocked can only happen after checking the new load
+ */
+ if (rq->nohz_tick_stopped)
+ goto out;
+
+ /* If we're a completely isolated CPU, we don't play: */
+ if (on_null_domain(rq))
+ return;
+
+ rq->nohz_tick_stopped = 1;
+
+ cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
+ atomic_inc(&nohz.nr_cpus);
+
+ /*
+ * Ensures that if nohz_idle_balance() fails to observe our
+ * @idle_cpus_mask store, it must observe the @has_blocked
+ * store.