From 488603b815a7514c7009e6fc339d74ed4a30f343 Mon Sep 17 00:00:00 2001
From: Scott Wood <swood@redhat.com>
Date: Sat, 11 Jan 2020 04:53:38 -0500
Subject: sched/core: Don't skip remote tick for idle CPUs

This will be used in the next patch to get a loadavg update from
nohz cpus.  The delta check is skipped because idle_sched_class
doesn't update se.exec_start.

Signed-off-by: Scott Wood <swood@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/1578736419-14628-2-git-send-email-swood@redhat.com
---
 kernel/sched/core.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fc1dfc007604..cf8b33dc4513 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3669,22 +3669,24 @@ static void sched_tick_remote(struct work_struct *work)
 	 * statistics and checks timeslices in a time-independent way, regardless
 	 * of when exactly it is running.
 	 */
-	if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu))
+	if (!tick_nohz_tick_stopped_cpu(cpu))
 		goto out_requeue;
 
 	rq_lock_irq(rq, &rf);
 	curr = rq->curr;
-	if (is_idle_task(curr) || cpu_is_offline(cpu))
+	if (cpu_is_offline(cpu))
 		goto out_unlock;
 
 	update_rq_clock(rq);
-	delta = rq_clock_task(rq) - curr->se.exec_start;
 
-	/*
-	 * Make sure the next tick runs within a reasonable
-	 * amount of time.
-	 */
-	WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
+	if (!is_idle_task(curr)) {
+		/*
+		 * Make sure the next tick runs within a reasonable
+		 * amount of time.
+		 */
+		delta = rq_clock_task(rq) - curr->se.exec_start;
+		WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
+	}
 	curr->sched_class->task_tick(rq, curr, 0);
 
 out_unlock:
-- 
cgit v1.2.3


From ebc0f83c78a2d26384401ecf2d2fa48063c0ee27 Mon Sep 17 00:00:00 2001
From: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Date: Sat, 11 Jan 2020 04:53:39 -0500
Subject: timers/nohz: Update NOHZ load in remote tick

The way loadavg is tracked during nohz only pays attention to the load
upon entering nohz.  This can be particularly noticeable if full nohz is
entered while non-idle, and then the cpu goes idle and stays that way for
a long time.

Use the remote tick to ensure that full nohz cpus report their deltas
within a reasonable time.

[ swood: Added changelog and removed recheck of stopped tick. ]

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Scott Wood <swood@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/1578736419-14628-3-git-send-email-swood@redhat.com
---
 kernel/sched/core.c    |  4 +++-
 kernel/sched/loadavg.c | 33 +++++++++++++++++++++++----------
 2 files changed, 26 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index cf8b33dc4513..4ff03c27779e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3677,6 +3677,7 @@ static void sched_tick_remote(struct work_struct *work)
 	if (cpu_is_offline(cpu))
 		goto out_unlock;
 
+	curr = rq->curr;
 	update_rq_clock(rq);
 
 	if (!is_idle_task(curr)) {
@@ -3689,10 +3690,11 @@ static void sched_tick_remote(struct work_struct *work)
 	}
 	curr->sched_class->task_tick(rq, curr, 0);
 
+	calc_load_nohz_remote(rq);
 out_unlock:
 	rq_unlock_irq(rq, &rf);
-
 out_requeue:
+
 	/*
 	 * Run the remote tick once per second (1Hz). This arbitrary
 	 * frequency is large enough to avoid overload but short enough
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index 28a516575c18..de22da666ac7 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -231,16 +231,11 @@ static inline int calc_load_read_idx(void)
 	return calc_load_idx & 1;
 }
 
-void calc_load_nohz_start(void)
+static void calc_load_nohz_fold(struct rq *rq)
 {
-	struct rq *this_rq = this_rq();
 	long delta;
 
-	/*
-	 * We're going into NO_HZ mode, if there's any pending delta, fold it
-	 * into the pending NO_HZ delta.
-	 */
-	delta = calc_load_fold_active(this_rq, 0);
+	delta = calc_load_fold_active(rq, 0);
 	if (delta) {
 		int idx = calc_load_write_idx();
 
@@ -248,6 +243,24 @@ void calc_load_nohz_start(void)
 	}
 }
 
+void calc_load_nohz_start(void)
+{
+	/*
+	 * We're going into NO_HZ mode, if there's any pending delta, fold it
+	 * into the pending NO_HZ delta.
+	 */
+	calc_load_nohz_fold(this_rq());
+}
+
+/*
+ * Keep track of the load for NOHZ_FULL, must be called between
+ * calc_load_nohz_{start,stop}().
+ */
+void calc_load_nohz_remote(struct rq *rq)
+{
+	calc_load_nohz_fold(rq);
+}
+
 void calc_load_nohz_stop(void)
 {
 	struct rq *this_rq = this_rq();
@@ -268,7 +281,7 @@ void calc_load_nohz_stop(void)
 		this_rq->calc_load_update += LOAD_FREQ;
 }
 
-static long calc_load_nohz_fold(void)
+static long calc_load_nohz_read(void)
 {
 	int idx = calc_load_read_idx();
 	long delta = 0;
@@ -323,7 +336,7 @@ static void calc_global_nohz(void)
 }
 #else /* !CONFIG_NO_HZ_COMMON */
 
-static inline long calc_load_nohz_fold(void) { return 0; }
+static inline long calc_load_nohz_read(void) { return 0; }
 static inline void calc_global_nohz(void) { }
 
 #endif /* CONFIG_NO_HZ_COMMON */
@@ -346,7 +359,7 @@ void calc_global_load(unsigned long ticks)
 	/*
 	 * Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs.
 	 */
-	delta = calc_load_nohz_fold();
+	delta = calc_load_nohz_read();
 	if (delta)
 		atomic_long_add(delta, &calc_load_tasks);
 
-- 
cgit v1.2.3


From b396f52326de20ec974471b7b19168867b365cbf Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Tue, 14 Jan 2020 10:13:20 +0000
Subject: sched/fair: Allow a small load imbalance between low utilisation
 SD_NUMA domains

The CPU load balancer balances between different domains to spread load
and strives to have equal balance everywhere. Communicating tasks can
migrate so they are topologically close to each other but these decisions
are independent. On a lightly loaded NUMA machine, two communicating tasks
pulled together at wakeup time can be pushed apart by the load balancer.
In isolation, the load balancer decision is fine but it ignores the tasks
data locality and the wakeup/LB paths continually conflict. NUMA balancing
is also a factor but it also simply conflicts with the load balancer.

This patch allows a fixed degree of imbalance of two tasks to exist
between NUMA domains regardless of utilisation levels. In many cases,
this prevents communicating tasks being pulled apart. It was evaluated
whether the imbalance should be scaled to the domain size. However, no
additional benefit was measured across a range of workloads and machines
and scaling adds the risk that lower domains have to be rebalanced. While
this could change again in the future, such a change should specify the
use case and benefit.

The most obvious impact is on netperf TCP_STREAM -- two simple
communicating tasks with some softirq offload depending on the
transmission rate.

 2-socket Haswell machine 48 core, HT enabled
 netperf-tcp -- mmtests config config-network-netperf-unbound
			      baseline              lbnuma-v3
 Hmean     64         568.73 (   0.00%)      577.56 *   1.55%*
 Hmean     128       1089.98 (   0.00%)     1128.06 *   3.49%*
 Hmean     256       2061.72 (   0.00%)     2104.39 *   2.07%*
 Hmean     1024      7254.27 (   0.00%)     7557.52 *   4.18%*
 Hmean     2048     11729.20 (   0.00%)    13350.67 *  13.82%*
 Hmean     3312     15309.08 (   0.00%)    18058.95 *  17.96%*
 Hmean     4096     17338.75 (   0.00%)    20483.66 *  18.14%*
 Hmean     8192     25047.12 (   0.00%)    27806.84 *  11.02%*
 Hmean     16384    27359.55 (   0.00%)    33071.88 *  20.88%*
 Stddev    64           2.16 (   0.00%)        2.02 (   6.53%)
 Stddev    128          2.31 (   0.00%)        2.19 (   5.05%)
 Stddev    256         11.88 (   0.00%)        3.22 (  72.88%)
 Stddev    1024        23.68 (   0.00%)        7.24 (  69.43%)
 Stddev    2048        79.46 (   0.00%)       71.49 (  10.03%)
 Stddev    3312        26.71 (   0.00%)       57.80 (-116.41%)
 Stddev    4096       185.57 (   0.00%)       96.15 (  48.19%)
 Stddev    8192       245.80 (   0.00%)      100.73 (  59.02%)
 Stddev    16384      207.31 (   0.00%)      141.65 (  31.67%)

In this case, there was a sizable improvement to performance and
a general reduction in variance. However, this is not univeral.
For most machines, the impact was roughly a 3% performance gain.

 Ops NUMA base-page range updates       19796.00         292.00
 Ops NUMA PTE updates                   19796.00         292.00
 Ops NUMA PMD updates                       0.00           0.00
 Ops NUMA hint faults                   16113.00         143.00
 Ops NUMA hint local faults %            8407.00         142.00
 Ops NUMA hint local percent               52.18          99.30
 Ops NUMA pages migrated                 4244.00           1.00

Without the patch, only 52.18% of sampled accesses are local.  In an
earlier changelog, 100% of sampled accesses are local and indeed on
most machines, this was still the case. In this specific case, the
local sampled rates was 99.3% but note the "base-page range updates"
and "PTE updates".  The activity with the patch is negligible as were
the number of faults. The small number of pages migrated were related to
shared libraries.  A 2-socket Broadwell showed better results on average
but are not presented for brevity as the performance was similar except
it showed 100% of the sampled NUMA hints were local. The patch holds up
for a 4-socket Haswell, an AMD EPYC and AMD Epyc 2 machine.

For dbench, the impact depends on the filesystem used and the number of
clients. On XFS, there is little difference as the clients typically
communicate with workqueues which have a separate class of scheduler
problem at the moment. For ext4, performance is generally better,
particularly for small numbers of clients as NUMA balancing activity is
negligible with the patch applied.

A more interesting example is the Facebook schbench which uses a
number of messaging threads to communicate with worker threads. In this
configuration, one messaging thread is used per NUMA node and the number of
worker threads is varied. The 50, 75, 90, 95, 99, 99.5 and 99.9 percentiles
for response latency is then reported.

 Lat 50.00th-qrtle-1        44.00 (   0.00%)       37.00 (  15.91%)
 Lat 75.00th-qrtle-1        53.00 (   0.00%)       41.00 (  22.64%)
 Lat 90.00th-qrtle-1        57.00 (   0.00%)       42.00 (  26.32%)
 Lat 95.00th-qrtle-1        63.00 (   0.00%)       43.00 (  31.75%)
 Lat 99.00th-qrtle-1        76.00 (   0.00%)       51.00 (  32.89%)
 Lat 99.50th-qrtle-1        89.00 (   0.00%)       52.00 (  41.57%)
 Lat 99.90th-qrtle-1        98.00 (   0.00%)       55.00 (  43.88%)
 Lat 50.00th-qrtle-2        42.00 (   0.00%)       42.00 (   0.00%)
 Lat 75.00th-qrtle-2        48.00 (   0.00%)       47.00 (   2.08%)
 Lat 90.00th-qrtle-2        53.00 (   0.00%)       52.00 (   1.89%)
 Lat 95.00th-qrtle-2        55.00 (   0.00%)       53.00 (   3.64%)
 Lat 99.00th-qrtle-2        62.00 (   0.00%)       60.00 (   3.23%)
 Lat 99.50th-qrtle-2        63.00 (   0.00%)       63.00 (   0.00%)
 Lat 99.90th-qrtle-2        68.00 (   0.00%)       66.00 (   2.94%

For higher worker threads, the differences become negligible but it's
interesting to note the difference in wakeup latency at low utilisation
and mpstat confirms that activity was almost all on one node until
the number of worker threads increase.

Hackbench generally showed neutral results across a range of machines.
This is different to earlier versions of the patch which allowed imbalances
for higher degrees of utilisation. perf bench pipe showed negligible
differences in overall performance as the differences are very close to
the noise.

An earlier prototype of the patch showed major regressions for NAS C-class
when running with only half of the available CPUs -- 20-30% performance
hits were measured at the time. With this version of the patch, the impact
is negligible with small gains/losses within the noise measured. This is
because the number of threads far exceeds the small imbalance the aptch
cares about. Similarly, there were report of regressions for the autonuma
benchmark against earlier versions but again, normal load balancing now
applies for that workload.

In general, the patch simply seeks to avoid unnecessary cross-node
migrations in the basic case where imbalances are very small.  For low
utilisation communicating workloads, this patch generally behaves better
with less NUMA balancing activity. For high utilisation, there is no
change in behaviour.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Valentin Schneider <valentin.schneider@arm.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Reviewed-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Phil Auld <pauld@redhat.com>
Tested-by: Phil Auld <pauld@redhat.com>
Link: https://lkml.kernel.org/r/20200114101319.GO3466@techsingularity.net
---
 kernel/sched/fair.c | 41 +++++++++++++++++++++++++++++------------
 1 file changed, 29 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fe4e0d775375..25dffc03f0f6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8658,10 +8658,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 	/*
 	 * Try to use spare capacity of local group without overloading it or
 	 * emptying busiest.
-	 * XXX Spreading tasks across NUMA nodes is not always the best policy
-	 * and special care should be taken for SD_NUMA domain level before
-	 * spreading the tasks. For now, load_balance() fully relies on
-	 * NUMA_BALANCING and fbq_classify_group/rq to override the decision.
 	 */
 	if (local->group_type == group_has_spare) {
 		if (busiest->group_type > group_fully_busy) {
@@ -8701,16 +8697,37 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 			env->migration_type = migrate_task;
 			lsub_positive(&nr_diff, local->sum_nr_running);
 			env->imbalance = nr_diff >> 1;
-			return;
-		}
+		} else {
 
-		/*
-		 * If there is no overload, we just want to even the number of
-		 * idle cpus.
-		 */
-		env->migration_type = migrate_task;
-		env->imbalance = max_t(long, 0, (local->idle_cpus -
+			/*
+			 * If there is no overload, we just want to even the number of
+			 * idle cpus.
+			 */
+			env->migration_type = migrate_task;
+			env->imbalance = max_t(long, 0, (local->idle_cpus -
 						 busiest->idle_cpus) >> 1);
+		}
+
+		/* Consider allowing a small imbalance between NUMA groups */
+		if (env->sd->flags & SD_NUMA) {
+			unsigned int imbalance_min;
+
+			/*
+			 * Compute an allowed imbalance based on a simple
+			 * pair of communicating tasks that should remain
+			 * local and ignore them.
+			 *
+			 * NOTE: Generally this would have been based on
+			 * the domain size and this was evaluated. However,
+			 * the benefit is similar across a range of workloads
+			 * and machines but scaling by the domain size adds
+			 * the risk that lower domains have to be rebalanced.
+			 */
+			imbalance_min = 2;
+			if (busiest->sum_nr_running <= imbalance_min)
+				env->imbalance = 0;
+		}
+
 		return;
 	}
 
-- 
cgit v1.2.3


From b562d140649966d4daedd0483a8fe59ad3bb465a Mon Sep 17 00:00:00 2001
From: Qais Yousef <qais.yousef@arm.com>
Date: Tue, 14 Jan 2020 21:09:47 +0000
Subject: sched/uclamp: Reject negative values in cpu_uclamp_write()

The check to ensure that the new written value into cpu.uclamp.{min,max}
is within range, [0:100], wasn't working because of the signed
comparison

 7301                 if (req.percent > UCLAMP_PERCENT_SCALE) {
 7302                         req.ret = -ERANGE;
 7303                         return req;
 7304                 }

	# echo -1 > cpu.uclamp.min
	# cat cpu.uclamp.min
	42949671.96

Cast req.percent into u64 to force the comparison to be unsigned and
work as intended in capacity_from_percent().

	# echo -1 > cpu.uclamp.min
	sh: write error: Numerical result out of range

Fixes: 2480c093130f ("sched/uclamp: Extend CPU's cgroup controller")
Signed-off-by: Qais Yousef <qais.yousef@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20200114210947.14083-1-qais.yousef@arm.com
---
 kernel/sched/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4ff03c27779e..55b9a9c53b91 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7264,7 +7264,7 @@ capacity_from_percent(char *buf)
 					     &req.percent);
 		if (req.ret)
 			return req;
-		if (req.percent > UCLAMP_PERCENT_SCALE) {
+		if ((u64)req.percent > UCLAMP_PERCENT_SCALE) {
 			req.ret = -ERANGE;
 			return req;
 		}
-- 
cgit v1.2.3


From e938b9c94164e4d981039f1cf6007d7453883e5a Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpengli@tencent.com>
Date: Mon, 13 Jan 2020 08:50:27 +0800
Subject: sched/nohz: Optimize get_nohz_timer_target()

On a machine, CPU 0 is used for housekeeping, the other 39 CPUs in the
same socket are in nohz_full mode. We can observe huge time burn in the
loop for seaching nearest busy housekeeper cpu by ftrace.

  2)               |                        get_nohz_timer_target() {
  2)   0.240 us    |                          housekeeping_test_cpu();
  2)   0.458 us    |                          housekeeping_test_cpu();

  ...

  2)   0.292 us    |                          housekeeping_test_cpu();
  2)   0.240 us    |                          housekeeping_test_cpu();
  2)   0.227 us    |                          housekeeping_any_cpu();
  2) + 43.460 us   |                        }

This patch optimizes the searching logic by finding a nearest housekeeper
CPU in the housekeeping cpumask, it can minimize the worst searching time
from ~44us to < 10us in my testing. In addition, the last iterated busy
housekeeper can become a random candidate while current CPU is a better
fallback if it is a housekeeper.

Signed-off-by: Wanpeng Li <wanpengli@tencent.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lkml.kernel.org/r/1578876627-11938-1-git-send-email-wanpengli@tencent.com
---
 kernel/sched/core.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 55b9a9c53b91..a8a5d5b6f5cf 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -552,27 +552,32 @@ void resched_cpu(int cpu)
  */
 int get_nohz_timer_target(void)
 {
-	int i, cpu = smp_processor_id();
+	int i, cpu = smp_processor_id(), default_cpu = -1;
 	struct sched_domain *sd;
 
-	if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
-		return cpu;
+	if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {
+		if (!idle_cpu(cpu))
+			return cpu;
+		default_cpu = cpu;
+	}
 
 	rcu_read_lock();
 	for_each_domain(cpu, sd) {
-		for_each_cpu(i, sched_domain_span(sd)) {
+		for_each_cpu_and(i, sched_domain_span(sd),
+			housekeeping_cpumask(HK_FLAG_TIMER)) {
 			if (cpu == i)
 				continue;
 
-			if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
+			if (!idle_cpu(i)) {
 				cpu = i;
 				goto unlock;
 			}
 		}
 	}
 
-	if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
-		cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
+	if (default_cpu == -1)
+		default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
+	cpu = default_cpu;
 unlock:
 	rcu_read_unlock();
 	return cpu;
-- 
cgit v1.2.3


From 2a4b03ffc69f2dedc6388e9a6438b5f4c133a40d Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Tue, 14 Jan 2020 15:13:56 +0100
Subject: sched/fair: Prevent unlimited runtime on throttled group

When a running task is moved on a throttled task group and there is no
other task enqueued on the CPU, the task can keep running using 100% CPU
whatever the allocated bandwidth for the group and although its cfs rq is
throttled. Furthermore, the group entity of the cfs_rq and its parents are
not enqueued but only set as curr on their respective cfs_rqs.

We have the following sequence:

sched_move_task
  -dequeue_task: dequeue task and group_entities.
  -put_prev_task: put task and group entities.
  -sched_change_group: move task to new group.
  -enqueue_task: enqueue only task but not group entities because cfs_rq is
    throttled.
  -set_next_task : set task and group_entities as current sched_entity of
    their cfs_rq.

Another impact is that the root cfs_rq runnable_load_avg at root rq stays
null because the group_entities are not enqueued. This situation will stay
the same until an "external" event triggers a reschedule. Let trigger it
immediately instead.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Ben Segall <bsegall@google.com>
Link: https://lkml.kernel.org/r/1579011236-31256-1-git-send-email-vincent.guittot@linaro.org
---
 kernel/sched/core.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a8a5d5b6f5cf..89e54f3ed571 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7072,8 +7072,15 @@ void sched_move_task(struct task_struct *tsk)
 
 	if (queued)
 		enqueue_task(rq, tsk, queue_flags);
-	if (running)
+	if (running) {
 		set_next_task(rq, tsk);
+		/*
+		 * After changing group, the running task may have joined a
+		 * throttled one but it's still the running task. Trigger a
+		 * resched to make sure that task can still run.
+		 */
+		resched_curr(rq);
+	}
 
 	task_rq_unlock(rq, tsk, &rf);
 }
-- 
cgit v1.2.3


From 52262ee567ad14c9606be25f3caddcefa3c514e4 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Tue, 28 Jan 2020 15:40:06 +0000
Subject: sched/fair: Allow a per-CPU kthread waking a task to stack on the
 same CPU, to fix XFS performance regression

The following XFS commit:

  8ab39f11d974 ("xfs: prevent CIL push holdoff in log recovery")

changed the logic from using bound workqueues to using unbound
workqueues. Functionally this makes sense but it was observed at the
time that the dbench performance dropped quite a lot and CPU migrations
were increased.

The current pattern of the task migration is straight-forward. With XFS,
an IO issuer delegates work to xlog_cil_push_work ()on an unbound kworker.
This runs on a nearby CPU and on completion, dbench wakes up on its old CPU
as it is still idle and no migration occurs. dbench then queues the real
IO on the blk_mq_requeue_work() work item which runs on a bound kworker
which is forced to run on the same CPU as dbench. When IO completes,
the bound kworker wakes dbench but as the kworker is a bound but,
real task, the CPU is not considered idle and dbench gets migrated by
select_idle_sibling() to a new CPU. dbench may ping-pong between two CPUs
for a while but ultimately it starts a round-robin of all CPUs sharing
the same LLC. High-frequency migration on each IO completion has poor
performance overall. It has negative implications both in commication
costs and power management. mpstat confirmed that at low thread counts
that all CPUs sharing an LLC has low level of activity.

Note that even if the CIL patch was reverted, there still would
be migrations but the impact is less noticeable. It turns out that
individually the scheduler, XFS, blk-mq and workqueues all made sensible
decisions but in combination, the overall effect was sub-optimal.

This patch special cases the IO issue/completion pattern and allows
a bound kworker waker and a task wakee to stack on the same CPU if
there is a strong chance they are directly related. The expectation
is that the kworker is likely going back to sleep shortly. This is not
guaranteed as the IO could be queued asynchronously but there is a very
strong relationship between the task and kworker in this case that would
justify stacking on the same CPU instead of migrating. There should be
few concerns about kworker starvation given that the special casing is
only when the kworker is the waker.

DBench on XFS
MMTests config: io-dbench4-async modified to run on a fresh XFS filesystem

UMA machine with 8 cores sharing LLC
                          5.5.0-rc7              5.5.0-rc7
                  tipsched-20200124           kworkerstack
Amean     1        22.63 (   0.00%)       20.54 *   9.23%*
Amean     2        25.56 (   0.00%)       23.40 *   8.44%*
Amean     4        28.63 (   0.00%)       27.85 *   2.70%*
Amean     8        37.66 (   0.00%)       37.68 (  -0.05%)
Amean     64      469.47 (   0.00%)      468.26 (   0.26%)
Stddev    1         1.00 (   0.00%)        0.72 (  28.12%)
Stddev    2         1.62 (   0.00%)        1.97 ( -21.54%)
Stddev    4         2.53 (   0.00%)        3.58 ( -41.19%)
Stddev    8         5.30 (   0.00%)        5.20 (   1.92%)
Stddev    64       86.36 (   0.00%)       94.53 (  -9.46%)

NUMA machine, 48 CPUs total, 24 CPUs share cache
                           5.5.0-rc7              5.5.0-rc7
                   tipsched-20200124      kworkerstack-v1r2
Amean     1         58.69 (   0.00%)       30.21 *  48.53%*
Amean     2         60.90 (   0.00%)       35.29 *  42.05%*
Amean     4         66.77 (   0.00%)       46.55 *  30.28%*
Amean     8         81.41 (   0.00%)       68.46 *  15.91%*
Amean     16       113.29 (   0.00%)      107.79 *   4.85%*
Amean     32       199.10 (   0.00%)      198.22 *   0.44%*
Amean     64       478.99 (   0.00%)      477.06 *   0.40%*
Amean     128     1345.26 (   0.00%)     1372.64 *  -2.04%*
Stddev    1          2.64 (   0.00%)        4.17 ( -58.08%)
Stddev    2          4.35 (   0.00%)        5.38 ( -23.73%)
Stddev    4          6.77 (   0.00%)        6.56 (   3.00%)
Stddev    8         11.61 (   0.00%)       10.91 (   6.04%)
Stddev    16        18.63 (   0.00%)       19.19 (  -3.01%)
Stddev    32        38.71 (   0.00%)       38.30 (   1.06%)
Stddev    64       100.28 (   0.00%)       91.24 (   9.02%)
Stddev    128      186.87 (   0.00%)      160.34 (  14.20%)

Dbench has been modified to report the time to complete a single "load
file". This is a more meaningful metric for dbench that a throughput
metric as the benchmark makes many different system calls that are not
throughput-related

Patch shows a 9.23% and 48.53% reduction in the time to process a load
file with the difference partially explained by the number of CPUs sharing
a LLC. In a separate run, task migrations were almost eliminated by the
patch for low client counts. In case people have issue with the metric
used for the benchmark, this is a comparison of the throughputs as
reported by dbench on the NUMA machine.

dbench4 Throughput (misleading but traditional)
                           5.5.0-rc7              5.5.0-rc7
                   tipsched-20200124      kworkerstack-v1r2
Hmean     1        321.41 (   0.00%)      617.82 *  92.22%*
Hmean     2        622.87 (   0.00%)     1066.80 *  71.27%*
Hmean     4       1134.56 (   0.00%)     1623.74 *  43.12%*
Hmean     8       1869.96 (   0.00%)     2212.67 *  18.33%*
Hmean     16      2673.11 (   0.00%)     2806.13 *   4.98%*
Hmean     32      3032.74 (   0.00%)     3039.54 (   0.22%)
Hmean     64      2514.25 (   0.00%)     2498.96 *  -0.61%*
Hmean     128     1778.49 (   0.00%)     1746.05 *  -1.82%*

Note that this is somewhat specific to XFS and ext4 shows no performance
difference as it does not rely on kworkers in the same way. No major
problem was observed running other workloads on different machines although
not all tests have completed yet.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200128154006.GD3466@techsingularity.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  | 11 -----------
 kernel/sched/fair.c  | 14 ++++++++++++++
 kernel/sched/sched.h | 13 +++++++++++++
 3 files changed, 27 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 89e54f3ed571..1a9983da4408 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1447,17 +1447,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 
 #ifdef CONFIG_SMP
 
-static inline bool is_per_cpu_kthread(struct task_struct *p)
-{
-	if (!(p->flags & PF_KTHREAD))
-		return false;
-
-	if (p->nr_cpus_allowed != 1)
-		return false;
-
-	return true;
-}
-
 /*
  * Per-CPU kthreads are allowed to run on !active && online CPUs, see
  * __set_cpus_allowed_ptr() and select_fallback_rq().
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 25dffc03f0f6..94c3b8469cf6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5912,6 +5912,20 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	    (available_idle_cpu(prev) || sched_idle_cpu(prev)))
 		return prev;
 
+	/*
+	 * Allow a per-cpu kthread to stack with the wakee if the
+	 * kworker thread and the tasks previous CPUs are the same.
+	 * The assumption is that the wakee queued work for the
+	 * per-cpu kthread that is now complete and the wakeup is
+	 * essentially a sync wakeup. An obvious example of this
+	 * pattern is IO completions.
+	 */
+	if (is_per_cpu_kthread(current) &&
+	    prev == smp_processor_id() &&
+	    this_rq()->nr_running <= 1) {
+		return prev;
+	}
+
 	/* Check a recently used CPU as a potential idle candidate: */
 	recent_used_cpu = p->recent_used_cpu;
 	if (recent_used_cpu != prev &&
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1a88dc8ad11b..5876e6ba5903 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2479,3 +2479,16 @@ static inline void membarrier_switch_mm(struct rq *rq,
 {
 }
 #endif
+
+#ifdef CONFIG_SMP
+static inline bool is_per_cpu_kthread(struct task_struct *p)
+{
+	if (!(p->flags & PF_KTHREAD))
+		return false;
+
+	if (p->nr_cpus_allowed != 1)
+		return false;
+
+	return true;
+}
+#endif
-- 
cgit v1.2.3


From 6fcca0fa48118e6d63733eb4644c6cd880c15b8f Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Mon, 3 Feb 2020 13:22:16 -0800
Subject: sched/psi: Fix OOB write when writing 0 bytes to PSI files

Issuing write() with count parameter set to 0 on any file under
/proc/pressure/ will cause an OOB write because of the access to
buf[buf_size-1] when NUL-termination is performed. Fix this by checking
for buf_size to be non-zero.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Link: https://lkml.kernel.org/r/20200203212216.7076-1-surenb@google.com
---
 kernel/sched/psi.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index db7b50bba3f1..38ccd49b9bf6 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -1199,6 +1199,9 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
 	if (static_branch_likely(&psi_disabled))
 		return -EOPNOTSUPP;
 
+	if (!nbytes)
+		return -EINVAL;
+
 	buf_size = min(nbytes, sizeof(buf));
 	if (copy_from_user(buf, user_buf, buf_size))
 		return -EFAULT;
-- 
cgit v1.2.3


From 4104a562e0ca62e971089db9d3c47794a0d7d4eb Mon Sep 17 00:00:00 2001
From: Madhuparna Bhowmik <madhuparnabhowmik10@gmail.com>
Date: Sat, 1 Feb 2020 18:28:03 +0530
Subject: sched/core: Annotate curr pointer in rq with __rcu

This patch fixes the following sparse warnings in sched/core.c
and sched/membarrier.c:

  kernel/sched/core.c:2372:27: error: incompatible types in comparison expression
  kernel/sched/core.c:4061:17: error: incompatible types in comparison expression
  kernel/sched/core.c:6067:9: error: incompatible types in comparison expression
  kernel/sched/membarrier.c:108:21: error: incompatible types in comparison expression
  kernel/sched/membarrier.c:177:21: error: incompatible types in comparison expression
  kernel/sched/membarrier.c:243:21: error: incompatible types in comparison expression

Signed-off-by: Madhuparna Bhowmik <madhuparnabhowmik10@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20200201125803.20245-1-madhuparnabhowmik10@gmail.com
---
 kernel/sched/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5876e6ba5903..9ea647835fd6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -896,7 +896,7 @@ struct rq {
 	 */
 	unsigned long		nr_uninterruptible;
 
-	struct task_struct	*curr;
+	struct task_struct __rcu	*curr;
 	struct task_struct	*idle;
 	struct task_struct	*stop;
 	unsigned long		next_balance;
-- 
cgit v1.2.3


From e9f5490c3574b435ce7fe7a71724aa3866babc7f Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sun, 9 Feb 2020 19:29:12 -0800
Subject: sched/fair: Fix kernel-doc warning in attach_entity_load_avg()

Fix kernel-doc warning in kernel/sched/fair.c, caused by a recent
function parameter removal:

  ../kernel/sched/fair.c:3526: warning: Excess function parameter 'flags' description in 'attach_entity_load_avg'

Fixes: a4f9a0e51bbf ("sched/fair: Remove redundant call to cpufreq_update_util()")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lkml.kernel.org/r/cbe964e4-6879-fd08-41c9-ef1917414af4@infradead.org
---
 kernel/sched/fair.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 94c3b8469cf6..3c8a379c357e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3516,7 +3516,6 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
  * attach_entity_load_avg - attach this entity to its cfs_rq load avg
  * @cfs_rq: cfs_rq to attach to
  * @se: sched_entity to attach
- * @flags: migration hints
  *
  * Must call update_cfs_rq_load_avg() before this, since we rely on
  * cfs_rq->avg.last_update_time being current.
-- 
cgit v1.2.3