summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-02-06 19:57:31 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2018-02-06 19:57:31 -0800
commitab2d92ad881da11331280aedf612d82e61cb6d41 (patch)
treeea1bc914b9dfc510b7ea7ec21219f0f831c442cc
parent4b0dda4f86c5c87698f7228a4f65ef834dc79252 (diff)
parent82845079160817cc6ac64e5321bbd935e0a47b3a (diff)
Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: - membarrier updates (Mathieu Desnoyers) - SMP balancing optimizations (Mel Gorman) - stats update optimizations (Peter Zijlstra) - RT scheduler race fixes (Steven Rostedt) - misc fixes and updates * 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/fair: Use a recently used CPU as an idle candidate and the basis for SIS sched/fair: Do not migrate if the prev_cpu is idle sched/fair: Restructure wake_affine*() to return a CPU id sched/fair: Remove unnecessary parameters from wake_affine_idle() sched/rt: Make update_curr_rt() more accurate sched/rt: Up the root domain ref count when passing it around via IPIs sched/rt: Use container_of() to get root domain in rto_push_irq_work_func() sched/core: Optimize update_stats_*() sched/core: Optimize ttwu_stat() membarrier/selftest: Test private expedited sync core command membarrier/arm64: Provide core serializing command membarrier/x86: Provide core serializing command membarrier: Provide core serializing command, *_SYNC_CORE lockin/x86: Implement sync_core_before_usermode() locking: Introduce sync_core_before_usermode() membarrier/selftest: Test global expedited command membarrier: Provide GLOBAL_EXPEDITED command membarrier: Document scheduler barrier requirements powerpc, membarrier: Skip memory barrier in switch_mm() membarrier/selftest: Test private expedited command
-rw-r--r--MAINTAINERS1
-rw-r--r--arch/arm64/Kconfig1
-rw-r--r--arch/arm64/kernel/entry.S4
-rw-r--r--arch/powerpc/Kconfig1
-rw-r--r--arch/powerpc/include/asm/membarrier.h27
-rw-r--r--arch/powerpc/mm/mmu_context.c7
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/entry/entry_32.S5
-rw-r--r--arch/x86/entry/entry_64.S4
-rw-r--r--arch/x86/include/asm/sync_core.h28
-rw-r--r--arch/x86/mm/tlb.c6
-rw-r--r--include/linux/sched.h8
-rw-r--r--include/linux/sched/mm.h35
-rw-r--r--include/linux/sync_core.h21
-rw-r--r--include/uapi/linux/membarrier.h74
-rw-r--r--init/Kconfig9
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/sched/core.c74
-rw-r--r--kernel/sched/fair.c101
-rw-r--r--kernel/sched/membarrier.c177
-rw-r--r--kernel/sched/rt.c29
-rw-r--r--kernel/sched/sched.h2
-rw-r--r--kernel/sched/stats.h6
-rw-r--r--kernel/sched/topology.c13
-rw-r--r--tools/testing/selftests/membarrier/membarrier_test.c237
25 files changed, 750 insertions, 127 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index b0760a8f1369..d1212226cbba 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9025,6 +9025,7 @@ L: linux-kernel@vger.kernel.org
S: Supported
F: kernel/sched/membarrier.c
F: include/uapi/linux/membarrier.h
+F: arch/powerpc/include/asm/membarrier.h
MEMORY MANAGEMENT
L: linux-mm@kvack.org
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 53612879fe56..7381eeb7ef8e 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -16,6 +16,7 @@ config ARM64
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
select ARCH_HAS_KCOV
+ select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_SET_MEMORY
select ARCH_HAS_SG_CHAIN
select ARCH_HAS_STRICT_KERNEL_RWX
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index b34e717d7597..cccd2788e631 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -324,6 +324,10 @@ alternative_else_nop_endif
ldp x28, x29, [sp, #16 * 14]
ldr lr, [sp, #S_LR]
add sp, sp, #S_FRAME_SIZE // restore sp
+ /*
+ * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on eret context synchronization
+ * when returning from IPI handler, and when returning to user-space.
+ */
.if \el == 0
alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 9d3329811cc1..73ce5dd07642 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -141,6 +141,7 @@ config PPC
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_PHYS_TO_DMA
select ARCH_HAS_PMEM_API if PPC64
+ select ARCH_HAS_MEMBARRIER_CALLBACKS
select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE
select ARCH_HAS_SG_CHAIN
select ARCH_HAS_STRICT_KERNEL_RWX if ((PPC_BOOK3S_64 || PPC32) && !RELOCATABLE && !HIBERNATION)
diff --git a/arch/powerpc/include/asm/membarrier.h b/arch/powerpc/include/asm/membarrier.h
new file mode 100644
index 000000000000..6e20bb5c74ea
--- /dev/null
+++ b/arch/powerpc/include/asm/membarrier.h
@@ -0,0 +1,27 @@
+#ifndef _ASM_POWERPC_MEMBARRIER_H
+#define _ASM_POWERPC_MEMBARRIER_H
+
+static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
+ struct mm_struct *next,
+ struct task_struct *tsk)
+{
+ /*
+ * Only need the full barrier when switching between processes.
+ * Barrier when switching from kernel to userspace is not
+ * required here, given that it is implied by mmdrop(). Barrier
+ * when switching from userspace to kernel is not needed after
+ * store to rq->curr.
+ */
+ if (likely(!(atomic_read(&next->membarrier_state) &
+ (MEMBARRIER_STATE_PRIVATE_EXPEDITED |
+ MEMBARRIER_STATE_GLOBAL_EXPEDITED)) || !prev))
+ return;
+
+ /*
+ * The membarrier system call requires a full memory barrier
+ * after storing to rq->curr, before going back to user-space.
+ */
+ smp_mb();
+}
+
+#endif /* _ASM_POWERPC_MEMBARRIER_H */
diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c
index d60a62bf4fc7..0ab297c4cfad 100644
--- a/arch/powerpc/mm/mmu_context.c
+++ b/arch/powerpc/mm/mmu_context.c
@@ -12,6 +12,7 @@
#include <linux/mm.h>
#include <linux/cpu.h>
+#include <linux/sched/mm.h>
#include <asm/mmu_context.h>
@@ -58,6 +59,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
*
* On the read side the barrier is in pte_xchg(), which orders
* the store to the PTE vs the load of mm_cpumask.
+ *
+ * This full barrier is needed by membarrier when switching
+ * between processes after store to rq->curr, before user-space
+ * memory accesses.
*/
smp_mb();
@@ -80,6 +85,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
if (new_on_cpu)
radix_kvm_prefetch_workaround(next);
+ else
+ membarrier_arch_switch_mm(prev, next, tsk);
/*
* The actual HW switching method differs between the various
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b0771ceabb4b..cefa6dbe80ae 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -55,6 +55,7 @@ config X86
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_KCOV if X86_64
select ARCH_HAS_PHYS_TO_DMA
+ select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_PMEM_API if X86_64
select ARCH_HAS_REFCOUNT
select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
@@ -62,6 +63,7 @@ config X86
select ARCH_HAS_SG_CHAIN
select ARCH_HAS_STRICT_KERNEL_RWX
select ARCH_HAS_STRICT_MODULE_RWX
+ select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARCH_HAS_ZONE_DEVICE if X86_64
select ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 2a35b1e0fb90..abee6d2b9311 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -566,6 +566,11 @@ restore_all:
.Lrestore_nocheck:
RESTORE_REGS 4 # skip orig_eax/error_code
.Lirq_return:
+ /*
+ * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
+ * when returning from IPI handler and when returning from
+ * scheduler to user-space.
+ */
INTERRUPT_RETURN
.section .fixup, "ax"
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index c752abe89d80..4a9bef6aca34 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -691,6 +691,10 @@ GLOBAL(restore_regs_and_return_to_kernel)
POP_EXTRA_REGS
POP_C_REGS
addq $8, %rsp /* skip regs->orig_ax */
+ /*
+ * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
+ * when returning from IPI handler.
+ */
INTERRUPT_RETURN
ENTRY(native_iret)
diff --git a/arch/x86/include/asm/sync_core.h b/arch/x86/include/asm/sync_core.h
new file mode 100644
index 000000000000..c67caafd3381
--- /dev/null
+++ b/arch/x86/include/asm/sync_core.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SYNC_CORE_H
+#define _ASM_X86_SYNC_CORE_H
+
+#include <linux/preempt.h>
+#include <asm/processor.h>
+#include <asm/cpufeature.h>
+
+/*
+ * Ensure that a core serializing instruction is issued before returning
+ * to user-mode. x86 implements return to user-space through sysexit,
+ * sysrel, and sysretq, which are not core serializing.
+ */
+static inline void sync_core_before_usermode(void)
+{
+ /* With PTI, we unconditionally serialize before running user code. */
+ if (static_cpu_has(X86_FEATURE_PTI))
+ return;
+ /*
+ * Return from interrupt and NMI is done through iret, which is core
+ * serializing.
+ */
+ if (in_irq() || in_nmi())
+ return;
+ sync_core();
+}
+
+#endif /* _ASM_X86_SYNC_CORE_H */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 012d02624848..8dcc0607f805 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -229,6 +229,12 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
#endif
this_cpu_write(cpu_tlbstate.is_lazy, false);
+ /*
+ * The membarrier system call requires a full memory barrier and
+ * core serialization before returning to user-space, after
+ * storing to rq->curr. Writing to CR3 provides that full
+ * memory barrier and core serializing instruction.
+ */
if (real_prev == next) {
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
next->context.ctx_id);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 166144c04ef6..92744e3f1556 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -555,6 +555,14 @@ struct task_struct {
unsigned long wakee_flip_decay_ts;
struct task_struct *last_wakee;
+ /*
+ * recent_used_cpu is initially set as the last CPU used by a task
+ * that wakes affine another task. Waker/wakee relationships can
+ * push tasks around a CPU where each wakeup moves to the next one.
+ * Tracking a recently used CPU allows a quick search for a recently
+ * used CPU that may be idle.
+ */
+ int recent_used_cpu;
int wake_cpu;
#endif
int on_rq;
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index bd422561a75e..1149533aa2fa 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -7,6 +7,7 @@
#include <linux/sched.h>
#include <linux/mm_types.h>
#include <linux/gfp.h>
+#include <linux/sync_core.h>
/*
* Routines for handling mm_structs
@@ -194,18 +195,48 @@ static inline void memalloc_noreclaim_restore(unsigned int flags)
#ifdef CONFIG_MEMBARRIER
enum {
- MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY = (1U << 0),
- MEMBARRIER_STATE_SWITCH_MM = (1U << 1),
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY = (1U << 0),
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED = (1U << 1),
+ MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY = (1U << 2),
+ MEMBARRIER_STATE_GLOBAL_EXPEDITED = (1U << 3),
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY = (1U << 4),
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE = (1U << 5),
};
+enum {
+ MEMBARRIER_FLAG_SYNC_CORE = (1U << 0),
+};
+
+#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
+#include <asm/membarrier.h>
+#endif
+
+static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
+{
+ if (likely(!(atomic_read(&mm->membarrier_state) &
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE)))
+ return;
+ sync_core_before_usermode();
+}
+
static inline void membarrier_execve(struct task_struct *t)
{
atomic_set(&t->mm->membarrier_state, 0);
}
#else
+#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
+static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
+ struct mm_struct *next,
+ struct task_struct *tsk)
+{
+}
+#endif
static inline void membarrier_execve(struct task_struct *t)
{
}
+static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
+{
+}
#endif
#endif /* _LINUX_SCHED_MM_H */
diff --git a/include/linux/sync_core.h b/include/linux/sync_core.h
new file mode 100644
index 000000000000..013da4b8b327
--- /dev/null
+++ b/include/linux/sync_core.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SYNC_CORE_H
+#define _LINUX_SYNC_CORE_H
+
+#ifdef CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
+#include <asm/sync_core.h>
+#else
+/*
+ * This is a dummy sync_core_before_usermode() implementation that can be used
+ * on all architectures which return to user-space through core serializing
+ * instructions.
+ * If your architecture returns to user-space through non-core-serializing
+ * instructions, you need to write your own functions.
+ */
+static inline void sync_core_before_usermode(void)
+{
+}
+#endif
+
+#endif /* _LINUX_SYNC_CORE_H */
+
diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h
index 4e01ad7ffe98..5891d7614c8c 100644
--- a/include/uapi/linux/membarrier.h
+++ b/include/uapi/linux/membarrier.h
@@ -31,7 +31,7 @@
* enum membarrier_cmd - membarrier system call command
* @MEMBARRIER_CMD_QUERY: Query the set of supported commands. It returns
* a bitmask of valid commands.
- * @MEMBARRIER_CMD_SHARED: Execute a memory barrier on all running threads.
+ * @MEMBARRIER_CMD_GLOBAL: Execute a memory barrier on all running threads.
* Upon return from system call, the caller thread
* is ensured that all running threads have passed
* through a state where all memory accesses to
@@ -40,6 +40,28 @@
* (non-running threads are de facto in such a
* state). This covers threads from all processes
* running on the system. This command returns 0.
+ * @MEMBARRIER_CMD_GLOBAL_EXPEDITED:
+ * Execute a memory barrier on all running threads
+ * of all processes which previously registered
+ * with MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED.
+ * Upon return from system call, the caller thread
+ * is ensured that all running threads have passed
+ * through a state where all memory accesses to
+ * user-space addresses match program order between
+ * entry to and return from the system call
+ * (non-running threads are de facto in such a
+ * state). This only covers threads from processes
+ * which registered with
+ * MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED.
+ * This command returns 0. Given that
+ * registration is about the intent to receive
+ * the barriers, it is valid to invoke
+ * MEMBARRIER_CMD_GLOBAL_EXPEDITED from a
+ * non-registered process.
+ * @MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
+ * Register the process intent to receive
+ * MEMBARRIER_CMD_GLOBAL_EXPEDITED memory
+ * barriers. Always returns 0.
* @MEMBARRIER_CMD_PRIVATE_EXPEDITED:
* Execute a memory barrier on each running
* thread belonging to the same process as the current
@@ -51,7 +73,7 @@
* to and return from the system call
* (non-running threads are de facto in such a
* state). This only covers threads from the
- * same processes as the caller thread. This
+ * same process as the caller thread. This
* command returns 0 on success. The
* "expedited" commands complete faster than
* the non-expedited ones, they never block,
@@ -64,18 +86,54 @@
* Register the process intent to use
* MEMBARRIER_CMD_PRIVATE_EXPEDITED. Always
* returns 0.
+ * @MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
+ * In addition to provide memory ordering
+ * guarantees described in
+ * MEMBARRIER_CMD_PRIVATE_EXPEDITED, ensure
+ * the caller thread, upon return from system
+ * call, that all its running threads siblings
+ * have executed a core serializing
+ * instruction. (architectures are required to
+ * guarantee that non-running threads issue
+ * core serializing instructions before they
+ * resume user-space execution). This only
+ * covers threads from the same process as the
+ * caller thread. This command returns 0 on
+ * success. The "expedited" commands complete
+ * faster than the non-expedited ones, they
+ * never block, but have the downside of
+ * causing extra overhead. If this command is
+ * not implemented by an architecture, -EINVAL
+ * is returned. A process needs to register its
+ * intent to use the private expedited sync
+ * core command prior to using it, otherwise
+ * this command returns -EPERM.
+ * @MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
+ * Register the process intent to use
+ * MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE.
+ * If this command is not implemented by an
+ * architecture, -EINVAL is returned.
+ * Returns 0 on success.
+ * @MEMBARRIER_CMD_SHARED:
+ * Alias to MEMBARRIER_CMD_GLOBAL. Provided for
+ * header backward compatibility.
*
* Command to be passed to the membarrier system call. The commands need to
* be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
* the value 0.
*/
enum membarrier_cmd {
- MEMBARRIER_CMD_QUERY = 0,
- MEMBARRIER_CMD_SHARED = (1 << 0),
- /* reserved for MEMBARRIER_CMD_SHARED_EXPEDITED (1 << 1) */
- /* reserved for MEMBARRIER_CMD_PRIVATE (1 << 2) */
- MEMBARRIER_CMD_PRIVATE_EXPEDITED = (1 << 3),
- MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED = (1 << 4),
+ MEMBARRIER_CMD_QUERY = 0,
+ MEMBARRIER_CMD_GLOBAL = (1 << 0),
+ MEMBARRIER_CMD_GLOBAL_EXPEDITED = (1 << 1),
+ MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED = (1 << 2),
+ MEMBARRIER_CMD_PRIVATE_EXPEDITED = (1 << 3),
+ MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED = (1 << 4),
+ MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE = (1 << 5),
+ MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE = (1 << 6),
+
+ /* Alias for header backward compatibility. */
+ MEMBARRIER_CMD_SHARED = MEMBARRIER_CMD_GLOBAL,
};
#endif /* _UAPI_LINUX_MEMBARRIER_H */
diff --git a/init/Kconfig b/init/Kconfig
index a9a2e2c86671..e37f4b2a6445 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1412,6 +1412,12 @@ config USERFAULTFD
Enable the userfaultfd() system call that allows to intercept and
handle page faults in userland.
+config ARCH_HAS_MEMBARRIER_CALLBACKS
+ bool
+
+config ARCH_HAS_MEMBARRIER_SYNC_CORE
+ bool
+
config EMBEDDED
bool "Embedded system"
option allnoconfig_y
@@ -1915,3 +1921,6 @@ config ASN1
functions to call on what tags.
source "kernel/Kconfig.locks"
+
+config ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
+ bool
diff --git a/kernel/fork.c b/kernel/fork.c
index 5c372c954f3b..c7c112391d79 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -606,6 +606,11 @@ static void __mmdrop(struct mm_struct *mm)
void mmdrop(struct mm_struct *mm)
{
+ /*
+ * The implicit full barrier implied by atomic_dec_and_test() is
+ * required by the membarrier system call before returning to
+ * user-space, after storing to rq->curr.
+ */
if (unlikely(atomic_dec_and_test(&mm->mm_count)))
__mmdrop(mm);
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3da7a2444a91..36f113ac6353 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1630,16 +1630,16 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
#ifdef CONFIG_SMP
if (cpu == rq->cpu) {
- schedstat_inc(rq->ttwu_local);
- schedstat_inc(p->se.statistics.nr_wakeups_local);
+ __schedstat_inc(rq->ttwu_local);
+ __schedstat_inc(p->se.statistics.nr_wakeups_local);
} else {
struct sched_domain *sd;
- schedstat_inc(p->se.statistics.nr_wakeups_remote);
+ __schedstat_inc(p->se.statistics.nr_wakeups_remote);
rcu_read_lock();
for_each_domain(rq->cpu, sd) {
if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
- schedstat_inc(sd->ttwu_wake_remote);
+ __schedstat_inc(sd->ttwu_wake_remote);
break;
}
}
@@ -1647,14 +1647,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
}
if (wake_flags & WF_MIGRATED)
- schedstat_inc(p->se.statistics.nr_wakeups_migrate);
+ __schedstat_inc(p->se.statistics.nr_wakeups_migrate);
#endif /* CONFIG_SMP */
- schedstat_inc(rq->ttwu_count);
- schedstat_inc(p->se.statistics.nr_wakeups);
+ __schedstat_inc(rq->ttwu_count);
+ __schedstat_inc(p->se.statistics.nr_wakeups);
if (wake_flags & WF_SYNC)
- schedstat_inc(p->se.statistics.nr_wakeups_sync);
+ __schedstat_inc(p->se.statistics.nr_wakeups_sync);
}
static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
@@ -2461,6 +2461,7 @@ void wake_up_new_task(struct task_struct *p)
* Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
* as we're not fully set-up yet.
*/
+ p->recent_used_cpu = task_cpu(p);
__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
rq = __task_rq_lock(p, &rf);
@@ -2698,23 +2699,27 @@ static struct rq *finish_task_switch(struct task_struct *prev)
prev_state = prev->state;
vtime_task_switch(prev);
perf_event_task_sched_in(prev, current);
- /*
- * The membarrier system call requires a full memory barrier
- * after storing to rq->curr, before going back to user-space.
- *
- * TODO: This smp_mb__after_unlock_lock can go away if PPC end
- * up adding a full barrier to switch_mm(), or we should figure
- * out if a smp_mb__after_unlock_lock is really the proper API
- * to use.
- */
- smp_mb__after_unlock_lock();
finish_task(prev);
finish_lock_switch(rq);
finish_arch_post_lock_switch();
fire_sched_in_preempt_notifiers(current);
- if (mm)
+ /*
+ * When switching through a kernel thread, the loop in
+ * membarrier_{private,global}_expedited() may have observed that
+ * kernel thread and not issued an IPI. It is therefore possible to
+ * schedule between user->kernel->user threads without passing though
+ * switch_mm(). Membarrier requires a barrier after storing to
+ * rq->curr, before returning to userspace, so provide them here:
+ *
+ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
+ * provided by mmdrop(),
+ * - a sync_core for SYNC_CORE.
+ */
+ if (mm) {
+ membarrier_mm_sync_core_before_usermode(mm);
mmdrop(mm);
+ }
if (unlikely(prev_state == TASK_DEAD)) {
if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev);
@@ -2818,6 +2823,13 @@ context_switch(struct rq *rq, struct task_struct *prev,
*/
arch_start_context_switch(prev);
+ /*
+ * If mm is non-NULL, we pass through switch_mm(). If mm is
+ * NULL, we will pass through mmdrop() in finish_task_switch().
+ * Both of these contain the full memory barrier required by
+ * membarrier after storing to rq->curr, before returning to
+ * user-space.
+ */
if (!mm) {
next->active_mm = oldmm;
mmgrab(oldmm);
@@ -3354,6 +3366,9 @@ static void __sched notrace __schedule(bool preempt)
* Make sure that signal_pending_state()->signal_pending() below
* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
* done by the caller to avoid the race with signal_wake_up().
+ *
+ * The membarrier system call requires a full memory barrier
+ * after coming from user-space, before storing to rq->curr.
*/
rq_lock(rq, &rf);
smp_mb__after_spinlock();
@@ -3401,17 +3416,16 @@ static void __sched notrace __schedule(bool preempt)
/*
* The membarrier system call requires each architecture
* to have a full memory barrier after updating
- * rq->curr, before returning to user-space. For TSO
- * (e.g. x86), the architecture must provide its own
- * barrier in switch_mm(). For weakly ordered machines
- * for which spin_unlock() acts as a full memory
- * barrier, finish_lock_switch() in common code takes
- * care of this barrier. For weakly ordered machines for
- * which spin_unlock() acts as a RELEASE barrier (only
- * arm64 and PowerPC), arm64 has a full barrier in
- * switch_to(), and PowerPC has
- * smp_mb__after_unlock_lock() before
- * finish_lock_switch().
+ * rq->curr, before returning to user-space.
+ *
+ * Here are the schemes providing that barrier on the
+ * various architectures:
+ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
+ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
+ * - finish_lock_switch() for weakly-ordered
+ * architectures where spin_unlock is a full barrier,
+ * - switch_to() for arm64 (weakly-ordered, spin_unlock
+ * is a RELEASE barrier),
*/
++*switch_count;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7b6535987500..5eb3ffc9be84 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -871,7 +871,7 @@ update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
likely(wait_start > prev_wait_start))
wait_start -= prev_wait_start;
- schedstat_set(se->statistics.wait_start, wait_start);
+ __schedstat_set(se->statistics.wait_start, wait_start);
}
static inline void
@@ -893,17 +893,17 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
* time stamp can be adjusted to accumulate wait time
* prior to migration.
*/
- schedstat_set(se->statistics.wait_start, delta);
+ __schedstat_set(se->statistics.wait_start, delta);
return;
}
trace_sched_stat_wait(p, delta);
}
- schedstat_set(se->statistics.wait_max,
+ __schedstat_set(se->statistics.wait_max,
max(schedstat_val(se->statistics.wait_max), delta));
- schedstat_inc(se->statistics.wait_count);
- schedstat_add(se->statistics.wait_sum, delta);
- schedstat_set(se->statistics.wait_start, 0);
+ __schedstat_inc(se->statistics.wait_count);
+ __schedstat_add(se->statistics.wait_sum, delta);
+ __schedstat_set(se->statistics.wait_start, 0);
}
static inline void
@@ -928,10 +928,10 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
delta = 0;
if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
- schedstat_set(se->statistics.sleep_max, delta);
+ __schedstat_set(se->statistics.sleep_max, delta);
- schedstat_set(se->statistics.sleep_start, 0);
- schedstat_add(se->statistics.sum_sleep_runtime, delta);
+ __schedstat_set(se->statistics.sleep_start, 0);
+ __schedstat_add(se->statistics.sum_sleep_runtime, delta);
if (tsk) {
account_scheduler_latency(tsk, delta >> 10, 1);
@@ -945,15 +945,15 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
delta = 0;
if (unlikely(delta > schedstat_val(se->statistics.block_max)))
- schedstat_set(se->statistics.block_max, delta);
+ __schedstat_set(se->statistics.block_max, delta);
- schedstat_set(se->statistics.block_start, 0);
- schedstat_add(se->statistics.sum_sleep_runtime, delta);
+ __schedstat_set(se->statistics.block_start, 0);
+ __schedstat_add(se->statistics.sum_sleep_runtime, delta);
if (tsk) {
if (tsk->in_iowait) {
- schedstat_add(se->statistics.iowait_sum, delta);
- schedstat_inc(se->statistics.iowait_count);
+ __schedstat_add(se->statistics.iowait_sum, delta);
+ __schedstat_inc(se->statistics.iowait_count);
trace_sched_stat_iowait(tsk, delta);
}
@@ -1012,10 +1012,10 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
struct task_struct *tsk = task_of(se);
if (tsk->state & TASK_INTERRUPTIBLE)
- schedstat_set(se->statistics.sleep_start,
+ __schedstat_set(se->statistics.sleep_start,
rq_clock(rq_of(cfs_rq)));
if (tsk->state & TASK_UNINTERRUPTIBLE)
- schedstat_set(se->statistics.block_start,
+ __schedstat_set(se->statistics.block_start,
rq_clock(rq_of(cfs_rq)));
}
}
@@ -5692,27 +5692,31 @@ static int wake_wide(struct task_struct *p)
* scheduling latency of the CPUs. This seems to work
* for the overloaded case.
*/
-
-static bool
-wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
- int this_cpu, int prev_cpu, int sync)
+static int
+wake_affine_idle(int this_cpu, int prev_cpu, int sync)
{
/*
* If this_cpu is idle, it implies the wakeup is from interrupt
* context. Only allow the move if cache is shared. Otherwise an
* interrupt intensive workload could force all tasks onto one
* node depending on the IO topology or IRQ affinity settings.
+ *
+ * If the prev_cpu is idle and cache affine then avoid a migration.
+ * There is no guarantee that the cache hot data from an interrupt
+ * is more important than cache hot data on the prev_cpu and from
+ * a cpufreq perspective, it's better to have higher utilisation
+ * on one CPU.
*/
if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
- return true;
+ return id