summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-06-10 10:17:09 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2018-06-10 10:17:09 -0700
commitd82991a8688ad128b46db1b42d5d84396487a508 (patch)
treea0743d346c23df3dd057c89b83100c09a376224a
parentf4e5b30d809d3882c69f43b5c90779af033d40c4 (diff)
parentccba8b64452b8dbf2c9670de026d00f519bb5da0 (diff)
Merge branch 'core-rseq-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull restartable sequence support from Thomas Gleixner: "The restartable sequences syscall (finally): After a lot of back and forth discussion and massive delays caused by the speculative distraction of maintainers, the core set of restartable sequences has finally reached a consensus. It comes with the basic non disputed core implementation along with support for arm, powerpc and x86 and a full set of selftests It was exposed to linux-next earlier this week, so it does not fully comply with the merge window requirements, but there is really no point to drag it out for yet another cycle" * 'core-rseq-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: rseq/selftests: Provide Makefile, scripts, gitignore rseq/selftests: Provide parametrized tests rseq/selftests: Provide basic percpu ops test rseq/selftests: Provide basic test rseq/selftests: Provide rseq library selftests/lib.mk: Introduce OVERRIDE_TARGETS powerpc: Wire up restartable sequences system call powerpc: Add syscall detection for restartable sequences powerpc: Add support for restartable sequences x86: Wire up restartable sequence system call x86: Add support for restartable sequences arm: Wire up restartable sequences system call arm: Add syscall detection for restartable sequences arm: Add restartable sequences support rseq: Introduce restartable sequences system call uapi/headers: Provide types_32_64.h
-rw-r--r--MAINTAINERS12
-rw-r--r--arch/Kconfig7
-rw-r--r--arch/arm/Kconfig1
-rw-r--r--arch/arm/kernel/entry-common.S25
-rw-r--r--arch/arm/kernel/signal.c14
-rw-r--r--arch/arm/tools/syscall.tbl1
-rw-r--r--arch/powerpc/Kconfig1
-rw-r--r--arch/powerpc/include/asm/systbl.h1
-rw-r--r--arch/powerpc/include/asm/unistd.h2
-rw-r--r--arch/powerpc/include/uapi/asm/unistd.h1
-rw-r--r--arch/powerpc/kernel/entry_32.S7
-rw-r--r--arch/powerpc/kernel/entry_64.S8
-rw-r--r--arch/powerpc/kernel/signal.c3
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/entry/common.c3
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl1
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl1
-rw-r--r--arch/x86/kernel/signal.c6
-rw-r--r--fs/exec.c1
-rw-r--r--include/linux/sched.h134
-rw-r--r--include/linux/syscalls.h4
-rw-r--r--include/trace/events/rseq.h57
-rw-r--r--include/uapi/linux/rseq.h133
-rw-r--r--include/uapi/linux/types_32_64.h50
-rw-r--r--init/Kconfig23
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/rseq.c357
-rw-r--r--kernel/sched/core.c2
-rw-r--r--kernel/sys_ni.c3
-rw-r--r--tools/testing/selftests/Makefile1
-rw-r--r--tools/testing/selftests/lib.mk4
-rw-r--r--tools/testing/selftests/rseq/.gitignore6
-rw-r--r--tools/testing/selftests/rseq/Makefile30
-rw-r--r--tools/testing/selftests/rseq/basic_percpu_ops_test.c312
-rw-r--r--tools/testing/selftests/rseq/basic_test.c56
-rw-r--r--tools/testing/selftests/rseq/param_test.c1260
-rw-r--r--tools/testing/selftests/rseq/rseq-arm.h715
-rw-r--r--tools/testing/selftests/rseq/rseq-ppc.h671
-rw-r--r--tools/testing/selftests/rseq/rseq-skip.h65
-rw-r--r--tools/testing/selftests/rseq/rseq-x86.h1132
-rw-r--r--tools/testing/selftests/rseq/rseq.c117
-rw-r--r--tools/testing/selftests/rseq/rseq.h147
-rw-r--r--tools/testing/selftests/rseq/run_param_test.sh121
44 files changed, 5491 insertions, 8 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 3aa1f81922ff..70cd162d6487 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12134,6 +12134,18 @@ F: include/dt-bindings/reset/
F: include/linux/reset.h
F: include/linux/reset-controller.h
+RESTARTABLE SEQUENCES SUPPORT
+M: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+M: Peter Zijlstra <peterz@infradead.org>
+M: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
+M: Boqun Feng <boqun.feng@gmail.com>
+L: linux-kernel@vger.kernel.org
+S: Supported
+F: kernel/rseq.c
+F: include/uapi/linux/rseq.h
+F: include/trace/events/rseq.h
+F: tools/testing/selftests/rseq/
+
RFKILL
M: Johannes Berg <johannes@sipsolutions.net>
L: linux-wireless@vger.kernel.org
diff --git a/arch/Kconfig b/arch/Kconfig
index 8a7f7e1f2ca7..86ae4c4edd6f 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -272,6 +272,13 @@ config HAVE_REGS_AND_STACK_ACCESS_API
declared in asm/ptrace.h
For example the kprobes-based event tracer needs this API.
+config HAVE_RSEQ
+ bool
+ depends on HAVE_REGS_AND_STACK_ACCESS_API
+ help
+ This symbol should be selected by an architecture if it
+ supports an implementation of restartable sequences.
+
config HAVE_CLK
bool
help
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 534563ac7f5f..94d222545920 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -91,6 +91,7 @@ config ARM
select HAVE_PERF_USER_STACK_DUMP
select HAVE_RCU_TABLE_FREE if (SMP && ARM_LPAE)
select HAVE_REGS_AND_STACK_ACCESS_API
+ select HAVE_RSEQ
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_UID16
select HAVE_VIRT_CPU_ACCOUNTING_GEN
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index 20df608bf343..106a1466518d 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -39,12 +39,13 @@ saved_pc .req lr
.section .entry.text,"ax",%progbits
.align 5
-#if !(IS_ENABLED(CONFIG_TRACE_IRQFLAGS) || IS_ENABLED(CONFIG_CONTEXT_TRACKING))
+#if !(IS_ENABLED(CONFIG_TRACE_IRQFLAGS) || IS_ENABLED(CONFIG_CONTEXT_TRACKING) || \
+ IS_ENABLED(CONFIG_DEBUG_RSEQ))
/*
* This is the fast syscall return path. We do as little as possible here,
* such as avoiding writing r0 to the stack. We only use this path if we
- * have tracing and context tracking disabled - the overheads from those
- * features make this path too inefficient.
+ * have tracing, context tracking and rseq debug disabled - the overheads
+ * from those features make this path too inefficient.
*/
ret_fast_syscall:
UNWIND(.fnstart )
@@ -71,14 +72,20 @@ fast_work_pending:
/* fall through to work_pending */
#else
/*
- * The "replacement" ret_fast_syscall for when tracing or context tracking
- * is enabled. As we will need to call out to some C functions, we save
- * r0 first to avoid needing to save registers around each C function call.
+ * The "replacement" ret_fast_syscall for when tracing, context tracking,
+ * or rseq debug is enabled. As we will need to call out to some C functions,
+ * we save r0 first to avoid needing to save registers around each C function
+ * call.
*/
ret_fast_syscall:
UNWIND(.fnstart )
UNWIND(.cantunwind )
str r0, [sp, #S_R0 + S_OFF]! @ save returned r0
+#if IS_ENABLED(CONFIG_DEBUG_RSEQ)
+ /* do_rseq_syscall needs interrupts enabled. */
+ mov r0, sp @ 'regs'
+ bl do_rseq_syscall
+#endif
disable_irq_notrace @ disable interrupts
ldr r2, [tsk, #TI_ADDR_LIMIT]
cmp r2, #TASK_SIZE
@@ -113,6 +120,12 @@ ENDPROC(ret_fast_syscall)
*/
ENTRY(ret_to_user)
ret_slow_syscall:
+#if IS_ENABLED(CONFIG_DEBUG_RSEQ)
+ /* do_rseq_syscall needs interrupts enabled. */
+ enable_irq_notrace @ enable interrupts
+ mov r0, sp @ 'regs'
+ bl do_rseq_syscall
+#endif
disable_irq_notrace @ disable interrupts
ENTRY(ret_to_user_from_irq)
ldr r2, [tsk, #TI_ADDR_LIMIT]
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index bd8810d4acb3..f09e9d66d605 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -541,6 +541,12 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs)
int ret;
/*
+ * Increment event counter and perform fixup for the pre-signal
+ * frame.
+ */
+ rseq_signal_deliver(regs);
+
+ /*
* Set up the stack frame
*/
if (ksig->ka.sa.sa_flags & SA_SIGINFO)
@@ -660,6 +666,7 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
} else {
clear_thread_flag(TIF_NOTIFY_RESUME);
tracehook_notify_resume(regs);
+ rseq_handle_notify_resume(regs);
}
}
local_irq_disable();
@@ -703,3 +710,10 @@ asmlinkage void addr_limit_check_failed(void)
{
addr_limit_user_check();
}
+
+#ifdef CONFIG_DEBUG_RSEQ
+asmlinkage void do_rseq_syscall(struct pt_regs *regs)
+{
+ rseq_syscall(regs);
+}
+#endif
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index 0bb0e9c6376c..fbc74b5fa3ed 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -412,3 +412,4 @@
395 common pkey_alloc sys_pkey_alloc
396 common pkey_free sys_pkey_free
397 common statx sys_statx
+398 common rseq sys_rseq
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 8f959df2de7a..eaba5920234d 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -220,6 +220,7 @@ config PPC
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_VIRT_CPU_ACCOUNTING
select HAVE_IRQ_TIME_ACCOUNTING
+ select HAVE_RSEQ
select IOMMU_HELPER if PPC64
select IRQ_DOMAIN
select IRQ_FORCED_THREADING
diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
index 79a3b47e4839..cfcf6a874cfa 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -392,3 +392,4 @@ SYSCALL(statx)
SYSCALL(pkey_alloc)
SYSCALL(pkey_free)
SYSCALL(pkey_mprotect)
+SYSCALL(rseq)
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
index daf1ba97a00c..1e9708632dce 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -12,7 +12,7 @@
#include <uapi/asm/unistd.h>
-#define NR_syscalls 387
+#define NR_syscalls 388
#define __NR__exit __NR_exit
diff --git a/arch/powerpc/include/uapi/asm/unistd.h b/arch/powerpc/include/uapi/asm/unistd.h
index 389c36fd8299..ac5ba55066dd 100644
--- a/arch/powerpc/include/uapi/asm/unistd.h
+++ b/arch/powerpc/include/uapi/asm/unistd.h
@@ -398,5 +398,6 @@
#define __NR_pkey_alloc 384
#define __NR_pkey_free 385
#define __NR_pkey_mprotect 386
+#define __NR_rseq 387
#endif /* _UAPI_ASM_POWERPC_UNISTD_H_ */
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index eb8d01bae8c6..973577f2141c 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -365,6 +365,13 @@ syscall_dotrace_cont:
blrl /* Call handler */
.globl ret_from_syscall
ret_from_syscall:
+#ifdef CONFIG_DEBUG_RSEQ
+ /* Check whether the syscall is issued inside a restartable sequence */
+ stw r3,GPR3(r1)
+ addi r3,r1,STACK_FRAME_OVERHEAD
+ bl rseq_syscall
+ lwz r3,GPR3(r1)
+#endif
mr r6,r3
CURRENT_THREAD_INFO(r12, r1)
/* disable interrupts so current_thread_info()->flags can't change */
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index b10e01021214..729e9ef4d3bb 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -194,6 +194,14 @@ system_call: /* label this so stack traces look sane */
.Lsyscall_exit:
std r3,RESULT(r1)
+
+#ifdef CONFIG_DEBUG_RSEQ
+ /* Check whether the syscall is issued inside a restartable sequence */
+ addi r3,r1,STACK_FRAME_OVERHEAD
+ bl rseq_syscall
+ ld r3,RESULT(r1)
+#endif
+
CURRENT_THREAD_INFO(r12, r1)
ld r8,_MSR(r1)
diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index fb932f1202c7..17fe4339ba59 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -134,6 +134,8 @@ static void do_signal(struct task_struct *tsk)
/* Re-enable the breakpoints for the signal stack */
thread_change_pc(tsk, tsk->thread.regs);
+ rseq_signal_deliver(tsk->thread.regs);
+
if (is32) {
if (ksig.ka.sa.sa_flags & SA_SIGINFO)
ret = handle_rt_signal32(&ksig, oldset, tsk);
@@ -168,6 +170,7 @@ void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags)
if (thread_info_flags & _TIF_NOTIFY_RESUME) {
clear_thread_flag(TIF_NOTIFY_RESUME);
tracehook_notify_resume(regs);
+ rseq_handle_notify_resume(regs);
}
user_enter();
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0527060b2710..297789aef9fa 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -183,6 +183,7 @@ config X86
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE if X86_64 && UNWINDER_FRAME_POINTER && STACK_VALIDATION
select HAVE_STACK_VALIDATION if X86_64
+ select HAVE_RSEQ
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_UNSTABLE_SCHED_CLOCK
select HAVE_USER_RETURN_NOTIFIER
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index fbf6a6c3fd2d..92190879b228 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -164,6 +164,7 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
if (cached_flags & _TIF_NOTIFY_RESUME) {
clear_thread_flag(TIF_NOTIFY_RESUME);
tracehook_notify_resume(regs);
+ rseq_handle_notify_resume(regs);
}
if (cached_flags & _TIF_USER_RETURN_NOTIFY)
@@ -254,6 +255,8 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs)
WARN(irqs_disabled(), "syscall %ld left IRQs disabled", regs->orig_ax))
local_irq_enable();
+ rseq_syscall(regs);
+
/*
* First do one-time work. If these work items are enabled, we
* want to run them exactly once per syscall exit with IRQs on.
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 14a2f996e543..3cf7b533b3d1 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -397,3 +397,4 @@
383 i386 statx sys_statx __ia32_sys_statx
384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl
385 i386 io_pgetevents sys_io_pgetevents __ia32_compat_sys_io_pgetevents
+386 i386 rseq sys_rseq __ia32_sys_rseq
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index cd36232ab62f..f0b1709a5ffb 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -342,6 +342,7 @@
331 common pkey_free __x64_sys_pkey_free
332 common statx __x64_sys_statx
333 common io_pgetevents __x64_sys_io_pgetevents
+334 common rseq __x64_sys_rseq
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index da270b95fe4d..445ca11ff863 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -688,6 +688,12 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
sigset_t *set = sigmask_to_save();
compat_sigset_t *cset = (compat_sigset_t *) set;
+ /*
+ * Increment event counter and perform fixup for the pre-signal
+ * frame.
+ */
+ rseq_signal_deliver(regs);
+
/* Set up the stack frame */
if (is_ia32_frame(ksig)) {
if (ksig->ka.sa.sa_flags & SA_SIGINFO)
diff --git a/fs/exec.c b/fs/exec.c
index 30a36c2a39bf..2d4e0075bd24 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1824,6 +1824,7 @@ static int __do_execve_file(int fd, struct filename *filename,
current->fs->in_exec = 0;
current->in_execve = 0;
membarrier_execve(current);
+ rseq_execve(current);
acct_update_integrals(current);
task_numa_free(current);
free_bprm(bprm);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 14e4f9c12337..3aa4fcb74e76 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -27,6 +27,7 @@
#include <linux/signal_types.h>
#include <linux/mm_types_task.h>
#include <linux/task_io_accounting.h>
+#include <linux/rseq.h>
/* task_struct member predeclarations (sorted alphabetically): */
struct audit_context;
@@ -1047,6 +1048,17 @@ struct task_struct {
unsigned long numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */
+#ifdef CONFIG_RSEQ
+ struct rseq __user *rseq;
+ u32 rseq_len;
+ u32 rseq_sig;
+ /*
+ * RmW on rseq_event_mask must be performed atomically
+ * with respect to preemption.
+ */
+ unsigned long rseq_event_mask;
+#endif
+
struct tlbflush_unmap_batch tlb_ubc;
struct rcu_head rcu;
@@ -1757,4 +1769,126 @@ extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
#define TASK_SIZE_OF(tsk) TASK_SIZE
#endif
+#ifdef CONFIG_RSEQ
+
+/*
+ * Map the event mask on the user-space ABI enum rseq_cs_flags
+ * for direct mask checks.
+ */
+enum rseq_event_mask_bits {
+ RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT,
+ RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT,
+ RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT,
+};
+
+enum rseq_event_mask {
+ RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT),
+ RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT),
+ RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT),
+};
+
+static inline void rseq_set_notify_resume(struct task_struct *t)
+{
+ if (t->rseq)
+ set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
+}
+
+void __rseq_handle_notify_resume(struct pt_regs *regs);
+
+static inline void rseq_handle_notify_resume(struct pt_regs *regs)
+{
+ if (current->rseq)
+ __rseq_handle_notify_resume(regs);
+}
+
+static inline void rseq_signal_deliver(struct pt_regs *regs)
+{
+ preempt_disable();
+ __set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask);
+ preempt_enable();
+ rseq_handle_notify_resume(regs);
+}
+
+/* rseq_preempt() requires preemption to be disabled. */
+static inline void rseq_preempt(struct task_struct *t)
+{
+ __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
+ rseq_set_notify_resume(t);
+}
+
+/* rseq_migrate() requires preemption to be disabled. */
+static inline void rseq_migrate(struct task_struct *t)
+{
+ __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask);
+ rseq_set_notify_resume(t);
+}
+
+/*
+ * If parent process has a registered restartable sequences area, the
+ * child inherits. Only applies when forking a process, not a thread. In
+ * case a parent fork() in the middle of a restartable sequence, set the
+ * resume notifier to force the child to retry.
+ */
+static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
+{
+ if (clone_flags & CLONE_THREAD) {
+ t->rseq = NULL;
+ t->rseq_len = 0;
+ t->rseq_sig = 0;
+ t->rseq_event_mask = 0;
+ } else {
+ t->rseq = current->rseq;
+ t->rseq_len = current->rseq_len;
+ t->rseq_sig = current->rseq_sig;
+ t->rseq_event_mask = current->rseq_event_mask;
+ rseq_preempt(t);
+ }
+}
+
+static inline void rseq_execve(struct task_struct *t)
+{
+ t->rseq = NULL;
+ t->rseq_len = 0;
+ t->rseq_sig = 0;
+ t->rseq_event_mask = 0;
+}
+
+#else
+
+static inline void rseq_set_notify_resume(struct task_struct *t)
+{
+}
+static inline void rseq_handle_notify_resume(struct pt_regs *regs)
+{
+}
+static inline void rseq_signal_deliver(struct pt_regs *regs)
+{
+}
+static inline void rseq_preempt(struct task_struct *t)
+{
+}
+static inline void rseq_migrate(struct task_struct *t)
+{
+}
+static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
+{
+}
+static inline void rseq_execve(struct task_struct *t)
+{
+}
+
+#endif
+
+#ifdef CONFIG_DEBUG_RSEQ
+
+void rseq_syscall(struct pt_regs *regs);
+
+#else
+
+static inline void rseq_syscall(struct pt_regs *regs)
+{
+}
+
+#endif
+
#endif
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 390e814fdc8d..73810808cdf2 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -66,6 +66,7 @@ struct old_linux_dirent;
struct perf_event_attr;
struct file_handle;
struct sigaltstack;
+struct rseq;
union bpf_attr;
#include <linux/types.h>
@@ -897,7 +898,8 @@ asmlinkage long sys_pkey_alloc(unsigned long flags, unsigned long init_val);
asmlinkage long sys_pkey_free(int pkey);
asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
unsigned mask, struct statx __user *buffer);
-
+asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len,
+ int flags, uint32_t sig);
/*
* Architecture-specific system calls
diff --git a/include/trace/events/rseq.h b/include/trace/events/rseq.h
new file mode 100644
index 000000000000..a04a64bc1a00
--- /dev/null
+++ b/include/trace/events/rseq.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rseq
+
+#if !defined(_TRACE_RSEQ_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_RSEQ_H
+
+#include <linux/tracepoint.h>
+#include <linux/types.h>
+
+TRACE_EVENT(rseq_update,
+
+ TP_PROTO(struct task_struct *t),
+
+ TP_ARGS(t),
+
+ TP_STRUCT__entry(
+ __field(s32, cpu_id)
+ ),
+
+ TP_fast_assign(
+ __entry->cpu_id = raw_smp_processor_id();
+ ),
+
+ TP_printk("cpu_id=%d", __entry->cpu_id)
+);
+
+TRACE_EVENT(rseq_ip_fixup,
+
+ TP_PROTO(unsigned long regs_ip, unsigned long start_ip,
+ unsigned long post_commit_offset, unsigned long abort_ip),
+
+ TP_ARGS(regs_ip, start_ip, post_commit_offset, abort_ip),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, regs_ip)
+ __field(unsigned long, start_ip)
+ __field(unsigned long, post_commit_offset)
+ __field(unsigned long, abort_ip)
+ ),
+
+ TP_fast_assign(
+ __entry->regs_ip = regs_ip;
+ __entry->start_ip = start_ip;
+ __entry->post_commit_offset = post_commit_offset;
+ __entry->abort_ip = abort_ip;
+ ),
+
+ TP_printk("regs_ip=0x%lx start_ip=0x%lx post_commit_offset=%lu abort_ip=0x%lx",
+ __entry->regs_ip, __entry->start_ip,
+ __entry->post_commit_offset, __entry->abort_ip)
+);
+
+#endif /* _TRACE_SOCK_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
new file mode 100644
index 000000000000..d620fa43756c
--- /dev/null
+++ b/include/uapi/linux/rseq.h
@@ -0,0 +1,133 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_RSEQ_H
+#define _UAPI_LINUX_RSEQ_H
+
+/*
+ * linux/rseq.h
+ *
+ * Restartable sequences system call API
+ *
+ * Copyright (c) 2015-2018 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ */
+
+#ifdef __KERNEL__
+# include <linux/types.h>
+#else
+# include <stdint.h>
+#endif
+
+#include <linux/types_32_64.h>
+
+enum rseq_cpu_id_state {
+ RSEQ_CPU_ID_UNINITIALIZED = -1,
+ RSEQ_CPU_ID_REGISTRATION_FAILED = -2,
+};
+
+enum rseq_flags {
+ RSEQ_FLAG_UNREGISTER = (1 << 0),
+};
+
+enum rseq_cs_flags_bit {
+ RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT = 0,
+ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT = 1,
+ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT = 2,
+};
+
+enum rseq_cs_flags {
+ RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT =
+ (1U << RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT),
+ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL =
+ (1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT),
+ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE =
+ (1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT),
+};
+
+/*
+ * struct rseq_cs is aligned on 4 * 8 bytes to ensure it is always
+ * contained within a single cache-line. It is usually declared as
+ * link-time constant data.
+ */
+struct rseq_cs {
+ /* Version of this structure. */
+ __u32 version;
+ /* enum rseq_cs_flags */
+ __u32 flags;
+ LINUX_FIELD_u32_u64(start_ip);
+ /* Offset from start_ip. */
+ LINUX_FIELD_u32_u64(post_commit_offset);
+ LINUX_FIELD_u32_u64(abort_ip);
+} __attribute__((aligned(4 * sizeof(__u64))));
+
+/*
+ * struct rseq is aligned on 4 * 8 bytes to ensure it is always
+ * contained within a single cache-line.
+ *
+ * A single struct rseq per thread is allowed.
+ */
+struct rseq {
+ /*
+ * Restartable sequences cpu_id_start field. Updated by the
+ * kernel, and read by user-space with single-copy atomicity
+ * semantics. Aligned on 32-bit. Always contains a value in the
+ * range of possible CPUs, although the value may not be the
+ * actual current CPU (e.g. if rseq is not initialized). This
+ * CPU number value should always be compared against the value
+ * of the cpu_id field before performing a rseq commit or
+ * returning a value read from a data structure indexed using
+ * the cpu_id_start value.
+ */
+ __u32 cpu_id_start;
+ /*
+ * Restartable sequences cpu_id field. Updated by the kernel,
+ * and read by user-space with single-copy atomicity semantics.
+ * Aligned on 32-bit. Values RSEQ_CPU_ID_UNINITIALIZED and
+ * RSEQ_CPU_ID_REGISTRATION_FAILED have a special semantic: the
+ * former means "rseq uninitialized", and latter means "rseq
+ * initialization failed". This value is meant to be read within
+ * rseq critical sections and compared with the cpu_id_start
+ * value previously read, before performing the commit instruction,
+ * or read and compared with the cpu_id_start value before returning
+ * a value loaded from a data structure indexed using the
+ * cpu_id_start value.
+ */
+ __u32 cpu_id;
+ /*
+ * Restartable sequences rseq_cs field.
+ *
+ * Contains NULL when no critical section is active for the current
+ * thread, or holds a pointer to the currently active struct rseq_cs.
+ *
+ * Updated by user-space, which sets the address of the currently
+ * active rseq_cs at the beginning of assembly instruction sequence
+ * block, and set to NULL by the kernel when it restarts an assembly
+ * instruction sequence block, as well as when the kernel