summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/arm/kernel/ptrace.c2
-rw-r--r--arch/arm64/kernel/ptrace.c2
-rw-r--r--arch/parisc/kernel/ptrace.c2
-rw-r--r--arch/riscv/kernel/ptrace.c2
-rw-r--r--arch/s390/kernel/ptrace.c2
-rw-r--r--arch/um/kernel/skas/syscall.c2
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_64.c2
-rw-r--r--include/linux/seccomp.h6
-rw-r--r--include/uapi/linux/seccomp.h29
-rw-r--r--kernel/seccomp.c28
-rw-r--r--tools/testing/selftests/seccomp/seccomp_bpf.c110
11 files changed, 170 insertions, 17 deletions
diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index 324352787aea..b606cded90cd 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -923,7 +923,7 @@ asmlinkage int syscall_trace_enter(struct pt_regs *regs, int scno)
/* Do seccomp after ptrace; syscall may have changed. */
#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
- if (secure_computing(NULL) == -1)
+ if (secure_computing() == -1)
return -1;
#else
/* XXX: remove this once OABI gets fixed */
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 21176d02e21a..6771c399d40c 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -1816,7 +1816,7 @@ int syscall_trace_enter(struct pt_regs *regs)
}
/* Do the secure computing after ptrace; failures should be fast. */
- if (secure_computing(NULL) == -1)
+ if (secure_computing() == -1)
return -1;
if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
diff --git a/arch/parisc/kernel/ptrace.c b/arch/parisc/kernel/ptrace.c
index 9f6ff7bc06f9..f8c07dcbfb49 100644
--- a/arch/parisc/kernel/ptrace.c
+++ b/arch/parisc/kernel/ptrace.c
@@ -342,7 +342,7 @@ long do_syscall_trace_enter(struct pt_regs *regs)
}
/* Do the secure computing check after ptrace. */
- if (secure_computing(NULL) == -1)
+ if (secure_computing() == -1)
return -1;
#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c
index 0f84628b9385..407464201b91 100644
--- a/arch/riscv/kernel/ptrace.c
+++ b/arch/riscv/kernel/ptrace.c
@@ -159,7 +159,7 @@ __visible void do_syscall_trace_enter(struct pt_regs *regs)
* If this fails we might have return value in a0 from seccomp
* (via SECCOMP_RET_ERRNO/TRACE).
*/
- if (secure_computing(NULL) == -1) {
+ if (secure_computing() == -1) {
syscall_set_nr(current, regs, -1);
return;
}
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index ad71132374f0..58faa12542a1 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -856,7 +856,7 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
}
/* Do the secure computing check after ptrace. */
- if (secure_computing(NULL)) {
+ if (secure_computing()) {
/* seccomp failures shouldn't expose any additional code. */
return -1;
}
diff --git a/arch/um/kernel/skas/syscall.c b/arch/um/kernel/skas/syscall.c
index f574b1856bc6..40d90dddf3f1 100644
--- a/arch/um/kernel/skas/syscall.c
+++ b/arch/um/kernel/skas/syscall.c
@@ -35,7 +35,7 @@ void handle_syscall(struct uml_pt_regs *r)
goto out;
/* Do the seccomp check after ptrace; failures should be fast. */
- if (secure_computing(NULL) == -1)
+ if (secure_computing() == -1)
goto out;
syscall = UPT_SYSCALL_NR(r);
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index e7c596dea947..b10cbf71a8cc 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -222,7 +222,7 @@ bool emulate_vsyscall(unsigned long error_code,
*/
regs->orig_ax = syscall_nr;
regs->ax = -ENOSYS;
- tmp = secure_computing(NULL);
+ tmp = secure_computing();
if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
warn_bad_vsyscall(KERN_DEBUG, regs,
"seccomp tried to change syscall nr or ip");
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 84868d37b35d..03583b6d1416 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -33,10 +33,10 @@ struct seccomp {
#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
extern int __secure_computing(const struct seccomp_data *sd);
-static inline int secure_computing(const struct seccomp_data *sd)
+static inline int secure_computing(void)
{
if (unlikely(test_thread_flag(TIF_SECCOMP)))
- return __secure_computing(sd);
+ return __secure_computing(NULL);
return 0;
}
#else
@@ -59,7 +59,7 @@ struct seccomp { };
struct seccomp_filter { };
#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
-static inline int secure_computing(struct seccomp_data *sd) { return 0; }
+static inline int secure_computing(void) { return 0; }
#else
static inline void secure_computing_strict(int this_syscall) { return; }
#endif
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 90734aa5aa36..be84d87f1f46 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -76,6 +76,35 @@ struct seccomp_notif {
struct seccomp_data data;
};
+/*
+ * Valid flags for struct seccomp_notif_resp
+ *
+ * Note, the SECCOMP_USER_NOTIF_FLAG_CONTINUE flag must be used with caution!
+ * If set by the process supervising the syscalls of another process the
+ * syscall will continue. This is problematic because of an inherent TOCTOU.
+ * An attacker can exploit the time while the supervised process is waiting on
+ * a response from the supervising process to rewrite syscall arguments which
+ * are passed as pointers of the intercepted syscall.
+ * It should be absolutely clear that this means that the seccomp notifier
+ * _cannot_ be used to implement a security policy! It should only ever be used
+ * in scenarios where a more privileged process supervises the syscalls of a
+ * lesser privileged process to get around kernel-enforced security
+ * restrictions when the privileged process deems this safe. In other words,
+ * in order to continue a syscall the supervising process should be sure that
+ * another security mechanism or the kernel itself will sufficiently block
+ * syscalls if arguments are rewritten to something unsafe.
+ *
+ * Similar precautions should be applied when stacking SECCOMP_RET_USER_NOTIF
+ * or SECCOMP_RET_TRACE. For SECCOMP_RET_USER_NOTIF filters acting on the
+ * same syscall, the most recently added filter takes precedence. This means
+ * that the new SECCOMP_RET_USER_NOTIF filter can override any
+ * SECCOMP_IOCTL_NOTIF_SEND from earlier filters, essentially allowing all
+ * such filtered syscalls to be executed by sending the response
+ * SECCOMP_USER_NOTIF_FLAG_CONTINUE. Note that SECCOMP_RET_TRACE can equally
+ * be overriden by SECCOMP_USER_NOTIF_FLAG_CONTINUE.
+ */
+#define SECCOMP_USER_NOTIF_FLAG_CONTINUE (1UL << 0)
+
struct seccomp_notif_resp {
__u64 id;
__s64 val;
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index dba52a7db5e8..12d2227e5786 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -75,6 +75,7 @@ struct seccomp_knotif {
/* The return values, only valid when in SECCOMP_NOTIFY_REPLIED */
int error;
long val;
+ u32 flags;
/* Signals when this has entered SECCOMP_NOTIFY_REPLIED */
struct completion ready;
@@ -732,11 +733,12 @@ static u64 seccomp_next_notify_id(struct seccomp_filter *filter)
return filter->notif->next_id++;
}
-static void seccomp_do_user_notification(int this_syscall,
- struct seccomp_filter *match,
- const struct seccomp_data *sd)
+static int seccomp_do_user_notification(int this_syscall,
+ struct seccomp_filter *match,
+ const struct seccomp_data *sd)
{
int err;
+ u32 flags = 0;
long ret = 0;
struct seccomp_knotif n = {};
@@ -764,6 +766,7 @@ static void seccomp_do_user_notification(int this_syscall,
if (err == 0) {
ret = n.val;
err = n.error;
+ flags = n.flags;
}
/*
@@ -780,8 +783,14 @@ static void seccomp_do_user_notification(int this_syscall,
list_del(&n.list);
out:
mutex_unlock(&match->notify_lock);
+
+ /* Userspace requests to continue the syscall. */
+ if (flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE)
+ return 0;
+
syscall_set_return_value(current, task_pt_regs(current),
err, ret);
+ return -1;
}
static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
@@ -867,8 +876,10 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
return 0;
case SECCOMP_RET_USER_NOTIF:
- seccomp_do_user_notification(this_syscall, match, sd);
- goto skip;
+ if (seccomp_do_user_notification(this_syscall, match, sd))
+ goto skip;
+
+ return 0;
case SECCOMP_RET_LOG:
seccomp_log(this_syscall, 0, action, true);
@@ -1087,7 +1098,11 @@ static long seccomp_notify_send(struct seccomp_filter *filter,
if (copy_from_user(&resp, buf, sizeof(resp)))
return -EFAULT;
- if (resp.flags)
+ if (resp.flags & ~SECCOMP_USER_NOTIF_FLAG_CONTINUE)
+ return -EINVAL;
+
+ if ((resp.flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE) &&
+ (resp.error || resp.val))
return -EINVAL;
ret = mutex_lock_interruptible(&filter->notify_lock);
@@ -1116,6 +1131,7 @@ static long seccomp_notify_send(struct seccomp_filter *filter,
knotif->state = SECCOMP_NOTIFY_REPLIED;
knotif->error = resp.error;
knotif->val = resp.val;
+ knotif->flags = resp.flags;
complete(&knotif->ready);
out:
mutex_unlock(&filter->notify_lock);
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index aeb0fc37a654..6944b898bb53 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -35,6 +35,7 @@
#include <stdbool.h>
#include <string.h>
#include <time.h>
+#include <limits.h>
#include <linux/elf.h>
#include <sys/uio.h>
#include <sys/utsname.h>
@@ -43,6 +44,7 @@
#include <sys/times.h>
#include <sys/socket.h>
#include <sys/ioctl.h>
+#include <linux/kcmp.h>
#include <unistd.h>
#include <sys/syscall.h>
@@ -206,6 +208,10 @@ struct seccomp_notif_sizes {
#define PTRACE_EVENTMSG_SYSCALL_EXIT 2
#endif
+#ifndef SECCOMP_USER_NOTIF_FLAG_CONTINUE
+#define SECCOMP_USER_NOTIF_FLAG_CONTINUE 0x00000001
+#endif
+
#ifndef seccomp
int seccomp(unsigned int op, unsigned int flags, void *args)
{
@@ -3083,7 +3089,7 @@ static int user_trap_syscall(int nr, unsigned int flags)
return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog);
}
-#define USER_NOTIF_MAGIC 116983961184613L
+#define USER_NOTIF_MAGIC INT_MAX
TEST(user_notification_basic)
{
pid_t pid;
@@ -3491,6 +3497,108 @@ TEST(seccomp_get_notif_sizes)
EXPECT_EQ(sizes.seccomp_notif_resp, sizeof(struct seccomp_notif_resp));
}
+static int filecmp(pid_t pid1, pid_t pid2, int fd1, int fd2)
+{
+#ifdef __NR_kcmp
+ return syscall(__NR_kcmp, pid1, pid2, KCMP_FILE, fd1, fd2);
+#else
+ errno = ENOSYS;
+ return -1;
+#endif
+}
+
+TEST(user_notification_continue)
+{
+ pid_t pid;
+ long ret;
+ int status, listener;
+ struct seccomp_notif req = {};
+ struct seccomp_notif_resp resp = {};
+ struct pollfd pollfd;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ listener = user_trap_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER);
+ ASSERT_GE(listener, 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int dup_fd, pipe_fds[2];
+ pid_t self;
+
+ ret = pipe(pipe_fds);
+ if (ret < 0)
+ exit(1);
+
+ dup_fd = dup(pipe_fds[0]);
+ if (dup_fd < 0)
+ exit(1);
+
+ self = getpid();
+
+ ret = filecmp(self, self, pipe_fds[0], dup_fd);
+ if (ret)
+ exit(2);
+
+ exit(0);
+ }
+
+ pollfd.fd = listener;
+ pollfd.events = POLLIN | POLLOUT;
+
+ EXPECT_GT(poll(&pollfd, 1, -1), 0);
+ EXPECT_EQ(pollfd.revents, POLLIN);
+
+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
+
+ pollfd.fd = listener;
+ pollfd.events = POLLIN | POLLOUT;
+
+ EXPECT_GT(poll(&pollfd, 1, -1), 0);
+ EXPECT_EQ(pollfd.revents, POLLOUT);
+
+ EXPECT_EQ(req.data.nr, __NR_dup);
+
+ resp.id = req.id;
+ resp.flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE;
+
+ /*
+ * Verify that setting SECCOMP_USER_NOTIF_FLAG_CONTINUE enforces other
+ * args be set to 0.
+ */
+ resp.error = 0;
+ resp.val = USER_NOTIF_MAGIC;
+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
+ EXPECT_EQ(errno, EINVAL);
+
+ resp.error = USER_NOTIF_MAGIC;
+ resp.val = 0;
+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
+ EXPECT_EQ(errno, EINVAL);
+
+ resp.error = 0;
+ resp.val = 0;
+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0) {
+ if (errno == EINVAL)
+ XFAIL(goto skip, "Kernel does not support SECCOMP_USER_NOTIF_FLAG_CONTINUE");
+ }
+
+skip:
+ EXPECT_EQ(waitpid(pid, &status, 0), pid);
+ EXPECT_EQ(true, WIFEXITED(status));
+ EXPECT_EQ(0, WEXITSTATUS(status)) {
+ if (WEXITSTATUS(status) == 2) {
+ XFAIL(return, "Kernel does not support kcmp() syscall");
+ return;
+ }
+ }
+}
+
/*
* TODO:
* - add microbenchmarks