From 384632e67e0829deb8015ee6ad916b180049d252 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 3 Oct 2017 16:15:38 -0700 Subject: userfaultfd: non-cooperative: fix fork use after free When reading the event from the uffd, we put it on a temporary fork_event list to detect if we can still access it after releasing and retaking the event_wqh.lock. If fork aborts and removes the event from the fork_event all is fine as long as we're still in the userfault read context and fork_event head is still alive. We've to put the event allocated in the fork kernel stack, back from fork_event list-head to the event_wqh head, before returning from userfaultfd_ctx_read, because the fork_event head lifetime is limited to the userfaultfd_ctx_read stack lifetime. Forgetting to move the event back to its event_wqh place then results in __remove_wait_queue(&ctx->event_wqh, &ewq->wq); in userfaultfd_event_wait_completion to remove it from a head that has been already freed from the reader stack. This could only happen if resolve_userfault_fork failed (for example if there are no file descriptors available to allocate the fork uffd). If it succeeded it was put back correctly. Furthermore, after find_userfault_evt receives a fork event, the forked userfault context in fork_nctx and uwq->msg.arg.reserved.reserved1 can be released by the fork thread as soon as the event_wqh.lock is released. Taking a reference on the fork_nctx before dropping the lock prevents an use after free in resolve_userfault_fork(). If the fork side aborted and it already released everything, we still try to succeed resolve_userfault_fork(), if possible. Fixes: 893e26e61d04eac9 ("userfaultfd: non-cooperative: Add fork() event") Link: http://lkml.kernel.org/r/20170920180413.26713-1-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Reported-by: Mark Rutland Tested-by: Mark Rutland Cc: Pavel Emelyanov Cc: Mike Rapoport Cc: "Dr. David Alan Gilbert" Cc: Mike Kravetz Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 56 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index ef4b48d1ea42..1c713fd5b3e6 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -588,6 +588,12 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, break; if (ACCESS_ONCE(ctx->released) || fatal_signal_pending(current)) { + /* + * &ewq->wq may be queued in fork_event, but + * __remove_wait_queue ignores the head + * parameter. It would be a problem if it + * didn't. + */ __remove_wait_queue(&ctx->event_wqh, &ewq->wq); if (ewq->msg.event == UFFD_EVENT_FORK) { struct userfaultfd_ctx *new; @@ -1061,6 +1067,12 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, (unsigned long) uwq->msg.arg.reserved.reserved1; list_move(&uwq->wq.entry, &fork_event); + /* + * fork_nctx can be freed as soon as + * we drop the lock, unless we take a + * reference on it. + */ + userfaultfd_ctx_get(fork_nctx); spin_unlock(&ctx->event_wqh.lock); ret = 0; break; @@ -1091,19 +1103,53 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, if (!ret && msg->event == UFFD_EVENT_FORK) { ret = resolve_userfault_fork(ctx, fork_nctx, msg); + spin_lock(&ctx->event_wqh.lock); + if (!list_empty(&fork_event)) { + /* + * The fork thread didn't abort, so we can + * drop the temporary refcount. + */ + userfaultfd_ctx_put(fork_nctx); + + uwq = list_first_entry(&fork_event, + typeof(*uwq), + wq.entry); + /* + * If fork_event list wasn't empty and in turn + * the event wasn't already released by fork + * (the event is allocated on fork kernel + * stack), put the event back to its place in + * the event_wq. fork_event head will be freed + * as soon as we return so the event cannot + * stay queued there no matter the current + * "ret" value. + */ + list_del(&uwq->wq.entry); + __add_wait_queue(&ctx->event_wqh, &uwq->wq); - if (!ret) { - spin_lock(&ctx->event_wqh.lock); - if (!list_empty(&fork_event)) { - uwq = list_first_entry(&fork_event, - typeof(*uwq), - wq.entry); - list_del(&uwq->wq.entry); - __add_wait_queue(&ctx->event_wqh, &uwq->wq); + /* + * Leave the event in the waitqueue and report + * error to userland if we failed to resolve + * the userfault fork. + */ + if (likely(!ret)) userfaultfd_event_complete(ctx, uwq); - } - spin_unlock(&ctx->event_wqh.lock); + } else { + /* + * Here the fork thread aborted and the + * refcount from the fork thread on fork_nctx + * has already been released. We still hold + * the reference we took before releasing the + * lock above. If resolve_userfault_fork + * failed we've to drop it because the + * fork_nctx has to be freed in such case. If + * it succeeded we'll hold it because the new + * uffd references it. + */ + if (ret) + userfaultfd_ctx_put(fork_nctx); } + spin_unlock(&ctx->event_wqh.lock); } return ret; -- cgit v1.2.3 From c2315c187fa0d3ab363fdebe22718170b40473e3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 3 Oct 2017 16:15:42 -0700 Subject: exec: load_script: kill the onstack interp[BINPRM_BUF_SIZE] array Patch series "exec: binfmt_misc: fix use-after-free, kill iname[BINPRM_BUF_SIZE]". It looks like this code was always wrong, then commit 948b701a607f ("binfmt_misc: add persistent opened binary handler for containers") added more problems. This patch (of 6): load_script() can simply use i_name instead, it points into bprm->buf[] and nobody can change this memory until we call prepare_binprm(). The only complication is that we need to also change the signature of bprm_change_interp() but this change looks good too. While at it, do whitespace/style cleanups. NOTE: the real motivation for this change is that people want to increase BINPRM_BUF_SIZE, we need to change load_misc_binary() too but this looks more complicated because afaics it is very buggy. Link: http://lkml.kernel.org/r/20170918163446.GA26793@redhat.com Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Travis Gummels Cc: Ben Woodard Cc: Jim Foraker Cc: Cc: Al Viro Cc: James Bottomley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_script.c | 17 +++++++++-------- fs/exec.c | 2 +- 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index afdf4e3cafc2..7cde3f46ad26 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -19,7 +19,6 @@ static int load_script(struct linux_binprm *bprm) const char *i_arg, *i_name; char *cp; struct file *file; - char interp[BINPRM_BUF_SIZE]; int retval; if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!')) @@ -55,7 +54,7 @@ static int load_script(struct linux_binprm *bprm) break; } for (cp = bprm->buf+2; (*cp == ' ') || (*cp == '\t'); cp++); - if (*cp == '\0') + if (*cp == '\0') return -ENOEXEC; /* No interpreter name found */ i_name = cp; i_arg = NULL; @@ -65,7 +64,6 @@ static int load_script(struct linux_binprm *bprm) *cp++ = '\0'; if (*cp) i_arg = cp; - strcpy (interp, i_name); /* * OK, we've parsed out the interpreter name and * (optional) argument. @@ -80,24 +78,27 @@ static int load_script(struct linux_binprm *bprm) if (retval) return retval; retval = copy_strings_kernel(1, &bprm->interp, bprm); - if (retval < 0) return retval; + if (retval < 0) + return retval; bprm->argc++; if (i_arg) { retval = copy_strings_kernel(1, &i_arg, bprm); - if (retval < 0) return retval; + if (retval < 0) + return retval; bprm->argc++; } retval = copy_strings_kernel(1, &i_name, bprm); - if (retval) return retval; + if (retval) + return retval; bprm->argc++; - retval = bprm_change_interp(interp, bprm); + retval = bprm_change_interp(i_name, bprm); if (retval < 0) return retval; /* * OK, now restart the process with the interpreter's dentry. */ - file = open_exec(interp); + file = open_exec(i_name); if (IS_ERR(file)) return PTR_ERR(file); diff --git a/fs/exec.c b/fs/exec.c index ac34d9724684..5470d3c1892a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1410,7 +1410,7 @@ static void free_bprm(struct linux_binprm *bprm) kfree(bprm); } -int bprm_change_interp(char *interp, struct linux_binprm *bprm) +int bprm_change_interp(const char *interp, struct linux_binprm *bprm) { /* If a binfmt changed the interp, free it first. */ if (bprm->interp != bprm->filename) -- cgit v1.2.3 From baba1b29731c79d605100087b8f02f9e1cf5a344 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 3 Oct 2017 16:15:45 -0700 Subject: exec: binfmt_misc: don't nullify Node->dentry in kill_node() kill_node() nullifies/checks Node->dentry to avoid double free. This complicates the next changes and this is very confusing: - we do not need to check dentry != NULL under entries_lock, kill_node() is always called under inode_lock(d_inode(root)) and we rely on this inode_lock() anyway, without this lock the MISC_FMT_OPEN_FILE cleanup could race with itself. - if kill_inode() was already called and ->dentry == NULL we should not even try to close e->interp_file. We can change bm_entry_write() to simply check !list_empty(list) before kill_node. Again, we rely on inode_lock(), in particular it saves us from the race with bm_status_write(), another caller of kill_node(). Link: http://lkml.kernel.org/r/20170922143641.GA17210@redhat.com Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Al Viro Cc: Ben Woodard Cc: James Bottomley Cc: Jim Foraker Cc: Cc: Travis Gummels Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_misc.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index ce7181ea60fa..6451e7520e05 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -603,11 +603,7 @@ static void kill_node(Node *e) struct dentry *dentry; write_lock(&entries_lock); - dentry = e->dentry; - if (dentry) { - list_del_init(&e->list); - e->dentry = NULL; - } + list_del_init(&e->list); write_unlock(&entries_lock); if ((e->flags & MISC_FMT_OPEN_FILE) && e->interp_file) { @@ -615,12 +611,11 @@ static void kill_node(Node *e) e->interp_file = NULL; } - if (dentry) { - drop_nlink(d_inode(dentry)); - d_drop(dentry); - dput(dentry); - simple_release_fs(&bm_mnt, &entry_count); - } + dentry = e->dentry; + drop_nlink(d_inode(dentry)); + d_drop(dentry); + dput(dentry); + simple_release_fs(&bm_mnt, &entry_count); } /* / */ @@ -665,7 +660,8 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer, root = file_inode(file)->i_sb->s_root; inode_lock(d_inode(root)); - kill_node(e); + if (!list_empty(&e->list)) + kill_node(e); inode_unlock(d_inode(root)); break; @@ -794,7 +790,7 @@ static ssize_t bm_status_write(struct file *file, const char __user *buffer, inode_lock(d_inode(root)); while (!list_empty(&entries)) - kill_node(list_entry(entries.next, Node, list)); + kill_node(list_first_entry(&entries, Node, list)); inode_unlock(d_inode(root)); break; -- cgit v1.2.3 From 83f918274e4b841d6fb817861ea0c896fba0c179 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 3 Oct 2017 16:15:48 -0700 Subject: exec: binfmt_misc: shift filp_close(interp_file) from kill_node() to bm_evict_inode() To ensure that load_misc_binary() can't use the partially destroyed Node, see also the next patch. The current logic looks wrong in any case, once we close interp_file it doesn't make any sense to delay kfree(inode->i_private), this Node is no longer valid. Even if the MISC_FMT_OPEN_FILE/interp_file checks were not racy (they are), load_misc_binary() should not try to reopen ->interpreter if MISC_FMT_OPEN_FILE is set but ->interp_file is NULL. And I can't understand why do we use filp_close(), not fput(). Link: http://lkml.kernel.org/r/20170922143644.GA17216@redhat.com Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Al Viro Cc: Ben Woodard Cc: James Bottomley Cc: Jim Foraker Cc: Cc: Travis Gummels Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_misc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 6451e7520e05..203598ccb40a 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -594,8 +594,13 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode) static void bm_evict_inode(struct inode *inode) { + Node *e = inode->i_private; + + if ((e->flags & MISC_FMT_OPEN_FILE) && e->interp_file) + filp_close(e->interp_file, NULL); + clear_inode(inode); - kfree(inode->i_private); + kfree(e); } static void kill_node(Node *e) @@ -606,11 +611,6 @@ static void kill_node(Node *e) list_del_init(&e->list); write_unlock(&entries_lock); - if ((e->flags & MISC_FMT_OPEN_FILE) && e->interp_file) { - filp_close(e->interp_file, NULL); - e->interp_file = NULL; - } - dentry = e->dentry; drop_nlink(d_inode(dentry)); d_drop(dentry); -- cgit v1.2.3 From eb23aa0317eb1f08e8d9d36b8753d42f03b32764 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 3 Oct 2017 16:15:51 -0700 Subject: exec: binfmt_misc: remove the confusing e->interp_file != NULL checks If MISC_FMT_OPEN_FILE flag is set e->interp_file must be valid or we have a bug which should not be silently ignored. Link: http://lkml.kernel.org/r/20170922143647.GA17222@redhat.com Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Al Viro Cc: Ben Woodard Cc: James Bottomley Cc: Jim Foraker Cc: Cc: Travis Gummels Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_misc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 203598ccb40a..babfe7bd56f0 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -205,7 +205,7 @@ static int load_misc_binary(struct linux_binprm *bprm) if (retval < 0) goto error; - if (fmt->flags & MISC_FMT_OPEN_FILE && fmt->interp_file) { + if (fmt->flags & MISC_FMT_OPEN_FILE) { interp_file = filp_clone_open(fmt->interp_file); if (!IS_ERR(interp_file)) deny_write_access(interp_file); @@ -596,7 +596,7 @@ static void bm_evict_inode(struct inode *inode) { Node *e = inode->i_private; - if ((e->flags & MISC_FMT_OPEN_FILE) && e->interp_file) + if (e->flags & MISC_FMT_OPEN_FILE) filp_close(e->interp_file, NULL); clear_inode(inode); -- cgit v1.2.3 From 43a4f2619038002f48c78698c42c05692d4b4eb2 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 3 Oct 2017 16:15:55 -0700 Subject: exec: binfmt_misc: fix race between load_misc_binary() and kill_node() load_misc_binary() makes a local copy of fmt->interpreter under entries_lock to avoid the race with kill_node() but this is not enough; the whole Node can be freed after we drop entries_lock, not only the ->interpreter string. Add dget/dput(fmt->dentry) to ensure bm_evict_inode() can't destroy/free this Node. Link: http://lkml.kernel.org/r/20170922143650.GA17227@redhat.com Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Al Viro Cc: Ben Woodard Cc: James Bottomley Cc: Jim Foraker Cc: Travis Gummels Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_misc.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index babfe7bd56f0..f5f8c2541790 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -138,20 +138,23 @@ static int load_misc_binary(struct linux_binprm *bprm) retval = -ENOEXEC; if (!enabled) - goto ret; + return retval; /* to keep locking time low, we copy the interpreter string */ read_lock(&entries_lock); fmt = check_file(bprm); - if (fmt) + if (fmt) { + dget(fmt->dentry); strlcpy(iname, fmt->interpreter, BINPRM_BUF_SIZE); + } read_unlock(&entries_lock); if (!fmt) - goto ret; + return retval; /* Need to be able to load the file after exec */ + retval = -ENOENT; if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE) - return -ENOENT; + goto ret; if (!(fmt->flags & MISC_FMT_PRESERVE_ARGV0)) { retval = remove_arg_zero(bprm); @@ -238,6 +241,7 @@ static int load_misc_binary(struct linux_binprm *bprm) goto error; ret: + dput(fmt->dentry); return retval; error: if (fd_binary > 0) -- cgit v1.2.3 From 50097f74934e3ec8fb1e6f3087568b958972817d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 3 Oct 2017 16:15:58 -0700 Subject: exec: binfmt_misc: kill the onstack iname[BINPRM_BUF_SIZE] array After the previous change "fmt" can't go away, we can kill iname/iname_addr and use fmt->interpreter. Link: http://lkml.kernel.org/r/20170922143653.GA17232@redhat.com Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Al Viro Cc: Ben Woodard Cc: James Bottomley Cc: Jim Foraker Cc: Cc: Travis Gummels Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_misc.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index f5f8c2541790..2a46762def31 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -54,7 +54,7 @@ typedef struct { int size; /* size of magic/mask */ char *magic; /* magic or filename extension */ char *mask; /* mask, NULL for exact match */ - char *interpreter; /* filename of interpreter */ + const char *interpreter; /* filename of interpreter */ char *name; struct dentry *dentry; struct file *interp_file; @@ -131,8 +131,6 @@ static int load_misc_binary(struct linux_binprm *bprm) { Node *fmt; struct file *interp_file = NULL; - char iname[BINPRM_BUF_SIZE]; - const char *iname_addr = iname; int retval; int fd_binary = -1; @@ -143,10 +141,8 @@ static int load_misc_binary(struct linux_binprm *bprm) /* to keep locking time low, we copy the interpreter string */ read_lock(&entries_lock); fmt = check_file(bprm); - if (fmt) { + if (fmt) dget(fmt->dentry); - strlcpy(iname, fmt->interpreter, BINPRM_BUF_SIZE); - } read_unlock(&entries_lock); if (!fmt) return retval; @@ -198,13 +194,13 @@ static int load_misc_binary(struct linux_binprm *bprm) bprm->argc++; /* add the interp as argv[0] */ - retval = copy_strings_kernel(1, &iname_addr, bprm); + retval = copy_strings_kernel(1, &fmt->interpreter, bprm); if (retval < 0) goto error; bprm->argc++; /* Update interp in case binfmt_script needs it. */ - retval = bprm_change_interp(iname, bprm); + retval = bprm_change_interp(fmt->interpreter, bprm); if (retval < 0) goto error; @@ -213,7 +209,7 @@ static int load_misc_binary(struct linux_binprm *bprm) if (!IS_ERR(interp_file)) deny_write_access(interp_file); } else { - interp_file = open_exec(iname); + interp_file = open_exec(fmt->interpreter); } retval = PTR_ERR(interp_file); if (IS_ERR(interp_file)) -- cgit v1.2.3