diff options
-rw-r--r-- | arch/x86/entry/syscalls/syscall_32.tbl | 7 | ||||
-rw-r--r-- | arch/x86/entry/syscalls/syscall_64.tbl | 6 | ||||
-rw-r--r-- | fs/Makefile | 2 | ||||
-rw-r--r-- | fs/file_table.c | 9 | ||||
-rw-r--r-- | fs/fs_context.c | 160 | ||||
-rw-r--r-- | fs/fsopen.c | 477 | ||||
-rw-r--r-- | fs/internal.h | 4 | ||||
-rw-r--r-- | fs/namespace.c | 477 | ||||
-rw-r--r-- | include/linux/fs.h | 7 | ||||
-rw-r--r-- | include/linux/fs_context.h | 38 | ||||
-rw-r--r-- | include/linux/lsm_hooks.h | 6 | ||||
-rw-r--r-- | include/linux/module.h | 6 | ||||
-rw-r--r-- | include/linux/security.h | 7 | ||||
-rw-r--r-- | include/linux/syscalls.h | 9 | ||||
-rw-r--r-- | include/uapi/linux/fcntl.h | 2 | ||||
-rw-r--r-- | include/uapi/linux/mount.h | 62 | ||||
-rw-r--r-- | samples/Kconfig | 9 | ||||
-rw-r--r-- | samples/Makefile | 2 | ||||
-rw-r--r-- | samples/vfs/Makefile (renamed from samples/statx/Makefile) | 5 | ||||
-rw-r--r-- | samples/vfs/test-fsmount.c | 133 | ||||
-rw-r--r-- | samples/vfs/test-statx.c (renamed from samples/statx/test-statx.c) | 11 | ||||
-rw-r--r-- | security/security.c | 5 |
22 files changed, 1353 insertions, 91 deletions
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 1f9607ed087c..4cd5f982b1e5 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -398,7 +398,12 @@ 384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl 385 i386 io_pgetevents sys_io_pgetevents_time32 __ia32_compat_sys_io_pgetevents 386 i386 rseq sys_rseq __ia32_sys_rseq -# don't use numbers 387 through 392, add new calls at the end +387 i386 open_tree sys_open_tree __ia32_sys_open_tree +388 i386 move_mount sys_move_mount __ia32_sys_move_mount +389 i386 fsopen sys_fsopen __ia32_sys_fsopen +390 i386 fsconfig sys_fsconfig __ia32_sys_fsconfig +391 i386 fsmount sys_fsmount __ia32_sys_fsmount +392 i386 fspick sys_fspick __ia32_sys_fspick 393 i386 semget sys_semget __ia32_sys_semget 394 i386 semctl sys_semctl __ia32_compat_sys_semctl 395 i386 shmget sys_shmget __ia32_sys_shmget diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 92ee0b4378d4..64ca0d06259a 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -343,6 +343,12 @@ 332 common statx __x64_sys_statx 333 common io_pgetevents __x64_sys_io_pgetevents 334 common rseq __x64_sys_rseq +335 common open_tree __x64_sys_open_tree +336 common move_mount __x64_sys_move_mount +337 common fsopen __x64_sys_fsopen +338 common fsconfig __x64_sys_fsconfig +339 common fsmount __x64_sys_fsmount +340 common fspick __x64_sys_fspick # don't use numbers 387 through 423, add new calls after the last # 'common' entry 424 common pidfd_send_signal __x64_sys_pidfd_send_signal diff --git a/fs/Makefile b/fs/Makefile index 35945f8139e6..5a51bc2489ba 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -13,7 +13,7 @@ obj-y := open.o read_write.o file_table.o super.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o splice.o sync.o utimes.o d_path.o \ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \ - fs_types.o fs_context.o fs_parser.o + fs_types.o fs_context.o fs_parser.o fsopen.o ifeq ($(CONFIG_BLOCK),y) obj-y += buffer.o block_dev.o direct-io.o mpage.o diff --git a/fs/file_table.c b/fs/file_table.c index 155d7514a094..3f9c1b452c1d 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -255,6 +255,7 @@ static void __fput(struct file *file) struct dentry *dentry = file->f_path.dentry; struct vfsmount *mnt = file->f_path.mnt; struct inode *inode = file->f_inode; + fmode_t mode = file->f_mode; if (unlikely(!(file->f_mode & FMODE_OPENED))) goto out; @@ -277,18 +278,20 @@ static void __fput(struct file *file) if (file->f_op->release) file->f_op->release(inode, file); if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL && - !(file->f_mode & FMODE_PATH))) { + !(mode & FMODE_PATH))) { cdev_put(inode->i_cdev); } fops_put(file->f_op); put_pid(file->f_owner.pid); - if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) + if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_dec(inode); - if (file->f_mode & FMODE_WRITER) { + if (mode & FMODE_WRITER) { put_write_access(inode); __mnt_drop_write(mnt); } dput(dentry); + if (unlikely(mode & FMODE_NEED_UNMOUNT)) + dissolve_on_fput(mnt); mntput(mnt); out: file_free(file); diff --git a/fs/fs_context.c b/fs/fs_context.c index 87e3546b9a52..a47ccd5a4a78 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -11,6 +11,7 @@ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/fs.h> @@ -23,6 +24,7 @@ #include <linux/pid_namespace.h> #include <linux/user_namespace.h> #include <net/net_namespace.h> +#include <asm/sections.h> #include "mount.h" #include "internal.h" @@ -271,6 +273,8 @@ static struct fs_context *alloc_fs_context(struct file_system_type *fs_type, fc->cred = get_current_cred(); fc->net_ns = get_net(current->nsproxy->net_ns); + mutex_init(&fc->uapi_mutex); + switch (purpose) { case FS_CONTEXT_FOR_MOUNT: fc->user_ns = get_user_ns(fc->cred->user_ns); @@ -353,6 +357,8 @@ struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc) if (!fc) return ERR_PTR(-ENOMEM); + mutex_init(&fc->uapi_mutex); + fc->fs_private = NULL; fc->s_fs_info = NULL; fc->source = NULL; @@ -361,6 +367,8 @@ struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc) get_net(fc->net_ns); get_user_ns(fc->user_ns); get_cred(fc->cred); + if (fc->log) + refcount_inc(&fc->log->usage); /* Can't call put until we've called ->dup */ ret = fc->ops->dup(fc, src_fc); @@ -378,7 +386,6 @@ err_fc: } EXPORT_SYMBOL(vfs_dup_fs_context); -#ifdef CONFIG_PRINTK /** * logfc - Log a message to a filesystem context * @fc: The filesystem context to log to. @@ -386,27 +393,100 @@ EXPORT_SYMBOL(vfs_dup_fs_context); */ void logfc(struct fs_context *fc, const char *fmt, ...) { + static const char store_failure[] = "OOM: Can't store error string"; + struct fc_log *log = fc ? fc->log : NULL; + const char *p; va_list va; + char *q; + u8 freeable; va_start(va, fmt); - - switch (fmt[0]) { - case 'w': - vprintk_emit(0, LOGLEVEL_WARNING, NULL, 0, fmt, va); - break; - case 'e': - vprintk_emit(0, LOGLEVEL_ERR, NULL, 0, fmt, va); - break; - default: - vprintk_emit(0, LOGLEVEL_NOTICE, NULL, 0, fmt, va); - break; + if (!strchr(fmt, '%')) { + p = fmt; + goto unformatted_string; + } + if (strcmp(fmt, "%s") == 0) { + p = va_arg(va, const char *); + goto unformatted_string; } - pr_cont("\n"); + q = kvasprintf(GFP_KERNEL, fmt, va); +copied_string: + if (!q) + goto store_failure; + freeable = 1; + goto store_string; + +unformatted_string: + if ((unsigned long)p >= (unsigned long)__start_rodata && + (unsigned long)p < (unsigned long)__end_rodata) + goto const_string; + if (log && within_module_core((unsigned long)p, log->owner)) + goto const_string; + q = kstrdup(p, GFP_KERNEL); + goto copied_string; + +store_failure: + p = store_failure; +const_string: + q = (char *)p; + freeable = 0; +store_string: + if (!log) { + switch (fmt[0]) { + case 'w': + printk(KERN_WARNING "%s\n", q + 2); + break; + case 'e': + printk(KERN_ERR "%s\n", q + 2); + break; + default: + printk(KERN_NOTICE "%s\n", q + 2); + break; + } + if (freeable) + kfree(q); + } else { + unsigned int logsize = ARRAY_SIZE(log->buffer); + u8 index; + + index = log->head & (logsize - 1); + BUILD_BUG_ON(sizeof(log->head) != sizeof(u8) || + sizeof(log->tail) != sizeof(u8)); + if ((u8)(log->head - log->tail) == logsize) { + /* The buffer is full, discard the oldest message */ + if (log->need_free & (1 << index)) + kfree(log->buffer[index]); + log->tail++; + } + + log->buffer[index] = q; + log->need_free &= ~(1 << index); + log->need_free |= freeable << index; + log->head++; + } va_end(va); } EXPORT_SYMBOL(logfc); -#endif + +/* + * Free a logging structure. + */ +static void put_fc_log(struct fs_context *fc) +{ + struct fc_log *log = fc->log; + int i; + + if (log) { + if (refcount_dec_and_test(&log->usage)) { + fc->log = NULL; + for (i = 0; i <= 7; i++) + if (log->need_free & (1 << i)) + kfree(log->buffer[i]); + kfree(log); + } + } +} /** * put_fs_context - Dispose of a superblock configuration context. @@ -431,6 +511,7 @@ void put_fs_context(struct fs_context *fc) put_user_ns(fc->user_ns); put_cred(fc->cred); kfree(fc->subtype); + put_fc_log(fc); put_filesystem(fc->fs_type); kfree(fc->source); kfree(fc); @@ -640,3 +721,54 @@ int parse_monolithic_mount_data(struct fs_context *fc, void *data) return monolithic_mount_data(fc, data); } + +/* + * Clean up a context after performing an action on it and put it into a state + * from where it can be used to reconfigure a superblock. + * + * Note that here we do only the parts that can't fail; the rest is in + * finish_clean_context() below and in between those fs_context is marked + * FS_CONTEXT_AWAITING_RECONF. The reason for splitup is that after + * successful mount or remount we need to report success to userland. + * Trying to do full reinit (for the sake of possible subsequent remount) + * and failing to allocate memory would've put us into a nasty situation. + * So here we only discard the old state and reinitialization is left + * until we actually try to reconfigure. + */ +void vfs_clean_context(struct fs_context *fc) +{ + if (fc->need_free && fc->ops && fc->ops->free) + fc->ops->free(fc); + fc->need_free = false; + fc->fs_private = NULL; + fc->s_fs_info = NULL; + fc->sb_flags = 0; + security_free_mnt_opts(&fc->security); + kfree(fc->subtype); + fc->subtype = NULL; + kfree(fc->source); + fc->source = NULL; + + fc->purpose = FS_CONTEXT_FOR_RECONFIGURE; + fc->phase = FS_CONTEXT_AWAITING_RECONF; +} + +int finish_clean_context(struct fs_context *fc) +{ + int error; + + if (fc->phase != FS_CONTEXT_AWAITING_RECONF) + return 0; + + if (fc->fs_type->init_fs_context) + error = fc->fs_type->init_fs_context(fc); + else + error = legacy_init_fs_context(fc); + if (unlikely(error)) { + fc->phase = FS_CONTEXT_FAILED; + return error; + } + fc->need_free = true; + fc->phase = FS_CONTEXT_RECONF_PARAMS; + return 0; +} diff --git a/fs/fsopen.c b/fs/fsopen.c new file mode 100644 index 000000000000..3bb9c0c8cbcc --- /dev/null +++ b/fs/fsopen.c @@ -0,0 +1,477 @@ +/* Filesystem access-by-fd. + * + * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/fs_context.h> +#include <linux/fs_parser.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/syscalls.h> +#include <linux/security.h> +#include <linux/anon_inodes.h> +#include <linux/namei.h> +#include <linux/file.h> +#include <uapi/linux/mount.h> +#include "internal.h" +#include "mount.h" + +/* + * Allow the user to read back any error, warning or informational messages. + */ +static ssize_t fscontext_read(struct file *file, + char __user *_buf, size_t len, loff_t *pos) +{ + struct fs_context *fc = file->private_data; + struct fc_log *log = fc->log; + unsigned int logsize = ARRAY_SIZE(log->buffer); + ssize_t ret; + char *p; + bool need_free; + int index, n; + + ret = mutex_lock_interruptible(&fc->uapi_mutex); + if (ret < 0) + return ret; + + if (log->head == log->tail) { + mutex_unlock(&fc->uapi_mutex); + return -ENODATA; + } + + index = log->tail & (logsize - 1); + p = log->buffer[index]; + need_free = log->need_free & (1 << index); + log->buffer[index] = NULL; + log->need_free &= ~(1 << index); + log->tail++; + mutex_unlock(&fc->uapi_mutex); + + ret = -EMSGSIZE; + n = strlen(p); + if (n > len) + goto err_free; + ret = -EFAULT; + if (copy_to_user(_buf, p, n) != 0) + goto err_free; + ret = n; + +err_free: + if (need_free) + kfree(p); + return ret; +} + +static int fscontext_release(struct inode *inode, struct file *file) +{ + struct fs_context *fc = file->private_data; + + if (fc) { + file->private_data = NULL; + put_fs_context(fc); + } + return 0; +} + +const struct file_operations fscontext_fops = { + .read = fscontext_read, + .release = fscontext_release, + .llseek = no_llseek, +}; + +/* + * Attach a filesystem context to a file and an fd. + */ +static int fscontext_create_fd(struct fs_context *fc, unsigned int o_flags) +{ + int fd; + + fd = anon_inode_getfd("fscontext", &fscontext_fops, fc, + O_RDWR | o_flags); + if (fd < 0) + put_fs_context(fc); + return fd; +} + +static int fscontext_alloc_log(struct fs_context *fc) +{ + fc->log = kzalloc(sizeof(*fc->log), GFP_KERNEL); + if (!fc->log) + return -ENOMEM; + refcount_set(&fc->log->usage, 1); + fc->log->owner = fc->fs_type->owner; + return 0; +} + +/* + * Open a filesystem by name so that it can be configured for mounting. + * + * We are allowed to specify a container in which the filesystem will be + * opened, thereby indicating which namespaces will be used (notably, which + * network namespace will be used for network filesystems). + */ +SYSCALL_DEFINE2(fsopen, const char __user *, _fs_name, unsigned int, flags) +{ + struct file_system_type *fs_type; + struct fs_context *fc; + const char *fs_name; + int ret; + + if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + if (flags & ~FSOPEN_CLOEXEC) + return -EINVAL; + + fs_name = strndup_user(_fs_name, PAGE_SIZE); + if (IS_ERR(fs_name)) + return PTR_ERR(fs_name); + + fs_type = get_fs_type(fs_name); + kfree(fs_name); + if (!fs_type) + return -ENODEV; + + fc = fs_context_for_mount(fs_type, 0); + put_filesystem(fs_type); + if (IS_ERR(fc)) + return PTR_ERR(fc); + + fc->phase = FS_CONTEXT_CREATE_PARAMS; + + ret = fscontext_alloc_log(fc); + if (ret < 0) + goto err_fc; + + return fscontext_create_fd(fc, flags & FSOPEN_CLOEXEC ? O_CLOEXEC : 0); + +err_fc: + put_fs_context(fc); + return ret; +} + +/* + * Pick a superblock into a context for reconfiguration. + */ +SYSCALL_DEFINE3(fspick, int, dfd, const char __user *, path, unsigned int, flags) +{ + struct fs_context *fc; + struct path target; + unsigned int lookup_flags; + int ret; + + if (!ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + if ((flags & ~(FSPICK_CLOEXEC | + FSPICK_SYMLINK_NOFOLLOW | + FSPICK_NO_AUTOMOUNT | + FSPICK_EMPTY_PATH)) != 0) + return -EINVAL; + + lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT; + if (flags & FSPICK_SYMLINK_NOFOLLOW) + lookup_flags &= ~LOOKUP_FOLLOW; + if (flags & FSPICK_NO_AUTOMOUNT) + lookup_flags &= ~LOOKUP_AUTOMOUNT; + if (flags & FSPICK_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + ret = user_path_at(dfd, path, lookup_flags, &target); + if (ret < 0) + goto err; + + ret = -EINVAL; + if (target.mnt->mnt_root != target.dentry) + goto err_path; + + fc = fs_context_for_reconfigure(target.dentry, 0, 0); + if (IS_ERR(fc)) { + ret = PTR_ERR(fc); + goto err_path; + } + + fc->phase = FS_CONTEXT_RECONF_PARAMS; + + ret = fscontext_alloc_log(fc); + if (ret < 0) + goto err_fc; + + path_put(&target); + return fscontext_create_fd(fc, flags & FSPICK_CLOEXEC ? O_CLOEXEC : 0); + +err_fc: + put_fs_context(fc); +err_path: + path_put(&target); +err: + return ret; +} + +/* + * Check the state and apply the configuration. Note that this function is + * allowed to 'steal' the value by setting param->xxx to NULL before returning. + */ +static int vfs_fsconfig_locked(struct fs_context *fc, int cmd, + struct fs_parameter *param) +{ + struct super_block *sb; + int ret; + + ret = finish_clean_context(fc); + if (ret) + return ret; + switch (cmd) { + case FSCONFIG_CMD_CREATE: + if (fc->phase != FS_CONTEXT_CREATE_PARAMS) + return -EBUSY; + fc->phase = FS_CONTEXT_CREATING; + ret = vfs_get_tree(fc); + if (ret) + break; + sb = fc->root->d_sb; + ret = security_sb_kern_mount(sb); + if (unlikely(ret)) { + fc_drop_locked(fc); + break; + } + up_write(&sb->s_umount); + fc->phase = FS_CONTEXT_AWAITING_MOUNT; + return 0; + case FSCONFIG_CMD_RECONFIGURE: + if (fc->phase != FS_CONTEXT_RECONF_PARAMS) + return -EBUSY; + fc->phase = FS_CONTEXT_RECONFIGURING; + sb = fc->root->d_sb; + if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) { + ret = -EPERM; + break; + } + down_write(&sb->s_umount); + ret = reconfigure_super(fc); + up_write(&sb->s_umount); + if (ret) + break; + vfs_clean_context(fc); + return 0; + default: + if (fc->phase != FS_CONTEXT_CREATE_PARAMS && + fc->phase != FS_CONTEXT_RECONF_PARAMS) + return -EBUSY; + + return vfs_parse_fs_param(fc, param); + } + fc->phase = FS_CONTEXT_FAILED; + return ret; +} + +/** + * sys_fsconfig - Set parameters and trigger actions on a context + * @fd: The filesystem context to act upon + * @cmd: The action to take + * @_key: Where appropriate, the parameter key to set + * @_value: Where appropriate, the parameter value to set + * @aux: Additional information for the value + * + * This system call is used to set parameters on a context, including + * superblock settings, data source and security labelling. + * + * Actions include triggering the creation of a superblock and the + * reconfiguration of the superblock attached to the specified context. + * + * When setting a parameter, @cmd indicates the type of value being proposed + * and @_key indicates the parameter to be altered. + * + * @_value and @aux are used to specify the value, should a value be required: + * + * (*) fsconfig_set_flag: No value is specified. The parameter must be boolean + * in nature. The key may be prefixed with "no" to invert the + * setting. @_value must be NULL and @aux must be 0. + * + * (*) fsconfig_set_string: A string value is specified. The parameter can be + * expecting boolean, integer, string or take a path. A conversion to an + * appropriate type will be attempted (which may include looking up as a + * path). @_value points to a NUL-terminated string and @aux must be 0. + * + * (*) fsconfig_set_binary: A binary blob is specified. @_value points to the + * blob and @aux indicates its size. The parameter must be expecting a + * blob. + * + * (*) fsconfig_set_path: A non-empty path is specified. The parameter must be + * expecting a path object. @_value points to a NUL-terminated string that + * is the path and @aux is a file descriptor at which to start a relative + * lookup or AT_FDCWD. + * + * (*) fsconfig_set_path_empty: As fsconfig_set_path, but with AT_EMPTY_PATH + * implied. + * + * (*) fsconfig_set_fd: An open file descriptor is specified. @_value must be + * NULL and @aux indicates the file descriptor. + */ +SYSCALL_DEFINE5(fsconfig, + int, fd, + unsigned int, cmd, + const char __user *, _key, + const void __user *, _value, + int, aux) +{ + struct fs_context *fc; + struct fd f; + int ret; + + struct fs_parameter param = { + .type = fs_value_is_undefined, + }; + + if (fd < 0) + return -EINVAL; + + switch (cmd) { + case FSCONFIG_SET_FLAG: + if (!_key || _value || aux) + return -EINVAL; + break; + case FSCONFIG_SET_STRING: + if (!_key || !_value || aux) + return -EINVAL; + break; + case FSCONFIG_SET_BINARY: + if (!_key || !_value || aux <= 0 || aux > 1024 * 1024) + return -EINVAL; + break; + case FSCONFIG_SET_PATH: + case FSCONFIG_SET_PATH_EMPTY: + if (!_key || !_value || (aux != AT_FDCWD && aux < 0)) + return -EINVAL; + break; + case FSCONFIG_SET_FD: + if (!_key || _value || aux < 0) + return -EINVAL; + break; + case FSCONFIG_CMD_CREATE: + case FSCONFIG_CMD_RECONFIGURE: + if (_key || _value || aux) + return -EINVAL; + break; + default: + return -EOPNOTSUPP; + } + + f = fdget(fd); + if (!f.file) + return -EBADF; + ret = -EINVAL; + if (f.file->f_op != &fscontext_fops) + goto out_f; + + fc = f.file->private_data; + if (fc->ops == &legacy_fs_context_ops) { + switch (cmd) { + case FSCONFIG_SET_BINARY: + case FSCONFIG_SET_PATH: + case FSCONFIG_SET_PATH_EMPTY: + case FSCONFIG_SET_FD: + ret = -EOPNOTSUPP; + goto out_f; + } + } + + if (_key) { + param.key = strndup_user(_key, 256); + if (IS_ERR(param.key)) { + ret = PTR_ERR(param.key); + goto out_f; + } + } + + switch (cmd) { + case FSCONFIG_SET_FLAG: + param.type = fs_value_is_flag; + break; + case FSCONFIG_SET_STRING: + param.type = fs_value_is_string; + param.string = strndup_user(_value, 256); + if (IS_ERR(param.string)) { + ret = PTR_ERR(param.string); + goto out_key; + } + param.size = strlen(param.string); + break; + case FSCONFIG_SET_BINARY: + param.type = fs_value_is_blob; + param.size = aux; + param.blob = memdup_user_nul(_value, aux); + if (IS_ERR(param.blob)) { + ret = PTR_ERR(param.blob); + goto out_key; + } + break; + case FSCONFIG_SET_PATH: + param.type = fs_value_is_filename; + param.name = getname_flags(_value, 0, NULL); + if (IS_ERR(param.name)) { + ret = PTR_ERR(param.name); + goto out_key; + } + param.dirfd = aux; + param.size = strlen(param.name->name); + break; + case FSCONFIG_SET_PATH_EMPTY: + param.type = fs_value_is_filename_empty; + param.name = getname_flags(_value, LOOKUP_EMPTY, NULL); + if (IS_ERR(param.name)) { + ret = PTR_ERR(param.name); + goto out_key; + } + param.dirfd = aux; + param.size = strlen(param.name->name); + break; + case FSCONFIG_SET_FD: + param.type = fs_value_is_file; + ret = -EBADF; + param.file = fget(aux); + if (!param.file) + goto out_key; + break; + default: + break; + } + + ret = mutex_lock_interruptible(&fc->uapi_mutex); + if (ret == 0) { + ret = vfs_fsconfig_locked(fc, cmd, ¶m); + mutex_unlock(&fc->uapi_mutex); + } + + /* Clean up the our record of any value that we obtained from + * userspace. Note that the value may have been stolen by the LSM or + * filesystem, in which case the value pointer will have been cleared. + */ + switch (cmd) { + case FSCONFIG_SET_STRING: + case FSCONFIG_SET_BINARY: + kfree(param.string); + break; + case FSCONFIG_SET_PATH: + case FSCONFIG_SET_PATH_EMPTY: + if (param.name) + putname(param.name); + break; + case FSCONFIG_SET_FD: + if (param.file) + fput(param.file); + break; + default: + break; + } +out_key: + kfree(param.key); +out_f: + fdput(f); + return ret; +} diff --git a/fs/internal.h b/fs/internal.h index 17a8ae967493..0010889f2e85 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -55,8 +55,11 @@ extern void __init chrdev_init(void); /* * fs_context.c */ +extern const struct fs_context_operations legacy_fs_context_ops; extern int parse_monolithic_mount_data(struct fs_context *, void *); extern void fc_drop_locked(struct fs_context *); +extern void vfs_clean_context(struct fs_context *fc); +extern int finish_clean_context(struct fs_context *fc); /* * namei.c @@ -92,6 +95,7 @@ extern void __init mnt_init(void); extern int __mnt_want_write_file(struct file *); extern void __mnt_drop_write_file(struct file *); +extern void dissolve_on_fput(struct vfsmount *); /* * fs_struct.c */ diff --git a/fs/namespace.c b/fs/namespace.c index c9cab307fa77..3357c3d65475 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -20,6 +20,7 @@ #include <linux/init.h> /* init_rootfs */ #include <linux/fs_struct.h> /* get_fs_root et.al. */ #include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */ +#include <linux/file.h> #include <linux/uaccess.h> #include <linux/proc_ns.h> #include <linux/magic.h> @@ -1832,6 +1833,27 @@ struct vfsmount *collect_mounts(const struct path *path) return &tree->mnt; } +static void free_mnt_ns(struct mnt_namespace *); +static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool); + +void dissolve_on_fput(struct vfsmount *mnt) +{ + struct mnt_namespace *ns; + namespace_lock(); + lock_mount_hash(); + ns = real_mount(mnt)->mnt_ns; + if (ns) { + if (is_anon_ns(ns)) + umount_tree(real_mount(mnt), UMOUNT_CONNECTED); + else + ns = NULL; + } + unlock_mount_hash(); + namespace_unlock(); + if (ns) + free_mnt_ns(ns); +} + void drop_collected_mounts(struct vfsmount *mnt) { namespace_lock(); @@ -2065,6 +2087,10 @@ static int attach_recursive_mnt(struct mount *source_mnt, attach_mnt(source_mnt, dest_mnt, dest_mp); touch_mnt_namespace(source_mnt->mnt_ns); } else { + if (source_mnt->mnt_ns) { + /* move from anon - the caller will destroy */ + list_del_init(&source_mnt->mnt_ns->list); + } mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); commit_tree(source_mnt); } @@ -2222,6 +2248,30 @@ static bool has_locked_children(struct mount *mnt, struct dentry *dentry) return false; } +static struct mount *__do_loopback(struct path *old_path, int recurse) +{ + struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt); + + if (IS_MNT_UNBINDABLE(old)) + return mnt; + + if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations) + return mnt; + + if (!recurse && has_locked_children(old, old_path->dentry)) + return mnt; + + if (recurse) + mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE); + else + mnt = clone_mnt(old, old_path->dentry, 0); + + if (!IS_ERR(mnt)) + mnt->mnt.mnt_flags &= ~MNT_LOCKED; + + return mnt; +} + /* * do loopback mount. */ @@ -2229,7 +2279,7 @@ static int do_loopback(struct path *path, const char *old_name, int recurse) { struct path old_path; - struct mount *mnt = NULL, *old, *parent; + struct mount *mnt = NULL, *parent; struct mountpoint *mp; int err; if (!old_name || !*old_name) @@ -2243,38 +2293,21 @@ static int do_loopback(struct path *path, const char *old_name, goto out; mp = lock_mount(path); - err = PTR_ERR(mp); - if (IS_ERR(mp)) + if (IS_ERR(mp)) { + err = PTR_ERR(mp); goto out; + } - old = real_mount(old_path.mnt); parent = real_mount(path->mnt); - - err = -EINVAL; - if (IS_MNT_UNBINDABLE(old)) - goto out2; - if (!check_mnt(parent)) goto out2; - if (!check_mnt(old) && old_path.dentry->d_op != &ns_dentry_operations) - goto out2; - - if (!recurse && has_locked_children(old, old_path.dentry)) - goto out2; - - if (recurse) - mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE); - else - mnt = clone_mnt(old, old_path.dentry, 0); - + mnt = __do_loopback(&old_path, recurse); if (IS_ERR(mnt)) { err = PTR_ERR(mnt); goto out2; } - mnt->mnt.mnt_flags &= ~MNT_LOCKED; - err = graft_tree(mnt, parent, mp); if (err) { lock_mount_hash(); @@ -2288,6 +2321,96 @@ out: return err; } +static struct file *open_detached_copy(struct path *path, bool recursive) +{ + struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; + struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true); + struct mount *mnt, *p; + struct file *file; + + if (IS_ERR(ns)) + return ERR_CAST(ns); + + namespace_lock(); + mnt = __do_loopback(path, recursive); + if (IS_ERR(mnt)) { + namespace_unlock(); + free_mnt_ns(ns); + return ERR_CAST(mnt); + } + + lock_mount_hash(); + for (p = mnt; p; p = next_mnt(p, mnt)) { + p->mnt_ns = ns; + ns->mounts++; + } + ns->root = mnt; + list_add_tail(&ns->list, &mnt->mnt_list); + mntget(&mnt->mnt); + unlock_mount_hash(); + namespace_unlock(); + + mntput(path->mnt); + path->mnt = &mnt->mnt; + file = dentry_open(path, O_PATH, current_cred()); + if (IS_ERR(file)) + dissolve_on_fput(path->mnt); + else + file->f_mode |= FMODE_NEED_UNMOUNT; + return file; +} + +SYSCALL_DEFINE3(open_tree, int, dfd, const char *, filename, unsigned, flags) +{ + struct file *file; + struct path path; + int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; + bool detached = flags & OPEN_TREE_CLONE; + int error; + int fd; + + BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC); + + if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE | + |