diff options
-rw-r--r-- | Documentation/admin-guide/cgroup-v1/cpusets.rst | 11 | ||||
-rw-r--r-- | fs/kernfs/inode.c | 91 | ||||
-rw-r--r-- | fs/kernfs/kernfs-internal.h | 2 | ||||
-rw-r--r-- | fs/xattr.c | 17 | ||||
-rw-r--r-- | include/linux/cgroup-defs.h | 5 | ||||
-rw-r--r-- | include/linux/cgroup.h | 23 | ||||
-rw-r--r-- | include/linux/kernfs.h | 11 | ||||
-rw-r--r-- | include/linux/sched/task.h | 4 | ||||
-rw-r--r-- | include/linux/xattr.h | 3 | ||||
-rw-r--r-- | include/uapi/linux/sched.h | 5 | ||||
-rw-r--r-- | kernel/cgroup/cgroup-v1.c | 34 | ||||
-rw-r--r-- | kernel/cgroup/cgroup.c | 361 | ||||
-rw-r--r-- | kernel/cgroup/cpuset.c | 8 | ||||
-rw-r--r-- | kernel/cgroup/pids.c | 15 | ||||
-rw-r--r-- | kernel/fork.c | 19 | ||||
-rw-r--r-- | mm/shmem.c | 2 | ||||
-rw-r--r-- | tools/testing/selftests/cgroup/Makefile | 6 | ||||
-rw-r--r-- | tools/testing/selftests/cgroup/cgroup_util.c | 126 | ||||
-rw-r--r-- | tools/testing/selftests/cgroup/cgroup_util.h | 4 | ||||
-rw-r--r-- | tools/testing/selftests/cgroup/test_core.c | 177 | ||||
-rw-r--r-- | tools/testing/selftests/clone3/clone3_selftests.h | 19 |
21 files changed, 795 insertions, 148 deletions
diff --git a/Documentation/admin-guide/cgroup-v1/cpusets.rst b/Documentation/admin-guide/cgroup-v1/cpusets.rst index 86a6ae995d54..7ade3abd342a 100644 --- a/Documentation/admin-guide/cgroup-v1/cpusets.rst +++ b/Documentation/admin-guide/cgroup-v1/cpusets.rst @@ -223,6 +223,17 @@ cpu_online_mask using a CPU hotplug notifier, and the mems file automatically tracks the value of node_states[N_MEMORY]--i.e., nodes with memory--using the cpuset_track_online_nodes() hook. +The cpuset.effective_cpus and cpuset.effective_mems files are +normally read-only copies of cpuset.cpus and cpuset.mems files +respectively. If the cpuset cgroup filesystem is mounted with the +special "cpuset_v2_mode" option, the behavior of these files will become +similar to the corresponding files in cpuset v2. In other words, hotplug +events will not change cpuset.cpus and cpuset.mems. Those events will +only affect cpuset.effective_cpus and cpuset.effective_mems which show +the actual cpus and memory nodes that are currently used by this cpuset. +See Documentation/admin-guide/cgroup-v2.rst for more information about +cpuset v2 behavior. + 1.4 What are exclusive cpusets ? -------------------------------- diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index d0f7a5abd9a9..fc2469a20fed 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -53,6 +53,8 @@ static struct kernfs_iattrs *__kernfs_iattrs(struct kernfs_node *kn, int alloc) kn->iattr->ia_ctime = kn->iattr->ia_atime; simple_xattrs_init(&kn->iattr->xattrs); + atomic_set(&kn->iattr->nr_user_xattrs, 0); + atomic_set(&kn->iattr->user_xattr_size, 0); out_unlock: ret = kn->iattr; mutex_unlock(&iattr_mutex); @@ -303,7 +305,7 @@ int kernfs_xattr_set(struct kernfs_node *kn, const char *name, if (!attrs) return -ENOMEM; - return simple_xattr_set(&attrs->xattrs, name, value, size, flags); + return simple_xattr_set(&attrs->xattrs, name, value, size, flags, NULL); } static int kernfs_vfs_xattr_get(const struct xattr_handler *handler, @@ -327,6 +329,86 @@ static int kernfs_vfs_xattr_set(const struct xattr_handler *handler, return kernfs_xattr_set(kn, name, value, size, flags); } +static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn, + const char *full_name, + struct simple_xattrs *xattrs, + const void *value, size_t size, int flags) +{ + atomic_t *sz = &kn->iattr->user_xattr_size; + atomic_t *nr = &kn->iattr->nr_user_xattrs; + ssize_t removed_size; + int ret; + + if (atomic_inc_return(nr) > KERNFS_MAX_USER_XATTRS) { + ret = -ENOSPC; + goto dec_count_out; + } + + if (atomic_add_return(size, sz) > KERNFS_USER_XATTR_SIZE_LIMIT) { + ret = -ENOSPC; + goto dec_size_out; + } + + ret = simple_xattr_set(xattrs, full_name, value, size, flags, + &removed_size); + + if (!ret && removed_size >= 0) + size = removed_size; + else if (!ret) + return 0; +dec_size_out: + atomic_sub(size, sz); +dec_count_out: + atomic_dec(nr); + return ret; +} + +static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn, + const char *full_name, + struct simple_xattrs *xattrs, + const void *value, size_t size, int flags) +{ + atomic_t *sz = &kn->iattr->user_xattr_size; + atomic_t *nr = &kn->iattr->nr_user_xattrs; + ssize_t removed_size; + int ret; + + ret = simple_xattr_set(xattrs, full_name, value, size, flags, + &removed_size); + + if (removed_size >= 0) { + atomic_sub(removed_size, sz); + atomic_dec(nr); + } + + return ret; +} + +static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *suffix, const void *value, + size_t size, int flags) +{ + const char *full_name = xattr_full_name(handler, suffix); + struct kernfs_node *kn = inode->i_private; + struct kernfs_iattrs *attrs; + + if (!(kernfs_root(kn)->flags & KERNFS_ROOT_SUPPORT_USER_XATTR)) + return -EOPNOTSUPP; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + if (value) + return kernfs_vfs_user_xattr_add(kn, full_name, &attrs->xattrs, + value, size, flags); + else + return kernfs_vfs_user_xattr_rm(kn, full_name, &attrs->xattrs, + value, size, flags); + +} + static const struct xattr_handler kernfs_trusted_xattr_handler = { .prefix = XATTR_TRUSTED_PREFIX, .get = kernfs_vfs_xattr_get, @@ -339,8 +421,15 @@ static const struct xattr_handler kernfs_security_xattr_handler = { .set = kernfs_vfs_xattr_set, }; +static const struct xattr_handler kernfs_user_xattr_handler = { + .prefix = XATTR_USER_PREFIX, + .get = kernfs_vfs_xattr_get, + .set = kernfs_vfs_user_xattr_set, +}; + const struct xattr_handler *kernfs_xattr_handlers[] = { &kernfs_trusted_xattr_handler, &kernfs_security_xattr_handler, + &kernfs_user_xattr_handler, NULL }; diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 2f3c51d55261..7ee97ef59184 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -26,6 +26,8 @@ struct kernfs_iattrs { struct timespec64 ia_ctime; struct simple_xattrs xattrs; + atomic_t nr_user_xattrs; + atomic_t user_xattr_size; }; /* +1 to avoid triggering overflow warning when negating it */ diff --git a/fs/xattr.c b/fs/xattr.c index 90dd78f0eb27..e13265e65871 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -817,7 +817,7 @@ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size) if (len < sizeof(*new_xattr)) return NULL; - new_xattr = kmalloc(len, GFP_KERNEL); + new_xattr = kvmalloc(len, GFP_KERNEL); if (!new_xattr) return NULL; @@ -860,6 +860,7 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, * @value: value of the xattr. If %NULL, will remove the attribute. * @size: size of the new xattr * @flags: %XATTR_{CREATE|REPLACE} + * @removed_size: returns size of the removed xattr, -1 if none removed * * %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails * with -EEXIST. If %XATTR_REPLACE is set, the xattr should exist; @@ -868,7 +869,8 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, * Returns 0 on success, -errno on failure. */ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, - const void *value, size_t size, int flags) + const void *value, size_t size, int flags, + ssize_t *removed_size) { struct simple_xattr *xattr; struct simple_xattr *new_xattr = NULL; @@ -882,7 +884,7 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, new_xattr->name = kstrdup(name, GFP_KERNEL); if (!new_xattr->name) { - kfree(new_xattr); + kvfree(new_xattr); return -ENOMEM; } } @@ -895,8 +897,12 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, err = -EEXIST; } else if (new_xattr) { list_replace(&xattr->list, &new_xattr->list); + if (removed_size) + *removed_size = xattr->size; } else { list_del(&xattr->list); + if (removed_size) + *removed_size = xattr->size; } goto out; } @@ -908,11 +914,14 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, list_add(&new_xattr->list, &xattrs->head); xattr = NULL; } + + if (removed_size) + *removed_size = -1; out: spin_unlock(&xattrs->lock); if (xattr) { kfree(xattr->name); - kfree(xattr); + kvfree(xattr); } return err; diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index e1fafed22db1..52661155f85f 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -633,8 +633,9 @@ struct cgroup_subsys { void (*cancel_attach)(struct cgroup_taskset *tset); void (*attach)(struct cgroup_taskset *tset); void (*post_attach)(void); - int (*can_fork)(struct task_struct *task); - void (*cancel_fork)(struct task_struct *task); + int (*can_fork)(struct task_struct *task, + struct css_set *cset); + void (*cancel_fork)(struct task_struct *task, struct css_set *cset); void (*fork)(struct task_struct *task); void (*exit)(struct task_struct *task); void (*release)(struct task_struct *task); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index e75d2191226b..4598e4da6b1b 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -27,6 +27,8 @@ #include <linux/cgroup-defs.h> +struct kernel_clone_args; + #ifdef CONFIG_CGROUPS /* @@ -58,9 +60,6 @@ struct css_task_iter { struct list_head *tcset_head; struct list_head *task_pos; - struct list_head *tasks_head; - struct list_head *mg_tasks_head; - struct list_head *dying_tasks_head; struct list_head *cur_tasks_head; struct css_set *cur_cset; @@ -122,9 +121,12 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk); void cgroup_fork(struct task_struct *p); -extern int cgroup_can_fork(struct task_struct *p); -extern void cgroup_cancel_fork(struct task_struct *p); -extern void cgroup_post_fork(struct task_struct *p); +extern int cgroup_can_fork(struct task_struct *p, + struct kernel_clone_args *kargs); +extern void cgroup_cancel_fork(struct task_struct *p, + struct kernel_clone_args *kargs); +extern void cgroup_post_fork(struct task_struct *p, + struct kernel_clone_args *kargs); void cgroup_exit(struct task_struct *p); void cgroup_release(struct task_struct *p); void cgroup_free(struct task_struct *p); @@ -708,9 +710,12 @@ static inline int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { return -EINVAL; } static inline void cgroup_fork(struct task_struct *p) {} -static inline int cgroup_can_fork(struct task_struct *p) { return 0; } -static inline void cgroup_cancel_fork(struct task_struct *p) {} -static inline void cgroup_post_fork(struct task_struct *p) {} +static inline int cgroup_can_fork(struct task_struct *p, + struct kernel_clone_args *kargs) { return 0; } +static inline void cgroup_cancel_fork(struct task_struct *p, + struct kernel_clone_args *kargs) {} +static inline void cgroup_post_fork(struct task_struct *p, + struct kernel_clone_args *kargs) {} static inline void cgroup_exit(struct task_struct *p) {} static inline void cgroup_release(struct task_struct *p) {} static inline void cgroup_free(struct task_struct *p) {} diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index dded2e5a9f42..89f6a4214a70 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -37,8 +37,10 @@ enum kernfs_node_type { KERNFS_LINK = 0x0004, }; -#define KERNFS_TYPE_MASK 0x000f -#define KERNFS_FLAG_MASK ~KERNFS_TYPE_MASK +#define KERNFS_TYPE_MASK 0x000f +#define KERNFS_FLAG_MASK ~KERNFS_TYPE_MASK +#define KERNFS_MAX_USER_XATTRS 128 +#define KERNFS_USER_XATTR_SIZE_LIMIT (128 << 10) enum kernfs_node_flag { KERNFS_ACTIVATED = 0x0010, @@ -78,6 +80,11 @@ enum kernfs_root_flag { * fhandle to access nodes of the fs. */ KERNFS_ROOT_SUPPORT_EXPORTOP = 0x0004, + + /* + * Support user xattrs to be written to nodes rooted at this root. + */ + KERNFS_ROOT_SUPPORT_USER_XATTR = 0x0008, }; /* type-specific structures for kernfs_node union members */ diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index f1879884238e..38359071236a 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -13,6 +13,7 @@ struct task_struct; struct rusage; union thread_union; +struct css_set; /* All the bits taken by the old clone syscall. */ #define CLONE_LEGACY_FLAGS 0xffffffffULL @@ -29,6 +30,9 @@ struct kernel_clone_args { pid_t *set_tid; /* Number of elements in *set_tid */ size_t set_tid_size; + int cgroup; + struct cgroup *cgrp; + struct css_set *cset; }; /* diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 6dad031be3c2..4cf6e11f4a3c 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -102,7 +102,8 @@ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size); int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, void *buffer, size_t size); int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, - const void *value, size_t size, int flags); + const void *value, size_t size, int flags, + ssize_t *removed_size); ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer, size_t size); void simple_xattr_list_add(struct simple_xattrs *xattrs, diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 2e3bc22c6f20..3bac0a8ceab2 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -35,6 +35,7 @@ /* Flags for the clone3() syscall. */ #define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */ +#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */ /* * cloning flags intersect with CSIGNAL so can be used with unshare and clone3 @@ -81,6 +82,8 @@ * @set_tid_size: This defines the size of the array referenced * in @set_tid. This cannot be larger than the * kernel's limit of nested PID namespaces. + * @cgroup: If CLONE_INTO_CGROUP is specified set this to + * a file descriptor for the cgroup. * * The structure is versioned by size and thus extensible. * New struct members must go at the end of the struct and @@ -97,11 +100,13 @@ struct clone_args { __aligned_u64 tls; __aligned_u64 set_tid; __aligned_u64 set_tid_size; + __aligned_u64 cgroup; }; #endif #define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ #define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ +#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */ /* * Scheduling policies diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index f2d7cea86ffe..191c329e482a 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -38,10 +38,7 @@ static bool cgroup_no_v1_named; */ static struct workqueue_struct *cgroup_pidlist_destroy_wq; -/* - * Protects cgroup_subsys->release_agent_path. Modifying it also requires - * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. - */ +/* protects cgroup_subsys->release_agent_path */ static DEFINE_SPINLOCK(release_agent_path_lock); bool cgroup1_ssid_disabled(int ssid) @@ -775,22 +772,29 @@ void cgroup1_release_agent(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, release_agent_work); - char *pathbuf = NULL, *agentbuf = NULL; + char *pathbuf, *agentbuf; char *argv[3], *envp[3]; int ret; - mutex_lock(&cgroup_mutex); + /* snoop agent path and exit early if empty */ + if (!cgrp->root->release_agent_path[0]) + return; + /* prepare argument buffers */ pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); - agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); - if (!pathbuf || !agentbuf || !strlen(agentbuf)) - goto out; + agentbuf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!pathbuf || !agentbuf) + goto out_free; - spin_lock_irq(&css_set_lock); - ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); - spin_unlock_irq(&css_set_lock); + spin_lock(&release_agent_path_lock); + strlcpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX); + spin_unlock(&release_agent_path_lock); + if (!agentbuf[0]) + goto out_free; + + ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); if (ret < 0 || ret >= PATH_MAX) - goto out; + goto out_free; argv[0] = agentbuf; argv[1] = pathbuf; @@ -801,11 +805,7 @@ void cgroup1_release_agent(struct work_struct *work) envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[2] = NULL; - mutex_unlock(&cgroup_mutex); call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); - goto out_free; -out: - mutex_unlock(&cgroup_mutex); out_free: kfree(agentbuf); kfree(pathbuf); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 755c07d845ce..06b5ea9d899d 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1966,7 +1966,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) root->kf_root = kernfs_create_root(kf_sops, KERNFS_ROOT_CREATE_DEACTIVATED | - KERNFS_ROOT_SUPPORT_EXPORTOP, + KERNFS_ROOT_SUPPORT_EXPORTOP | + KERNFS_ROOT_SUPPORT_USER_XATTR, root_cgrp); if (IS_ERR(root->kf_root)) { ret = PTR_ERR(root->kf_root); @@ -2726,11 +2727,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, { DEFINE_CGROUP_MGCTX(mgctx); struct task_struct *task; - int ret; - - ret = cgroup_migrate_vet_dst(dst_cgrp); - if (ret) - return ret; + int ret = 0; /* look up all src csets */ spin_lock_irq(&css_set_lock); @@ -4160,7 +4157,8 @@ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, } else if (likely(!(pos->flags & CSS_RELEASED))) { next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling); } else { - list_for_each_entry_rcu(next, &parent->children, sibling) + list_for_each_entry_rcu(next, &parent->children, sibling, + lockdep_is_held(&cgroup_mutex)) if (next->serial_nr > pos->serial_nr) break; } @@ -4403,29 +4401,24 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) lockdep_assert_held(&css_set_lock); - /* Advance to the next non-empty css_set */ - do { - cset = css_task_iter_next_css_set(it); - if (!cset) { - it->task_pos = NULL; - return; + /* Advance to the next non-empty css_set and find first non-empty tasks list*/ + while ((cset = css_task_iter_next_css_set(it))) { + if (!list_empty(&cset->tasks)) { + it->cur_tasks_head = &cset->tasks; + break; + } else if (!list_empty(&cset->mg_tasks)) { + it->cur_tasks_head = &cset->mg_tasks; + break; + } else if (!list_empty(&cset->dying_tasks)) { + it->cur_tasks_head = &cset->dying_tasks; + break; } - } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks)); - - if (!list_empty(&cset->tasks)) { - it->task_pos = cset->tasks.next; - it->cur_tasks_head = &cset->tasks; - } else if (!list_empty(&cset->mg_tasks)) { - it->task_pos = cset->mg_tasks.next; - it->cur_tasks_head = &cset->mg_tasks; - } else { - it->task_pos = cset->dying_tasks.next; - it->cur_tasks_head = &cset->dying_tasks; } - - it->tasks_head = &cset->tasks; - it->mg_tasks_head = &cset->mg_tasks; - it->dying_tasks_head = &cset->dying_tasks; + if (!cset) { + it->task_pos = NULL; + return; + } + it->task_pos = it->cur_tasks_head->next; /* * We don't keep css_sets locked across iteration steps and thus @@ -4470,24 +4463,24 @@ static void css_task_iter_advance(struct css_task_iter *it) repeat: if (it->task_pos) { /* - * Advance iterator to find next entry. cset->tasks is - * consumed first and then ->mg_tasks. After ->mg_tasks, - * we move onto the next cset. + * Advance iterator to find next entry. We go through cset + * tasks, mg_tasks and dying_tasks, when consumed we move onto + * the next cset. */ if (it->flags & CSS_TASK_ITER_SKIPPED) it->flags &= ~CSS_TASK_ITER_SKIPPED; else it->task_pos = it->task_pos->next; - if (it->task_pos == it->tasks_head) { - it->task_pos = it->mg_tasks_head->next; - it->cur_tasks_head = it->mg_tasks_head; + if (it->task_pos == &it->cur_cset->tasks) { + it->cur_tasks_head = &it->cur_cset->mg_tasks; + it->task_pos = it->cur_tasks_head->next; } - if (it->task_pos == it->mg_tasks_head) { - it->task_pos = it->dying_tasks_head->next; - it->cur_tasks_head = it->dying_tasks_head; + if (it->task_pos == &it->cur_cset->mg_tasks) { + it->cur_tasks_head = &it->cur_cset->dying_tasks; + it->task_pos = it->cur_tasks_head->next; } - if (it->task_pos == it->dying_tasks_head) + if (it->task_pos == &it->cur_cset->dying_tasks) css_task_iter_advance_css_set(it); } else { /* called from start, proceed to the first cset */ @@ -4505,12 +4498,12 @@ repeat: goto repeat; /* and dying leaders w/o live member threads */ - if (it->cur_tasks_head == it->dying_tasks_head && + if (it->cur_tasks_head == &it->cur_cset->dying_tasks && !atomic_read(&task->signal->live)) goto repeat; } else { /* skip all dying ones */ - if (it->cur_tasks_head == it->dying_tasks_head) + if (it->cur_tasks_head == &it->cur_cset->dying_tasks) goto repeat; } } @@ -4674,13 +4667,28 @@ static int cgroup_procs_show(struct seq_file *s, void *v) return 0; } +static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb) +{ + int ret; + struct inode *inode; + + lockdep_assert_held(&cgroup_mutex); + + inode = kernfs_get_inode(sb, cgrp->procs_file.kn); + if (!inode) + return -ENOMEM; + + ret = inode_permission(inode, MAY_WRITE); + iput(inode); + return ret; +} + static int cgroup_procs_write_permission(struct cgroup *src_cgrp, struct cgroup *dst_cgrp, struct super_block *sb) { struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; struct cgroup *com_cgrp = src_cgrp; - struct inode *inode; int ret; lockdep_assert_held(&cgroup_mutex); @@ -4690,12 +4698,7 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp, com_cgrp = cgroup_parent(com_cgrp); /* %current should be authorized to migrate to the common ancestor */ - inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn); - if (!inode) - return -ENOMEM; - - ret = inode_permission(inode, MAY_WRITE); - iput(inode); + ret = cgroup_may_write(com_cgrp, sb); if (ret) return ret; @@ -4711,6 +4714,26 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp, return 0; } +static int cgroup_attach_permissions(struct cgroup *src_cgrp, + struct cgroup *dst_cgrp, + struct super_block *sb, bool threadgroup) +{ + int ret = 0; + + ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb); + if (ret) + return ret; + + ret = cgroup_migrate_vet_dst(dst_cgrp); + if (ret) + return ret; + + if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)) + ret = -EOPNOTSUPP; + + return ret; +} + static ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { @@ -4733,8 +4756,8 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of, src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); spin_unlock_irq(&css_set_lock); - ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, - of->file->f_path.dentry->d_sb); + ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, + of->file->f_path.dentry->d_sb, true); if (ret) goto out_finish; @@ -4778,16 +4801,11 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of, spin_unlock_irq(&css_set_lock); /* thread migrations follow the cgroup.procs delegation rule */ - ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, - of->file->f_path.dentry->d_sb); + ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, + of->file->f_path.dentry->d_sb, false); if (ret) goto out_finish; - /* and must be contained in the same domain */ - ret = -EOPNOTSUPP; - if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp) - goto out_finish; - ret = cgroup_attach_task(dst_cgrp, task, false); out_finish: @@ -5876,8 +5894,7 @@ out: * @child: pointer to task_struct of forking parent process. * * A task is associated with the init_css_set until cgroup_post_fork() - * attaches it to the parent's css_set. Empty cg_list indicates that - * @child isn't holding reference to its css_set. + * attaches it to the target css_set. */ void cgroup_fork(struct task_struct *child) { @@ -5885,21 +5902,172 @@ void cgroup_fork(struct task_struct *child) INIT_LIST_HEAD(&child->cg_list); } +static struct cgroup *cgroup_get_from_file(struct file *f) +{ + struct cgroup_subsys_state *css; + struct cgroup *cgrp; + + css = css_tryget_online_from_dir(f->f_path.dentry, NULL); + if (IS_ERR(css)) + return ERR_CAST(css); + + cgrp = css->cgroup; + if (!cgroup_on_dfl(cgrp)) { + cgroup_put(cgrp); + return ERR_PTR(-EBADF); + } + + return cgrp; +} + +/** + * cgroup_css_set_fork - find or create a css_set for a child process + * @kargs: the arguments passed to create the child process + * + * This functions finds or creates a new css_set which the child + * process will be attached to in cgroup_post_fork(). By default, + * the child process will be given the same css_set as its parent. + * + * If CLONE_INTO_CGROUP is specified this function will try to find an + * existing css_set which includes the requested cgroup and if not create + * a new css_set that the child will be attached to later. If this function + * succeeds it will hold cgroup_threadgroup_rwsem on return. If + * CLONE_INTO_CGROUP is requested this function will grab cgroup mutex + * before grabbing cgroup_threadgroup_rwsem and will hold a reference + * to the target cgroup. + */ +static int cgroup_css_set_fork(struct kernel_clone_args *kargs) + __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem) +{ + int ret; + struct cgroup *dst_cgrp = NULL; + struct css_set *cset; + struct super_block *sb; + struct file *f; + + if (kargs->flags & CLONE_INTO_CGROUP) + mutex_lock(&cgroup_mutex); + + cgroup_threadgroup_change_begin(current); + + spin_lock_irq(&css_set_lock); + cset = task_css_set(current); + get_css_set(cset); + spin_unlock_irq(&css_set_lock); + + if (!(kargs->flags & CLONE_INTO_CGROUP)) { + kargs->cset = cset; + return 0; + } + + f = fget_raw(kargs->cgroup); + if (!f) { + ret = -EBADF; + goto err; + } + sb = f->f_path.dentry->d_sb; + + dst_cgrp = cgroup_get_from_file(f); + if (IS_ERR(dst_cgrp)) { + ret = PTR_ERR(dst_cgrp); + dst_cgrp = NULL; + goto err; + } + + if (cgroup_is_dead(dst_cgrp)) { + ret = -ENODEV; + goto err; + } + + /* + * Verify that we the target cgroup is writable for us. This is + * usually done by the vfs layer but since we're not going through + * the vfs layer here we need to do it "manually". + */ + ret = cgroup_may_write(dst_cgrp, sb); + if (ret) + goto err; + + ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb, + !(kargs->flags & CLONE_THREAD)); + if (ret) + goto err; + + kargs->cset = find_css_set(cset, dst_cgrp); + if (!kargs->cset) { + ret = -ENOMEM; + goto err; + } + + put_css_set(cset); + fput(f); + kargs->cgrp = dst_cgrp; + return ret; + +err: + cgroup_threadgroup_change_end(current); + mutex_unlock(&cgroup_mutex); + if (f) + fput(f); + if (dst_cgrp) + cgroup_put(dst_cgrp); + put_css_set(cset); + if (kargs->cset) + put_css_set(kargs->cset); + return ret; +} + +/** + * cgroup_css_set_put_fork - drop references we took during fork + * @kargs: the arguments passed to create the child process + * + * Drop references to the prepared css_set and target cgroup if + * CLONE_INTO_CGROUP was requested. + */ +static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs) + __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) +{ + cgroup_threadgroup_change_end(current); + + if (kargs->flags & CLONE_INTO_CGROUP) { + struct cgroup *cgrp = kargs->cgrp; + struct css_set *cset = kargs->cset; + + mutex_unlock(&cgroup_mutex); + + if (cset) { + put_css_set(cset); + kargs->cset = NULL; + } + + if (cgrp) { + cgroup_put(cgrp); + kargs->cgrp = NULL; + } + } +} + /** * cgroup_can_fork - called on a new task before the process is exposed - * @child: the task in question. + * @child: the child process * - * This calls the subsystem can_fork() callbacks. If the can_fork() callback - * returns an error, the fork aborts with that error code. This allows for - * a cgroup subsystem to conditionally allow or deny new forks. + * This prepares a new css_set for the child process which the child will + * be attached to in cgroup_post_fork(). + * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork() + * callback returns an error, the fork aborts with that error code. This + * allows for a cgroup subsystem to conditionally allow or deny new forks. */ -int cgroup_can_fork(struct task_struct *child) +int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs) { struct cgroup_subsys *ss; int i, j, ret; + ret = cgroup_css_set_fork(kargs); + if (ret) + return ret; + do_each_subsys_mask(ss, i, have_canfork_callback) { - ret = ss->can_fork(child); + ret = ss->can_fork(child, kargs->cset); if (ret) goto out_revert; } while_each_subsys_mask(); @@ -5911,54 +6079,64 @@ out_revert: if (j >= i) break; if (ss->cancel_fork) - ss->cancel_fork(child); + ss->cancel_fork(child, kargs->cset); } + cgroup_css_set_put_fork(kargs); + return ret; } /** * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork() |