summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/cgroup.h44
-rw-r--r--kernel/cgroup.c684
2 files changed, 608 insertions, 120 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c2515851c1aa..ada239253ec7 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -21,6 +21,7 @@
#include <linux/percpu-refcount.h>
#include <linux/seq_file.h>
#include <linux/kernfs.h>
+#include <linux/wait.h>
#ifdef CONFIG_CGROUPS
@@ -164,6 +165,7 @@ struct cgroup {
struct cgroup *parent; /* my parent */
struct kernfs_node *kn; /* cgroup kernfs entry */
+ struct kernfs_node *control_kn; /* kn for "cgroup.subtree_control" */
/*
* Monotonically increasing unique serial number which defines a
@@ -173,8 +175,8 @@ struct cgroup {
*/
u64 serial_nr;
- /* The bitmask of subsystems attached to this cgroup */
- unsigned long subsys_mask;
+ /* the bitmask of subsystems enabled on the child cgroups */
+ unsigned long child_subsys_mask;
/* Private pointers for each registered subsystem */
struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT];
@@ -188,6 +190,15 @@ struct cgroup {
struct list_head cset_links;
/*
+ * On the default hierarchy, a css_set for a cgroup with some
+ * susbsys disabled will point to css's which are associated with
+ * the closest ancestor which has the subsys enabled. The
+ * following lists all css_sets which point to this cgroup's css
+ * for the given subsystem.
+ */
+ struct list_head e_csets[CGROUP_SUBSYS_COUNT];
+
+ /*
* Linked list running through all cgroups that can
* potentially be reaped by the release agent. Protected by
* release_list_lock
@@ -207,6 +218,9 @@ struct cgroup {
/* For css percpu_ref killing and RCU-protected deletion */
struct rcu_head rcu_head;
struct work_struct destroy_work;
+
+ /* used to wait for offlining of csses */
+ wait_queue_head_t offline_waitq;
};
#define MAX_CGROUP_ROOT_NAMELEN 64
@@ -282,6 +296,9 @@ enum {
struct cgroup_root {
struct kernfs_root *kf_root;
+ /* The bitmask of subsystems attached to this hierarchy */
+ unsigned long subsys_mask;
+
/* Unique id for this hierarchy. */
int hierarchy_id;
@@ -342,6 +359,9 @@ struct css_set {
*/
struct list_head cgrp_links;
+ /* the default cgroup associated with this css_set */
+ struct cgroup *dfl_cgrp;
+
/*
* Set of subsystem states, one for each subsystem. This array is
* immutable after creation apart from the init_css_set during
@@ -366,6 +386,15 @@ struct css_set {
struct cgroup *mg_src_cgrp;
struct css_set *mg_dst_cset;
+ /*
+ * On the default hierarhcy, ->subsys[ssid] may point to a css
+ * attached to an ancestor instead of the cgroup this css_set is
+ * associated with. The following node is anchored at
+ * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to
+ * iterate through all css's attached to a given cgroup.
+ */
+ struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
+
/* For RCU-protected deletion */
struct rcu_head rcu_head;
};
@@ -821,9 +850,14 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
/* A css_task_iter should be treated as an opaque object */
struct css_task_iter {
- struct cgroup_subsys_state *origin_css;
- struct list_head *cset_link;
- struct list_head *task;
+ struct cgroup_subsys *ss;
+
+ struct list_head *cset_pos;
+ struct list_head *cset_head;
+
+ struct list_head *task_pos;
+ struct list_head *tasks_head;
+ struct list_head *mg_tasks_head;
};
void css_task_iter_start(struct cgroup_subsys_state *css,
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 9fcdaa705b6c..809dd903ceb8 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -182,6 +182,8 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
unsigned long ss_mask);
static void cgroup_destroy_css_killed(struct cgroup *cgrp);
static int cgroup_destroy_locked(struct cgroup *cgrp);
+static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss);
+static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
@@ -208,6 +210,34 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
return &cgrp->dummy_css;
}
+/**
+ * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest (%NULL returns the dummy_css)
+ *
+ * Similar to cgroup_css() but returns the effctive css, which is defined
+ * as the matching css of the nearest ancestor including self which has @ss
+ * enabled. If @ss is associated with the hierarchy @cgrp is on, this
+ * function is guaranteed to return non-NULL css.
+ */
+static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
+ struct cgroup_subsys *ss)
+{
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (!ss)
+ return &cgrp->dummy_css;
+
+ if (!(cgrp->root->subsys_mask & (1 << ss->id)))
+ return NULL;
+
+ while (cgrp->parent &&
+ !(cgrp->parent->child_subsys_mask & (1 << ss->id)))
+ cgrp = cgrp->parent;
+
+ return cgroup_css(cgrp, ss);
+}
+
/* convenient tests for these bits */
static inline bool cgroup_is_dead(const struct cgroup *cgrp)
{
@@ -273,7 +303,7 @@ static int notify_on_release(const struct cgroup *cgrp)
* @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
* @cgrp: the target cgroup to iterate css's of
*
- * Should be called under cgroup_mutex.
+ * Should be called under cgroup_[tree_]mutex.
*/
#define for_each_css(css, ssid, cgrp) \
for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
@@ -284,6 +314,20 @@ static int notify_on_release(const struct cgroup *cgrp)
else
/**
+ * for_each_e_css - iterate all effective css's of a cgroup
+ * @css: the iteration cursor
+ * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
+ * @cgrp: the target cgroup to iterate css's of
+ *
+ * Should be called under cgroup_[tree_]mutex.
+ */
+#define for_each_e_css(css, ssid, cgrp) \
+ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
+ if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
+ ; \
+ else
+
+/**
* for_each_subsys - iterate all enabled cgroup subsystems
* @ss: the iteration cursor
* @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
@@ -296,6 +340,14 @@ static int notify_on_release(const struct cgroup *cgrp)
#define for_each_root(root) \
list_for_each_entry((root), &cgroup_roots, root_list)
+/* iterate over child cgrps, lock should be held throughout iteration */
+#define cgroup_for_each_live_child(child, cgrp) \
+ list_for_each_entry((child), &(cgrp)->children, sibling) \
+ if (({ lockdep_assert_held(&cgroup_tree_mutex); \
+ cgroup_is_dead(child); })) \
+ ; \
+ else
+
/**
* cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
* @cgrp: the cgroup to be checked for liveness
@@ -383,6 +435,8 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
static void put_css_set_locked(struct css_set *cset, bool taskexit)
{
struct cgrp_cset_link *link, *tmp_link;
+ struct cgroup_subsys *ss;
+ int ssid;
lockdep_assert_held(&css_set_rwsem);
@@ -390,6 +444,8 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)
return;
/* This css_set is dead. unlink it and release cgroup refcounts */
+ for_each_subsys(ss, ssid)
+ list_del(&cset->e_cset_node[ssid]);
hash_del(&cset->hlist);
css_set_count--;
@@ -452,20 +508,20 @@ static bool compare_css_sets(struct css_set *cset,
{
struct list_head *l1, *l2;
- if (memcmp(template, cset->subsys, sizeof(cset->subsys))) {
- /* Not all subsystems matched */
+ /*
+ * On the default hierarchy, there can be csets which are
+ * associated with the same set of cgroups but different csses.
+ * Let's first ensure that csses match.
+ */
+ if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
return false;
- }
/*
* Compare cgroup pointers in order to distinguish between
- * different cgroups in heirarchies with no subsystems. We
- * could get by with just this check alone (and skip the
- * memcmp above) but on most setups the memcmp check will
- * avoid the need for this more expensive check on almost all
- * candidates.
+ * different cgroups in hierarchies. As different cgroups may
+ * share the same effective css, this comparison is always
+ * necessary.
*/
-
l1 = &cset->cgrp_links;
l2 = &old_cset->cgrp_links;
while (1) {
@@ -529,14 +585,17 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
* won't change, so no need for locking.
*/
for_each_subsys(ss, i) {
- if (root->cgrp.subsys_mask & (1UL << i)) {
- /* Subsystem is in this hierarchy. So we want
- * the subsystem state from the new
- * cgroup */
- template[i] = cgroup_css(cgrp, ss);
+ if (root->subsys_mask & (1UL << i)) {
+ /*
+ * @ss is in this hierarchy, so we want the
+ * effective css from @cgrp.
+ */
+ template[i] = cgroup_e_css(cgrp, ss);
} else {
- /* Subsystem is not in this hierarchy, so we
- * don't want to change the subsystem state */
+ /*
+ * @ss is not in this hierarchy, so we don't want
+ * to change the css.
+ */
template[i] = old_cset->subsys[i];
}
}
@@ -602,6 +661,10 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
struct cgrp_cset_link *link;
BUG_ON(list_empty(tmp_links));
+
+ if (cgroup_on_dfl(cgrp))
+ cset->dfl_cgrp = cgrp;
+
link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
link->cset = cset;
link->cgrp = cgrp;
@@ -628,7 +691,9 @@ static struct css_set *find_css_set(struct css_set *old_cset,
struct css_set *cset;
struct list_head tmp_links;
struct cgrp_cset_link *link;
+ struct cgroup_subsys *ss;
unsigned long key;
+ int ssid;
lockdep_assert_held(&cgroup_mutex);
@@ -679,10 +744,14 @@ static struct css_set *find_css_set(struct css_set *old_cset,
css_set_count++;
- /* Add this cgroup group to the hash table */
+ /* Add @cset to the hash table */
key = css_set_hash(cset->subsys);
hash_add(css_set_table, &cset->hlist, key);
+ for_each_subsys(ss, ssid)
+ list_add_tail(&cset->e_cset_node[ssid],
+ &cset->subsys[ssid]->cgroup->e_csets[ssid]);
+
up_write(&css_set_rwsem);
return cset;
@@ -742,7 +811,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
BUG_ON(!list_empty(&cgrp->children));
/* Rebind all subsystems back to the default hierarchy */
- rebind_subsystems(&cgrp_dfl_root, cgrp->subsys_mask);
+ rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
/*
* Release all the links from cset_links to this hierarchy's
@@ -983,7 +1052,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
unsigned long ss_mask)
{
struct cgroup_subsys *ss;
- int ssid, ret;
+ int ssid, i, ret;
lockdep_assert_held(&cgroup_tree_mutex);
lockdep_assert_held(&cgroup_mutex);
@@ -992,16 +1061,12 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
if (!(ss_mask & (1 << ssid)))
continue;
- /* if @ss is on the dummy_root, we can always move it */
- if (ss->root == &cgrp_dfl_root)
- continue;
-
- /* if @ss has non-root cgroups attached to it, can't move */
- if (!list_empty(&ss->root->cgrp.children))
+ /* if @ss has non-root csses attached to it, can't move */
+ if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
return -EBUSY;
/* can't move between two non-dummy roots either */
- if (dst_root != &cgrp_dfl_root)
+ if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
return -EBUSY;
}
@@ -1036,6 +1101,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
for_each_subsys(ss, ssid) {
struct cgroup_root *src_root;
struct cgroup_subsys_state *css;
+ struct css_set *cset;
if (!(ss_mask & (1 << ssid)))
continue;
@@ -1050,8 +1116,19 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
ss->root = dst_root;
css->cgroup = &dst_root->cgrp;
- src_root->cgrp.subsys_mask &= ~(1 << ssid);
- dst_root->cgrp.subsys_mask |= 1 << ssid;
+ down_write(&css_set_rwsem);
+ hash_for_each(css_set_table, i, cset, hlist)
+ list_move_tail(&cset->e_cset_node[ss->id],
+ &dst_root->cgrp.e_csets[ss->id]);
+ up_write(&css_set_rwsem);
+
+ src_root->subsys_mask &= ~(1 << ssid);
+ src_root->cgrp.child_subsys_mask &= ~(1 << ssid);
+
+ /* default hierarchy doesn't enable controllers by default */
+ dst_root->subsys_mask |= 1 << ssid;
+ if (dst_root != &cgrp_dfl_root)
+ dst_root->cgrp.child_subsys_mask |= 1 << ssid;
if (ss->bind)
ss->bind(css);
@@ -1069,7 +1146,7 @@ static int cgroup_show_options(struct seq_file *seq,
int ssid;
for_each_subsys(ss, ssid)
- if (root->cgrp.subsys_mask & (1 << ssid))
+ if (root->subsys_mask & (1 << ssid))
seq_printf(seq, ",%s", ss->name);
if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
seq_puts(seq, ",sane_behavior");
@@ -1273,12 +1350,12 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
if (ret)
goto out_unlock;
- if (opts.subsys_mask != root->cgrp.subsys_mask || opts.release_agent)
+ if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
task_tgid_nr(current), current->comm);
- added_mask = opts.subsys_mask & ~root->cgrp.subsys_mask;
- removed_mask = root->cgrp.subsys_mask & ~opts.subsys_mask;
+ added_mask = opts.subsys_mask & ~root->subsys_mask;
+ removed_mask = root->subsys_mask & ~opts.subsys_mask;
/* Don't allow flags or name to change at remount */
if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
@@ -1369,6 +1446,9 @@ out_unlock:
static void init_cgroup_housekeeping(struct cgroup *cgrp)
{
+ struct cgroup_subsys *ss;
+ int ssid;
+
atomic_set(&cgrp->refcnt, 1);
INIT_LIST_HEAD(&cgrp->sibling);
INIT_LIST_HEAD(&cgrp->children);
@@ -1377,6 +1457,11 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->pidlists);
mutex_init(&cgrp->pidlist_mutex);
cgrp->dummy_css.cgroup = cgrp;
+
+ for_each_subsys(ss, ssid)
+ INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
+
+ init_waitqueue_head(&cgrp->offline_waitq);
}
static void init_cgroup_root(struct cgroup_root *root,
@@ -1495,7 +1580,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
*/
if (!use_task_css_set_links)
cgroup_enable_task_cg_lists();
-retry:
+
mutex_lock(&cgroup_tree_mutex);
mutex_lock(&cgroup_mutex);
@@ -1503,7 +1588,7 @@ retry:
ret = parse_cgroupfs_options(data, &opts);
if (ret)
goto out_unlock;
-
+retry:
/* look for a matching existing root */
if (!opts.subsys_mask && !opts.none && !opts.name) {
cgrp_dfl_root_visible = true;
@@ -1535,7 +1620,7 @@ retry:
* subsystems) then they must match.
*/
if ((opts.subsys_mask || opts.none) &&
- (opts.subsys_mask != root->cgrp.subsys_mask)) {
+ (opts.subsys_mask != root->subsys_mask)) {
if (!name_match)
continue;
ret = -EBUSY;
@@ -1562,9 +1647,9 @@ retry:
if (!atomic_inc_not_zero(&root->cgrp.refcnt)) {
mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgroup_tree_mutex);
- kfree(opts.release_agent);
- kfree(opts.name);
msleep(10);
+ mutex_lock(&cgroup_tree_mutex);
+ mutex_lock(&cgroup_mutex);
goto retry;
}
@@ -1829,10 +1914,6 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
- /* nothing to do if this cset already belongs to the cgroup */
- if (src_cgrp == dst_cgrp)
- return;
-
if (!list_empty(&src_cset->mg_preload_node))
return;
@@ -1847,13 +1928,14 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
/**
* cgroup_migrate_prepare_dst - prepare destination css_sets for migration
- * @dst_cgrp: the destination cgroup
+ * @dst_cgrp: the destination cgroup (may be %NULL)
* @preloaded_csets: list of preloaded source css_sets
*
* Tasks are about to be moved to @dst_cgrp and all the source css_sets
* have been preloaded to @preloaded_csets. This function looks up and
- * pins all destination css_sets, links each to its source, and put them on
- * @preloaded_csets.
+ * pins all destination css_sets, links each to its source, and append them
+ * to @preloaded_csets. If @dst_cgrp is %NULL, the destination of each
+ * source css_set is assumed to be its cgroup on the default hierarchy.
*
* This function must be called after cgroup_migrate_add_src() has been
* called on each migration source css_set. After migration is performed
@@ -1864,19 +1946,42 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
struct list_head *preloaded_csets)
{
LIST_HEAD(csets);
- struct css_set *src_cset;
+ struct css_set *src_cset, *tmp_cset;
lockdep_assert_held(&cgroup_mutex);
+ /*
+ * Except for the root, child_subsys_mask must be zero for a cgroup
+ * with tasks so that child cgroups don't compete against tasks.
+ */
+ if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && dst_cgrp->parent &&
+ dst_cgrp->child_subsys_mask)
+ return -EBUSY;
+
/* look up the dst cset for each src cset and link it to src */
- list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) {
+ list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
struct css_set *dst_cset;
- dst_cset = find_css_set(src_cset, dst_cgrp);
+ dst_cset = find_css_set(src_cset,
+ dst_cgrp ?: src_cset->dfl_cgrp);
if (!dst_cset)
goto err;
WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
+
+ /*
+ * If src cset equals dst, it's noop. Drop the src.
+ * cgroup_migrate() will skip the cset too. Note that we
+ * can't handle src == dst as some nodes are used by both.
+ */
+ if (src_cset == dst_cset) {
+ src_cset->mg_src_cgrp = NULL;
+ list_del_init(&src_cset->mg_preload_node);
+ put_css_set(src_cset, false);
+ put_css_set(dst_cset, false);
+ continue;
+ }
+
src_cset->mg_dst_cset = dst_cset;
if (list_empty(&dst_cset->mg_preload_node))
@@ -1885,7 +1990,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
put_css_set(dst_cset, false);
}
- list_splice(&csets, preloaded_csets);
+ list_splice_tail(&csets, preloaded_csets);
return 0;
err:
cgroup_migrate_finish(&csets);
@@ -1966,7 +2071,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
return 0;
/* check that we can legitimately attach to the cgroup */
- for_each_css(css, i, cgrp) {
+ for_each_e_css(css, i, cgrp) {
if (css->ss->can_attach) {
ret = css->ss->can_attach(css, &tset);
if (ret) {
@@ -1996,7 +2101,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
*/
tset.csets = &tset.dst_csets;
- for_each_css(css, i, cgrp)
+ for_each_e_css(css, i, cgrp)
if (css->ss->attach)
css->ss->attach(css, &tset);
@@ -2004,7 +2109,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
goto out_release_tset;
out_cancel_attach:
- for_each_css(css, i, cgrp) {
+ for_each_e_css(css, i, cgrp) {
if (css == failed_css)
break;
if (css->ss->cancel_attach)
@@ -2218,6 +2323,326 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
return 0;
}
+static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask)
+{
+ struct cgroup_subsys *ss;
+ bool printed = false;
+ int ssid;
+
+ for_each_subsys(ss, ssid) {
+ if (ss_mask & (1 << ssid)) {
+ if (printed)
+ seq_putc(seq, ' ');
+ seq_printf(seq, "%s", ss->name);
+ printed = true;
+ }
+ }
+ if (printed)
+ seq_putc(seq, '\n');
+}
+
+/* show controllers which are currently attached to the default hierarchy */
+static int cgroup_root_controllers_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ cgroup_print_ss_mask(seq, cgrp->root->subsys_mask);
+ return 0;
+}
+
+/* show controllers which are enabled from the parent */
+static int cgroup_controllers_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ cgroup_print_ss_mask(seq, cgrp->parent->child_subsys_mask);
+ return 0;
+}
+
+/* show controllers which are enabled for a given cgroup's children */
+static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ cgroup_print_ss_mask(seq, cgrp->child_subsys_mask);
+ return 0;
+}
+
+/**
+ * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
+ * @cgrp: root of the subtree to update csses for
+ *
+ * @cgrp's child_subsys_mask has changed and its subtree's (self excluded)
+ * css associations need to be updated accordingly. This function looks up
+ * all css_sets which are attached to the subtree, creates the matching
+ * updated css_sets and migrates the tasks to the new ones.
+ */
+static int cgroup_update_dfl_csses(struct cgroup *cgrp)
+{
+ LIST_HEAD(preloaded_csets);
+ struct cgroup_subsys_state *css;
+ struct css_set *src_cset;
+ int ret;
+
+ lockdep_assert_held(&cgroup_tree_mutex);
+ lockdep_assert_held(&cgroup_mutex);
+
+ /* look up all csses currently attached to @cgrp's subtree */
+ down_read(&css_set_rwsem);
+ css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
+ struct cgrp_cset_link *link;
+
+ /* self is not affected by child_subsys_mask change */
+ if (css->cgroup == cgrp)
+ continue;
+
+ list_for_each_entry(link, &css->cgroup->cset_links, cset_link)
+ cgroup_migrate_add_src(link->cset, cgrp,
+ &preloaded_csets);
+ }
+ up_read(&css_set_rwsem);
+
+ /* NULL dst indicates self on default hierarchy */
+ ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
+ if (ret)
+ goto out_finish;
+
+ list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
+ struct task_struct *last_task = NULL, *task;
+
+ /* src_csets precede dst_csets, break on the first dst_cset */
+ if (!src_cset->mg_src_cgrp)
+ break;
+
+ /*
+ * All tasks in src_cset need to be migrated to the
+ * matching dst_cset. Empty it process by process. We
+ * walk tasks but migrate processes. The leader might even
+ * belong to a different cset but such src_cset would also
+ * be among the target src_csets because the default
+ * hierarchy enforces per-process membership.
+ */
+ while (true) {
+ down_read(&css_set_rwsem);
+ task = list_first_entry_or_null(&src_cset->tasks,
+ struct task_struct, cg_list);
+ if (task) {
+ task = task->group_leader;
+ WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp);
+ get_task_struct(task);
+ }
+ up_read(&css_set_rwsem);
+
+ if (!task)
+ break;
+
+ /* guard against possible infinite loop */
+ if (WARN(last_task == task,
+ "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n"))
+ goto out_finish;
+ last_task = task;
+
+ threadgroup_lock(task);
+ /* raced against de_thread() from another thread? */
+ if (!thread_group_leader(task)) {
+ threadgroup_unlock(task);
+ put_task_struct(task);
+ continue;
+ }
+
+ ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
+
+ threadgroup_unlock(task);
+ put_task_struct(task);
+
+ if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
+ goto out_finish;
+ }
+ }
+
+out_finish:
+ cgroup_migrate_finish(&preloaded_csets);
+ return ret;
+}
+
+/* change the enabled child controllers for a cgroup in the default hierarchy */
+static int cgroup_subtree_control_write(struct cgroup_subsys_state *dummy_css,
+ struct cftype *cft, char *buffer)
+{
+ unsigned long enable_req = 0, disable_req = 0, enable, disable;
+ struct cgroup *cgrp = dummy_css->cgroup, *child;
+ struct cgroup_subsys *ss;
+ char *tok, *p;
+ int ssid, ret;
+
+ /*
+ * Parse input - white space separated list of subsystem names
+ * prefixed with either + or -.
+ */
+ p = buffer;
+ while ((tok = strsep(&p, " \t\n"))) {
+ for_each_subsys(ss, ssid) {
+ if (ss->disabled || strcmp(tok + 1, ss->name))
+ continue;
+
+ if (*tok == '+') {
+ enable_req |= 1 << ssid;
+ disable_req &= ~(1 << ssid);
+ } else if (*tok == '-') {
+ disable_req |= 1 << ssid;
+ enable_req &= ~(1 << ssid);
+ } else {
+ return -EINVAL;
+ }
+ break;
+ }
+ if (ssid == CGROUP_SUBSYS_COUNT)
+ return -EINVAL;
+ }
+
+ /*
+ * We're gonna grab cgroup_tree_mutex which nests outside kernfs
+ * active_ref. cgroup_lock_live_group() already provides enough
+ * protection. Ensure @cgrp stays accessible and break the
+ * active_ref protection.
+ */
+ cgroup_get(cgrp);
+ kernfs_break_active_protection(cgrp->control_kn);
+retry:
+ enable = enable_req;
+ disable = disable_req;
+
+ mutex_lock(&cgroup_tree_mutex);
+
+ for_each_subsys(ss, ssid) {
+ if (enable & (1 << ssid)) {
+ if (cgrp->child_subsys_mask & (1 << ssid)) {
+ enable &= ~(1 << ssid);
+ continue;
+ }
+
+ /*
+ * Because css offlining is asynchronous, userland
+ * might try to re-enable the same controller while
+ * the previous instance is still around. In such
+ * cases, wait till it's gone using offline_waitq.
+ */
+ cgroup_for_each_live_child(child, cgrp) {
+ wait_queue_t wait;
+
+ if (!cgroup_css(child, ss))
+ continue;
+
+ prepare_to_wait(&child->offline_waitq, &wait,
+ TASK_UNINTERRUPTIBLE);
+ mutex_unlock(&cgroup_tree_mutex);
+ schedule();
+ finish_wait(&child->offline_waitq, &wait);
+ goto retry;
+ }
+
+ /* unavailable or not enabled on the parent? */
+ if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
+ (cgrp->parent &&
+ !(cgrp->parent->child_subsys_mask & (1 << ssid)))) {
+ ret = -ENOENT;
+ goto out_unlock_tree;
+ }
+ } else if (disable & (1 << ssid)) {
+ if (!(cgrp->child_subsys_mask & (1 << ssid))) {
+ disable &= ~(1 << ssid);
+ continue;
+ }
+
+ /* a child has it enabled? */
+ cgroup_for_each_live_child(child, cgrp) {
+ if (child->child_subsys_mask & (1 << ssid)) {
+ ret = -EBUSY;
+ goto out_unlock_tree;
+ }
+ }
+ }
+ }
+
+ if (!enable && !disable) {
+ ret = 0;
+ goto out_unlock_tree;
+ }
+
+ if (!cgroup_lock_live_group(cgrp)) {
+ ret = -ENODEV;
+ goto out_unlock_tree;
+ }
+
+ /*
+ * Except for the root, child_subsys_mask must be zero for a cgroup
+ * with tasks so that child cgroups don't compete against tasks.
+ */
+ if (enable && cgrp->parent && !list_empty(&cgrp->cset_links)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ /*
+ * Create csses for enables and update child_subsys_mask. This
+ * changes cgroup_e_css() results which in turn makes the
+ * subsequent cgroup_update_dfl_csses() associate all tasks in the
+ * subtree to the updated csses.
+ */
+ for_each_subsys(ss, ssid) {
+ if (!(enable & (1 << ssid)))
+ continue;
+
+ cgroup_for_each_live_child(child, cgrp) {
+ ret = create_css(child, ss);
+ if (ret)
+ goto err_undo_css;
+ }
+ }
+
+ cgrp->child_subsys_mask |= enable;
+ cgrp->child_subsys_mask &= ~disable;
+
+ ret = cgroup_update_dfl_csses(cgrp);
+ if (ret)
+ goto err_undo_css;
+
+ /* all tasks are now migrated away from the old csses, kill them */
+ for_each_subsys(ss, ssid) {
+ if (!(disable & (1 << ssid)))
+ continue;
+
+ cgroup_for_each_live_child(child, cgrp)
+ kill_css(cgroup_css(child, ss));
+ }
+
+ kernfs_activate(cgrp->kn);
+ ret = 0;
+out_unlock:
+ mutex_unlock(&cgroup_mutex);
+out_unlock_tree:
+ mutex_unlock(&cgroup_tree_mutex);
+ kernfs_unbreak_active_protection(cgrp->control_kn);
+ cgroup_put(cgrp);
+ return ret;
+
+err_undo_css:
+ cgrp->child_subsys_mask &= ~enable;
+ cgrp->child_subsys_mask |= disable;
+
+ for_each_subsys(ss, ssid) {
+ if (!(enable & (1 << ssid)))
+ continue;
+
+ cgroup_for_each_live_child(child, cgrp) {
+ struct cgroup_subsys_state *css = cgroup_css(child, ss);
+ if (css)
+ kill_css(css);
+ }
+ }
+ goto out_unlock;
+}
+
static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
@@ -2377,9 +2802,14 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
return PTR_ERR(kn);
ret = cgroup_kn_set_ugid(kn);
- if (ret)
+ if (ret) {
kernfs_remove(kn);
- return ret;
+ return ret;
+ }
+
+ if (cft->seq_show == cgroup_subtree_control_show)
+ cgrp->control_kn = kn;
+ return 0;
}
/**
@@ -2436,10 +2866,6 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
lockdep_assert_held(&cgroup_tree_mutex);
- /* don't bother if @ss isn't attached */
- if (ss->root == &cgrp_dfl_root)
- return 0;
-
/* add/rm files for all cgroups created before */
css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
struct cgroup *cgrp = css->cgroup;
@@ -2641,10 +3067,19 @@ css_next_child(struct cgroup_subsys_state *pos_css,
break;
}
- if (&next->sibling == &cgrp->children)
- return NULL;
+ /*
+ * @next, if not pointing to the head, can be dereferenced and is
+ * the next sibling; however, it might have @ss disabled. If so,
+ * fast-forward to the next enabled one.
+ */
+ while (&next->sibling != &cgrp->children) {
+ struct cgroup_subsys_state *next_css = cgroup_css(next, parent_css->ss);
- return cgroup_css(next, parent_css->ss);
+ if (next_css)
+ return next_css;
+ next = list_entry_rcu(next->sibling.next, struct cgroup, sibling);
+ }
+ return NULL;
}
/**
@@ -2781,27 +3216,36 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
*/
static void css_advance_task_iter(struct css_task_iter *it)
{
- struct list_head *l = it->cset_link;
+ struct list_head *l = it->cset_pos;
struct cgrp_cset_link *link;
struct css_set *cset;
/* Advance to the next non-empty css_set */
do {
l = l->next;
- if (l == &it->origin_css->cgroup->cset_links) {
- it->cset_link = NULL;
+ if (l == it->cset_head) {
+ it->cset_pos = NULL;
return;
}
- link = list_entry(l, struct cgrp_cset_link, cset_link);
- cset = link->cset;
+
+ if (it->ss) {
+ cset = container_of(l, struct css_set,
+ e_cset_node[it->ss->id]);
+ } else {
+ link = list_entry(l, struct cgrp_cset_link, cset_link);
+ cset = link->cset;
+ }
} while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
- it->cset_link = l;
+ it->cset_pos = l;
if (!list_empty(&cset->tasks))
- it->task = cset->tasks.next;
+ it->task_pos = cset->tasks.next;
else
- it->task = cset->mg_tasks.next;
+ it->task_pos = cset->mg_tasks.next;
+
+ it->tasks_head = &cset->tasks;
+ it->mg_tasks_head = &cset->mg_tasks;
}
/**
@@ -2827,8 +3271,14 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
down_read(&css_set_rwsem);
- it->origin_css = css;
- it->cset_link = &css->cgroup->cset_links;
+ it->ss = css->ss;
+
+ if (it->ss)
+ it->cset_pos = &css->cgroup->e_csets[css->ss->id];
+ else
+ it->cset_pos = &css->cgroup->cset_links;
+
+ it->cset_head = it->cset_pos;
css_advance_task_iter(it);
}
@@ -2844,12 +3294,10 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
struct task_struct *css_task_iter_next(struct css_task_iter *it)
{
struct task_struct *res;
- struct list_head *l = it->task;
- struct cgrp_cset_link *link = list_entry(it->cset_link,
- struct cgrp_cset_link, cset_link);
+ struct list_head *l = it->task_pos;
/* If the iterator cg is NULL, we have no tasks */
- if (!it->cset_link)
+ if (!it->cset_pos)
return NULL;
res = list_entry(l, struct task_struct, cg_list);
@@ -2860,13 +3308,13 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
*/
l = l->next;
- if (l == &link->cset->tasks)
- l = link->cset->mg_tasks.next;
+ if (l == it->tasks_head)
+ l = it->mg_tasks_head->next;
- if (l == &link->cset->mg_tasks)
+ if (l == it->mg_tasks_head)
css_advance_task_iter(it);
else
- it->task = l;
+ it->task_pos = l;
return res;
}
@@ -3454,6 +3902,22 @@ static struct cftype cgroup_base_files[] = {
.flags = CFTYPE_ONLY_ON_ROOT,
.seq_show = cgroup_sane_behavior_show,
},
+ {
+ .name = "cgroup.controllers",
+ .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT,
+ .seq_show = cgroup_root_controllers_show,
+ },
+ {
+ .name = "cgroup.controllers",
+ .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_controllers_show,
+ },
+ {
+ .name = "cgroup.subtree_control",
+ .flags = CFTYPE_ONLY_ON_DFL,
+ .seq_show = cgroup_subtree_control_show,
+ .write_string = cgroup_subtree_control_write,
+ },
/*
* Historical crazy stuff. These don't have "cgroup." prefix and
@@ -3621,7 +4085,9 @@ static void offline_css(struct cgroup_subsys_state *css)
css->flags &= ~CSS_ONLINE;
css->cgroup->nr_css--;
- RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css);
+ RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
+
+ wake_up_all(&css->cgroup->offline_waitq);
}
/**
@@ -3662,8 +4128,6 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
cgroup_get(cgrp);
css_get(css->parent);
- cgrp->subsys_mask |= 1 << ss->id;
-
if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
parent->parent) {
pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
@@ -3699,13 +4163,6 @@ static long cgroup_create(struct cgroup *parent, const char *name,
struct cgroup_subsys *ss;
struct kernfs_node *kn;
- /*
- * XXX: The default hierarchy isn't fully implemented yet. Block
- * !root cgroup creation on it for now.
- */
- if (root == &cgrp_dfl_root)
- return -EINVAL;
-
/* allocate the cgroup and its ID, 0 is reserved for the root */
cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
<