From ea8fd3b47ff4ed4b1b5942bf3e0cb8d8f590ec59 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:14 -0400 Subject: cgroup: cgroup_apply_cftypes() shouldn't skip the default hierarhcy cgroup_apply_cftypes() skip creating or removing files if the subsystem is attached to the default hierarchy, which led to missing files in the root of the default hierarchy. Skipping made sense when the default hierarchy was dummy; however, now that the default hierarchy is full functional and planned to be used as the unified hierarchy, it shouldn't be skipped over. Reported-by: Li Zefan Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 11a03d67635a..a6894272353b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2436,10 +2436,6 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) lockdep_assert_held(&cgroup_tree_mutex); - /* don't bother if @ss isn't attached */ - if (ss->root == &cgrp_dfl_root) - return 0; - /* add/rm files for all cgroups created before */ css_for_each_descendant_pre(css, cgroup_css(root, ss)) { struct cgroup *cgrp = css->cgroup; -- cgit v1.2.3 From f392e51cd6ae6f6ee5b9b6d611cdc282b4c1711e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:14 -0400 Subject: cgroup: update cgroup->subsys_mask to ->child_subsys_mask and restore cgroup_root->subsys_mask 944196278d3d ("cgroup: move ->subsys_mask from cgroupfs_root to cgroup") moved ->subsys_mask from cgroup_root to cgroup to prepare for the unified hierarhcy; however, it turns out that carrying the subsys_mask of the children in the parent, instead of itself, is a lot more natural. This patch restores cgroup_root->subsys_mask and morphs cgroup->subsys_mask into cgroup->child_subsys_mask. * Uses of root->cgrp.subsys_mask are restored to root->subsys_mask. * Remove automatic setting and clearing of cgrp->subsys_mask and instead just inherit ->child_subsys_mask from the parent during cgroup creation. Note that this doesn't affect any current behaviors. * Undo __kill_css() separation. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 64 ++++++++++++++++++++++++--------------------------------- 1 file changed, 27 insertions(+), 37 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a6894272353b..f944619077f4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -529,7 +529,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, * won't change, so no need for locking. */ for_each_subsys(ss, i) { - if (root->cgrp.subsys_mask & (1UL << i)) { + if (root->subsys_mask & (1UL << i)) { /* Subsystem is in this hierarchy. So we want * the subsystem state from the new * cgroup */ @@ -742,7 +742,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) BUG_ON(!list_empty(&cgrp->children)); /* Rebind all subsystems back to the default hierarchy */ - rebind_subsystems(&cgrp_dfl_root, cgrp->subsys_mask); + rebind_subsystems(&cgrp_dfl_root, root->subsys_mask); /* * Release all the links from cset_links to this hierarchy's @@ -1050,8 +1050,11 @@ static int rebind_subsystems(struct cgroup_root *dst_root, ss->root = dst_root; css->cgroup = &dst_root->cgrp; - src_root->cgrp.subsys_mask &= ~(1 << ssid); - dst_root->cgrp.subsys_mask |= 1 << ssid; + src_root->subsys_mask &= ~(1 << ssid); + src_root->cgrp.child_subsys_mask &= ~(1 << ssid); + + dst_root->subsys_mask |= 1 << ssid; + dst_root->cgrp.child_subsys_mask |= 1 << ssid; if (ss->bind) ss->bind(css); @@ -1069,7 +1072,7 @@ static int cgroup_show_options(struct seq_file *seq, int ssid; for_each_subsys(ss, ssid) - if (root->cgrp.subsys_mask & (1 << ssid)) + if (root->subsys_mask & (1 << ssid)) seq_printf(seq, ",%s", ss->name); if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) seq_puts(seq, ",sane_behavior"); @@ -1273,12 +1276,12 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) if (ret) goto out_unlock; - if (opts.subsys_mask != root->cgrp.subsys_mask || opts.release_agent) + if (opts.subsys_mask != root->subsys_mask || opts.release_agent) pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", task_tgid_nr(current), current->comm); - added_mask = opts.subsys_mask & ~root->cgrp.subsys_mask; - removed_mask = root->cgrp.subsys_mask & ~opts.subsys_mask; + added_mask = opts.subsys_mask & ~root->subsys_mask; + removed_mask = root->subsys_mask & ~opts.subsys_mask; /* Don't allow flags or name to change at remount */ if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || @@ -1535,7 +1538,7 @@ retry: * subsystems) then they must match. */ if ((opts.subsys_mask || opts.none) && - (opts.subsys_mask != root->cgrp.subsys_mask)) { + (opts.subsys_mask != root->subsys_mask)) { if (!name_match) continue; ret = -EBUSY; @@ -3658,8 +3661,6 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) cgroup_get(cgrp); css_get(css->parent); - cgrp->subsys_mask |= 1 << ss->id; - if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && parent->parent) { pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", @@ -3780,13 +3781,15 @@ static long cgroup_create(struct cgroup *parent, const char *name, /* let's create and online css's */ for_each_subsys(ss, ssid) { - if (root->cgrp.subsys_mask & (1 << ssid)) { + if (parent->child_subsys_mask & (1 << ssid)) { err = create_css(cgrp, ss); if (err) goto err_destroy; } } + cgrp->child_subsys_mask = parent->child_subsys_mask; + kernfs_activate(kn); mutex_unlock(&cgroup_mutex); @@ -3882,7 +3885,16 @@ static void css_killed_ref_fn(struct percpu_ref *ref) queue_work(cgroup_destroy_wq, &css->destroy_work); } -static void __kill_css(struct cgroup_subsys_state *css) +/** + * kill_css - destroy a css + * @css: css to destroy + * + * This function initiates destruction of @css by removing cgroup interface + * files and putting its base reference. ->css_offline() will be invoked + * asynchronously once css_tryget() is guaranteed to fail and when the + * reference count reaches zero, @css will be released. + */ +static void kill_css(struct cgroup_subsys_state *css) { lockdep_assert_held(&cgroup_tree_mutex); @@ -3911,28 +3923,6 @@ static void __kill_css(struct cgroup_subsys_state *css) percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); } -/** - * kill_css - destroy a css - * @css: css to destroy - * - * This function initiates destruction of @css by removing cgroup interface - * files and putting its base reference. ->css_offline() will be invoked - * asynchronously once css_tryget() is guaranteed to fail and when the - * reference count reaches zero, @css will be released. - */ -static void kill_css(struct cgroup_subsys_state *css) -{ - struct cgroup *cgrp = css->cgroup; - - lockdep_assert_held(&cgroup_tree_mutex); - - /* if already killed, noop */ - if (cgrp->subsys_mask & (1 << css->ss->id)) { - cgrp->subsys_mask &= ~(1 << css->ss->id); - __kill_css(css); - } -} - /** * cgroup_destroy_locked - the first stage of cgroup destruction * @cgrp: cgroup to be destroyed @@ -4145,7 +4135,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) BUG_ON(online_css(css)); - cgrp_dfl_root.cgrp.subsys_mask |= 1 << ss->id; + cgrp_dfl_root.subsys_mask |= 1 << ss->id; mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_tree_mutex); @@ -4302,7 +4292,7 @@ int proc_cgroup_show(struct seq_file *m, void *v) seq_printf(m, "%d:", root->hierarchy_id); for_each_subsys(ss, ssid) - if (root->cgrp.subsys_mask & (1 << ssid)) + if (root->subsys_mask & (1 << ssid)) seq_printf(m, "%s%s", count++ ? "," : "", ss->name); if (strlen(root->name)) seq_printf(m, "%sname=%s", count ? "," : "", -- cgit v1.2.3 From aec3dfcb2e43892180ee053e8c260dcdeccf4392 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:14 -0400 Subject: cgroup: introduce effective cgroup_subsys_state In the planned default unified hierarchy, controllers may get dynamically attached to and detached from a cgroup and a cgroup may not have csses for all the controllers associated with the hierarchy. When a cgroup doesn't have its own css for a given controller, the css of the nearest ancestor with the controller enabled will be used, which is called the effective css. This patch introduces cgroup_e_css() and for_each_e_css() to access the effective csses and convert compare_css_sets(), find_existing_css_set() and cgroup_migrate() to use the effective csses so that they can handle cgroups with partial csses correctly. This means that for two css_sets to be considered identical, they should have both matching csses and cgroups. compare_css_sets() already compares both, not for correctness but for optimization. As this now becomes a matter of correctness, update the comments accordingly. For all !default hierarchies, cgroup_e_css() always equals cgroup_css(), so this patch doesn't change behavior. While at it, fix incorrect locking comment for for_each_css(). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 83 ++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 64 insertions(+), 19 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f944619077f4..4eb2dd1bb5b1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -208,6 +208,34 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, return &cgrp->dummy_css; } +/** + * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem + * @cgrp: the cgroup of interest + * @ss: the subsystem of interest (%NULL returns the dummy_css) + * + * Similar to cgroup_css() but returns the effctive css, which is defined + * as the matching css of the nearest ancestor including self which has @ss + * enabled. If @ss is associated with the hierarchy @cgrp is on, this + * function is guaranteed to return non-NULL css. + */ +static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) +{ + lockdep_assert_held(&cgroup_mutex); + + if (!ss) + return &cgrp->dummy_css; + + if (!(cgrp->root->subsys_mask & (1 << ss->id))) + return NULL; + + while (cgrp->parent && + !(cgrp->parent->child_subsys_mask & (1 << ss->id))) + cgrp = cgrp->parent; + + return cgroup_css(cgrp, ss); +} + /* convenient tests for these bits */ static inline bool cgroup_is_dead(const struct cgroup *cgrp) { @@ -273,7 +301,7 @@ static int notify_on_release(const struct cgroup *cgrp) * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end * @cgrp: the target cgroup to iterate css's of * - * Should be called under cgroup_mutex. + * Should be called under cgroup_[tree_]mutex. */ #define for_each_css(css, ssid, cgrp) \ for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ @@ -283,6 +311,20 @@ static int notify_on_release(const struct cgroup *cgrp) lockdep_is_held(&cgroup_mutex)))) { } \ else +/** + * for_each_e_css - iterate all effective css's of a cgroup + * @css: the iteration cursor + * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end + * @cgrp: the target cgroup to iterate css's of + * + * Should be called under cgroup_[tree_]mutex. + */ +#define for_each_e_css(css, ssid, cgrp) \ + for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ + if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \ + ; \ + else + /** * for_each_subsys - iterate all enabled cgroup subsystems * @ss: the iteration cursor @@ -452,20 +494,20 @@ static bool compare_css_sets(struct css_set *cset, { struct list_head *l1, *l2; - if (memcmp(template, cset->subsys, sizeof(cset->subsys))) { - /* Not all subsystems matched */ + /* + * On the default hierarchy, there can be csets which are + * associated with the same set of cgroups but different csses. + * Let's first ensure that csses match. + */ + if (memcmp(template, cset->subsys, sizeof(cset->subsys))) return false; - } /* * Compare cgroup pointers in order to distinguish between - * different cgroups in heirarchies with no subsystems. We - * could get by with just this check alone (and skip the - * memcmp above) but on most setups the memcmp check will - * avoid the need for this more expensive check on almost all - * candidates. + * different cgroups in hierarchies. As different cgroups may + * share the same effective css, this comparison is always + * necessary. */ - l1 = &cset->cgrp_links; l2 = &old_cset->cgrp_links; while (1) { @@ -530,13 +572,16 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, */ for_each_subsys(ss, i) { if (root->subsys_mask & (1UL << i)) { - /* Subsystem is in this hierarchy. So we want - * the subsystem state from the new - * cgroup */ - template[i] = cgroup_css(cgrp, ss); + /* + * @ss is in this hierarchy, so we want the + * effective css from @cgrp. + */ + template[i] = cgroup_e_css(cgrp, ss); } else { - /* Subsystem is not in this hierarchy, so we - * don't want to change the subsystem state */ + /* + * @ss is not in this hierarchy, so we don't want + * to change the css. + */ template[i] = old_cset->subsys[i]; } } @@ -1969,7 +2014,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, return 0; /* check that we can legitimately attach to the cgroup */ - for_each_css(css, i, cgrp) { + for_each_e_css(css, i, cgrp) { if (css->ss->can_attach) { ret = css->ss->can_attach(css, &tset); if (ret) { @@ -1999,7 +2044,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, */ tset.csets = &tset.dst_csets; - for_each_css(css, i, cgrp) + for_each_e_css(css, i, cgrp) if (css->ss->attach) css->ss->attach(css, &tset); @@ -2007,7 +2052,7 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader, goto out_release_tset; out_cancel_attach: - for_each_css(css, i, cgrp) { + for_each_e_css(css, i, cgrp) { if (css == failed_css) break; if (css->ss->cancel_attach) -- cgit v1.2.3 From 2d8f243a5e6efa57fb7c46fe83fafa45b33d0ec2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:15 -0400 Subject: cgroup: implement cgroup->e_csets[] On the default unified hierarchy, a cgroup may be associated with csses of its ancestors, which means that a css of a given cgroup may be associated with css_sets of descendant cgroups. This means that we can't walk all tasks associated with a css by iterating the css_sets associated with the cgroup as there are css_sets which are pointing to the css but linked on the descendants. This patch adds per-subsystem list heads cgroup->e_csets[]. Any css_set which is pointing to a css is linked to css->cgroup->e_csets[$SUBSYS_ID] through css_set->e_cset_node[$SUBSYS_ID]. The lists are protected by css_set_rwsem and will allow us to walk all css_sets associated with a given css so that we can find out all associated tasks. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4eb2dd1bb5b1..37d966289978 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -425,6 +425,8 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) static void put_css_set_locked(struct css_set *cset, bool taskexit) { struct cgrp_cset_link *link, *tmp_link; + struct cgroup_subsys *ss; + int ssid; lockdep_assert_held(&css_set_rwsem); @@ -432,6 +434,8 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit) return; /* This css_set is dead. unlink it and release cgroup refcounts */ + for_each_subsys(ss, ssid) + list_del(&cset->e_cset_node[ssid]); hash_del(&cset->hlist); css_set_count--; @@ -673,7 +677,9 @@ static struct css_set *find_css_set(struct css_set *old_cset, struct css_set *cset; struct list_head tmp_links; struct cgrp_cset_link *link; + struct cgroup_subsys *ss; unsigned long key; + int ssid; lockdep_assert_held(&cgroup_mutex); @@ -724,10 +730,14 @@ static struct css_set *find_css_set(struct css_set *old_cset, css_set_count++; - /* Add this cgroup group to the hash table */ + /* Add @cset to the hash table */ key = css_set_hash(cset->subsys); hash_add(css_set_table, &cset->hlist, key); + for_each_subsys(ss, ssid) + list_add_tail(&cset->e_cset_node[ssid], + &cset->subsys[ssid]->cgroup->e_csets[ssid]); + up_write(&css_set_rwsem); return cset; @@ -1028,7 +1038,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned long ss_mask) { struct cgroup_subsys *ss; - int ssid, ret; + int ssid, i, ret; lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); @@ -1081,6 +1091,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, for_each_subsys(ss, ssid) { struct cgroup_root *src_root; struct cgroup_subsys_state *css; + struct css_set *cset; if (!(ss_mask & (1 << ssid))) continue; @@ -1095,6 +1106,12 @@ static int rebind_subsystems(struct cgroup_root *dst_root, ss->root = dst_root; css->cgroup = &dst_root->cgrp; + down_write(&css_set_rwsem); + hash_for_each(css_set_table, i, cset, hlist) + list_move_tail(&cset->e_cset_node[ss->id], + &dst_root->cgrp.e_csets[ss->id]); + up_write(&css_set_rwsem); + src_root->subsys_mask &= ~(1 << ssid); src_root->cgrp.child_subsys_mask &= ~(1 << ssid); @@ -1417,6 +1434,9 @@ out_unlock: static void init_cgroup_housekeeping(struct cgroup *cgrp) { + struct cgroup_subsys *ss; + int ssid; + atomic_set(&cgrp->refcnt, 1); INIT_LIST_HEAD(&cgrp->sibling); INIT_LIST_HEAD(&cgrp->children); @@ -1425,6 +1445,9 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); cgrp->dummy_css.cgroup = cgrp; + + for_each_subsys(ss, ssid) + INIT_LIST_HEAD(&cgrp->e_csets[ssid]); } static void init_cgroup_root(struct cgroup_root *root, @@ -4249,6 +4272,9 @@ int __init cgroup_init(void) if (!ss->early_init) cgroup_init_subsys(ss); + list_add_tail(&init_css_set.e_cset_node[ssid], + &cgrp_dfl_root.cgrp.e_csets[ssid]); + /* * cftype registration needs kmalloc and can't be done * during early_init. Register base cftypes separately. -- cgit v1.2.3 From 3b281afbc3a06cd69c54e6db1a04a8e73997723f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:15 -0400 Subject: cgroup: make css_next_child() skip missing csses css_next_child() walks the children of the specified css. It does this by finding the next cgroup and then returning the requested css. On the default unified hierarchy, a cgroup may not have a css associated with it even if the hierarchy has the subsystem enabled. This patch updates css_next_child() so that it skips children without the requested css associated. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 37d966289978..0edc186cd545 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2708,10 +2708,19 @@ css_next_child(struct cgroup_subsys_state *pos_css, break; } - if (&next->sibling == &cgrp->children) - return NULL; + /* + * @next, if not pointing to the head, can be dereferenced and is + * the next sibling; however, it might have @ss disabled. If so, + * fast-forward to the next enabled one. + */ + while (&next->sibling != &cgrp->children) { + struct cgroup_subsys_state *next_css = cgroup_css(next, parent_css->ss); - return cgroup_css(next, parent_css->ss); + if (next_css) + return next_css; + next = list_entry_rcu(next->sibling.next, struct cgroup, sibling); + } + return NULL; } /** -- cgit v1.2.3 From 0f0a2b4fa6210147131082999f1f16d7fb79abf8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:15 -0400 Subject: cgroup: reorganize css_task_iter This patch reorganizes css_task_iter so that adding effective css support is easier. * s/->cset_link/->cset_pos/ and s/->task/->task_pos/ for consistency * ->origin_css is used to determine whether the iteration reached the last css_set. Replace it with explicit ->cset_head so that css_advance_task_iter() doesn't have to know the termination condition directly. * css_task_iter_next() currently assumes that it's walking list of cgrp_cset_link and reaches into the current cset through the current link to determine the termination conditions for task walking. As this won't always be true for effective css walking, add ->tasks_head and ->mg_tasks_head and use them to control task walking so that css_task_iter_next() doesn't have to know how css_sets are being walked. This patch doesn't make any behavior changes. The iteration logic stays unchanged after the patch. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0edc186cd545..d48163b26196 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2857,27 +2857,30 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, */ static void css_advance_task_iter(struct css_task_iter *it) { - struct list_head *l = it->cset_link; + struct list_head *l = it->cset_pos; struct cgrp_cset_link *link; struct css_set *cset; /* Advance to the next non-empty css_set */ do { l = l->next; - if (l == &it->origin_css->cgroup->cset_links) { - it->cset_link = NULL; + if (l == it->cset_head) { + it->cset_pos = NULL; return; } link = list_entry(l, struct cgrp_cset_link, cset_link); cset = link->cset; } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks)); - it->cset_link = l; + it->cset_pos = l; if (!list_empty(&cset->tasks)) - it->task = cset->tasks.next; + it->task_pos = cset->tasks.next; else - it->task = cset->mg_tasks.next; + it->task_pos = cset->mg_tasks.next; + + it->tasks_head = &cset->tasks; + it->mg_tasks_head = &cset->mg_tasks; } /** @@ -2903,8 +2906,8 @@ void css_task_iter_start(struct cgroup_subsys_state *css, down_read(&css_set_rwsem); - it->origin_css = css; - it->cset_link = &css->cgroup->cset_links; + it->cset_pos = &css->cgroup->cset_links; + it->cset_head = it->cset_pos; css_advance_task_iter(it); } @@ -2920,12 +2923,10 @@ void css_task_iter_start(struct cgroup_subsys_state *css, struct task_struct *css_task_iter_next(struct css_task_iter *it) { struct task_struct *res; - struct list_head *l = it->task; - struct cgrp_cset_link *link = list_entry(it->cset_link, - struct cgrp_cset_link, cset_link); + struct list_head *l = it->task_pos; /* If the iterator cg is NULL, we have no tasks */ - if (!it->cset_link) + if (!it->cset_pos) return NULL; res = list_entry(l, struct task_struct, cg_list); @@ -2936,13 +2937,13 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) */ l = l->next; - if (l == &link->cset->tasks) - l = link->cset->mg_tasks.next; + if (l == it->tasks_head) + l = it->mg_tasks_head->next; - if (l == &link->cset->mg_tasks) + if (l == it->mg_tasks_head) css_advance_task_iter(it); else - it->task = l; + it->task_pos = l; return res; } -- cgit v1.2.3 From 3ebb2b6ef38875b866ec0118bfae7bc52afd0166 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:15 -0400 Subject: cgroup: teach css_task_iter about effective csses Currently, css_task_iter iterates tasks associated with a css by visiting each css_set associated with the owning cgroup and walking tasks of each of them. This works fine for !unified hierarchies as each cgroup has its own css for each associated subsystem on the hierarchy; however, on the planned unified hierarchy, a cgroup may not have csses associated and its tasks would be considered associated with the matching css of the nearest ancestor which has the subsystem enabled. This means that on the default unified hierarchy, just walking all tasks associated with a cgroup isn't enough to walk all tasks which are associated with the specified css. If any of its children doesn't have the matching css enabled, task iteration should also include all tasks from the subtree. We already added cgroup->e_csets[] to list all css_sets effectively associated with a given css and walk css_sets on that list instead to achieve such iteration. This patch updates css_task_iter iteration such that it walks css_sets on cgroup->e_csets[] instead of cgroup->cset_links if iteration is requested on an non-dummy css. Thanks to the previous iteration update, this change can be achieved with the addition of css_task_iter->ss and minimal updates to css_advance_task_iter() and css_task_iter_start(). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d48163b26196..ad28866ed44c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2868,8 +2868,14 @@ static void css_advance_task_iter(struct css_task_iter *it) it->cset_pos = NULL; return; } - link = list_entry(l, struct cgrp_cset_link, cset_link); - cset = link->cset; + + if (it->ss) { + cset = container_of(l, struct css_set, + e_cset_node[it->ss->id]); + } else { + link = list_entry(l, struct cgrp_cset_link, cset_link); + cset = link->cset; + } } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks)); it->cset_pos = l; @@ -2906,7 +2912,13 @@ void css_task_iter_start(struct cgroup_subsys_state *css, down_read(&css_set_rwsem); - it->cset_pos = &css->cgroup->cset_links; + it->ss = css->ss; + + if (it->ss) + it->cset_pos = &css->cgroup->e_csets[css->ss->id]; + else + it->cset_pos = &css->cgroup->cset_links; + it->cset_head = it->cset_pos; css_advance_task_iter(it); -- cgit v1.2.3 From e32978031016f56be977a9a856ba4d9f447db51f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:15 -0400 Subject: cgroup: cgroup->subsys[] should be cleared after the css is offlined After a css finishes offlining, offline_css() mistakenly performs RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css) which just sets the cgroup->subsys[] pointer to the current value. The intention was to clear it after offline is complete, not reassign the same value. Update it to assign NULL instead of the current value. This makes cgroup_css() to return NULL once offline is complete. All the existing users of the function either can handle NULL return already or guarantee that the css doesn't get offlined. While this is a bugfix, as css lifetime is currently tied to the cgroup it belongs to, this bug doesn't cause any actual problems. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ad28866ed44c..83a8fff43d68 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3710,7 +3710,7 @@ static void offline_css(struct cgroup_subsys_state *css) css->flags &= ~CSS_ONLINE; css->cgroup->nr_css--; - RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css); + RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); } /** -- cgit v1.2.3 From bd53d617b34c781dac8e22dbc75e8f182d918ecf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:16 -0400 Subject: cgroup: allow cgroup creation and suppress automatic css creation in the unified hierarchy Now that effective css handling has been added and iterators updated accordingly, it's safe to allow cgroup creation in the default hierarchy. Unblock cgroup creation in the default hierarchy. As the default hierarchy will implement explicit enabling and disabling of controllers on each cgroup, suppress automatic css enabling on cgroup creation. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 83a8fff43d68..2a4f88db3205 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1115,8 +1115,10 @@ static int rebind_subsystems(struct cgroup_root *dst_root, src_root->subsys_mask &= ~(1 << ssid); src_root->cgrp.child_subsys_mask &= ~(1 << ssid); + /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; - dst_root->cgrp.child_subsys_mask |= 1 << ssid; + if (dst_root != &cgrp_dfl_root) + dst_root->cgrp.child_subsys_mask |= 1 << ssid; if (ss->bind) ss->bind(css); @@ -3786,13 +3788,6 @@ static long cgroup_create(struct cgroup *parent, const char *name, struct cgroup_subsys *ss; struct kernfs_node *kn; - /* - * XXX: The default hierarchy isn't fully implemented yet. Block - * !root cgroup creation on it for now. - */ - if (root == &cgrp_dfl_root) - return -EINVAL; - /* allocate the cgroup and its ID, 0 is reserved for the root */ cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); if (!cgrp) @@ -3878,7 +3873,12 @@ static long cgroup_create(struct cgroup *parent, const char *name, } } - cgrp->child_subsys_mask = parent->child_subsys_mask; + /* + * On the default hierarchy, a child doesn't automatically inherit + * child_subsys_mask from the parent. Each is configured manually. + */ + if (!cgroup_on_dfl(cgrp)) + cgrp->child_subsys_mask = parent->child_subsys_mask; kernfs_activate(kn); -- cgit v1.2.3 From 6803c006282768ec850760766a6e4eb1a6ff87df Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:16 -0400 Subject: cgroup: add css_set->dfl_cgrp To implement the unified hierarchy behavior, we'll need to be able to determine the associated cgroup on the default hierarchy from css_set. Let's add css_set->dfl_cgrp so that it can be accessed conveniently and efficiently. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2a4f88db3205..c66bfc8ee8a7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -651,6 +651,10 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, struct cgrp_cset_link *link; BUG_ON(list_empty(tmp_links)); + + if (cgroup_on_dfl(cgrp)) + cset->dfl_cgrp = cgrp; + link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); link->cset = cset; link->cgrp = cgrp; -- cgit v1.2.3 From 7fd8c565d8a501486d63d7ee07fd6582e97db437 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:16 -0400 Subject: cgroup: update subsystem rebind restrictions Because the default root couldn't have any non-root csses attached to it, rebinding away from it was always allowed; however, the default hierarchy will soon host the unified hierarchy and have non-root csses so the rebind restrictions need to be updated accordingly. Instead of special casing rebinding from the default hierarchy and then checking whether the source hierarchy has children cgroups, which implies non-root csses for !dfl hierarchies, simply check whether the source hierarchy has non-root csses for the subsystem using css_next_child(). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c66bfc8ee8a7..15eb2273d80b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1051,16 +1051,12 @@ static int rebind_subsystems(struct cgroup_root *dst_root, if (!(ss_mask & (1 << ssid))) continue; - /* if @ss is on the dummy_root, we can always move it */ - if (ss->root == &cgrp_dfl_root) - continue; - - /* if @ss has non-root cgroups attached to it, can't move */ - if (!list_empty(&ss->root->cgrp.children)) + /* if @ss has non-root csses attached to it, can't move */ + if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss))) return -EBUSY; /* can't move between two non-dummy roots either */ - if (dst_root != &cgrp_dfl_root) + if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root) return -EBUSY; } -- cgit v1.2.3 From f817de98513d060023be4fa1d061b29a6515273e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:16 -0400 Subject: cgroup: prepare migration path for unified hierarchy Unified hierarchy implementation would require re-migrating tasks onto the same cgroup on the default hierarchy to reflect updated effective csses. Update cgroup_migrate_prepare_dst() so that it accepts NULL as the destination cgrp. When NULL is specified, the destination is considered to be the cgroup on the default hierarchy associated with each css_set. After this change, the identity check in cgroup_migrate_add_src() isn't sufficient for noop detection as the associated csses may change without any cgroup association changing. The only way to tell whether a migration is noop or not is testing whether the source and destination csets are identical. The noop check in cgroup_migrate_add_src() is removed and cset identity test is added to cgroup_migreate_prepare_dst(). If it's detected that source and destination csets are identical, the cset is removed removed from @preloaded_csets and all the migration nodes are cleared which makes cgroup_migrate() ignore the cset. Also, make the function append the destination css_sets to @preloaded_list so that destination css_sets always come after source css_sets. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 15eb2273d80b..8c2835a9e192 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1902,10 +1902,6 @@ static void cgroup_migrate_add_src(struct css_set *src_cset, src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); - /* nothing to do if this cset already belongs to the cgroup */ - if (src_cgrp == dst_cgrp) - return; - if (!list_empty(&src_cset->mg_preload_node)) return; @@ -1920,13 +1916,14 @@ static void cgroup_migrate_add_src(struct css_set *src_cset, /** * cgroup_migrate_prepare_dst - prepare destination css_sets for migration - * @dst_cgrp: the destination cgroup + * @dst_cgrp: the destination cgroup (may be %NULL) * @preloaded_csets: list of preloaded source css_sets * * Tasks are about to be moved to @dst_cgrp and all the source css_sets * have been preloaded to @preloaded_csets. This function looks up and - * pins all destination css_sets, links each to its source, and put them on - * @preloaded_csets. + * pins all destination css_sets, links each to its source, and append them + * to @preloaded_csets. If @dst_cgrp is %NULL, the destination of each + * source css_set is assumed to be its cgroup on the default hierarchy. * * This function must be called after cgroup_migrate_add_src() has been * called on each migration source css_set. After migration is performed @@ -1937,19 +1934,34 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, struct list_head *preloaded_csets) { LIST_HEAD(csets); - struct css_set *src_cset; + struct css_set *src_cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); /* look up the dst cset for each src cset and link it to src */ - list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) { + list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) { struct css_set *dst_cset; - dst_cset = find_css_set(src_cset, dst_cgrp); + dst_cset = find_css_set(src_cset, + dst_cgrp ?: src_cset->dfl_cgrp); if (!dst_cset) goto err; WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); + + /* + * If src cset equals dst, it's noop. Drop the src. + * cgroup_migrate() will skip the cset too. Note that we + * can't handle src == dst as some nodes are used by both. + */ + if (src_cset == dst_cset) { + src_cset->mg_src_cgrp = NULL; + list_del_init(&src_cset->mg_preload_node); + put_css_set(src_cset, false); + put_css_set(dst_cset, false); + continue; + } + src_cset->mg_dst_cset = dst_cset; if (list_empty(&dst_cset->mg_preload_node)) @@ -1958,7 +1970,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, put_css_set(dst_cset, false); } - list_splice(&csets, preloaded_csets); + list_splice_tail(&csets, preloaded_csets); return 0; err: cgroup_migrate_finish(&csets); -- cgit v1.2.3 From f8f22e53a262ebee37fc98004f16b066cf5bc125 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 Apr 2014 11:13:16 -0400 Subject: cgroup: implement dynamic subtree controller enable/disable on the default hierarchy cgroup is switching away from multiple hierarchies and will use one unified default hierarchy where controllers can be dynamically enabled and disabled per subtree. The default hierarchy will serve as the unified hierarchy to which all controllers are attached and a css on the default hierarchy would need to also serve the tasks of descendant cgroups which don't have the controller enabled - ie. the tree may be collapsed from leaf towards root when viewed from specific controllers. This has been implemented through effective css in the previous patches. This patch finally implements dynamic subtree controller enable/disable on the default hierarchy via a new knob - "cgroup.subtree_control" which controls which controllers are enabled on the child cgroups. Let's assume a hierarchy like the following. root - A - B - C \ D root's "cgroup.subtree_control" determines which controllers are enabled on A. A's on B. B's on C and D. This coincides with the fact that controllers on the immediate sub-level are used to distribute the resources of the parent. In fact, it's natural to assume that resource control knobs of a child belong to its parent. Enabling a controller in "cgroup.subtree_control" declares that distribution of the respective resources of the cgroup will be controlled. Note that this means that controller enable states are shared among siblings. The default hierarchy has an extra restriction - only cgroups which don't contain any task may have controllers enabled in "cgroup.subtree_control". Combined with the other properties of the default hierarchy, this guarantees that, from the view point of controllers, tasks are only on the leaf cgroups. In other words, only leaf csses may contain tasks. This rules out situations where child cgroups compete against internal tasks of the parent, which is a competition between two different types of entities without any clear way to determine resource distribution between the two. Different controllers handle it differently and all the implemented behaviors are ambiguous, ad-hoc, cumbersome and/or just wrong. Having this structural constraints imposed from cgroup core removes the burden from controller implementations and enables showing one consistent behavior across all controllers. When a controller is enabled or disabled, css associations for the controller in the subtrees of each child should be updated. After enabling, the whole subtree of a child should point to the new css of the child. After disabling, the whole subtree of a child should point to the cgroup's css. This is implemented by first updating cgroup states such that cgroup_e_css() result points to the appropriate css and then invoking cgroup_update_dfl_csses() which migrates all tasks in the affected subtrees to the self cgroup on the default hierarchy. * When read, "cgroup.subtree_control" lists all the currently enabled controllers on the children of the cgroup. * White-space separated list of controller names prefixed with either '+' or '-' can be written to "cgroup.subtree_control". The ones prefixed with '+' are enabled on the controller and '-' disabled. * A controller can be enabled iff the parent's "cgroup.subtree_control" enables it and disabled iff no child's "cgroup.subtree_control" has it enabled. * If a cgroup has tasks, no controller can be enabled via "cgroup.subtree_control". Likewise, if "cgroup.subtree_control" has some controllers enabled, tasks can't be migrated into the cgroup. * All controllers which aren't bound on other hierarchies are automatically associated with the root cgroup of the default hierarchy. All the controllers which are bound to the default hierarchy are listed in the read-only file "cgroup.controllers" in the root directory. * "cgroup.controllers" in all non-root cgroups is read-only file whose content is equal to that of "cgroup.subtree_control" of the parent. This indicates which controllers can be used in the cgroup's "cgroup.subtree_control". This is still experimental and there are some holes, one of which is that ->can_attach() failure during cgroup_update_dfl_csses() may leave the cgroups in an undefined state. The issues will be addressed by future patches. v2: Non-root cgroups now also have "cgroup.controllers". Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 367 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 365 insertions(+), 2 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8c2835a9e192..809dd903ceb8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -182,6 +182,8 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned long ss_mask); static void cgroup_destroy_css_killed(struct cgroup *cgrp); static int cgroup_destroy_locked(struct cgroup *cgrp); +static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss); +static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); @@ -338,6 +340,14 @@ static int notify_on_release(const struct cgroup *cgrp) #define for_each_root(root) \ list_for_each_entry((root), &cgroup_roots, root_list) +/* iterate over child cgrps, lock should be held throughout iteration */ +#define cgroup_for_each_live_child(child, cgrp) \ + list_for_each_entry((child), &(cgrp)->children, sibling) \ + if (({ lockdep_assert_held(&cgroup_tree_mutex); \ + cgroup_is_dead(child); })) \ + ; \ + else + /** * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. * @cgrp: the cgroup to be checked for liveness @@ -1450,6 +1460,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) for_each_subsys(ss, ssid) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); + + init_waitqueue_head(&cgrp->offline_waitq); } static void init_cgroup_root(struct cgroup_root *root, @@ -1938,6 +1950,14 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, lockdep_assert_held(&cgroup_mutex); + /* + * Except for the root, child_subsys_mask must be zero for a cgroup + * with tasks so that child cgroups don't compete against tasks. + */ + if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && dst_cgrp->parent && + dst_cgrp->child_subsys_mask) + return -EBUSY; + /* look up the dst cset for each src cset and link it to src */ list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) { struct css_set *dst_cset; @@ -2303,6 +2323,326 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) return 0; } +static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask) +{ + struct cgroup_subsys *ss; + bool printed = false; + int ssid; + + for_each_subsys(ss, ssid) { + if (ss_mask & (1 << ssid)) { + if (printed) + seq_putc(seq, ' '); + seq_printf(seq, "%s", ss->name); + printed = true; + } + } + if (printed) + seq_putc(seq, '\n'); +} + +/* show controllers which are currently attached to the default hierarchy */ +static int cgroup_root_controllers_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + + cgroup_print_ss_mask(seq, cgrp->root->subsys_mask); + return 0; +} + +/* show controllers which are enabled from the parent */ +static int cgroup_controllers_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + + cgroup_print_ss_mask(seq, cgrp->parent->child_subsys_mask); + return 0; +} + +/* show controllers which are enabled for a given cgroup's children */ +static int cgroup_subtree_control_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + + cgroup_print_ss_mask(seq, cgrp->child_subsys_mask); + return 0; +} + +/** + * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy + * @cgrp: root of the subtree to update csses for + * + * @cgrp's child_subsys_mask has changed and its subtree's (self excluded) + * css associations need to be updated accordingly. This function looks up + * all css_sets which are attached to the subtree, creates the matching + * updated css_sets and migrates the tasks to the new ones. + */ +static int cgroup_update_dfl_csses(struct cgroup *cgrp) +{ + LIST_HEAD(preloaded_csets); + struct cgroup_subsys_state *css; + struct css_set *src_cset; + int ret; + + lockdep_assert_held(&cgroup_tree_mutex); + lockdep_assert_held(&cgroup_mutex); + + /* look up all csses currently attached to @cgrp's subtree */ + down_read(&css_set_rwsem); + css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { + struct cgrp_cset_link *link; + + /* self is not affected by child_subsys_mask change */ + if (css->cgroup == cgrp) + continue; + + list_for_each_entry(link, &css->cgroup->cset_links, cset_link) + cgroup_migrate_add_src(link->cset, cgrp, + &preloaded_csets); + } + up_read(&css_set_rwsem); + + /* NULL dst indicates self on default hierarchy */ + ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets); + if (ret) + goto out_finish; + + list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) { + struct task_struct *last_task = NULL, *task; + + /* src_csets precede dst_csets, break on the first dst_cset */ + if (!src_cset->mg_src_cgrp) + break; + + /* + * All tasks in src_cset need to be migrated to the + * matching dst_cset. Empty it process by process. We + * walk tasks but migrate processes. The leader might even + * belong to a different cset but such src_cset would also + * be among the target src_csets because the default + * hierarchy enforces per-process membership. + */ + while (true) { + down_read(&css_set_rwsem); + task = list_first_entry_or_null(&src_cset->tasks, + struct task_struct, cg_list); + if (task) { + task = task->group_leader; + WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp); + get_task_struct(task); + } + up_read(&css_set_rwsem); + + if (!task) + break; + + /* guard against possible infinite loop */ + if (WARN(last_task == task, + "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n")) + goto out_finish; + last_task = task; + + threadgroup_lock(task); + /* raced against de_thread() from another thread? */ + if (!thread_group_leader(task)) { + threadgroup_unlock(task); + put_task_struct(task); + continue; + } + + ret = cgroup_migrate(src_cset->dfl_cgrp, task, true); + + threadgroup_unlock(task); + put_task_struct(task); + + if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret)) + goto out_finish; + } + } + +out_finish: + cgroup_migrate_finish(&preloaded_csets); + return ret; +} + +/* change the enabled child controllers for a cgroup in the default hierarchy */ +static int cgroup_subtree_control_write(struct cgroup_subsys_state *dummy_css, + struct cftype *cft, char *buffer) +{ + unsigned long enable_req = 0, disable_req = 0, enable, disable; + struct cgroup *cgrp = dummy_css->cgroup, *child; + struct cgroup_subsys *ss; + char *tok, *p; + int ssid, ret; + + /* + * Parse input - white space separated list of subsystem names + * prefixed with either + or -. + */ + p = buffer; + while ((tok = strsep(&p, " \t\n"))) { + for_each_subsys(ss, ssid) { + if (ss->disabled || strcmp(tok + 1, ss->name)) + continue; + + if (*tok == '+') { + enable_req |= 1 << ssid; + disable_req &= ~(1 << ssid); + } else if (*tok == '-') { + disable_req |= 1 << ssid; + enable_req &= ~(1 << ssid); + } else { + return -EINVAL; + } + break; + } + if (ssid == CGROUP_SUBSYS_COUNT) + return -EINVAL; + } + + /* + * We're gonna grab cgroup_tree_mutex which nests outside kernfs + * active_ref. cgroup_lock_live_group() already provides enough + * protection. Ensure @cgrp stays accessible and break the + * active_ref protection. + */ + cgroup_get(cgrp); + kernfs_break_active_protection(cgrp->control_kn); +retry: + enable = enable_req; + disable = disable_req; + + mutex_lock(&cgroup_tree_mutex); + + for_each_subsys(ss, ssid) { + if (enable & (1 << ssid)) { + if (cgrp->child_subsys_mask & (1 << ssid)) { + enable &= ~(1 << ssid); + continue; + } + + /* + * Because css offlining is asynchronous, userland + * might try to re-enable the same controller while + * the previous instance is still around. In such + * cases, wait till it's gone using offline_waitq. + */ + cgroup_for_each_live_child(child, cgrp) { + wait_queue_t wait; + + if (!cgroup_css(child, ss)) + continue; + + prepare_to_wait(&child->offline_waitq, &wait, + TASK_UNINTERRUPTIBLE); + mutex_unlock(&cgroup_tree_mutex); + schedule(); + finish_wait(&child->offline_waitq, &wait); + goto retry; + } + + /* unavailable or not enabled on the parent? */ + if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) || + (cgrp->parent && + !(cgrp->parent->child_subsys_mask & (1 << ssid)))) { + ret = -ENOENT; + goto out_unlock_tree; + } + } else if (disable & (1 << ssid)) { + if (!(cgrp->child_subsys_mask & (1 << ssid))) { + disable &= ~(1 << ssid); + continue; + } + + /* a child has it enabled? */ + cgroup_for_each_live_child(child, cgrp) { + if (child->child_subsys_mask & (1 << ssid)) { + ret = -EBUSY; + goto out_unlock_tree; + } + } + } + } + + if (!enable && !disable) { + ret = 0; + goto out_unlock_tree; + } + + if (!cgroup_lock_live_group(cgrp)) { + ret = -ENODEV; + goto out_unlock_tree; + } + + /* + * Except for the root, child_subsys_mask must be zero for a cgroup + * with tasks so that child cgroups don't compete against tasks. + */ + if (enable && cgrp->parent && !list_empty(&cgrp->cset_links)) { + ret = -EBUSY; + goto out_unlock; + } + + /* + * Create csses for enables and update child_subsys_mask. This + * changes cgroup_e_css() results which in turn makes the + * subsequent cgroup_update_dfl_csses() associate all tasks in the + * subtree to the updated csses. + */ + for_each_subsys(ss, ssid) { + if (!(enable & (1 << ssid))) + continue; + + cgroup_for_each_live_child(child, cgrp) { + ret = create_css(child, ss); + if (ret) + goto err_undo_css; + } + } + + cgrp->child_subsys_mask |= enable; + cgrp->child_subsys_mask &= ~disable; + + ret = cgroup_update_dfl_csses(cgrp); + if (ret) + goto err_undo_css; + + /* all tasks are now migrated away from the old csses, kill them */ + for_each_subsys(ss, ssid) { + if (!(disable & (1 << ssid))) + continue; + + cgroup_for_each_live_child(child, cgrp) + kill_css(cgroup_css(child, ss)); + } + + kernfs_activate(cgrp->kn); + ret = 0; +out_unlock: + mutex_unlock(&cgroup_mutex); +out_unlock_tree: + mutex_unlock(&cgroup_tree_mutex); + kernfs_unbreak_active_protection(cgrp->control_kn); + cgroup_put(cgrp); + return ret; + +err_undo_css: + cgrp->child_subsys_mask &= ~enable; + cgrp->child_subsys_mask |= disable; + + for_each_subsys(ss, ssid) { + if (!(enable & (1 << ssid))) + continue; + + cgroup_for_each_live_child(child, cgrp) { + struct cgroup_subsys_state *css = cgroup_css(child, ss); + if (css) + kill_css(css); + } + } + goto out_unlock; +} + static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { @@ -2462,9 +2802,14 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) return PTR_ERR(kn); ret = cgroup_kn_set_ugid(kn); - if (ret) + if (ret) { kernfs_remove(kn); - return ret; + return ret; + } + + if (cft->seq_show == cgroup_subtree_control_show) + cgrp->control_kn = kn; + return 0; } /** @@ -3557,6 +3902,22 @@ static struct cftype cgroup_base_files[] = { .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = cgroup_sane_behavior_show, }, + { + .name = "cgroup.controllers", + .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT, + .seq_show = cgroup_root_controllers_show, + }, + { + .name = "cgroup.controllers", + .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_controllers_show, + }, + { + .name = "cgroup.subtree_control", + .flags = CFTYPE_ONLY_ON_DFL, + .seq_show = cgroup_subtree_control_show, + .write_string = cgroup_subtree_control_write, + }, /* * Historical crazy stuff. These don't have "cgroup." prefix and @@ -3725,6 +4086,8 @@ static void offline_css(struct cgroup_subsys_state *css) css->flags &= ~CSS_ONLINE; css->cgroup->nr_css--; RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); + + wake_up_all(&css->cgroup->offline_waitq); } /** -- cgit v1.2.3 From 842b597ee0a7e1aa5a3148164ffdba00ec17f614 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 25 Apr 2014 18:28:02 -0400 Subject: cgroup: implement cgroup.populated for the default hierarchy cgroup users often need a way to determine when a cgroup's subhierarchy becomes empty so that it can be cleaned up. cgroup currently provides release_agent for it; unfortunately, this mechanism is riddled with issues. * It delivers events by forking and execing a userland binary specified as the release_agent. This is a long deprecated method of notification delivery. It's extremely heavy, slow and cumbersome to integrate with larger infrastructure. * There is single monitoring point at the root. There's no way to delegate management of a subtree. * The event isn't recursive. It triggers when a cgroup doesn't have any tasks or child cgroups. Events for internal nodes trigger only after all children are removed. This again makes it impossible to delegate management of a subtree. * Events are filtered from the kernel side. "notify_on_release" file is used to subscribe to or suppress release event. This is unnecessarily complicated and probably done this way because event delivery itself was expensive. This patch implements interface file "cgroup.populated" which can be used to monitor whether the cgroup's subhierarchy has tasks in it or not. Its value is 0 if there is no task in the cgroup and its descendants; otherwise, 1, and kernfs_notify() notificaiton is triggers when the value changes, which can be monitored through poll and [di]notify. This is a lot ligther and simpler and trivially allows delegating management of subhierarchy - subhierarchy monitoring can block further propgation simply by putting itself or another process in the root of the subhierarchy and monitor events that it's interested in from there without interfering with monitoring higher in the tree. v2: Patch description updated as per Serge. v3: "cgroup.subtree_populated" renamed to "cgroup.populated". The subtree_ prefix was a bit confusing because "cgroup.subtree_control" uses it to denote the tree rooted at the cgroup sans the cgroup itself while the populated state includes the cgroup itself. Signed-off-by: Tejun Heo Acked-by: Serge Hallyn Acked-by: Li Zefan Cc: Lennart Poettering --- kernel/cgroup.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 61 insertions(+), 4 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 809dd903ceb8..0f986f7afee4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -411,6 +411,43 @@ static struct css_set init_css_set = { static int css_set_count = 1; /* 1 for init_css_set */ +/** + * cgroup_update_populated - updated populated count of a cgroup + * @cgrp: the target cgroup + * @populated: inc or dec populated count + * + * @cgrp is either getting the first task (css_set) or losing the last. + * Update @cgrp->populated_cnt accordingly. The count is propagated + * towards root so that a given cgroup's populated_cnt is zero iff the + * cgroup and all its descendants are empty. + * + * @cgrp's interface file "cgroup.populated" is zero if + * @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt + * changes from or to zero, userland is notified that the content of the + * interface file has changed. This can be used to detect when @cgrp and + * its descendants become populated or empty. + */ +static void cgroup_update_populated(struct cgroup *cgrp, bool populated) +{ + lockdep_assert_held(&css_set_rwsem); + + do { + bool trigger; + + if (populated) + trigger = !cgrp->populated_cnt++; + else + trigger = !--cgrp->populated_cnt; + + if (!trigger) + break; + + if (cgrp->populated_kn) + kernfs_notify(cgrp->populated_kn); + cgrp = cgrp->parent; + } while (cgrp); +} + /* * hash table for cgroup groups. This improves the performance to find * an existing css_set. This hash doesn't (currently) take into @@ -456,10 +493,13 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit) list_del(&link->cgrp_link); /* @cgrp can't go away while we're holding css_set_rwsem */ - if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) { - if (taskexit) - set_bit(CGRP_RELEASABLE, &cgrp->flags); - check_for_release(cgrp); + if (list_empty(&cgrp->cset_links)) { + cgroup_update_populated(cgrp, false); + if (notify_on_release(cgrp)) { + if (taskexit) + set_bit(CGRP_RELEASABLE, &cgrp->flags); + check_for_release(cgrp); + } } kfree(link); @@ -668,7 +708,11 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link); link->cset = cset; link->cgrp = cgrp; + + if (list_empty(&cgrp->cset_links)) + cgroup_update_populated(cgrp, true); list_move(&link->cset_link, &cgrp->cset_links); + /* * Always add links to the tail of the list so that the list * is sorted by order of hierarchy creation @@ -2643,6 +2687,12 @@ err_undo_css: goto out_unlock; } +static int cgroup_populated_show(struct seq_file *seq, void *v) +{ + seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt); + return 0; +} + static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { @@ -2809,6 +2859,8 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) if (cft->seq_show == cgroup_subtree_control_show) cgrp->control_kn = kn; + else if (cft->seq_show == cgroup_populated_show) + cgrp->populated_kn = kn; return 0; } @@ -3918,6 +3970,11 @@ static struct cftype cgroup_base_files[] = { .seq_show = cgroup_subtree_control_show, .write_string = cgroup_subtree_control_write, }, + { + .name = "cgroup.populated", + .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_populated_show, + }, /* * Historical crazy stuff. These don't have "cgroup." prefix and -- cgit v1.2.3 From 2f0edc04e702fc07d29621f9e361b9120a7594d0 Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Fri, 25 Apr 2014 18:28:03 -0400 Subject: cgroup: clean up obsolete comment for parse_cgroupfs_options() 1d5be6b287c8efc87 ("cgroup: move module ref handling into rebind_subsystems()") makes parse_cgroupfs_options() no longer takes refcounts on subsystems. And unified hierachy makes parse_cgroupfs_options not need to call with cgroup_mutex held to protect the cgroup_subsys[]. So this patch removes BUG_ON() and the comment. As the comment doesn't contain useful information afterwards, the whole comment is removed. Signed-off-by: Jianyu Zhan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0f986f7afee4..fb848be0ea7b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1221,12 +1221,6 @@ struct cgroup_sb_opts { bool none; }; -/* - * Convert a hierarchy specifier into a bitmask of subsystems and - * flags. Call with cgroup_mutex held to protect the cgroup_subsys[] - * array. This function takes refcounts on subsystems to be used, unless it - * returns error, in which case no refcounts are taken. - */ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) { char *token, *o = data; @@ -1235,8 +1229,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) struct cgroup_subsys *ss; int i; - BUG_ON(!mutex_is_locked(&cgroup_mutex)); - #ifdef CONFIG_CPUSETS mask = ~(1UL << cpuset_cgrp_id); #endif -- cgit v1.2.3 From f8719ccf7bc0858384c7e93d8c57fe69ae8c9eac Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Fri, 25 Apr 2014 18:28:03 -0400 Subject: cgroup: remove orphaned cgroup_pidlist_seq_operations 6612f05b88fa309c9 ("cgroup: unify pidlist and other file handling") has removed the only user of cgroup_pidlist_seq_operations : cgroup_pidlist_open(). This patch removes it. Signed-off-by: Jianyu Zhan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index fb848be0ea7b..3849d3d2dfe1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3880,17 +3880,6 @@ static int cgroup_pidlist_show(struct seq_file *s, void *v) return seq_printf(s, "%d\n", *(int *)v); } -/* - * seq_operations functions for iterating on pidlists through seq_file - - * independent of whether it's tasks or procs - */ -static const struct seq_operations cgroup_pidlist_seq_operations = { - .start = cgroup_pidlist_start, - .stop = cgroup_pidlist_stop, - .next = cgroup_pidlist_next, - .show = cgroup_pidlist_show, -}; - static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, struct cftype *cft) { -- cgit v1.2.3 From a2a1f9eaf945c46b5b2bc0e439cba68888e3d540 Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Fri, 25 Apr 2014 18:28:03 -0400 Subject: cgroup: replace pr_warning with preferred pr_warn As suggested by scripts/checkpatch.pl, substitude all pr_warning() with pr_warn(). No functional change. Signed-off-by: Jianyu Zhan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel/cgroup.c') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3849d3d2dfe1..cb453e9954c1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1126,9 +1126,9 @@ static int rebind_subsystems(struct cgroup_root *dst_root, * Just warn about it and continue. */ if (cgrp_dfl_root_visible) { - pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n", - ret,