summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/cgroup-v1/00-INDEX2
-rw-r--r--Documentation/cgroup-v2.txt8
-rw-r--r--Documentation/kernel-parameters.txt5
-rw-r--r--include/linux/cgroup-defs.h46
-rw-r--r--init/Kconfig4
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/cgroup.c1168
-rw-r--r--kernel/cpuset.c2
-rw-r--r--kernel/sched/core.c2
-rw-r--r--kernel/sched/cpuacct.c2
10 files changed, 738 insertions, 504 deletions
diff --git a/Documentation/cgroup-v1/00-INDEX b/Documentation/cgroup-v1/00-INDEX
index 6ad425f7cf56..106885ad670d 100644
--- a/Documentation/cgroup-v1/00-INDEX
+++ b/Documentation/cgroup-v1/00-INDEX
@@ -24,5 +24,3 @@ net_prio.txt
- Network priority cgroups details and usages.
pids.txt
- Process number cgroups details and usages.
-unified-hierarchy.txt
- - Description the new/next cgroup interface.
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index 8f1329a5f700..bdc6773277be 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -132,6 +132,12 @@ strongly discouraged for production use. It is recommended to decide
the hierarchies and controller associations before starting using the
controllers after system boot.
+During transition to v2, system management software might still
+automount the v1 cgroup filesystem and so hijack all controllers
+during boot, before manual intervention is possible. To make testing
+and experimenting easier, the kernel parameter cgroup_no_v1= allows
+disabling controllers in v1 and make them always available in v2.
+
2-2. Organizing Processes
@@ -915,7 +921,7 @@ PAGE_SIZE multiple when read back.
limit, anonymous meomry of the cgroup will not be swapped out.
-5-2-2. General Usage
+5-2-2. Usage Guidelines
"memory.high" is the main mechanism to control memory usage.
Over-committing on high limit (sum of high limits > available memory)
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 0ee46a8f6401..eef242ee576b 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -614,6 +614,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
cut the overhead, others just disable the usage. So
only cgroup_disable=memory is actually worthy}
+ cgroup_no_v1= [KNL] Disable one, multiple, all cgroup controllers in v1
+ Format: { controller[,controller...] | "all" }
+ Like cgroup_disable, but only applies to cgroup v1;
+ the blacklisted controllers remain available in cgroup2.
+
cgroup.memory= [KNL] Pass options to the cgroup memory controller.
Format: <string>
nosocket -- Disable socket memory accounting.
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 789471dba6fb..3e39ae5bc799 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -45,6 +45,7 @@ enum {
CSS_NO_REF = (1 << 0), /* no reference counting for this css */
CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */
CSS_RELEASED = (1 << 2), /* refcnt reached zero, released */
+ CSS_VISIBLE = (1 << 3), /* css is visible to userland */
};
/* bits in struct cgroup flags field */
@@ -190,12 +191,13 @@ struct css_set {
/*
* If this cset is acting as the source of migration the following
- * two fields are set. mg_src_cgrp is the source cgroup of the
- * on-going migration and mg_dst_cset is the destination cset the
- * target tasks on this cset should be migrated to. Protected by
- * cgroup_mutex.
+ * two fields are set. mg_src_cgrp and mg_dst_cgrp are
+ * respectively the source and destination cgroups of the on-going
+ * migration. mg_dst_cset is the destination cset the target tasks
+ * on this cset should be migrated to. Protected by cgroup_mutex.
*/
struct cgroup *mg_src_cgrp;
+ struct cgroup *mg_dst_cgrp;
struct css_set *mg_dst_cset;
/*
@@ -210,6 +212,9 @@ struct css_set {
/* all css_task_iters currently walking this cset */
struct list_head task_iters;
+ /* dead and being drained, ignore for migration */
+ bool dead;
+
/* For RCU-protected deletion */
struct rcu_head rcu_head;
};
@@ -253,13 +258,14 @@ struct cgroup {
/*
* The bitmask of subsystems enabled on the child cgroups.
* ->subtree_control is the one configured through
- * "cgroup.subtree_control" while ->child_subsys_mask is the
- * effective one which may have more subsystems enabled.
- * Controller knobs are made available iff it's enabled in
- * ->subtree_control.
+ * "cgroup.subtree_control" while ->child_ss_mask is the effective
+ * one which may have more subsystems enabled. Controller knobs
+ * are made available iff it's enabled in ->subtree_control.
*/
- unsigned int subtree_control;
- unsigned int child_subsys_mask;
+ u16 subtree_control;
+ u16 subtree_ss_mask;
+ u16 old_subtree_control;
+ u16 old_subtree_ss_mask;
/* Private pointers for each registered subsystem */
struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT];
@@ -434,7 +440,6 @@ struct cgroup_subsys {
void (*css_released)(struct cgroup_subsys_state *css);
void (*css_free)(struct cgroup_subsys_state *css);
void (*css_reset)(struct cgroup_subsys_state *css);
- void (*css_e_css_changed)(struct cgroup_subsys_state *css);
int (*can_attach)(struct cgroup_taskset *tset);
void (*cancel_attach)(struct cgroup_taskset *tset);
@@ -446,7 +451,20 @@ struct cgroup_subsys {
void (*free)(struct task_struct *task);
void (*bind)(struct cgroup_subsys_state *root_css);
- int early_init;
+ bool early_init:1;
+
+ /*
+ * If %true, the controller, on the default hierarchy, doesn't show
+ * up in "cgroup.controllers" or "cgroup.subtree_control", is
+ * implicitly enabled on all cgroups on the default hierarchy, and
+ * bypasses the "no internal process" constraint. This is for
+ * utility type controllers which is transparent to userland.
+ *
+ * An implicit controller can be stolen from the default hierarchy
+ * anytime and thus must be okay with offline csses from previous
+ * hierarchies coexisting with csses for the current one.
+ */
+ bool implicit_on_dfl:1;
/*
* If %false, this subsystem is properly hierarchical -
@@ -460,8 +478,8 @@ struct cgroup_subsys {
* cases. Eventually, all subsystems will be made properly
* hierarchical and this will go away.
*/
- bool broken_hierarchy;
- bool warned_broken_hierarchy;
+ bool broken_hierarchy:1;
+ bool warned_broken_hierarchy:1;
/* the following two fields are initialized automtically during boot */
int id;
diff --git a/init/Kconfig b/init/Kconfig
index 2d70c8c4b1d8..e0d26162432e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1047,10 +1047,10 @@ config CGROUP_PIDS
is fairly trivial to reach PID exhaustion before you reach even a
conservative kmemcg limit. As a result, it is possible to grind a
system to halt without being limited by other cgroup policies. The
- PIDs cgroup subsystem is designed to stop this from happening.
+ PIDs controller is designed to stop this from happening.
It should be noted that organisational operations (such as attaching
- to a cgroup hierarchy will *not* be blocked by the PIDs subsystem),
+ to a cgroup hierarchy will *not* be blocked by the PIDs controller),
since the PIDs limit only affects a process's ability to fork, not to
attach to a cgroup.
diff --git a/kernel/Makefile b/kernel/Makefile
index 53abf008ecb3..baa55e50a315 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -14,8 +14,7 @@ obj-y = fork.o exec_domain.o panic.o \
obj-$(CONFIG_MULTIUSER) += groups.o
ifdef CONFIG_FUNCTION_TRACER
-# Do not trace debug files and internal ftrace files
-CFLAGS_REMOVE_cgroup-debug.o = $(CC_FLAGS_FTRACE)
+# Do not trace internal ftrace files
CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE)
endif
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d27904c193da..3fe02c152799 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -178,10 +178,16 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root);
* The default hierarchy always exists but is hidden until mounted for the
* first time. This is for backward compatibility.
*/
-static bool cgrp_dfl_root_visible;
+static bool cgrp_dfl_visible;
+
+/* Controllers blocked by the commandline in v1 */
+static u16 cgroup_no_v1_mask;
/* some controllers are not supported in the default hierarchy */
-static unsigned long cgrp_dfl_root_inhibit_ss_mask;
+static u16 cgrp_dfl_inhibit_ss_mask;
+
+/* some controllers are implicitly enabled on the default hierarchy */
+static unsigned long cgrp_dfl_implicit_ss_mask;
/* The list of hierarchy roots */
@@ -205,23 +211,25 @@ static u64 css_serial_nr_next = 1;
* fork/exit handlers to call. This avoids us having to do extra work in the
* fork/exit path to check which subsystems have fork/exit callbacks.
*/
-static unsigned long have_fork_callback __read_mostly;
-static unsigned long have_exit_callback __read_mostly;
-static unsigned long have_free_callback __read_mostly;
+static u16 have_fork_callback __read_mostly;
+static u16 have_exit_callback __read_mostly;
+static u16 have_free_callback __read_mostly;
/* Ditto for the can_fork callback. */
-static unsigned long have_canfork_callback __read_mostly;
+static u16 have_canfork_callback __read_mostly;
static struct file_system_type cgroup2_fs_type;
static struct cftype cgroup_dfl_base_files[];
static struct cftype cgroup_legacy_base_files[];
-static int rebind_subsystems(struct cgroup_root *dst_root,
- unsigned long ss_mask);
+static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
+static void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
+static int cgroup_apply_control(struct cgroup *cgrp);
+static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
static void css_task_iter_advance(struct css_task_iter *it);
static int cgroup_destroy_locked(struct cgroup *cgrp);
-static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
- bool visible);
+static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
+ struct cgroup_subsys *ss);
static void css_release(struct percpu_ref *ref);
static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
@@ -238,9 +246,17 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
*/
static bool cgroup_ssid_enabled(int ssid)
{
+ if (CGROUP_SUBSYS_COUNT == 0)
+ return false;
+
return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
}
+static bool cgroup_ssid_no_v1(int ssid)
+{
+ return cgroup_no_v1_mask & (1 << ssid);
+}
+
/**
* cgroup_on_dfl - test whether a cgroup is on the default hierarchy
* @cgrp: the cgroup of interest
@@ -339,6 +355,32 @@ static struct cgroup *cgroup_parent(struct cgroup *cgrp)
return NULL;
}
+/* subsystems visibly enabled on a cgroup */
+static u16 cgroup_control(struct cgroup *cgrp)
+{
+ struct cgroup *parent = cgroup_parent(cgrp);
+ u16 root_ss_mask = cgrp->root->subsys_mask;
+
+ if (parent)
+ return parent->subtree_control;
+
+ if (cgroup_on_dfl(cgrp))
+ root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
+ cgrp_dfl_implicit_ss_mask);
+ return root_ss_mask;
+}
+
+/* subsystems enabled on a cgroup */
+static u16 cgroup_ss_mask(struct cgroup *cgrp)
+{
+ struct cgroup *parent = cgroup_parent(cgrp);
+
+ if (parent)
+ return parent->subtree_ss_mask;
+
+ return cgrp->root->subsys_mask;
+}
+
/**
* cgroup_css - obtain a cgroup's css for the specified subsystem
* @cgrp: the cgroup of interest
@@ -378,16 +420,15 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
if (!ss)
return &cgrp->self;
- if (!(cgrp->root->subsys_mask & (1 << ss->id)))
- return NULL;
-
/*
* This function is used while updating css associations and thus
- * can't test the csses directly. Use ->child_subsys_mask.
+ * can't test the csses directly. Test ss_mask.
*/
- while (cgroup_parent(cgrp) &&
- !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id)))
+ while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
cgrp = cgroup_parent(cgrp);
+ if (!cgrp)
+ return NULL;
+ }
return cgroup_css(cgrp, ss);
}
@@ -506,22 +547,28 @@ static int notify_on_release(const struct cgroup *cgrp)
(((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
/**
- * for_each_subsys_which - filter for_each_subsys with a bitmask
+ * do_each_subsys_mask - filter for_each_subsys with a bitmask
* @ss: the iteration cursor
* @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
- * @ss_maskp: a pointer to the bitmask
+ * @ss_mask: the bitmask
*
* The block will only run for cases where the ssid-th bit (1 << ssid) of
- * mask is set to 1.
+ * @ss_mask is set.
*/
-#define for_each_subsys_which(ss, ssid, ss_maskp) \
- if (!CGROUP_SUBSYS_COUNT) /* to avoid spurious gcc warning */ \
+#define do_each_subsys_mask(ss, ssid, ss_mask) do { \
+ unsigned long __ss_mask = (ss_mask); \
+ if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */ \
(ssid) = 0; \
- else \
- for_each_set_bit(ssid, ss_maskp, CGROUP_SUBSYS_COUNT) \
- if (((ss) = cgroup_subsys[ssid]) && false) \
- break; \
- else
+ break; \
+ } \
+ for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \
+ (ss) = cgroup_subsys[ssid]; \
+ {
+
+#define while_each_subsys_mask() \
+ } \
+ } \
+} while (false)
/* iterate across the hierarchies */
#define for_each_root(root) \
@@ -535,6 +582,24 @@ static int notify_on_release(const struct cgroup *cgrp)
; \
else
+/* walk live descendants in preorder */
+#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \
+ css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \
+ if (({ lockdep_assert_held(&cgroup_mutex); \
+ (dsct) = (d_css)->cgroup; \
+ cgroup_is_dead(dsct); })) \
+ ; \
+ else
+
+/* walk live descendants in postorder */
+#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \
+ css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \
+ if (({ lockdep_assert_held(&cgroup_mutex); \
+ (dsct) = (d_css)->cgroup; \
+ cgroup_is_dead(dsct); })) \
+ ; \
+ else
+
static void cgroup_release_agent(struct work_struct *work);
static void check_for_release(struct cgroup *cgrp);
@@ -665,6 +730,9 @@ static void css_set_move_task(struct task_struct *task,
{
lockdep_assert_held(&css_set_lock);
+ if (to_cset && !css_set_populated(to_cset))
+ css_set_update_populated(to_cset, true);
+
if (from_cset) {
struct css_task_iter *it, *pos;
@@ -698,8 +766,6 @@ static void css_set_move_task(struct task_struct *task,
*/
WARN_ON_ONCE(task->flags & PF_EXITING);
- if (!css_set_populated(to_cset))
- css_set_update_populated(to_cset, true);
rcu_assign_pointer(task->cgroups, to_cset);
list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
&to_cset->tasks);
@@ -1102,13 +1168,13 @@ static void cgroup_destroy_root(struct cgroup_root *root)
struct cgroup *cgrp = &root->cgrp;
struct cgrp_cset_link *link, *tmp_link;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
BUG_ON(atomic_read(&root->nr_cgrps));
BUG_ON(!list_empty(&cgrp->self.children));
/* Rebind all subsystems back to the default hierarchy */
- rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
+ WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));
/*
* Release all the links from cset_links to this hierarchy's
@@ -1248,46 +1314,40 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
}
/**
- * cgroup_calc_child_subsys_mask - calculate child_subsys_mask
- * @cgrp: the target cgroup
+ * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask
* @subtree_control: the new subtree_control mask to consider
+ * @this_ss_mask: available subsystems
*
* On the default hierarchy, a subsystem may request other subsystems to be
* enabled together through its ->depends_on mask. In such cases, more
* subsystems than specified in "cgroup.subtree_control" may be enabled.
*
* This function calculates which subsystems need to be enabled if
- * @subtree_control is to be applied to @cgrp. The returned mask is always
- * a superset of @subtree_control and follows the usual hierarchy rules.
+ * @subtree_control is to be applied while restricted to @this_ss_mask.
*/
-static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
- unsigned long subtree_control)
+static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
{
- struct cgroup *parent = cgroup_parent(cgrp);
- unsigned long cur_ss_mask = subtree_control;
+ u16 cur_ss_mask = subtree_control;
struct cgroup_subsys *ss;
int ssid;
lockdep_assert_held(&cgroup_mutex);
- if (!cgroup_on_dfl(cgrp))
- return cur_ss_mask;
+ cur_ss_mask |= cgrp_dfl_implicit_ss_mask;
while (true) {
- unsigned long new_ss_mask = cur_ss_mask;
+ u16 new_ss_mask = cur_ss_mask;
- for_each_subsys_which(ss, ssid, &cur_ss_mask)
+ do_each_subsys_mask(ss, ssid, cur_ss_mask) {
new_ss_mask |= ss->depends_on;
+ } while_each_subsys_mask();
/*
* Mask out subsystems which aren't available. This can
* happen only if some depended-upon subsystems were bound
* to non-default hierarchies.
*/
- if (parent)
- new_ss_mask &= parent->child_subsys_mask;
- else
- new_ss_mask &= cgrp->root->subsys_mask;
+ new_ss_mask &= this_ss_mask;
if (new_ss_mask == cur_ss_mask)
break;
@@ -1298,19 +1358,6 @@ static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
}
/**
- * cgroup_refresh_child_subsys_mask - update child_subsys_mask
- * @cgrp: the target cgroup
- *
- * Update @cgrp->child_subsys_mask according to the current
- * @cgrp->subtree_control using cgroup_calc_child_subsys_mask().
- */
-static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
-{
- cgrp->child_subsys_mask =
- cgroup_calc_child_subsys_mask(cgrp, cgrp->subtree_control);
-}
-
-/**
* cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
* @kn: the kernfs_node being serviced
*
@@ -1338,19 +1385,22 @@ static void cgroup_kn_unlock(struct kernfs_node *kn)
/**
* cgroup_kn_lock_live - locking helper for cgroup kernfs methods
* @kn: the kernfs_node being serviced
+ * @drain_offline: perform offline draining on the cgroup
*
* This helper is to be used by a cgroup kernfs method currently servicing
* @kn. It breaks the active protection, performs cgroup locking and
* verifies that the associated cgroup is alive. Returns the cgroup if
* alive; otherwise, %NULL. A successful return should be undone by a
- * matching cgroup_kn_unlock() invocation.
+ * matching cgroup_kn_unlock() invocation. If @drain_offline is %true, the
+ * cgroup is drained of offlining csses before return.
*
* Any cgroup kernfs method implementation which requires locking the
* associated cgroup should use this helper. It avoids nesting cgroup
* locking under kernfs active protection and allows all kernfs operations
* including self-removal.
*/
-static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
+static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn,
+ bool drain_offline)
{
struct cgroup *cgrp;
@@ -1369,7 +1419,10 @@ static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
return NULL;
kernfs_break_active_protection(kn);
- mutex_lock(&cgroup_mutex);
+ if (drain_offline)
+ cgroup_lock_and_drain_offline(cgrp);
+ else
+ mutex_lock(&cgroup_mutex);
if (!cgroup_is_dead(cgrp))
return cgrp;
@@ -1399,14 +1452,17 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
/**
* css_clear_dir - remove subsys files in a cgroup directory
* @css: taget css
- * @cgrp_override: specify if target cgroup is different from css->cgroup
*/
-static void css_clear_dir(struct cgroup_subsys_state *css,
- struct cgroup *cgrp_override)
+static void css_clear_dir(struct cgroup_subsys_state *css)
{
- struct cgroup *cgrp = cgrp_override ?: css->cgroup;
+ struct cgroup *cgrp = css->cgroup;
struct cftype *cfts;
+ if (!(css->flags & CSS_VISIBLE))
+ return;
+
+ css->flags &= ~CSS_VISIBLE;
+
list_for_each_entry(cfts, &css->ss->cfts, node)
cgroup_addrm_files(css, cgrp, cfts, false);
}
@@ -1414,17 +1470,18 @@ static void css_clear_dir(struct cgroup_subsys_state *css,
/**
* css_populate_dir - create subsys files in a cgroup directory
* @css: target css
- * @cgrp_overried: specify if target cgroup is different from css->cgroup
*
* On failure, no file is added.
*/
-static int css_populate_dir(struct cgroup_subsys_state *css,
- struct cgroup *cgrp_override)
+static int css_populate_dir(struct cgroup_subsys_state *css)
{
- struct cgroup *cgrp = cgrp_override ?: css->cgroup;
+ struct cgroup *cgrp = css->cgroup;
struct cftype *cfts, *failed_cfts;
int ret;
+ if ((css->flags & CSS_VISIBLE) || !cgrp->kn)
+ return 0;
+
if (!css->ss) {
if (cgroup_on_dfl(cgrp))
cfts = cgroup_dfl_base_files;
@@ -1441,6 +1498,9 @@ static int css_populate_dir(struct cgroup_subsys_state *css,
goto err;
}
}
+
+ css->flags |= CSS_VISIBLE;
+
return 0;
err:
list_for_each_entry(cfts, &css->ss->cfts, node) {
@@ -1451,67 +1511,30 @@ err:
return ret;
}
-static int rebind_subsystems(struct cgroup_root *dst_root,
- unsigned long ss_mask)
+static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
{
struct cgroup *dcgrp = &dst_root->cgrp;
struct cgroup_subsys *ss;
- unsigned long tmp_ss_mask;
int ssid, i, ret;
lockdep_assert_held(&cgroup_mutex);
- for_each_subsys_which(ss, ssid, &ss_mask) {
- /* if @ss has non-root csses attached to it, can't move */
- if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
+ do_each_subsys_mask(ss, ssid, ss_mask) {
+ /*
+ * If @ss has non-root csses attached to it, can't move.
+ * If @ss is an implicit controller, it is exempt from this
+ * rule and can be stolen.
+ */
+ if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
+ !ss->implicit_on_dfl)
return -EBUSY;
/* can't move between two non-dummy roots either */
if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
return -EBUSY;
- }
-
- /* skip creating root files on dfl_root for inhibited subsystems */
- tmp_ss_mask = ss_mask;
- if (dst_root == &cgrp_dfl_root)
- tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
-
- for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
- struct cgroup *scgrp = &ss->root->cgrp;
- int tssid;
-
- ret = css_populate_dir(cgroup_css(scgrp, ss), dcgrp);
- if (!ret)
- continue;
-
- /*
- * Rebinding back to the default root is not allowed to
- * fail. Using both default and non-default roots should
- * be rare. Moving subsystems back and forth even more so.
- * Just warn about it and continue.
- */
- if (dst_root == &cgrp_dfl_root) {
- if (cgrp_dfl_root_visible) {
- pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n",
- ret, ss_mask);
- pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
- }
- continue;
- }
-
- for_each_subsys_which(ss, tssid, &tmp_ss_mask) {
- if (tssid == ssid)
- break;
- css_clear_dir(cgroup_css(scgrp, ss), dcgrp);
- }
- return ret;
- }
+ } while_each_subsys_mask();
- /*
- * Nothing can fail from this point on. Remove files for the
- * removed subsystems and rebind each subsystem.
- */
- for_each_subsys_which(ss, ssid, &ss_mask) {
+ do_each_subsys_mask(ss, ssid, ss_mask) {
struct cgroup_root *src_root = ss->root;
struct cgroup *scgrp = &src_root->cgrp;
struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
@@ -1519,8 +1542,12 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
WARN_ON(!css || cgroup_css(dcgrp, ss));
- css_clear_dir(css, NULL);
+ /* disable from the source */
+ src_root->subsys_mask &= ~(1 << ssid);
+ WARN_ON(cgroup_apply_control(scgrp));
+ cgroup_finalize_control(scgrp, 0);
+ /* rebind */
RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
rcu_assign_pointer(dcgrp->subsys[ssid], css);
ss->root = dst_root;
@@ -1532,23 +1559,23 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
&dcgrp->e_csets[ss->id]);
spin_unlock_bh(&css_set_lock);
- src_root->subsys_mask &= ~(1 << ssid);
- scgrp->subtree_control &= ~(1 << ssid);
- cgroup_refresh_child_subsys_mask(scgrp);
-
/* default hierarchy doesn't enable controllers by default */
dst_root->subsys_mask |= 1 << ssid;
if (dst_root == &cgrp_dfl_root) {
static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
} else {
dcgrp->subtree_control |= 1 << ssid;
- cgroup_refresh_child_subsys_mask(dcgrp);
static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
}
+ ret = cgroup_apply_control(dcgrp);
+ if (ret)
+ pr_warn("partial failure to rebind %s controller (err=%d)\n",
+ ss->name, ret);
+
if (ss->bind)
ss->bind(css);
- }
+ } while_each_subsys_mask();
kernfs_activate(dcgrp->kn);
return 0;
@@ -1584,7 +1611,7 @@ static int cgroup_show_options(struct seq_file *seq,
}
struct cgroup_sb_opts {
- unsigned long subsys_mask;
+ u16 subsys_mask;
unsigned int flags;
char *release_agent;
bool cpuset_clone_children;
@@ -1597,13 +1624,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
{
char *token, *o = data;
bool all_ss = false, one_ss = false;
- unsigned long mask = -1UL;
+ u16 mask = U16_MAX;
struct cgroup_subsys *ss;
int nr_opts = 0;
int i;
#ifdef CONFIG_CPUSETS
- mask = ~(1U << cpuset_cgrp_id);
+ mask = ~((u16)1 << cpuset_cgrp_id);
#endif
memset(opts, 0, sizeof(*opts));
@@ -1678,6 +1705,8 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
continue;
if (!cgroup_ssid_enabled(i))
continue;
+ if (cgroup_ssid_no_v1(i))
+ continue;
/* Mutually exclusive option 'all' + subsystem name */
if (all_ss)
@@ -1698,7 +1727,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
*/
if (all_ss || (!one_ss && !opts->none && !opts->name))
for_each_subsys(ss, i)
- if (cgroup_ssid_enabled(i))
+ if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i))
opts->subsys_mask |= (1 << i);
/*
@@ -1728,14 +1757,14 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
int ret = 0;
struct cgroup_root *root = cgroup_root_from_kf(kf_root);
struct cgroup_sb_opts opts;
- unsigned long added_mask, removed_mask;
+ u16 added_mask, removed_mask;
if (root == &cgrp_dfl_root) {
pr_err("remount is not allowed\n");
return -EINVAL;
}
- mutex_lock(&cgroup_mutex);
+ cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
/* See what subsystems are wanted */
ret = parse_cgroupfs_options(data, &opts);
@@ -1768,7 +1797,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
if (ret)
goto out_unlock;
- rebind_subsystems(&cgrp_dfl_root, removed_mask);
+ WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
if (opts.release_agent) {
spin_lock(&release_agent_path_lock);
@@ -1876,7 +1905,7 @@ static void init_cgroup_root(struct cgroup_root *root,
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
}
-static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
+static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
{
LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = &root->cgrp;
@@ -1899,10 +1928,11 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
/*
* We're accessing css_set_count without locking css_set_lock here,
* but that's OK - it can only be increased by someone holding
- * cgroup_lock, and that's us. The worst that can happen is that we
- * have some link structures left over
+ * cgroup_lock, and that's us. Later rebinding may disable
+ * controllers on the default hierarchy and thus create new csets,
+ * which can't be more than the existing ones. Allocate 2x.
*/
- ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
+ ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);
if (ret)
goto cancel_ref;
@@ -1919,7 +1949,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
}
root_cgrp->kn = root->kf_root->kn;
- ret = css_populate_dir(&root_cgrp->self, NULL);
+ ret = css_populate_dir(&root_cgrp->self);
if (ret)
goto destroy_root;
@@ -1992,13 +2022,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
return ERR_PTR(-EINVAL);
}
- cgrp_dfl_root_visible = true;
+ cgrp_dfl_visible = true;
root = &cgrp_dfl_root;
cgroup_get(&root->cgrp);
goto out_mount;
}
- mutex_lock(&cgroup_mutex);
+ cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
/* First find the desired set of subsystems */
ret = parse_cgroupfs_options(data, &opts);
@@ -2338,38 +2368,38 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
}
/**
- * cgroup_taskset_migrate - migrate a taskset to a cgroup
+ * cgroup_taskset_migrate - migrate a taskset
* @tset: taget taskset
- * @dst_cgrp: destination cgroup
+ * @root: cgroup root the migration is taking place on
*
- * Migrate tasks in @tset to @dst_cgrp. This function fails iff one of the
- * ->can_attach callbacks fails and guarantees that either all or none of
- * the tasks in @tset are migrated. @tset is consumed regardless of
- * success.
+ * Migrate tasks in @tset as setup by migration preparation functions.
+ * This function fails iff one of the ->can_attach callbacks fails and
+ * guarantees that either all or none of the tasks in @tset are migrated.
+ * @tset is consumed regardless of success.
*/
static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
- struct cgroup *dst_cgrp)
+ struct cgroup_root *root)
{
- struct cgroup_subsys_state *css, *failed_css = NULL;
+ struct cgroup_subsys *ss;
struct task_struct *task, *tmp_task;
struct css_set *cset, *tmp_cset;
- int i, ret;
+ int ssid, failed_ssid, ret;
/* methods shouldn't be called if no task is actually migrating */
if (list_empty(&tset->src_csets))
return 0;
/* check that we can legitimately attach to the cgroup */
- for_each_e_css(css, i, dst_cgrp) {
- if (css->ss->can_attach) {
- tset->ssid = i;
- ret = css->ss->can_attach(tset);
+ do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+ if (ss->can_attach) {
+ tset->ssid = ssid;
+ ret = ss->can_attach(tset);
if (ret) {
- failed_css = css;
+ failed_ssid = ssid;
goto out_cancel_attach;
}
}
- }
+ } while_each_subsys_mask();
/*
* Now that we're guaranteed success, proceed to move all tasks to
@@ -2396,25 +2426,25 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
*/
tset->csets = &tset->dst_csets;
- for_each_e_css(css, i, dst_cgrp) {
- if (css->ss->attach) {
- tset->ssid = i;
- css->ss->attach(tset);
+ do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+ if (ss->attach) {
+ tset->ssid = ssid;
+ ss->attach(tset);
}
- }
+ } while_each_subsys_mask();
ret = 0;
goto out_release_tset;
out_cancel_attach:
- for_each_e_css(css, i, dst_cgrp) {
- if (css == failed_css)
+ do_each_subsys_mask(ss, ssid, root->subsys_mask) {
+ if (ssid == failed_ssid)
break;
- if (css->ss->cancel_attach) {
- tset->ssid = i;
- css->ss->cancel_attach(tset);
+ if (ss->cancel_attach) {
+ tset->ssid = ssid;
+ ss->cancel_attach(tset);
}
- }
+ } while_each_subsys_mask();
out_release_tset:
spin_lock_bh(&css_set_lock);
list_splice_init(&tset->dst_csets, &tset->src_csets);
@@ -2427,6 +2457,20 @@ out_release_tset:
}
/**
+ * cgroup_may_migrate_to - verify whether a cgroup can be migration destination
+ * @dst_cgrp: destination cgroup to test
+ *
+ * On the default hierarchy, except for the root, subtree_control must be
+ * zero for migration destination cgroups with tasks so that child cgroups
+ * don't compete against tasks.
+ */
+static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp)
+{
+ return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) ||
+ !dst_cgrp->subtree_control;
+}
+
+/**
* cgroup_migrate_finish - cleanup after attach
* @preloaded_csets: list of preloaded css_sets
*
@@ -2442,6 +2486,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
spin_lock_bh(&css_set_lock);
list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
cset->mg_src_cgrp = NULL;
+ cset->mg_dst_cgrp = NULL;
cset->mg_dst_cset = NULL;
list_del_init(&cset->mg_preload_node);
put_css_set_locked(cset);
@@ -2474,58 +2519,56 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
lockdep_assert_held(&cgroup_mutex);
lockdep_assert_held(&css_set_lock);
+ /*
+ * If ->dead, @src_set is associated with one or more dead cgroups
+ * and doesn't contain any migratable tasks. Ignore it early so
+ * that the rest of migration path doesn't get confused by it.
+ */
+ if (src_cset->dead)
+ return;
+
src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
if (!list_empty(&src_cset->mg_preload_node))
return;
WARN_ON(src_cset->mg_src_cgrp);
+ WARN_ON(src_cset->mg_dst_cgrp);
WARN_ON(!list_empty(&src_cset->mg_tasks));
WARN_ON(!list_empty(&src_cset->mg_node));
src_cset->mg_src_cgrp = src_cgrp;
+ src_cset->mg_dst_cgrp = dst_cgrp;
get_css_set(src_cset);
list_add(&src_cset->mg_preload_node, preloaded_csets);
}
/**
* cgroup_migrate_prepare_dst - prepare destination css_sets for migration
- * @dst_cgrp: the destination cgroup (may be %NULL)
* @preloaded_csets: list of preloaded source css_sets
*
- * Tasks are about to be moved to @dst_cgrp and all the source css_sets
- * have been preloaded to @preloaded_csets. This function looks up and
- * pins all destination css_sets, links each to its source, and append them
- * to @preloaded_csets. If @dst_cgrp is %N