summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/cgroups/cgroups.txt14
-rw-r--r--Documentation/cgroups/unified-hierarchy.txt35
-rw-r--r--block/blk-cgroup.c13
-rw-r--r--block/blk-throttle.c6
-rw-r--r--include/linux/cgroup.h165
-rw-r--r--kernel/cgroup.c453
-rw-r--r--kernel/cgroup_freezer.c2
-rw-r--r--kernel/cpuset.c500
-rw-r--r--kernel/sched/core.c2
-rw-r--r--kernel/sched/cpuacct.c2
-rw-r--r--mm/hugetlb_cgroup.c5
-rw-r--r--mm/memcontrol.c37
-rw-r--r--net/core/netclassid_cgroup.c2
-rw-r--r--net/core/netprio_cgroup.c2
-rw-r--r--net/ipv4/tcp_memcontrol.c2
-rw-r--r--security/device_cgroup.c2
16 files changed, 806 insertions, 436 deletions
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 821de56d1580..10c949b293e4 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -599,6 +599,20 @@ fork. If this method returns 0 (success) then this should remain valid
while the caller holds cgroup_mutex and it is ensured that either
attach() or cancel_attach() will be called in future.
+void css_reset(struct cgroup_subsys_state *css)
+(cgroup_mutex held by caller)
+
+An optional operation which should restore @css's configuration to the
+initial state. This is currently only used on the unified hierarchy
+when a subsystem is disabled on a cgroup through
+"cgroup.subtree_control" but should remain enabled because other
+subsystems depend on it. cgroup core makes such a css invisible by
+removing the associated interface files and invokes this callback so
+that the hidden subsystem can return to the initial neutral state.
+This prevents unexpected resource control from a hidden css and
+ensures that the configuration is in the initial state when it is made
+visible again later.
+
void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
(cgroup_mutex held by caller)
diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt
index 324b182e6000..4f4563277864 100644
--- a/Documentation/cgroups/unified-hierarchy.txt
+++ b/Documentation/cgroups/unified-hierarchy.txt
@@ -94,12 +94,35 @@ change soon.
mount -t cgroup -o __DEVEL__sane_behavior cgroup $MOUNT_POINT
-All controllers which are not bound to other hierarchies are
-automatically bound to unified hierarchy and show up at the root of
-it. Controllers which are enabled only in the root of unified
-hierarchy can be bound to other hierarchies at any time. This allows
-mixing unified hierarchy with the traditional multiple hierarchies in
-a fully backward compatible way.
+All controllers which support the unified hierarchy and are not bound
+to other hierarchies are automatically bound to unified hierarchy and
+show up at the root of it. Controllers which are enabled only in the
+root of unified hierarchy can be bound to other hierarchies. This
+allows mixing unified hierarchy with the traditional multiple
+hierarchies in a fully backward compatible way.
+
+For development purposes, the following boot parameter makes all
+controllers to appear on the unified hierarchy whether supported or
+not.
+
+ cgroup__DEVEL__legacy_files_on_dfl
+
+A controller can be moved across hierarchies only after the controller
+is no longer referenced in its current hierarchy. Because per-cgroup
+controller states are destroyed asynchronously and controllers may
+have lingering references, a controller may not show up immediately on
+the unified hierarchy after the final umount of the previous
+hierarchy. Similarly, a controller should be fully disabled to be
+moved out of the unified hierarchy and it may take some time for the
+disabled controller to become available for other hierarchies;
+furthermore, due to dependencies among controllers, other controllers
+may need to be disabled too.
+
+While useful for development and manual configurations, dynamically
+moving controllers between the unified and other hierarchies is
+strongly discouraged for production use. It is recommended to decide
+the hierarchies and controller associations before starting using the
+controllers.
2-2. cgroup.subtree_control
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 28d227c5ca77..e17da947f6bd 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -928,7 +928,15 @@ struct cgroup_subsys blkio_cgrp_subsys = {
.css_offline = blkcg_css_offline,
.css_free = blkcg_css_free,
.can_attach = blkcg_can_attach,
- .base_cftypes = blkcg_files,
+ .legacy_cftypes = blkcg_files,
+#ifdef CONFIG_MEMCG
+ /*
+ * This ensures that, if available, memcg is automatically enabled
+ * together on the default hierarchy so that the owner cgroup can
+ * be retrieved from writeback pages.
+ */
+ .depends_on = 1 << memory_cgrp_id,
+#endif
};
EXPORT_SYMBOL_GPL(blkio_cgrp_subsys);
@@ -1120,7 +1128,8 @@ int blkcg_policy_register(struct blkcg_policy *pol)
/* everything is in place, add intf files for the new policy */
if (pol->cftypes)
- WARN_ON(cgroup_add_cftypes(&blkio_cgrp_subsys, pol->cftypes));
+ WARN_ON(cgroup_add_legacy_cftypes(&blkio_cgrp_subsys,
+ pol->cftypes));
ret = 0;
out_unlock:
mutex_unlock(&blkcg_pol_mutex);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 3fdb21a390c1..9273d0969ebd 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -412,13 +412,13 @@ static void throtl_pd_init(struct blkcg_gq *blkg)
int rw;
/*
- * If sane_hierarchy is enabled, we switch to properly hierarchical
+ * If on the default hierarchy, we switch to properly hierarchical
* behavior where limits on a given throtl_grp are applied to the
* whole subtree rather than just the group itself. e.g. If 16M
* read_bps limit is set on the root group, the whole system can't
* exceed 16M for the device.
*
- * If sane_hierarchy is not enabled, the broken flat hierarchy
+ * If not on the default hierarchy, the broken flat hierarchy
* behavior is retained where all throtl_grps are treated as if
* they're all separate root groups right below throtl_data.
* Limits of a group don't interact with limits of other groups
@@ -426,7 +426,7 @@ static void throtl_pd_init(struct blkcg_gq *blkg)
*/
parent_sq = &td->service_queue;
- if (cgroup_sane_behavior(blkg->blkcg->css.cgroup) && blkg->parent)
+ if (cgroup_on_dfl(blkg->blkcg->css.cgroup) && blkg->parent)
parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
throtl_service_queue_init(&tg->service_queue, parent_sq);
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 8a111dd42d7a..b5223c570eba 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -203,7 +203,15 @@ struct cgroup {
struct kernfs_node *kn; /* cgroup kernfs entry */
struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */
- /* the bitmask of subsystems enabled on the child cgroups */
+ /*
+ * The bitmask of subsystems enabled on the child cgroups.
+ * ->subtree_control is the one configured through
+ * "cgroup.subtree_control" while ->child_subsys_mask is the
+ * effective one which may have more subsystems enabled.
+ * Controller knobs are made available iff it's enabled in
+ * ->subtree_control.
+ */
+ unsigned int subtree_control;
unsigned int child_subsys_mask;
/* Private pointers for each registered subsystem */
@@ -248,73 +256,9 @@ struct cgroup {
/* cgroup_root->flags */
enum {
- /*
- * Unfortunately, cgroup core and various controllers are riddled
- * with idiosyncrasies and pointless options. The following flag,
- * when set, will force sane behavior - some options are forced on,
- * others are disallowed, and some controllers will change their
- * hierarchical or other behaviors.
- *
- * The set of behaviors affected by this flag are still being
- * determined and developed and the mount option for this flag is
- * prefixed with __DEVEL__. The prefix will be dropped once we
- * reach the point where all behaviors are compatible with the
- * planned unified hierarchy, which will automatically turn on this
- * flag.
- *
- * The followings are the behaviors currently affected this flag.
- *
- * - Mount options "noprefix", "xattr", "clone_children",
- * "release_agent" and "name" are disallowed.
- *
- * - When mounting an existing superblock, mount options should
- * match.
- *
- * - Remount is disallowed.
- *
- * - rename(2) is disallowed.
- *
- * - "tasks" is removed. Everything should be at process
- * granularity. Use "cgroup.procs" instead.
- *
- * - "cgroup.procs" is not sorted. pids will be unique unless they
- * got recycled inbetween reads.
- *
- * - "release_agent" and "notify_on_release" are removed.
- * Replacement notification mechanism will be implemented.
- *
- * - "cgroup.clone_children" is removed.
- *
- * - "cgroup.subtree_populated" is available. Its value is 0 if
- * the cgroup and its descendants contain no task; otherwise, 1.
- * The file also generates kernfs notification which can be
- * monitored through poll and [di]notify when the value of the
- * file changes.
- *
- * - If mount is requested with sane_behavior but without any
- * subsystem, the default unified hierarchy is mounted.
- *
- * - cpuset: tasks will be kept in empty cpusets when hotplug happens
- * and take masks of ancestors with non-empty cpus/mems, instead of
- * being moved to an ancestor.
- *
- * - cpuset: a task can be moved into an empty cpuset, and again it
- * takes masks of ancestors.
- *
- * - memcg: use_hierarchy is on by default and the cgroup file for
- * the flag is not created.
- *
- * - blkcg: blk-throttle becomes properly hierarchical.
- *
- * - debug: disallowed on the default hierarchy.
- */
- CGRP_ROOT_SANE_BEHAVIOR = (1 << 0),
-
+ CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), /* __DEVEL__sane_behavior specified */
CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */
CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */
-
- /* mount options live below bit 16 */
- CGRP_ROOT_OPTION_MASK = (1 << 16) - 1,
};
/*
@@ -440,9 +384,11 @@ struct css_set {
enum {
CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */
CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */
- CFTYPE_INSANE = (1 << 2), /* don't create if sane_behavior */
CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */
- CFTYPE_ONLY_ON_DFL = (1 << 4), /* only on default hierarchy */
+
+ /* internal flags, do not use outside cgroup core proper */
+ __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */
+ __CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */
};
#define MAX_CFTYPE_NAME 64
@@ -526,20 +472,64 @@ struct cftype {
extern struct cgroup_root cgrp_dfl_root;
extern struct css_set init_css_set;
+/**
+ * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
+ * @cgrp: the cgroup of interest
+ *
+ * The default hierarchy is the v2 interface of cgroup and this function
+ * can be used to test whether a cgroup is on the default hierarchy for
+ * cases where a subsystem should behave differnetly depending on the
+ * interface version.
+ *
+ * The set of behaviors which change on the default hierarchy are still
+ * being determined and the mount option is prefixed with __DEVEL__.
+ *
+ * List of changed behaviors:
+ *
+ * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
+ * and "name" are disallowed.
+ *
+ * - When mounting an existing superblock, mount options should match.
+ *
+ * - Remount is disallowed.
+ *
+ * - rename(2) is disallowed.
+ *
+ * - "tasks" is removed. Everything should be at process granularity. Use
+ * "cgroup.procs" instead.
+ *
+ * - "cgroup.procs" is not sorted. pids will be unique unless they got
+ * recycled inbetween reads.
+ *
+ * - "release_agent" and "notify_on_release" are removed. Replacement
+ * notification mechanism will be implemented.
+ *
+ * - "cgroup.clone_children" is removed.
+ *
+ * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup
+ * and its descendants contain no task; otherwise, 1. The file also
+ * generates kernfs notification which can be monitored through poll and
+ * [di]notify when the value of the file changes.
+ *
+ * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
+ * take masks of ancestors with non-empty cpus/mems, instead of being
+ * moved to an ancestor.
+ *
+ * - cpuset: a task can be moved into an empty cpuset, and again it takes
+ * masks of ancestors.
+ *
+ * - memcg: use_hierarchy is on by default and the cgroup file for the flag
+ * is not created.
+ *
+ * - blkcg: blk-throttle becomes properly hierarchical.
+ *
+ * - debug: disallowed on the default hierarchy.
+ */
static inline bool cgroup_on_dfl(const struct cgroup *cgrp)
{
return cgrp->root == &cgrp_dfl_root;
}
-/*
- * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details. This
- * function can be called as long as @cgrp is accessible.
- */
-static inline bool cgroup_sane_behavior(const struct cgroup *cgrp)
-{
- return cgrp->root->flags & CGRP_ROOT_SANE_BEHAVIOR;
-}
-
/* no synchronization, the result can only be used as a hint */
static inline bool cgroup_has_tasks(struct cgroup *cgrp)
{
@@ -602,7 +592,8 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen);
-int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
+int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
+int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
int cgroup_rm_cftypes(struct cftype *cfts);
bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
@@ -634,6 +625,7 @@ struct cgroup_subsys {
int (*css_online)(struct cgroup_subsys_state *css);
void (*css_offline)(struct cgroup_subsys_state *css);
void (*css_free)(struct cgroup_subsys_state *css);
+ void (*css_reset)(struct cgroup_subsys_state *css);
int (*can_attach)(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset);
@@ -682,8 +674,21 @@ struct cgroup_subsys {
*/
struct list_head cfts;
- /* base cftypes, automatically registered with subsys itself */
- struct cftype *base_cftypes;
+ /*
+ * Base cftypes which are automatically registered. The two can
+ * point to the same array.
+ */
+ struct cftype *dfl_cftypes; /* for the default hierarchy */
+ struct cftype *legacy_cftypes; /* for the legacy hierarchies */
+
+ /*
+ * A subsystem may depend on other subsystems. When such subsystem
+ * is enabled on a cgroup, the depended-upon subsystems are enabled
+ * together if available. Subsystems enabled due to dependency are
+ * not visible to userland until explicitly enabled. The following
+ * specifies the mask of subsystems that this one depends on.
+ */
+ unsigned int depends_on;
};
#define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index aad41f06901b..7dc8788cfd52 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -149,12 +149,14 @@ struct cgroup_root cgrp_dfl_root;
*/
static bool cgrp_dfl_root_visible;
+/*
+ * Set by the boot param of the same name and makes subsystems with NULL
+ * ->dfl_files to use ->legacy_files on the default hierarchy.
+ */
+static bool cgroup_legacy_files_on_dfl;
+
/* some controllers are not supported in the default hierarchy */
-static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0
-#ifdef CONFIG_CGROUP_DEBUG
- | (1 << debug_cgrp_id)
-#endif
- ;
+static unsigned int cgrp_dfl_root_inhibit_ss_mask;
/* The list of hierarchy roots */
@@ -180,13 +182,15 @@ static u64 css_serial_nr_next = 1;
*/
static int need_forkexit_callback __read_mostly;
-static struct cftype cgroup_base_files[];
+static struct cftype cgroup_dfl_base_files[];
+static struct cftype cgroup_legacy_base_files[];
static void cgroup_put(struct cgroup *cgrp);
static int rebind_subsystems(struct cgroup_root *dst_root,
unsigned int ss_mask);
static int cgroup_destroy_locked(struct cgroup *cgrp);
-static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss);
+static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
+ bool visible);
static void css_release(struct percpu_ref *ref);
static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
@@ -1037,6 +1041,58 @@ static void cgroup_put(struct cgroup *cgrp)
}
/**
+ * cgroup_refresh_child_subsys_mask - update child_subsys_mask
+ * @cgrp: the target cgroup
+ *
+ * On the default hierarchy, a subsystem may request other subsystems to be
+ * enabled together through its ->depends_on mask. In such cases, more
+ * subsystems than specified in "cgroup.subtree_control" may be enabled.
+ *
+ * This function determines which subsystems need to be enabled given the
+ * current @cgrp->subtree_control and records it in
+ * @cgrp->child_subsys_mask. The resulting mask is always a superset of
+ * @cgrp->subtree_control and follows the usual hierarchy rules.
+ */
+static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
+{
+ struct cgroup *parent = cgroup_parent(cgrp);
+ unsigned int cur_ss_mask = cgrp->subtree_control;
+ struct cgroup_subsys *ss;
+ int ssid;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (!cgroup_on_dfl(cgrp)) {
+ cgrp->child_subsys_mask = cur_ss_mask;
+ return;
+ }
+
+ while (true) {
+ unsigned int new_ss_mask = cur_ss_mask;
+
+ for_each_subsys(ss, ssid)
+ if (cur_ss_mask & (1 << ssid))
+ new_ss_mask |= ss->depends_on;
+
+ /*
+ * Mask out subsystems which aren't available. This can
+ * happen only if some depended-upon subsystems were bound
+ * to non-default hierarchies.
+ */
+ if (parent)
+ new_ss_mask &= parent->child_subsys_mask;
+ else
+ new_ss_mask &= cgrp->root->subsys_mask;
+
+ if (new_ss_mask == cur_ss_mask)
+ break;
+ cur_ss_mask = new_ss_mask;
+ }
+
+ cgrp->child_subsys_mask = cur_ss_mask;
+}
+
+/**
* cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
* @kn: the kernfs_node being serviced
*
@@ -1208,12 +1264,15 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
up_write(&css_set_rwsem);
src_root->subsys_mask &= ~(1 << ssid);
- src_root->cgrp.child_subsys_mask &= ~(1 << ssid);
+ src_root->cgrp.subtree_control &= ~(1 << ssid);
+ cgroup_refresh_child_subsys_mask(&src_root->cgrp);
/* default hierarchy doesn't enable controllers by default */
dst_root->subsys_mask |= 1 << ssid;
- if (dst_root != &cgrp_dfl_root)
- dst_root->cgrp.child_subsys_mask |= 1 << ssid;
+ if (dst_root != &cgrp_dfl_root) {
+ dst_root->cgrp.subtree_control |= 1 << ssid;
+ cgroup_refresh_child_subsys_mask(&dst_root->cgrp);
+ }
if (ss->bind)
ss->bind(css);
@@ -1233,8 +1292,6 @@ static int cgroup_show_options(struct seq_file *seq,
for_each_subsys(ss, ssid)
if (root->subsys_mask & (1 << ssid))
seq_printf(seq, ",%s", ss->name);
- if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
- seq_puts(seq, ",sane_behavior");
if (root->flags & CGRP_ROOT_NOPREFIX)
seq_puts(seq, ",noprefix");
if (root->flags & CGRP_ROOT_XATTR)
@@ -1268,6 +1325,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
bool all_ss = false, one_ss = false;
unsigned int mask = -1U;
struct cgroup_subsys *ss;
+ int nr_opts = 0;
int i;
#ifdef CONFIG_CPUSETS
@@ -1277,6 +1335,8 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
memset(opts, 0, sizeof(*opts));
while ((token = strsep(&o, ",")) != NULL) {
+ nr_opts++;
+
if (!*token)
return -EINVAL;
if (!strcmp(token, "none")) {
@@ -1361,37 +1421,33 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
return -ENOENT;
}
- /* Consistency checks */
-
if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
-
- if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
- opts->cpuset_clone_children || opts->release_agent ||
- opts->name) {
- pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
+ if (nr_opts != 1) {
+ pr_err("sane_behavior: no other mount options allowed\n");
return -EINVAL;
}
- } else {
- /*
- * If the 'all' option was specified select all the
- * subsystems, otherwise if 'none', 'name=' and a subsystem
- * name options were not specified, let's default to 'all'
- */
- if (all_ss || (!one_ss && !opts->none && !opts->name))
- for_each_subsys(ss, i)
- if (!ss->disabled)
- opts->subsys_mask |= (1 << i);
-
- /*
- * We either have to specify by name or by subsystems. (So
- * all empty hierarchies must have a name).
- */
- if (!opts->subsys_mask && !opts->name)
- return -EINVAL;
+ return 0;
}
/*
+ * If the 'all' option was specified select all the subsystems,
+ * otherwise if 'none', 'name=' and a subsystem name options were
+ * not specified, let's default to 'all'
+ */
+ if (all_ss || (!one_ss && !opts->none && !opts->name))
+ for_each_subsys(ss, i)
+ if (!ss->disabled)
+ opts->subsys_mask |= (1 << i);
+
+ /*
+ * We either have to specify by name or by subsystems. (So all
+ * empty hierarchies must have a name).
+ */
+ if (!opts->subsys_mask && !opts->name)
+ return -EINVAL;
+
+ /*
* Option noprefix was introduced just for backward compatibility
* with the old cpuset, so we allow noprefix only if mounting just
* the cpuset subsystem.
@@ -1399,7 +1455,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
return -EINVAL;
-
/* Can't specify "none" and some subsystems */
if (opts->subsys_mask && opts->none)
return -EINVAL;
@@ -1414,8 +1469,8 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
struct cgroup_sb_opts opts;
unsigned int added_mask, removed_mask;
- if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
- pr_err("sane_behavior: remount is not allowed\n");
+ if (root == &cgrp_dfl_root) {
+ pr_err("remount is not allowed\n");
return -EINVAL;
}
@@ -1434,11 +1489,10 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
removed_mask = root->subsys_mask & ~opts.subsys_mask;
/* Don't allow flags or name to change at remount */
- if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
+ if ((opts.flags ^ root->flags) ||
(opts.name && strcmp(opts.name, root->name))) {
pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
- opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
- root->flags & CGRP_ROOT_OPTION_MASK, root->name);
+ opts.flags, opts.name ?: "", root->flags, root->name);
ret = -EINVAL;
goto out_unlock;
}
@@ -1563,6 +1617,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
{
LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = &root->cgrp;
+ struct cftype *base_files;
struct css_set *cset;
int i, ret;
@@ -1600,7 +1655,12 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
}
root_cgrp->kn = root->kf_root->kn;
- ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
+ if (root == &cgrp_dfl_root)
+ base_files = cgroup_dfl_base_files;
+ else
+ base_files = cgroup_legacy_base_files;
+
+ ret = cgroup_addrm_files(root_cgrp, base_files, true);
if (ret)
goto destroy_root;
@@ -1672,7 +1732,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
goto out_unlock;
/* look for a matching existing root */
- if (!opts.subsys_mask && !opts.none && !opts.name) {
+ if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) {
cgrp_dfl_root_visible = true;
root = &cgrp_dfl_root;
cgroup_get(&root->cgrp);
@@ -1730,15 +1790,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
goto out_unlock;
}
- if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
- if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
- pr_err("sane_behavior: new mount options should match the existing superblock\n");
- ret = -EINVAL;
- goto out_unlock;
- } else {
- pr_warn("new mount options do not match the existing superblock, will be ignored\n");
- }
- }
+ if (root->flags ^ opts.flags)
+ pr_warn("new mount options do not match the existing superblock, will be ignored\n");
/*
* We want to reuse @root whose lifetime is governed by its
@@ -2457,9 +2510,7 @@ static int cgroup_release_agent_show(struct seq_file *seq, void *v)
static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
{
- struct cgroup *cgrp = seq_css(seq)->cgroup;
-
- seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
+ seq_puts(seq, "0\n");
return 0;
}
@@ -2496,7 +2547,7 @@ static int cgroup_controllers_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask);
+ cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control);
return 0;
}
@@ -2505,7 +2556,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- cgroup_print_ss_mask(seq, cgrp->child_subsys_mask);
+ cgroup_print_ss_mask(seq, cgrp->subtree_control);
return 0;
}
@@ -2611,6 +2662,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
loff_t off)
{
unsigned int enable = 0, disable = 0;
+ unsigned int css_enable, css_disable, old_ctrl, new_ctrl;
struct cgroup *cgrp, *child;
struct cgroup_subsys *ss;
char *tok;
@@ -2650,11 +2702,26 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
for_each_subsys(ss, ssid) {
if (enable & (1 << ssid)) {
- if (cgrp->child_subsys_mask & (1 << ssid)) {
+ if (cgrp->subtree_control & (1 << ssid)) {
enable &= ~(1 << ssid);
continue;
}
+ /* unavailable or not enabled on the parent? */
+ if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
+ (cgroup_parent(cgrp) &&
+ !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
+
+ /*
+ * @ss is already enabled through dependency and
+ * we'll just make it visible. Skip draining.
+ */
+ if (cgrp->child_subsys_mask & (1 << ssid))
+ continue;
+
/*
* Because css offlining is asynchronous, userland
* might try to re-enable the same controller while
@@ -2677,23 +2744,15 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
return restart_syscall();
}
-
- /* unavailable or not enabled on the parent? */
- if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
- (cgroup_parent(cgrp) &&
- !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) {
- ret = -ENOENT;
- goto out_unlock;
- }
} else if (disable & (1 << ssid)) {
- if (!(cgrp->child_subsys_mask & (1 << ssid))) {
+ if (!(cgrp->subtree_control & (1 << ssid))) {
disable &= ~(1 << ssid);
continue;
}
/* a child has it enabled? */
cgroup_for_each_live_child(child, cgrp) {
- if (child->child_subsys_mask & (1 << ssid)) {
+ if (child->subtree_control & (1 << ssid)) {
ret = -EBUSY;
goto out_unlock;
}
@@ -2707,7 +2766,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
}
/*
- * Except for the root, child_subsys_mask must be zero for a cgroup
+ * Except for the root, subtree_control must be zero for a cgroup
* with tasks so that child cgroups don't compete against tasks.
*/
if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
@@ -2716,36 +2775,75 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
}
/*
- * Create csses for enables and update child_subsys_mask. This
- * changes cgroup_e_css() results which in turn makes the
- * subsequent cgroup_update_dfl_csses() associate all tasks in the
- * subtree to the updated csses.
+ * Update subsys masks and calculate what needs to be done. More
+ * subsystems than specified may need to be enabled or disabled
+ * depending on subsystem dependencies.
+ */
+ cgrp->subtree_control |= enable;
+ cgrp->subtree_control &= ~disable;
+
+ old_ctrl = cgrp->child_subsys_mask;
+ cgroup_refresh_child_subsys_mask(cgrp);
+ new_ctrl = cgrp->child_subsys_mask;
+
+ css_enable = ~old_ctrl & new_ctrl;
+ css_disable = old_ctrl & ~new_ctrl;
+ enable |= css_enable;
+ disable |= css_disable;
+
+ /*
+ * Create new csses or make the existing ones visible. A css is
+ * created invisible if it's being implicitly enabled through
+ * dependency. An invisible css is made visible when the userland
+ * explicitly enables it.
*/
for_each_subsys(ss, ssid) {
if (!(enable & (1 << ssid)))
continue;
cgroup_for_each_live_child(child, cgrp) {
- ret = create_css(child, ss);
+ if (css_enable & (1 << ssid))
+ ret = create_css(child, ss,
+ cgrp->subtree_control & (1 << ssid));
+ else
+ ret = cgroup_populate_dir(child, 1 << ssid);
if (ret)
goto err_undo_css;
}
}
- cgrp->child_subsys_mask |= enable;
- cgrp->child_subsys_mask &= ~disable;
-
+ /*
+ * At this point, cgroup_e_css() results reflect the new csses
+ * making the following cgroup_update_dfl_csses() properly update
+ * css associations of all tasks in the subtree.
+ */
ret = cgroup_update_dfl_csses(cgrp);
if (ret)
goto err_undo_css;
- /* all tasks are now migrated away from the old csses, kill them */
+ /*
+ * All tasks are migrated out of disabled csses. Kill or hide
+ * them. A css is hidden when the userland requests it to be
+ * disabled while other subsystems are still depending on it. The
+ * css must not actively control resources and be in the vanilla
+ * state if it's made visible again later. Controllers which may
+ * be depended upon should provide ->css_reset() for this purpose.
+ */
for_each_subsys(ss, ssid) {
if (!(disable & (1 << ssid)))
continue;
- cgroup_for_each_live_child(child, cgrp)
- kill_css(cgroup_css(child, ss));
+ cgroup_for_each_live_child(child, cgrp) {
+ struct cgroup_subsys_state *css = cgroup_css(child, ss);
+
+ if (css_disable & (1 << ssid)) {
+ kill_css(css);
+ } else {
+ cgroup_clear_dir(child, 1 << ssid);
+ if (ss->css_reset)
+ ss->css_reset(css);
+ }
+ }
}
kernfs_activate(cgrp->kn);
@@ -2755,8 +2853,9 @@ out_unlock:
return ret ?: nbytes;
err_undo_css:
- cgrp->child_subsys_mask &= ~enable;
- cgrp->child_subsys_mask |= disable;
+ cgrp->subtree_control &= ~enable;
+ cgrp->subtree_control |= disable;
+ cgroup_refresh_child_subsys_mask(cgrp);
for_each_subsys(ss, ssid) {
if (!(enable & (1 << ssid)))
@@ -2764,8 +2863,14 @@ err_undo_css:
cgroup_for_each_live_child(child, cgrp) {
struct cgroup_subsys_state *css = cgroup_css(child, ss);
- if (css)
+
+ if (!css)
+ continue;
+
+ if (css_enable & (1 << ssid))
kill_css(css);
+ else
+ cgroup_clear_dir(child, 1 << ssid);
}
}
goto out_unlock;
@@ -2878,9 +2983,9 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
/*
* This isn't a proper migration and its usefulness is very
- * limited. Disallow if sane_behavior.
+ * limited. Disallow on the default hierarchy.
*/
- if (cgroup_sane_behavior(cgrp))
+ if (cgroup_on_dfl(cgrp))
return -EPERM;
/*
@@ -2964,9 +3069,9 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
for (cft = cfts; cft->name[0] != '\0'; cft++) {
/* does cft->flags tell us to skip this file on @cgrp? */
- if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
+ if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
continue;
- if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
+ if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
continue;
if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
continue;
@@ -3024,6 +3129,9 @@ static void cgroup_exit_cftypes(struct cftype *cfts)
kfree(cft->kf_ops);
cft->kf_ops = NULL;
cft->ss = NULL;
+
+ /* revert flags set by cgroup core while adding @cfts */
+ cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
}
}
@@