summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/cgroups/00-INDEX8
-rw-r--r--Documentation/cgroups/cgroups.txt61
-rw-r--r--Documentation/cgroups/freezer-subsystem.txt63
-rw-r--r--Documentation/cgroups/net_prio.txt2
-rw-r--r--block/blk-cgroup.c15
-rw-r--r--include/linux/cgroup.h167
-rw-r--r--include/linux/freezer.h57
-rw-r--r--include/net/netprio_cgroup.h11
-rw-r--r--kernel/cgroup.c754
-rw-r--r--kernel/cgroup_freezer.c514
-rw-r--r--kernel/cpuset.c90
-rw-r--r--kernel/events/core.c8
-rw-r--r--kernel/fork.c9
-rw-r--r--kernel/freezer.c11
-rw-r--r--kernel/power/process.c13
-rw-r--r--kernel/sched/core.c16
-rw-r--r--kernel/signal.c20
-rw-r--r--mm/hugetlb_cgroup.c23
-rw-r--r--mm/memcontrol.c191
-rw-r--r--net/core/netprio_cgroup.c260
-rw-r--r--net/sched/cls_cgroup.c28
-rw-r--r--security/device_cgroup.c20
22 files changed, 1255 insertions, 1086 deletions
diff --git a/Documentation/cgroups/00-INDEX b/Documentation/cgroups/00-INDEX
index 3f58fa3d6d00..f78b90a35ad0 100644
--- a/Documentation/cgroups/00-INDEX
+++ b/Documentation/cgroups/00-INDEX
@@ -1,7 +1,11 @@
00-INDEX
- this file
+blkio-controller.txt
+ - Description for Block IO Controller, implementation and usage details.
cgroups.txt
- Control Groups definition, implementation details, examples and API.
+cgroup_event_listener.c
+ - A user program for cgroup listener.
cpuacct.txt
- CPU Accounting Controller; account CPU usage for groups of tasks.
cpusets.txt
@@ -10,9 +14,13 @@ devices.txt
- Device Whitelist Controller; description, interface and security.
freezer-subsystem.txt
- checkpointing; rationale to not use signals, interface.
+hugetlb.txt
+ - HugeTLB Controller implementation and usage details.
memcg_test.txt
- Memory Resource Controller; implementation details.
memory.txt
- Memory Resource Controller; design, accounting, interface, testing.
+net_prio.txt
+ - Network priority cgroups details and usages.
resource_counter.txt
- Resource Counter API.
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 9e04196c4d78..bcf1a00b06a1 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -299,11 +299,9 @@ a cgroup hierarchy's release_agent path is empty.
1.5 What does clone_children do ?
---------------------------------
-If the clone_children flag is enabled (1) in a cgroup, then all
-cgroups created beneath will call the post_clone callbacks for each
-subsystem of the newly created cgroup. Usually when this callback is
-implemented for a subsystem, it copies the values of the parent
-subsystem, this is the case for the cpuset.
+This flag only affects the cpuset controller. If the clone_children
+flag is enabled (1) in a cgroup, a new cpuset cgroup will copy its
+configuration from the parent during initialization.
1.6 How do I use cgroups ?
--------------------------
@@ -553,16 +551,16 @@ call to cgroup_unload_subsys(). It should also set its_subsys.module =
THIS_MODULE in its .c file.
Each subsystem may export the following methods. The only mandatory
-methods are create/destroy. Any others that are null are presumed to
+methods are css_alloc/free. Any others that are null are presumed to
be successful no-ops.
-struct cgroup_subsys_state *create(struct cgroup *cgrp)
+struct cgroup_subsys_state *css_alloc(struct cgroup *cgrp)
(cgroup_mutex held by caller)
-Called to create a subsystem state object for a cgroup. The
+Called to allocate a subsystem state object for a cgroup. The
subsystem should allocate its subsystem state object for the passed
cgroup, returning a pointer to the new object on success or a
-negative error code. On success, the subsystem pointer should point to
+ERR_PTR() value. On success, the subsystem pointer should point to
a structure of type cgroup_subsys_state (typically embedded in a
larger subsystem-specific object), which will be initialized by the
cgroup system. Note that this will be called at initialization to
@@ -571,24 +569,33 @@ identified by the passed cgroup object having a NULL parent (since
it's the root of the hierarchy) and may be an appropriate place for
initialization code.
-void destroy(struct cgroup *cgrp)
+int css_online(struct cgroup *cgrp)
(cgroup_mutex held by caller)
-The cgroup system is about to destroy the passed cgroup; the subsystem
-should do any necessary cleanup and free its subsystem state
-object. By the time this method is called, the cgroup has already been
-unlinked from the file system and from the child list of its parent;
-cgroup->parent is still valid. (Note - can also be called for a
-newly-created cgroup if an error occurs after this subsystem's
-create() method has been called for the new cgroup).
+Called after @cgrp successfully completed all allocations and made
+visible to cgroup_for_each_child/descendant_*() iterators. The
+subsystem may choose to fail creation by returning -errno. This
+callback can be used to implement reliable state sharing and
+propagation along the hierarchy. See the comment on
+cgroup_for_each_descendant_pre() for details.
-int pre_destroy(struct cgroup *cgrp);
+void css_offline(struct cgroup *cgrp);
-Called before checking the reference count on each subsystem. This may
-be useful for subsystems which have some extra references even if
-there are not tasks in the cgroup. If pre_destroy() returns error code,
-rmdir() will fail with it. From this behavior, pre_destroy() can be
-called multiple times against a cgroup.
+This is the counterpart of css_online() and called iff css_online()
+has succeeded on @cgrp. This signifies the beginning of the end of
+@cgrp. @cgrp is being removed and the subsystem should start dropping
+all references it's holding on @cgrp. When all references are dropped,
+cgroup removal will proceed to the next step - css_free(). After this
+callback, @cgrp should be considered dead to the subsystem.
+
+void css_free(struct cgroup *cgrp)
+(cgroup_mutex held by caller)
+
+The cgroup system is about to free @cgrp; the subsystem should free
+its subsystem state object. By the time this method is called, @cgrp
+is completely unused; @cgrp->parent is still valid. (Note - can also
+be called for a newly-created cgroup if an error occurs after this
+subsystem's create() method has been called for the new cgroup).
int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
(cgroup_mutex held by caller)
@@ -635,14 +642,6 @@ void exit(struct task_struct *task)
Called during task exit.
-void post_clone(struct cgroup *cgrp)
-(cgroup_mutex held by caller)
-
-Called during cgroup_create() to do any parameter
-initialization which might be required before a task could attach. For
-example, in cpusets, no task may attach before 'cpus' and 'mems' are set
-up.
-
void bind(struct cgroup *root)
(cgroup_mutex held by caller)
diff --git a/Documentation/cgroups/freezer-subsystem.txt b/Documentation/cgroups/freezer-subsystem.txt
index 7e62de1e59ff..c96a72cbb30a 100644
--- a/Documentation/cgroups/freezer-subsystem.txt
+++ b/Documentation/cgroups/freezer-subsystem.txt
@@ -49,13 +49,49 @@ prevent the freeze/unfreeze cycle from becoming visible to the tasks
being frozen. This allows the bash example above and gdb to run as
expected.
-The freezer subsystem in the container filesystem defines a file named
-freezer.state. Writing "FROZEN" to the state file will freeze all tasks in the
-cgroup. Subsequently writing "THAWED" will unfreeze the tasks in the cgroup.
-Reading will return the current state.
+The cgroup freezer is hierarchical. Freezing a cgroup freezes all
+tasks beloning to the cgroup and all its descendant cgroups. Each
+cgroup has its own state (self-state) and the state inherited from the
+parent (parent-state). Iff both states are THAWED, the cgroup is
+THAWED.
-Note freezer.state doesn't exist in root cgroup, which means root cgroup
-is non-freezable.
+The following cgroupfs files are created by cgroup freezer.
+
+* freezer.state: Read-write.
+
+ When read, returns the effective state of the cgroup - "THAWED",
+ "FREEZING" or "FROZEN". This is the combined self and parent-states.
+ If any is freezing, the cgroup is freezing (FREEZING or FROZEN).
+
+ FREEZING cgroup transitions into FROZEN state when all tasks
+ belonging to the cgroup and its descendants become frozen. Note that
+ a cgroup reverts to FREEZING from FROZEN after a new task is added
+ to the cgroup or one of its descendant cgroups until the new task is
+ frozen.
+
+ When written, sets the self-state of the cgroup. Two values are
+ allowed - "FROZEN" and "THAWED". If FROZEN is written, the cgroup,
+ if not already freezing, enters FREEZING state along with all its
+ descendant cgroups.
+
+ If THAWED is written, the self-state of the cgroup is changed to
+ THAWED. Note that the effective state may not change to THAWED if
+ the parent-state is still freezing. If a cgroup's effective state
+ becomes THAWED, all its descendants which are freezing because of
+ the cgroup also leave the freezing state.
+
+* freezer.self_freezing: Read only.
+
+ Shows the self-state. 0 if the self-state is THAWED; otherwise, 1.
+ This value is 1 iff the last write to freezer.state was "FROZEN".
+
+* freezer.parent_freezing: Read only.
+
+ Shows the parent-state. 0 if none of the cgroup's ancestors is
+ frozen; otherwise, 1.
+
+The root cgroup is non-freezable and the above interface files don't
+exist.
* Examples of usage :
@@ -85,18 +121,3 @@ to unfreeze all tasks in the container :
This is the basic mechanism which should do the right thing for user space task
in a simple scenario.
-
-It's important to note that freezing can be incomplete. In that case we return
-EBUSY. This means that some tasks in the cgroup are busy doing something that
-prevents us from completely freezing the cgroup at this time. After EBUSY,
-the cgroup will remain partially frozen -- reflected by freezer.state reporting
-"FREEZING" when read. The state will remain "FREEZING" until one of these
-things happens:
-
- 1) Userspace cancels the freezing operation by writing "THAWED" to
- the freezer.state file
- 2) Userspace retries the freezing operation by writing "FROZEN" to
- the freezer.state file (writing "FREEZING" is not legal
- and returns EINVAL)
- 3) The tasks that blocked the cgroup from entering the "FROZEN"
- state disappear from the cgroup's set of tasks.
diff --git a/Documentation/cgroups/net_prio.txt b/Documentation/cgroups/net_prio.txt
index 01b322635591..a82cbd28ea8a 100644
--- a/Documentation/cgroups/net_prio.txt
+++ b/Documentation/cgroups/net_prio.txt
@@ -51,3 +51,5 @@ One usage for the net_prio cgroup is with mqprio qdisc allowing application
traffic to be steered to hardware/driver based traffic classes. These mappings
can then be managed by administrators or other networking protocols such as
DCBX.
+
+A new net_prio cgroup inherits the parent's configuration.
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index d0b770391ad4..3f6d39d23bb6 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -600,7 +600,7 @@ struct cftype blkcg_files[] = {
};
/**
- * blkcg_pre_destroy - cgroup pre_destroy callback
+ * blkcg_css_offline - cgroup css_offline callback
* @cgroup: cgroup of interest
*
* This function is called when @cgroup is about to go away and responsible
@@ -610,7 +610,7 @@ struct cftype blkcg_files[] = {
*
* This is the blkcg counterpart of ioc_release_fn().
*/
-static int blkcg_pre_destroy(struct cgroup *cgroup)
+static void blkcg_css_offline(struct cgroup *cgroup)
{
struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
@@ -632,10 +632,9 @@ static int blkcg_pre_destroy(struct cgroup *cgroup)
}
spin_unlock_irq(&blkcg->lock);
- return 0;
}
-static void blkcg_destroy(struct cgroup *cgroup)
+static void blkcg_css_free(struct cgroup *cgroup)
{
struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
@@ -643,7 +642,7 @@ static void blkcg_destroy(struct cgroup *cgroup)
kfree(blkcg);
}
-static struct cgroup_subsys_state *blkcg_create(struct cgroup *cgroup)
+static struct cgroup_subsys_state *blkcg_css_alloc(struct cgroup *cgroup)
{
static atomic64_t id_seq = ATOMIC64_INIT(0);
struct blkcg *blkcg;
@@ -740,10 +739,10 @@ static int blkcg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
struct cgroup_subsys blkio_subsys = {
.name = "blkio",
- .create = blkcg_create,
+ .css_alloc = blkcg_css_alloc,
+ .css_offline = blkcg_css_offline,
+ .css_free = blkcg_css_free,
.can_attach = blkcg_can_attach,
- .pre_destroy = blkcg_pre_destroy,
- .destroy = blkcg_destroy,
.subsys_id = blkio_subsys_id,
.base_cftypes = blkcg_files,
.module = THIS_MODULE,
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index f8a030ced0c7..7d73905dcba2 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -12,6 +12,7 @@
#include <linux/cpumask.h>
#include <linux/nodemask.h>
#include <linux/rcupdate.h>
+#include <linux/rculist.h>
#include <linux/cgroupstats.h>
#include <linux/prio_heap.h>
#include <linux/rwsem.h>
@@ -34,7 +35,6 @@ extern int cgroup_lock_is_held(void);
extern bool cgroup_lock_live_group(struct cgroup *cgrp);
extern void cgroup_unlock(void);
extern void cgroup_fork(struct task_struct *p);
-extern void cgroup_fork_callbacks(struct task_struct *p);
extern void cgroup_post_fork(struct task_struct *p);
extern void cgroup_exit(struct task_struct *p, int run_callbacks);
extern int cgroupstats_build(struct cgroupstats *stats,
@@ -66,7 +66,7 @@ struct cgroup_subsys_state {
/*
* State maintained by the cgroup system to allow subsystems
* to be "busy". Should be accessed via css_get(),
- * css_tryget() and and css_put().
+ * css_tryget() and css_put().
*/
atomic_t refcnt;
@@ -81,9 +81,8 @@ struct cgroup_subsys_state {
/* bits in struct cgroup_subsys_state flags field */
enum {
- CSS_ROOT, /* This CSS is the root of the subsystem */
- CSS_REMOVED, /* This CSS is dead */
- CSS_CLEAR_CSS_REFS, /* @ss->__DEPRECATED_clear_css_refs */
+ CSS_ROOT = (1 << 0), /* this CSS is the root of the subsystem */
+ CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */
};
/* Caller must verify that the css is not for root cgroup */
@@ -102,15 +101,10 @@ static inline void __css_get(struct cgroup_subsys_state *css, int count)
static inline void css_get(struct cgroup_subsys_state *css)
{
/* We don't need to reference count the root state */
- if (!test_bit(CSS_ROOT, &css->flags))
+ if (!(css->flags & CSS_ROOT))
__css_get(css, 1);
}
-static inline bool css_is_removed(struct cgroup_subsys_state *css)
-{
- return test_bit(CSS_REMOVED, &css->flags);
-}
-
/*
* Call css_tryget() to take a reference on a css if your existing
* (known-valid) reference isn't already ref-counted. Returns false if
@@ -120,7 +114,7 @@ static inline bool css_is_removed(struct cgroup_subsys_state *css)
extern bool __css_tryget(struct cgroup_subsys_state *css);
static inline bool css_tryget(struct cgroup_subsys_state *css)
{
- if (test_bit(CSS_ROOT, &css->flags))
+ if (css->flags & CSS_ROOT)
return true;
return __css_tryget(css);
}
@@ -133,7 +127,7 @@ static inline bool css_tryget(struct cgroup_subsys_state *css)
extern void __css_put(struct cgroup_subsys_state *css);
static inline void css_put(struct cgroup_subsys_state *css)
{
- if (!test_bit(CSS_ROOT, &css->flags))
+ if (!(css->flags & CSS_ROOT))
__css_put(css);
}
@@ -149,13 +143,11 @@ enum {
/* Control Group requires release notifications to userspace */
CGRP_NOTIFY_ON_RELEASE,
/*
- * A thread in rmdir() is wating for this cgroup.
- */
- CGRP_WAIT_ON_RMDIR,
- /*
- * Clone cgroup values when creating a new child cgroup
+ * Clone the parent's configuration when creating a new child
+ * cpuset cgroup. For historical reasons, this option can be
+ * specified at mount time and thus is implemented here.
*/
- CGRP_CLONE_CHILDREN,
+ CGRP_CPUSET_CLONE_CHILDREN,
};
struct cgroup {
@@ -167,6 +159,8 @@ struct cgroup {
*/
atomic_t count;
+ int id; /* ida allocated in-hierarchy ID */
+
/*
* We link our 'sibling' struct into our parent's 'children'.
* Our children link their 'sibling' into our 'children'.
@@ -176,7 +170,7 @@ struct cgroup {
struct list_head files; /* my files */
struct cgroup *parent; /* my parent */
- struct dentry __rcu *dentry; /* cgroup fs entry, RCU protected */
+ struct dentry *dentry; /* cgroup fs entry, RCU protected */
/* Private pointers for each registered subsystem */
struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
@@ -282,7 +276,7 @@ struct cgroup_map_cb {
/* cftype->flags */
#define CFTYPE_ONLY_ON_ROOT (1U << 0) /* only create on root cg */
-#define CFTYPE_NOT_ON_ROOT (1U << 1) /* don't create onp root cg */
+#define CFTYPE_NOT_ON_ROOT (1U << 1) /* don't create on root cg */
#define MAX_CFTYPE_NAME 64
@@ -422,23 +416,6 @@ int cgroup_task_count(const struct cgroup *cgrp);
int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);
/*
- * When the subsys has to access css and may add permanent refcnt to css,
- * it should take care of racy conditions with rmdir(). Following set of
- * functions, is for stop/restart rmdir if necessary.
- * Because these will call css_get/put, "css" should be alive css.
- *
- * cgroup_exclude_rmdir();
- * ...do some jobs which may access arbitrary empty cgroup
- * cgroup_release_and_wakeup_rmdir();
- *
- * When someone removes a cgroup while cgroup_exclude_rmdir() holds it,
- * it sleeps and cgroup_release_and_wakeup_rmdir() will wake him up.
- */
-
-void cgroup_exclude_rmdir(struct cgroup_subsys_state *css);
-void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css);
-
-/*
* Control Group taskset, used to pass around set of tasks to cgroup_subsys
* methods.
*/
@@ -466,16 +443,17 @@ int cgroup_taskset_size(struct cgroup_taskset *tset);
*/
struct cgroup_subsys {
- struct cgroup_subsys_state *(*create)(struct cgroup *cgrp);
- int (*pre_destroy)(struct cgroup *cgrp);
- void (*destroy)(struct cgroup *cgrp);
+ struct cgroup_subsys_state *(*css_alloc)(struct cgroup *cgrp);
+ int (*css_online)(struct cgroup *cgrp);
+ void (*css_offline)(struct cgroup *cgrp);
+ void (*css_free)(struct cgroup *cgrp);
+
int (*can_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset);
void (*cancel_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset);
void (*attach)(struct cgroup *cgrp, struct cgroup_taskset *tset);
void (*fork)(struct task_struct *task);
void (*exit)(struct cgroup *cgrp, struct cgroup *old_cgrp,
struct task_struct *task);
- void (*post_clone)(struct cgroup *cgrp);
void (*bind)(struct cgroup *root);
int subsys_id;
@@ -489,17 +467,6 @@ struct cgroup_subsys {
bool use_id;
/*
- * If %true, cgroup removal will try to clear css refs by retrying
- * ss->pre_destroy() until there's no css ref left. This behavior
- * is strictly for backward compatibility and will be removed as
- * soon as the current user (memcg) is updated.
- *
- * If %false, ss->pre_destroy() can't fail and cgroup removal won't
- * wait for css refs to drop to zero before proceeding.
- */
- bool __DEPRECATED_clear_css_refs;
-
- /*
* If %false, this subsystem is properly hierarchical -
* configuration, resource accounting and restriction on a parent
* cgroup cover those of its children. If %true, hierarchy support
@@ -572,6 +539,100 @@ static inline struct cgroup* task_cgroup(struct task_struct *task,
return task_subsys_state(task, subsys_id)->cgroup;
}
+/**
+ * cgroup_for_each_child - iterate through children of a cgroup
+ * @pos: the cgroup * to use as the loop cursor
+ * @cgroup: cgroup whose children to walk
+ *
+ * Walk @cgroup's children. Must be called under rcu_read_lock(). A child
+ * cgroup which hasn't finished ->css_online() or already has finished
+ * ->css_offline() may show up during traversal and it's each subsystem's
+ * responsibility to verify that each @pos is alive.
+ *
+ * If a subsystem synchronizes against the parent in its ->css_online() and
+ * before starting iterating, a cgroup which finished ->css_online() is
+ * guaranteed to be visible in the future iterations.
+ */
+#define cgroup_for_each_child(pos, cgroup) \
+ list_for_each_entry_rcu(pos, &(cgroup)->children, sibling)
+
+struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
+ struct cgroup *cgroup);
+
+/**
+ * cgroup_for_each_descendant_pre - pre-order walk of a cgroup's descendants
+ * @pos: the cgroup * to use as the loop cursor
+ * @cgroup: cgroup whose descendants to walk
+ *
+ * Walk @cgroup's descendants. Must be called under rcu_read_lock(). A
+ * descendant cgroup which hasn't finished ->css_online() or already has
+ * finished ->css_offline() may show up during traversal and it's each
+ * subsystem's responsibility to verify that each @pos is alive.
+ *
+ * If a subsystem synchronizes against the parent in its ->css_online() and
+ * before starting iterating, and synchronizes against @pos on each
+ * iteration, any descendant cgroup which finished ->css_offline() is
+ * guaranteed to be visible in the future iterations.
+ *
+ * In other words, the following guarantees that a descendant can't escape
+ * state updates of its ancestors.
+ *
+ * my_online(@cgrp)
+ * {
+ * Lock @cgrp->parent and @cgrp;
+ * Inherit state from @cgrp->parent;
+ * Unlock both.
+ * }
+ *
+ * my_update_state(@cgrp)
+ * {
+ * Lock @cgrp;
+ * Update @cgrp's state;
+ * Unlock @cgrp;
+ *
+ * cgroup_for_each_descendant_pre(@pos, @cgrp) {
+ * Lock @pos;
+ * Verify @pos is alive and inherit state from @pos->parent;
+ * Unlock @pos;
+ * }
+ * }
+ *
+ * As long as the inheriting step, including checking the parent state, is
+ * enclosed inside @pos locking, double-locking the parent isn't necessary
+ * while inheriting. The state update to the parent is guaranteed to be
+ * visible by walking order and, as long as inheriting operations to the
+ * same @pos are atomic to each other, multiple updates racing each other
+ * still result in the correct state. It's guaranateed that at least one
+ * inheritance happens for any cgroup after the latest update to its
+ * parent.
+ *
+ * If checking parent's state requires locking the parent, each inheriting
+ * iteration should lock and unlock both @pos->parent and @pos.
+ *
+ * Alternatively, a subsystem may choose to use a single global lock to
+ * synchronize ->css_online() and ->css_offline() against tree-walking
+ * operations.
+ */
+#define cgroup_for_each_descendant_pre(pos, cgroup) \
+ for (pos = cgroup_next_descendant_pre(NULL, (cgroup)); (pos); \
+ pos = cgroup_next_descendant_pre((pos), (cgroup)))
+
+struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
+ struct cgroup *cgroup);
+
+/**
+ * cgroup_for_each_descendant_post - post-order walk of a cgroup's descendants
+ * @pos: the cgroup * to use as the loop cursor
+ * @cgroup: cgroup whose descendants to walk
+ *
+ * Similar to cgroup_for_each_descendant_pre() but performs post-order
+ * traversal instead. Note that the walk visibility guarantee described in
+ * pre-order walk doesn't apply the same to post-order walks.
+ */
+#define cgroup_for_each_descendant_post(pos, cgroup) \
+ for (pos = cgroup_next_descendant_post(NULL, (cgroup)); (pos); \
+ pos = cgroup_next_descendant_post((pos), (cgroup)))
+
/* A cgroup_iter should be treated as an opaque object */
struct cgroup_iter {
struct list_head *cg_link;
diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index b90091af5798..e4238ceaa4d6 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -75,35 +75,68 @@ static inline bool cgroup_freezing(struct task_struct *task)
*/
-/* Tell the freezer not to count the current task as freezable. */
+/**
+ * freezer_do_not_count - tell freezer to ignore %current
+ *
+ * Tell freezers to ignore the current task when determining whether the
+ * target frozen state is reached. IOW, the current task will be
+ * considered frozen enough by freezers.
+ *
+ * The caller shouldn't do anything which isn't allowed for a frozen task
+ * until freezer_cont() is called. Usually, freezer[_do_not]_count() pair
+ * wrap a scheduling operation and nothing much else.
+ */
static inline void freezer_do_not_count(void)
{
current->flags |= PF_FREEZER_SKIP;
}
-/*
- * Tell the freezer to count the current task as freezable again and try to
- * freeze it.
+/**
+ * freezer_count - tell freezer to stop ignoring %current
+ *
+ * Undo freezer_do_not_count(). It tells freezers that %current should be
+ * considered again and tries to freeze if freezing condition is already in
+ * effect.
*/
static inline void freezer_count(void)
{
current->flags &= ~PF_FREEZER_SKIP;
+ /*
+ * If freezing is in progress, the following paired with smp_mb()
+ * in freezer_should_skip() ensures that either we see %true
+ * freezing() or freezer_should_skip() sees !PF_FREEZER_SKIP.
+ */
+ smp_mb();
try_to_freeze();
}
-/*
- * Check if the task should be counted as freezable by the freezer
+/**
+ * freezer_should_skip - whether to skip a task when determining frozen
+ * state is reached
+ * @p: task in quesion
+ *
+ * This function is used by freezers after establishing %true freezing() to
+ * test whether a task should be skipped when determining the target frozen
+ * state is reached. IOW, if this function returns %true, @p is considered
+ * frozen enough.
*/
-static inline int freezer_should_skip(struct task_struct *p)
+static inline bool freezer_should_skip(struct task_struct *p)
{
- return !!(p->flags & PF_FREEZER_SKIP);
+ /*
+ * The following smp_mb() paired with the one in freezer_count()
+ * ensures that either freezer_count() sees %true freezing() or we
+ * see cleared %PF_FREEZER_SKIP and return %false. This makes it
+ * impossible for a task to slip frozen state testing after
+ * clearing %PF_FREEZER_SKIP.
+ */
+ smp_mb();
+ return p->flags & PF_FREEZER_SKIP;
}
/*
- * These macros are intended to be used whenever you want allow a task that's
- * sleeping in TASK_UNINTERRUPTIBLE or TASK_KILLABLE state to be frozen. Note
- * that neither return any clear indication of whether a freeze event happened
- * while in this function.
+ * These macros are intended to be used whenever you want allow a sleeping
+ * task to be frozen. Note that neither return any clear indication of
+ * whether a freeze event happened while in this function.
*/
/* Like schedule(), but should not block the freezer. */
diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h
index 2760f4f4ae9b..1d04b6f0fbd4 100644
--- a/include/net/netprio_cgroup.h
+++ b/include/net/netprio_cgroup.h
@@ -27,7 +27,6 @@ struct netprio_map {
struct cgroup_netprio_state {
struct cgroup_subsys_state css;
- u32 prioidx;
};
extern void sock_update_netprioidx(struct sock *sk, struct task_struct *task);
@@ -36,13 +35,12 @@ extern void sock_update_netprioidx(struct sock *sk, struct task_struct *task);
static inline u32 task_netprioidx(struct task_struct *p)
{
- struct cgroup_netprio_state *state;
+ struct cgroup_subsys_state *css;
u32 idx;
rcu_read_lock();
- state = container_of(task_subsys_state(p, net_prio_subsys_id),
- struct cgroup_netprio_state, css);
- idx = state->prioidx;
+ css = task_subsys_state(p, net_prio_subsys_id);
+ idx = css->cgroup->id;
rcu_read_unlock();
return idx;
}
@@ -57,8 +55,7 @@ static inline u32 task_netprioidx(struct task_struct *p)
rcu_read_lock();
css = task_subsys_state(p, net_prio_subsys_id);
if (css)
- idx = container_of(css,
- struct cgroup_netprio_state, css)->prioidx;
+ idx = css->cgroup->id;
rcu_read_unlock();
return idx;
}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f24f724620dd..f34c41bfaa37 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -138,6 +138,9 @@ struct cgroupfs_root {
/* Hierarchy-specific flags */
unsigned long flags;
+ /* IDs for cgroups in this hierarchy */
+ struct ida cgroup_ida;
+
/* The path to use for release notifications. */
char release_agent_path[PATH_MAX];
@@ -171,8 +174,8 @@ struct css_id {
* The css to which this ID points. This pointer is set to valid value
* after cgroup is populated. If cgroup is removed, this will be NULL.
* This pointer is expected to be RCU-safe because destroy()
- * is called after synchronize_rcu(). But for safe use, css_is_removed()
- * css_tryget() should be used for avoiding race.
+ * is called after synchronize_rcu(). But for safe use, css_tryget()
+ * should be used for avoiding race.
*/
struct cgroup_subsys_state __rcu *css;
/*
@@ -242,6 +245,10 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
*/
static int need_forkexit_callback __read_mostly;
+static int cgroup_destroy_locked(struct cgroup *cgrp);
+static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
+ struct cftype cfts[], bool is_add);
+
#ifdef CONFIG_PROVE_LOCKING
int cgroup_lock_is_held(void)
{
@@ -294,11 +301,6 @@ static int notify_on_release(const struct cgroup *cgrp)
return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
}
-static int clone_children(const struct cgroup *cgrp)
-{
- return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
-}
-
/*
* for_each_subsys() allows you to iterate on each subsystem attached to
* an active hierarchy
@@ -782,12 +784,12 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
* The task_lock() exception
*
* The need for this exception arises from the action of
- * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
+ * cgroup_attach_task(), which overwrites one task's cgroup pointer with
* another. It does so using cgroup_mutex, however there are
* several performance critical places that need to reference
* task->cgroup without the expense of grabbing a system global
* mutex. Therefore except as noted below, when dereferencing or, as
- * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
+ * in cgroup_attach_task(), modifying a task's cgroup pointer we use
* task_lock(), which acts on a spinlock (task->alloc_lock) already in
* the task_struct routinely used for such matters.
*
@@ -854,30 +856,6 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
return inode;
}
-/*
- * Call subsys's pre_destroy handler.
- * This is called before css refcnt check.
- */
-static int cgroup_call_pre_destroy(struct cgroup *cgrp)
-{
- struct cgroup_subsys *ss;
- int ret = 0;
-
- for_each_subsys(cgrp->root, ss) {
- if (!ss->pre_destroy)
- continue;
-
- ret = ss->pre_destroy(cgrp);
- if (ret) {
- /* ->pre_destroy() failure is being deprecated */
- WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs);
- break;
- }
- }
-
- return ret;
-}
-
static void cgroup_diput(struct dentry *dentry, struct inode *inode)
{
/* is dentry a directory ? if so, kfree() associated cgroup */
@@ -898,7 +876,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
* Release the subsystem state objects.
*/
for_each_subsys(cgrp->root, ss)
- ss->destroy(cgrp);
+ ss->css_free(cgrp);
cgrp->root->number_of_cgroups--;
mutex_unlock(&cgroup_mutex);
@@ -917,6 +895,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
simple_xattrs_free(&cgrp->xattrs);
+ ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
kfree_rcu(cgrp, rcu_head);
} else {
struct cfent *cfe = __d_cfe(dentry);
@@ -987,7 +966,7 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files,
if (!test_bit(ss->subsys_id, &subsys_mask))
continue;
list_for_each_entry(set, &ss->cftsets, node)
- cgroup_rm_file(cgrp, set->cfts);
+ cgroup_addrm_files(cgrp, NULL, set->cfts, false);
}
if (base_files) {
while (!list_empty(&cgrp->files))
@@ -1015,33 +994,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
}
/*
- * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
- * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
- * reference to css->refcnt. In general, this refcnt is expected to goes down
- * to zero, soon.
- *
- * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
- */
-static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
-
-static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
-{
- if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
- wake_up_all(&cgroup_rmdir_waitq);
-}
-
-void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
-{
- css_get(css);
-}
-
-void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
-{
- cgroup_wakeup_rmdir_waiter(css->cgroup);
- css_put(css);
-}
-
-/*
* Call with cgroup_mutex held. Drops reference counts on modules, including
* any duplicate ones that parse_cgroupfs_options took. If this function
* returns an error, no reference counts are to