diff options
-rw-r--r-- | Documentation/admin-guide/cgroup-v2.rst | 27 | ||||
-rw-r--r-- | include/linux/cgroup-defs.h | 33 | ||||
-rw-r--r-- | include/linux/cgroup.h | 43 | ||||
-rw-r--r-- | include/linux/sched.h | 2 | ||||
-rw-r--r-- | include/linux/sched/jobctl.h | 2 | ||||
-rw-r--r-- | include/trace/events/cgroup.h | 55 | ||||
-rw-r--r-- | kernel/cgroup/Makefile | 4 | ||||
-rw-r--r-- | kernel/cgroup/cgroup-internal.h | 8 | ||||
-rw-r--r-- | kernel/cgroup/cgroup-v1.c | 16 | ||||
-rw-r--r-- | kernel/cgroup/cgroup.c | 152 | ||||
-rw-r--r-- | kernel/cgroup/debug.c | 8 | ||||
-rw-r--r-- | kernel/cgroup/freezer.c | 639 | ||||
-rw-r--r-- | kernel/cgroup/legacy_freezer.c | 481 | ||||
-rw-r--r-- | kernel/fork.c | 2 | ||||
-rw-r--r-- | kernel/signal.c | 66 | ||||
-rw-r--r-- | tools/testing/selftests/cgroup/.gitignore | 1 | ||||
-rw-r--r-- | tools/testing/selftests/cgroup/Makefile | 2 | ||||
-rw-r--r-- | tools/testing/selftests/cgroup/cgroup_util.c | 58 | ||||
-rw-r--r-- | tools/testing/selftests/cgroup/cgroup_util.h | 5 | ||||
-rw-r--r-- | tools/testing/selftests/cgroup/test_freezer.c | 851 |
20 files changed, 2012 insertions, 443 deletions
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 20f92c16ffbf..88e746074252 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -864,6 +864,8 @@ All cgroup core files are prefixed with "cgroup." populated 1 if the cgroup or its descendants contains any live processes; otherwise, 0. + frozen + 1 if the cgroup is frozen; otherwise, 0. cgroup.max.descendants A read-write single value files. The default is "max". @@ -897,6 +899,31 @@ All cgroup core files are prefixed with "cgroup." A dying cgroup can consume system resources not exceeding limits, which were active at the moment of cgroup deletion. + cgroup.freeze + A read-write single value file which exists on non-root cgroups. + Allowed values are "0" and "1". The default is "0". + + Writing "1" to the file causes freezing of the cgroup and all + descendant cgroups. This means that all belonging processes will + be stopped and will not run until the cgroup will be explicitly + unfrozen. Freezing of the cgroup may take some time; when this action + is completed, the "frozen" value in the cgroup.events control file + will be updated to "1" and the corresponding notification will be + issued. + + A cgroup can be frozen either by its own settings, or by settings + of any ancestor cgroups. If any of ancestor cgroups is frozen, the + cgroup will remain frozen. + + Processes in the frozen cgroup can be killed by a fatal signal. + They also can enter and leave a frozen cgroup: either by an explicit + move by a user, or if freezing of the cgroup races with fork(). + If a process is moved to a frozen cgroup, it stops. If a process is + moved out of a frozen cgroup, it becomes running. + + Frozen status of a cgroup doesn't affect any cgroup tree operations: + it's possible to delete a frozen (and empty) cgroup, as well as + create new sub-cgroups. Controllers =========== diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 1c70803e9f77..77258d276f93 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -65,6 +65,12 @@ enum { * specified at mount time and thus is implemented here. */ CGRP_CPUSET_CLONE_CHILDREN, + + /* Control group has to be frozen. */ + CGRP_FREEZE, + + /* Cgroup is frozen. */ + CGRP_FROZEN, }; /* cgroup_root->flags */ @@ -317,6 +323,25 @@ struct cgroup_rstat_cpu { struct cgroup *updated_next; /* NULL iff not on the list */ }; +struct cgroup_freezer_state { + /* Should the cgroup and its descendants be frozen. */ + bool freeze; + + /* Should the cgroup actually be frozen? */ + int e_freeze; + + /* Fields below are protected by css_set_lock */ + + /* Number of frozen descendant cgroups */ + int nr_frozen_descendants; + + /* + * Number of tasks, which are counted as frozen: + * frozen, SIGSTOPped, and PTRACEd. + */ + int nr_frozen_tasks; +}; + struct cgroup { /* self css with NULL ->ss, points back to this cgroup */ struct cgroup_subsys_state self; @@ -349,6 +374,11 @@ struct cgroup { * Dying cgroups are cgroups which were deleted by a user, * but are still existing because someone else is holding a reference. * max_descendants is a maximum allowed number of descent cgroups. + * + * nr_descendants and nr_dying_descendants are protected + * by cgroup_mutex and css_set_lock. It's fine to read them holding + * any of cgroup_mutex and css_set_lock; for writing both locks + * should be held. */ int nr_descendants; int nr_dying_descendants; @@ -448,6 +478,9 @@ struct cgroup { /* If there is block congestion on this cgroup. */ atomic_t congestion_count; + /* Used to store internal freezer state */ + struct cgroup_freezer_state freezer; + /* ids of the ancestors at each level including self */ int ancestor_ids[]; }; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 81f58b4a5418..c0077adeea83 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -881,4 +881,47 @@ static inline void put_cgroup_ns(struct cgroup_namespace *ns) free_cgroup_ns(ns); } +#ifdef CONFIG_CGROUPS + +void cgroup_enter_frozen(void); +void cgroup_leave_frozen(bool always_leave); +void cgroup_update_frozen(struct cgroup *cgrp); +void cgroup_freeze(struct cgroup *cgrp, bool freeze); +void cgroup_freezer_migrate_task(struct task_struct *task, struct cgroup *src, + struct cgroup *dst); + +static inline bool cgroup_task_freeze(struct task_struct *task) +{ + bool ret; + + if (task->flags & PF_KTHREAD) + return false; + + rcu_read_lock(); + ret = test_bit(CGRP_FREEZE, &task_dfl_cgroup(task)->flags); + rcu_read_unlock(); + + return ret; +} + +static inline bool cgroup_task_frozen(struct task_struct *task) +{ + return task->frozen; +} + +#else /* !CONFIG_CGROUPS */ + +static inline void cgroup_enter_frozen(void) { } +static inline void cgroup_leave_frozen(bool always_leave) { } +static inline bool cgroup_task_freeze(struct task_struct *task) +{ + return false; +} +static inline bool cgroup_task_frozen(struct task_struct *task) +{ + return false; +} + +#endif /* !CONFIG_CGROUPS */ + #endif /* _LINUX_CGROUP_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 50606a6e73d6..a2cd15855bad 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -726,6 +726,8 @@ struct task_struct { #ifdef CONFIG_CGROUPS /* disallow userland-initiated cgroup migration */ unsigned no_cgroup_migration:1; + /* task is frozen/stopped (used by the cgroup freezer) */ + unsigned frozen:1; #endif #ifdef CONFIG_BLK_CGROUP /* to be used once the psi infrastructure lands upstream. */ diff --git a/include/linux/sched/jobctl.h b/include/linux/sched/jobctl.h index 98228bd48aee..fa067de9f1a9 100644 --- a/include/linux/sched/jobctl.h +++ b/include/linux/sched/jobctl.h @@ -18,6 +18,7 @@ struct task_struct; #define JOBCTL_TRAP_NOTIFY_BIT 20 /* trap for NOTIFY */ #define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */ #define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */ +#define JOBCTL_TRAP_FREEZE_BIT 23 /* trap for cgroup freezer */ #define JOBCTL_STOP_DEQUEUED (1UL << JOBCTL_STOP_DEQUEUED_BIT) #define JOBCTL_STOP_PENDING (1UL << JOBCTL_STOP_PENDING_BIT) @@ -26,6 +27,7 @@ struct task_struct; #define JOBCTL_TRAP_NOTIFY (1UL << JOBCTL_TRAP_NOTIFY_BIT) #define JOBCTL_TRAPPING (1UL << JOBCTL_TRAPPING_BIT) #define JOBCTL_LISTENING (1UL << JOBCTL_LISTENING_BIT) +#define JOBCTL_TRAP_FREEZE (1UL << JOBCTL_TRAP_FREEZE_BIT) #define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY) #define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK) diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h index a401ff5e7847..a566cc521476 100644 --- a/include/trace/events/cgroup.h +++ b/include/trace/events/cgroup.h @@ -103,6 +103,20 @@ DEFINE_EVENT(cgroup, cgroup_rename, TP_ARGS(cgrp, path) ); +DEFINE_EVENT(cgroup, cgroup_freeze, + + TP_PROTO(struct cgroup *cgrp, const char *path), + + TP_ARGS(cgrp, path) +); + +DEFINE_EVENT(cgroup, cgroup_unfreeze, + + TP_PROTO(struct cgroup *cgrp, const char *path), + + TP_ARGS(cgrp, path) +); + DECLARE_EVENT_CLASS(cgroup_migrate, TP_PROTO(struct cgroup *dst_cgrp, const char *path, @@ -149,6 +163,47 @@ DEFINE_EVENT(cgroup_migrate, cgroup_transfer_tasks, TP_ARGS(dst_cgrp, path, task, threadgroup) ); +DECLARE_EVENT_CLASS(cgroup_event, + + TP_PROTO(struct cgroup *cgrp, const char *path, int val), + + TP_ARGS(cgrp, path, val), + + TP_STRUCT__entry( + __field( int, root ) + __field( int, id ) + __field( int, level ) + __string( path, path ) + __field( int, val ) + ), + + TP_fast_assign( + __entry->root = cgrp->root->hierarchy_id; + __entry->id = cgrp->id; + __entry->level = cgrp->level; + __assign_str(path, path); + __entry->val = val; + ), + + TP_printk("root=%d id=%d level=%d path=%s val=%d", + __entry->root, __entry->id, __entry->level, __get_str(path), + __entry->val) +); + +DEFINE_EVENT(cgroup_event, cgroup_notify_populated, + + TP_PROTO(struct cgroup *cgrp, const char *path, int val), + + TP_ARGS(cgrp, path, val) +); + +DEFINE_EVENT(cgroup_event, cgroup_notify_frozen, + + TP_PROTO(struct cgroup *cgrp, const char *path, int val), + + TP_ARGS(cgrp, path, val) +); + #endif /* _TRACE_CGROUP_H */ /* This part must be outside protection */ diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index bfcdae896122..5d7a76bfbbb7 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o +obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o freezer.o -obj-$(CONFIG_CGROUP_FREEZER) += freezer.o +obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o obj-$(CONFIG_CGROUP_PIDS) += pids.o obj-$(CONFIG_CGROUP_RDMA) += rdma.o obj-$(CONFIG_CPUSETS) += cpuset.o diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 30e39f3932ad..809e34a3c017 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -28,12 +28,15 @@ extern void __init enable_debug_cgroup(void); #define TRACE_CGROUP_PATH(type, cgrp, ...) \ do { \ if (trace_cgroup_##type##_enabled()) { \ - spin_lock(&trace_cgroup_path_lock); \ + unsigned long flags; \ + spin_lock_irqsave(&trace_cgroup_path_lock, \ + flags); \ cgroup_path(cgrp, trace_cgroup_path, \ TRACE_CGROUP_PATH_LEN); \ trace_cgroup_##type(cgrp, trace_cgroup_path, \ ##__VA_ARGS__); \ - spin_unlock(&trace_cgroup_path_lock); \ + spin_unlock_irqrestore(&trace_cgroup_path_lock, \ + flags); \ } \ } while (0) @@ -240,6 +243,7 @@ int cgroup_rmdir(struct kernfs_node *kn); int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, struct kernfs_root *kf_root); +int __cgroup_task_count(const struct cgroup *cgrp); int cgroup_task_count(const struct cgroup *cgrp); /* diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index c126b34fd4ff..68ca5de7ec27 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -342,22 +342,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, return l; } -/** - * cgroup_task_count - count the number of tasks in a cgroup. - * @cgrp: the cgroup in question - */ -int cgroup_task_count(const struct cgroup *cgrp) -{ - int count = 0; - struct cgrp_cset_link *link; - - spin_lock_irq(&css_set_lock); - list_for_each_entry(link, &cgrp->cset_links, cset_link) - count += link->cset->nr_tasks; - spin_unlock_irq(&css_set_lock); - return count; -} - /* * Load a cgroup's pidarray with either procs' tgids or tasks' pids */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 3f2b4bde0f9c..327f37c9fdfa 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -593,6 +593,39 @@ static void cgroup_get_live(struct cgroup *cgrp) css_get(&cgrp->self); } +/** + * __cgroup_task_count - count the number of tasks in a cgroup. The caller + * is responsible for taking the css_set_lock. + * @cgrp: the cgroup in question + */ +int __cgroup_task_count(const struct cgroup *cgrp) +{ + int count = 0; + struct cgrp_cset_link *link; + + lockdep_assert_held(&css_set_lock); + + list_for_each_entry(link, &cgrp->cset_links, cset_link) + count += link->cset->nr_tasks; + + return count; +} + +/** + * cgroup_task_count - count the number of tasks in a cgroup. + * @cgrp: the cgroup in question + */ +int cgroup_task_count(const struct cgroup *cgrp) +{ + int count; + + spin_lock_irq(&css_set_lock); + count = __cgroup_task_count(cgrp); + spin_unlock_irq(&css_set_lock); + + return count; +} + struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) { struct cgroup *cgrp = of->kn->parent->priv; @@ -783,6 +816,8 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) break; cgroup1_check_for_release(cgrp); + TRACE_CGROUP_PATH(notify_populated, cgrp, + cgroup_is_populated(cgrp)); cgroup_file_notify(&cgrp->events_file); child = cgrp; @@ -2402,8 +2437,15 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) get_css_set(to_cset); to_cset->nr_tasks++; css_set_move_task(task, from_cset, to_cset, true); - put_css_set_locked(from_cset); from_cset->nr_tasks--; + /* + * If the source or destination cgroup is frozen, + * the task might require to change its state. + */ + cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp, + to_cset->dfl_cgrp); + put_css_set_locked(from_cset); + } } spin_unlock_irq(&css_set_lock); @@ -2602,7 +2644,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp); if (!dst_cset) - goto err; + return -ENOMEM; WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); @@ -2634,9 +2676,6 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) } return 0; -err: - cgroup_migrate_finish(mgctx); - return -ENOMEM; } /** @@ -3447,8 +3486,11 @@ static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of, static int cgroup_events_show(struct seq_file *seq, void *v) { - seq_printf(seq, "populated %d\n", - cgroup_is_populated(seq_css(seq)->cgroup)); + struct cgroup *cgrp = seq_css(seq)->cgroup; + + seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp)); + seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags)); + return 0; } @@ -3510,6 +3552,40 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) } #endif +static int cgroup_freeze_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + + seq_printf(seq, "%d\n", cgrp->freezer.freeze); + + return 0; +} + +static ssize_t cgroup_freeze_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct cgroup *cgrp; + ssize_t ret; + int freeze; + + ret = kstrtoint(strstrip(buf), 0, &freeze); + if (ret) + return ret; + + if (freeze < 0 || freeze > 1) + return -ERANGE; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENOENT; + + cgroup_freeze(cgrp, freeze); + + cgroup_kn_unlock(of->kn); + + return nbytes; +} + static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of->kn->priv; @@ -4654,6 +4730,12 @@ static struct cftype cgroup_base_files[] = { .seq_show = cgroup_stat_show, }, { + .name = "cgroup.freeze", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_freeze_show, + .write = cgroup_freeze_write, + }, + { .name = "cpu.stat", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cpu_stat_show, @@ -4781,9 +4863,11 @@ static void css_release_work_fn(struct work_struct *work) if (cgroup_on_dfl(cgrp)) cgroup_rstat_flush(cgrp); + spin_lock_irq(&css_set_lock); for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) tcgrp->nr_dying_descendants--; + spin_unlock_irq(&css_set_lock); cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; @@ -5001,12 +5085,31 @@ static struct cgroup *cgroup_create(struct cgroup *parent) if (ret) goto out_psi_free; + /* + * New cgroup inherits effective freeze counter, and + * if the parent has to be frozen, the child has too. + */ + cgrp->freezer.e_freeze = parent->freezer.e_freeze; + if (cgrp->freezer.e_freeze) + set_bit(CGRP_FROZEN, &cgrp->flags); + + spin_lock_irq(&css_set_lock); for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; - if (tcgrp != cgrp) + if (tcgrp != cgrp) { tcgrp->nr_descendants++; + + /* + * If the new cgroup is frozen, all ancestor cgroups + * get a new frozen descendant, but their state can't + * change because of this. + */ + if (cgrp->freezer.e_freeze) + tcgrp->freezer.nr_frozen_descendants++; + } } + spin_unlock_irq(&css_set_lock); if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -5291,10 +5394,18 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) if (parent && cgroup_is_threaded(cgrp)) parent->nr_threaded_children--; + spin_lock_irq(&css_set_lock); for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) { tcgrp->nr_descendants--; tcgrp->nr_dying_descendants++; + /* + * If the dying cgroup is frozen, decrease frozen descendants + * counters of ancestor cgroups. + */ + if (test_bit(CGRP_FROZEN, &cgrp->flags)) + tcgrp->freezer.nr_frozen_descendants--; } + spin_unlock_irq(&css_set_lock); cgroup1_check_for_release(parent); @@ -5746,6 +5857,26 @@ void cgroup_post_fork(struct task_struct *child) cset->nr_tasks++; css_set_move_task(child, NULL, cset, false); } + + /* + * If the cgroup has to be frozen, the new task has too. + * Let's set the JOBCTL_TRAP_FREEZE jobctl bit to get + * the task into the frozen state. + */ + if (unlikely(cgroup_task_freeze(child))) { + spin_lock(&child->sighand->siglock); + WARN_ON_ONCE(child->frozen); + child->jobctl |= JOBCTL_TRAP_FREEZE; + spin_unlock(&child->sighand->siglock); + + /* + * Calling cgroup_update_frozen() isn't required here, + * because it will be called anyway a bit later + * from do_freezer_trap(). So we avoid cgroup's + * transient switch from the frozen state and back. + */ + } + spin_unlock_irq(&css_set_lock); } @@ -5794,6 +5925,11 @@ void cgroup_exit(struct task_struct *tsk) spin_lock_irq(&css_set_lock); css_set_move_task(tsk, cset, NULL, false); cset->nr_tasks--; + + WARN_ON_ONCE(cgroup_task_frozen(tsk)); + if (unlikely(cgroup_task_freeze(tsk))) + cgroup_update_frozen(task_dfl_cgroup(tsk)); + spin_unlock_irq(&css_set_lock); } else { get_css_set(cset); diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 5f1b87330bee..80aa3f027ac3 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -64,8 +64,8 @@ static int current_css_set_read(struct seq_file *seq, void *v) css = cset->subsys[ss->id]; if (!css) continue; - seq_printf(seq, "%2d: %-4s\t- %lx[%d]\n", ss->id, ss->name, - (unsigned long)css, css->id); + seq_printf(seq, "%2d: %-4s\t- %p[%d]\n", ss->id, ss->name, + css, css->id); } rcu_read_unlock(); spin_unlock_irq(&css_set_lock); @@ -224,8 +224,8 @@ static int cgroup_subsys_states_read(struct seq_file *seq, void *v) if (css->parent) snprintf(pbuf, sizeof(pbuf) - 1, " P=%d", css->parent->id); - seq_printf(seq, "%2d: %-4s\t- %lx[%d] %d%s\n", ss->id, ss->name, - (unsigned long)css, css->id, + seq_printf(seq, "%2d: %-4s\t- %p[%d] %d%s\n", ss->id, ss->name, + css, css->id, atomic_read(&css->online_cnt), pbuf); } diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c index 08236798d173..8cf010680678 100644 --- a/kernel/cgroup/freezer.c +++ b/kernel/cgroup/freezer.c @@ -1,481 +1,314 @@ -/* - * cgroup_freezer.c - control group freezer subsystem - * - * Copyright IBM Corporation, 2007 - * - * Author : Cedric Le Goater <clg@fr.ibm.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2.1 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - */ - -#include <linux/export.h> -#include <linux/slab.h> +//SPDX-License-Identifier: GPL-2.0 #include <linux/cgroup.h> -#include <linux/fs.h> -#include <linux/uaccess.h> -#include <linux/freezer.h> -#include <linux/seq_file.h> -#include <linux/mutex.h> - -/* - * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is - * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared - * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING - * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of - * its ancestors has FREEZING_SELF set. - */ -enum freezer_state_flags { - CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */ - CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */ - CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */ - CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */ +#include <linux/sched.h> +#include <linux/sched/task.h> +#include <linux/sched/signal.h> - /* mask for all FREEZING flags */ - CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT, -}; +#include "cgroup-internal.h" -struct freezer { - struct cgroup_subsys_state css; - unsigned int state; -}; +#include <trace/events/cgroup.h> -static DEFINE_MUTEX(freezer_mutex); - -static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) +/* + * Propagate the cgroup frozen state upwards by the cgroup tree. + */ +static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen) { - return css ? container_of(css, struct freezer, css) : NULL; -} + int desc = 1; -static inline struct freezer *task_freezer(struct task_struct *task) -{ - return css_freezer(task_css(task, freezer_cgrp_id)); + /* + * If the new state is frozen, some freezing ancestor cgroups may change + * their state too, depending on if all their descendants are frozen. + * + * Otherwise, all ancestor cgroups are forced into the non-frozen state. + */ + while ((cgrp = cgroup_parent(cgrp))) { + if (frozen) { + cgrp->freezer.nr_frozen_descendants += desc; + if (!test_bit(CGRP_FROZEN, &cgrp->flags) && + test_bit(CGRP_FREEZE, &cgrp->flags) && + cgrp->freezer.nr_frozen_descendants == + cgrp->nr_descendants) { + set_bit(CGRP_FROZEN, &cgrp->flags); + cgroup_file_notify(&cgrp->events_file); + TRACE_CGROUP_PATH(notify_frozen, cgrp, 1); + desc++; + } + } else { + cgrp->freezer.nr_frozen_descendants -= desc; + if (test_bit(CGRP_FROZEN, &cgrp->flags)) { + clear_bit(CGRP_FROZEN, &cgrp->flags); + cgroup_file_notify(&cgrp->events_file); + TRACE_CGROUP_PATH(notify_frozen, cgrp, 0); + desc++; + } + } + } } -static struct freezer *parent_freezer(struct freezer *freezer) +/* + * Revisit the cgroup frozen state. + * Checks if the cgroup is really frozen and perform all state transitions. + */ +void cgroup_update_frozen(struct cgroup *cgrp) { - return css_freezer(freezer->css.parent); -} + bool frozen; -bool cgroup_freezing(struct task_struct *task) -{ - bool ret; + lockdep_assert_held(&css_set_lock); - rcu_read_lock(); - ret = task_freezer(task)->state & CGROUP_FREEZING; - rcu_read_unlock(); + /* + * If the cgroup has to be frozen (CGRP_FREEZE bit set), + * and all tasks are frozen and/or stopped, let's consider + * the cgroup frozen. Otherwise it's not frozen. + */ + frozen = test_bit(CGRP_FREEZE, &cgrp->flags) && + cgrp->freezer.nr_frozen_tasks == __cgroup_task_count(cgrp); - return ret; -} + if (frozen) { + /* Already there? */ + if (test_bit(CGRP_FROZEN, &cgrp->flags)) + return; -static const char *freezer_state_strs(unsigned int state) -{ - if (state & CGROUP_FROZEN) - return "FROZEN"; - if (state & CGROUP_FREEZING) - return "FREEZING"; - return "THAWED"; -}; - -static struct cgroup_subsys_state * -freezer_css_alloc(struct cgroup_subsys_state *parent_css) -{ - struct freezer *freezer; + set_bit(CGRP_FROZEN, &cgrp->flags); + } else { + /* Already there? */ + if (!test_bit(CGRP_FROZEN, &cgrp->flags)) + return; - freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL); - if (!freezer) - return ERR_PTR(-ENOMEM); + clear_bit(CGRP_FROZEN, &cgrp->flags); + } + cgroup_file_notify(&cgrp->events_file); + TRACE_CGROUP_PATH(notify_frozen, cgrp, frozen); - return &freezer->css; + /* Update the state of ancestor cgroups. */ + cgroup_propagate_frozen(cgrp, frozen); } -/** - * freezer_css_online - commit creation of a freezer css - * @css: css being created - * - * We're committing to creation of @css. Mark it online and inherit - * parent's freezing state while holding both parent's and our - * freezer->lock. +/* + * Increment cgroup's nr_frozen_tasks. */ -static int freezer_css_online(struct cgroup_subsys_state *css) +static void cgroup_inc_frozen_cnt(struct cgroup *cgrp) { - struct freezer *freezer = css_freezer(css); - struct freezer *parent = parent_freezer(freezer); - - mutex_lock(&freezer_mutex); - - freezer->state |= CGROUP_FREEZER_ONLINE; - - if (parent && (parent->state & CGROUP_FREEZING)) { - freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN; - atomic_inc(&system_freezing_cnt); - } - - mutex_unlock(&freezer_mutex); - return 0; + cgrp->freezer.nr_frozen_tasks++; } -/** - * freezer_css_offline - initiate destruction of a freezer css - * @css: css being destroyed - * - * @css is going away. Mark it dead and decrement system_freezing_count if - * it was holding one. +/* + * Decrement cgroup's nr_frozen_tasks. */ -static void freezer_css_offline(struct cgroup_subsys_state *css) +static void cgroup_dec_frozen_cnt(struct cgroup *cgrp) { - struct freezer *freezer = css_freezer(css); - - mutex_lock(&freezer_mutex); - - if (freezer->state & CGROUP_FREEZING) - atomic_dec(&system_freezing_cnt); - - freezer->state = 0; - - mutex_unlock(&freezer_mutex); + cgrp->freezer.nr_frozen_tasks--; + WARN_ON_ONCE(cgrp->freezer.nr_frozen_tasks < 0); } -static void freezer_css_free(struct cgroup_subsys_state *css) +/* + * Enter frozen/stopped state, if not yet there. Update cgroup's counters, + * and revisit the state of the cgroup, if necessary. + */ +void cgroup_enter_frozen(void) { - kfree(css_freezer(css)); + struct cgroup *cgrp; + + if (current->frozen) + return; + + spin_lock_irq(&css_set_lock); + current->frozen = true; + cgrp = task_dfl_cgroup(current); + cgroup_inc_frozen_cnt(cgrp); + cgroup_update_frozen(cgrp); + spin_unlock_irq(&css_set_lock); } /* - * Tasks can be migrated into a different freezer anytime regardless of its - * current state. freezer_attach() is responsible for making new tasks - * conform to the current state. + * Conditionally leave frozen/stopped state. Update cgroup's counters, + * and revisit the state of the cgroup, if necessary. * - * Freezer state changes and task migration are synchronized via - * @freezer->lock. freezer_attach() makes the new tasks conform to the - * current state and all following state changes can see the new tasks. + * If always_leave is not set, and the cgroup is freezing, + * we're racing with the cgroup freezing. In this case, we don't + * drop the frozen counter to avoid a transient switch to + * the unfrozen state. */ -static void freezer_attach(struct cgroup_taskset *tset) +void cgroup_leave_frozen(bool always_leave) { - struct task_struct *task; - struct cgroup_subsys_state *new_css; - - mutex_lock(&freezer_mutex); - - /* - * Make the new tasks conform to the current state of @new_css. - * For simplicity, when migrating any task to a FROZEN cgroup, we - * revert it to FREEZING and let update_if_frozen() determine the - * correct state later. - * - * Tasks in @tset are on @new_css but may not conform to its - * current state before executing the following - !frozen tasks may - * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. - */ - cgroup_taskset_for_each(task, new_css, tset) { - struct freezer *freezer = css_freezer(new_css); - - if (!(freezer->state & CGROUP_FREEZING)) { - __thaw_task(task); - } else { - freeze_task(task); - /* clear FROZEN and propagate upwards */ - while (freezer && (freezer->state & CGROUP_FROZEN)) { - freezer->state &= ~CGROUP_FROZEN; - freezer = parent_freezer(freezer); - } - } + struct cgroup *cgrp; + + spin_lock_irq(&css_set_lock); + cgrp = task_dfl_cgroup(current); + if (always_leave || !test_bit(CGRP_FREEZE, &cgrp->flags)) { + cgroup_dec_frozen_cnt(cgrp); + cgroup_update_frozen(cgrp); + WARN_ON_ONCE(!current->frozen); + current->frozen = false; + } else if (!(current->jobctl & JOBCTL_TRAP_FREEZE)) { + spin_lock(¤t->sighand->siglock); + current->jobctl |= JOBCTL_TRAP_FREEZE; + set_thread_flag(TIF_SIGPENDING); + spin_unlock(¤t->sighand->siglock); } - - mutex_unlock(&freezer_mutex); + spin_unlock_irq(&css_set_lock); } -/** - * freezer_fork - cgroup post fork callback - * @task: a task which has just been forked - * - * @task has just been created and should conform to the current state of - * the cgroup_freezer it belongs to. This function may race against - * freezer_attach(). Losing to freezer_attach() means that we don't have - * to do anything as freezer_attach() will put @task into the appropriate - * state. +/* + * Freeze or unfreeze the task by setting or clearing the JOBCTL_TRAP_FREEZE + * jobctl bit. */ -static void freezer_fork(struct task_struct *task) +static void cgroup_freeze_task(struct task_struct *task, bool freeze) { - struct freezer *freezer; + unsigned long flags; - /* - * The root cgroup is non-freezable, so we can skip locking the - * freezer. This is safe regardless of race with task migration. - * If we didn't race or won, s |