From 387ad9674b0013c8756ad20d854ff005b0c313ad Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Mon, 20 Feb 2017 12:19:00 +0200 Subject: kernel: convert cgroup_namespace.count from atomic_t to refcount_t refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Signed-off-by: Elena Reshetova Signed-off-by: Hans Liljestrand Signed-off-by: Kees Cook Signed-off-by: David Windsor Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 +- kernel/cgroup/namespace.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 0125589c7428..8ee78688e36d 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -189,7 +189,7 @@ static u16 have_canfork_callback __read_mostly; /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { - .count = { .counter = 2, }, + .count = REFCOUNT_INIT(2), .user_ns = &init_user_ns, .ns.ops = &cgroupns_operations, .ns.inum = PROC_CGROUP_INIT_INO, diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 96d38dab6fb2..66129eb4371d 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -31,7 +31,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void) kfree(new_ns); return ERR_PTR(ret); } - atomic_set(&new_ns->count, 1); + refcount_set(&new_ns->count, 1); new_ns->ns.ops = &cgroupns_operations; return new_ns; } -- cgit v1.2.3 From 4b9502e63b5e2b1b5ef491919d3219b9440fe0b3 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Wed, 8 Mar 2017 10:00:40 +0200 Subject: kernel: convert css_set.refcount from atomic_t to refcount_t refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Signed-off-by: Elena Reshetova Signed-off-by: Hans Liljestrand Signed-off-by: Kees Cook Signed-off-by: David Windsor Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-internal.h | 5 +++-- kernel/cgroup/cgroup-v1.c | 4 ++-- kernel/cgroup/cgroup.c | 6 +++--- 3 files changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 9203bfb05603..4567f12b02e9 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -5,6 +5,7 @@ #include #include #include +#include /* * A cgroup can be associated with multiple css_sets as different tasks may @@ -134,7 +135,7 @@ static inline void put_css_set(struct css_set *cset) * can see it. Similar to atomic_dec_and_lock(), but for an * rwlock */ - if (atomic_add_unless(&cset->refcount, -1, 1)) + if (refcount_dec_not_one(&cset->refcount)) return; spin_lock_irqsave(&css_set_lock, flags); @@ -147,7 +148,7 @@ static inline void put_css_set(struct css_set *cset) */ static inline void get_css_set(struct css_set *cset) { - atomic_inc(&cset->refcount); + refcount_inc(&cset->refcount); } bool cgroup_ssid_enabled(int ssid); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 56eba9caa632..c4a68c438fde 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -346,7 +346,7 @@ static int cgroup_task_count(const struct cgroup *cgrp) spin_lock_irq(&css_set_lock); list_for_each_entry(link, &cgrp->cset_links, cset_link) - count += atomic_read(&link->cset->refcount); + count += refcount_read(&link->cset->refcount); spin_unlock_irq(&css_set_lock); return count; } @@ -1286,7 +1286,7 @@ static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, u64 count; rcu_read_lock(); - count = atomic_read(&task_css_set(current)->refcount); + count = refcount_read(&task_css_set(current)->refcount); rcu_read_unlock(); return count; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 8ee78688e36d..b1cc1c306668 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -554,7 +554,7 @@ EXPORT_SYMBOL_GPL(of_css); * haven't been created. */ struct css_set init_css_set = { - .refcount = ATOMIC_INIT(1), + .refcount = REFCOUNT_INIT(1), .tasks = LIST_HEAD_INIT(init_css_set.tasks), .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), @@ -724,7 +724,7 @@ void put_css_set_locked(struct css_set *cset) lockdep_assert_held(&css_set_lock); - if (!atomic_dec_and_test(&cset->refcount)) + if (!refcount_dec_and_test(&cset->refcount)) return; /* This css_set is dead. unlink it and release cgroup and css refs */ @@ -977,7 +977,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, return NULL; } - atomic_set(&cset->refcount, 1); + refcount_set(&cset->refcount, 1); INIT_LIST_HEAD(&cset->tasks); INIT_LIST_HEAD(&cset->mg_tasks); INIT_LIST_HEAD(&cset->task_iters); -- cgit v1.2.3 From 75fa8e5d3b1f3f7bdafcde12e973b0faaf13f1c4 Mon Sep 17 00:00:00 2001 From: Nicholas Mc Guire Date: Sun, 26 Mar 2017 18:24:06 +0200 Subject: cgroup: switch to BUG_ON() Use BUG_ON() rather than an explicit if followed by BUG() for improved readability and also consistency. Signed-off-by: Nicholas Mc Guire Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 0f41292be0fb..8b84db2d2d17 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2121,10 +2121,8 @@ int __init cpuset_init(void) { int err = 0; - if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)) - BUG(); - if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)) - BUG(); + BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); @@ -2139,8 +2137,7 @@ int __init cpuset_init(void) if (err < 0) return err; - if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) - BUG(); + BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); return 0; } -- cgit v1.2.3 From 30e03acda5fd9c77ec9bf8b3c5def9540c6b0486 Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Sun, 9 Apr 2017 07:36:14 +0600 Subject: cpuset: Remove cpuset_update_active_cpus()'s parameter. In cpuset_update_active_cpus(), cpu_online isn't used anymore. Remove it. Signed-off-by: Rakib Mullick Acked-by: Zefan Li Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 2 +- kernel/sched/core.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 8b84db2d2d17..f6501f4f6040 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2351,7 +2351,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) rebuild_sched_domains(); } -void cpuset_update_active_cpus(bool cpu_online) +void cpuset_update_active_cpus(void) { /* * We're inside cpu hotplug critical region which usually nests diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 956383844116..e62802c044e1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5727,7 +5727,7 @@ static void cpuset_cpu_active(void) * cpuset configurations. */ } - cpuset_update_active_cpus(true); + cpuset_update_active_cpus(); } static int cpuset_cpu_inactive(unsigned int cpu) @@ -5750,7 +5750,7 @@ static int cpuset_cpu_inactive(unsigned int cpu) if (overflow) return -EBUSY; - cpuset_update_active_cpus(false); + cpuset_update_active_cpus(); } else { num_cpus_frozen++; partition_sched_domains(1, NULL, NULL); -- cgit v1.2.3 From a590b90d472f2c176c140576ee3ab44df7f67839 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Apr 2017 15:14:55 -0400 Subject: cgroup: fix spurious warnings on cgroup_is_dead() from cgroup_sk_alloc() cgroup_get() expected to be called only on live cgroups and triggers warning on a dead cgroup; however, cgroup_sk_alloc() may be called while cloning a socket which is left in an empty and removed cgroup and thus may legitimately duplicate its reference on a dead cgroup. This currently triggers the following warning spuriously. WARNING: CPU: 14 PID: 0 at kernel/cgroup.c:490 cgroup_get+0x55/0x60 ... [] __warn+0xd3/0xf0 [] warn_slowpath_null+0x1e/0x20 [] cgroup_get+0x55/0x60 [] cgroup_sk_alloc+0x51/0xe0 [] sk_clone_lock+0x2db/0x390 [] inet_csk_clone_lock+0x16/0xc0 [] tcp_create_openreq_child+0x23/0x4b0 [] tcp_v6_syn_recv_sock+0x91/0x670 [] tcp_check_req+0x3a6/0x4e0 [] tcp_v6_rcv+0x693/0xa00 [] ip6_input_finish+0x59/0x3e0 [] ip6_input+0x32/0xb0 [] ip6_rcv_finish+0x57/0xa0 [] ipv6_rcv+0x318/0x4d0 [] __netif_receive_skb_core+0x2d7/0x9a0 [] __netif_receive_skb+0x16/0x70 [] netif_receive_skb_internal+0x23/0x80 [] napi_gro_frags+0x208/0x270 [] mlx4_en_process_rx_cq+0x74c/0xf40 [] mlx4_en_poll_rx_cq+0x30/0x90 [] net_rx_action+0x210/0x350 [] __do_softirq+0x106/0x2c7 [] irq_exit+0x9d/0xa0 [] do_IRQ+0x54/0xd0 [] common_interrupt+0x7f/0x7f [] cpuidle_enter+0x17/0x20 [] cpu_startup_entry+0x2a9/0x2f0 [] start_secondary+0xf1/0x100 This patch renames the existing cgroup_get() with the dead cgroup warning to cgroup_get_live() after cgroup_kn_lock_live() and introduces the new cgroup_get() which doesn't check whether the cgroup is live or dead. All existing cgroup_get() users except for cgroup_sk_alloc() are converted to use cgroup_get_live(). Fixes: d979a39d7242 ("cgroup: duplicate cgroup reference when cloning sockets") Cc: stable@vger.kernel.org # v4.5+ Cc: Johannes Weiner Reported-by: Chris Mason Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b1cc1c306668..10951d5e35d2 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -437,6 +437,11 @@ out_unlock: } static void cgroup_get(struct cgroup *cgrp) +{ + css_get(&cgrp->self); +} + +static void cgroup_get_live(struct cgroup *cgrp) { WARN_ON_ONCE(cgroup_is_dead(cgrp)); css_get(&cgrp->self); @@ -932,7 +937,7 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, list_add_tail(&link->cgrp_link, &cset->cgrp_links); if (cgroup_parent(cgrp)) - cgroup_get(cgrp); + cgroup_get_live(cgrp); } /** @@ -1802,7 +1807,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, return ERR_PTR(-EINVAL); } cgrp_dfl_visible = true; - cgroup_get(&cgrp_dfl_root.cgrp); + cgroup_get_live(&cgrp_dfl_root.cgrp); dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root, CGROUP2_SUPER_MAGIC, ns); @@ -2575,7 +2580,7 @@ restart: if (!css || !percpu_ref_is_dying(&css->refcnt)) continue; - cgroup_get(dsct); + cgroup_get_live(dsct); prepare_to_wait(&dsct->offline_waitq, &wait, TASK_UNINTERRUPTIBLE); @@ -3946,7 +3951,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, { lockdep_assert_held(&cgroup_mutex); - cgroup_get(cgrp); + cgroup_get_live(cgrp); memset(css, 0, sizeof(*css)); css->cgroup = cgrp; @@ -4122,7 +4127,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) /* allocation complete, commit to creation */ list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children); atomic_inc(&root->nr_cgrps); - cgroup_get(parent); + cgroup_get_live(parent); /* * @cgrp is now fully operational. If something fails after this @@ -4946,7 +4951,7 @@ struct cgroup *cgroup_get_from_path(const char *path) if (kn) { if (kernfs_type(kn) == KERNFS_DIR) { cgrp = kn->priv; - cgroup_get(cgrp); + cgroup_get_live(cgrp); } else { cgrp = ERR_PTR(-ENOTDIR); } @@ -5026,6 +5031,11 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) /* Socket clone path */ if (skcd->val) { + /* + * We might be cloning a socket which is left in an empty + * cgroup and the cgroup might have already been rmdir'd. + * Don't use cgroup_get_live(). + */ cgroup_get(sock_cgroup_ptr(skcd)); return; } -- cgit v1.2.3 From 9732adc5d6520238223df16630f1f8cad2269317 Mon Sep 17 00:00:00 2001 From: Zefan Li Date: Wed, 19 Apr 2017 10:15:59 +0800 Subject: cgroup: avoid attaching a cgroup root to two different superblocks, take 2 Commit bfb0b80db5f9 ("cgroup: avoid attaching a cgroup root to two different superblocks") is broken. Now we try to fix the race by delaying the initialization of cgroup root refcnt until a superblock has been allocated. Reported-by: Dmitry Vyukov Reported-by: Andrei Vagin Tested-by: Andrei Vagin Signed-off-by: Zefan Li Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-internal.h | 2 +- kernel/cgroup/cgroup-v1.c | 16 +++++++++++++++- kernel/cgroup/cgroup.c | 8 ++++---- 3 files changed, 20 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 4567f12b02e9..00f4d6bf048f 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -164,7 +164,7 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, void cgroup_free_root(struct cgroup_root *root); void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts); -int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask); +int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags); int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, struct cgroup_root *root, unsigned long magic, diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index c4a68c438fde..a478995e7ed3 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -1072,6 +1072,7 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, struct cgroup_subsys *ss; struct dentry *dentry; int i, ret; + bool new_root = false; cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); @@ -1181,10 +1182,11 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, ret = -ENOMEM; goto out_unlock; } + new_root = true; init_cgroup_root(root, &opts); - ret = cgroup_setup_root(root, opts.subsys_mask); + ret = cgroup_setup_root(root, opts.subsys_mask, PERCPU_REF_INIT_DEAD); if (ret) cgroup_free_root(root); @@ -1200,6 +1202,18 @@ out_free: dentry = cgroup_do_mount(&cgroup_fs_type, flags, root, CGROUP_SUPER_MAGIC, ns); + /* + * There's a race window after we release cgroup_mutex and before + * allocating a superblock. Make sure a concurrent process won't + * be able to re-use the root during this window by delaying the + * initialization of root refcnt. + */ + if (new_root) { + mutex_lock(&cgroup_mutex); + percpu_ref_reinit(&root->cgrp.self.refcnt); + mutex_unlock(&cgroup_mutex); + } + /* * If @pinned_sb, we're reusing an existing root and holding an * extra ref on its sb. Mount is complete. Put the extra ref. diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 10951d5e35d2..38d9386f46e7 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1645,7 +1645,7 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) +int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; @@ -1661,8 +1661,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) root_cgrp->id = ret; root_cgrp->ancestor_ids[0] = ret; - ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, - GFP_KERNEL); + ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, + ref_flags, GFP_KERNEL); if (ret) goto out; @@ -4517,7 +4517,7 @@ int __init cgroup_init(void) hash_add(css_set_table, &init_css_set.hlist, css_set_hash(init_css_set.subsys)); - BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); + BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0, 0)); mutex_unlock(&cgroup_mutex); -- cgit v1.2.3 From 310b4816a5d8082416b4ab83e5a7b3cb92883a4d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 1 May 2017 15:24:14 -0400 Subject: cgroup: mark cgroup_get() with __maybe_unused a590b90d472f ("cgroup: fix spurious warnings on cgroup_is_dead() from cgroup_sk_alloc()") converted most cgroup_get() usages to cgroup_get_live() leaving cgroup_sk_alloc() the sole user of cgroup_get(). When !CONFIG_SOCK_CGROUP_DATA, this ends up triggering unused warning for cgroup_get(). Silence the warning by adding __maybe_unused to cgroup_get(). Reported-by: Stephen Rothwell Link: http://lkml.kernel.org/r/20170501145340.17e8ef86@canb.auug.org.au Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 38d9386f46e7..f2bcc11b85be 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -436,7 +436,7 @@ out_unlock: return css; } -static void cgroup_get(struct cgroup *cgrp) +static void __maybe_unused cgroup_get(struct cgroup *cgrp) { css_get(&cgrp->self); } -- cgit v1.2.3