summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorAlexander Graf <agraf@suse.de>2013-08-29 00:41:59 +0200
committerAlexander Graf <agraf@suse.de>2013-08-29 00:41:59 +0200
commitbf550fc93d9855872a95e69e4002256110d89858 (patch)
tree10876bb4304bffe54c4160a132e7b8de6577ac4e /mm
parent7e48c101e0c53e6095c5f4f5e63d14df50aae8fc (diff)
parentcc2df20c7c4ce594c3e17e9cc260c330646012c8 (diff)
Merge remote-tracking branch 'origin/next' into kvm-ppc-next
Conflicts: mm/Kconfig CMA DMA split and ZSWAP introduction were conflicting, fix up manually.
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig42
-rw-r--r--mm/Makefile2
-rw-r--r--mm/backing-dev.c5
-rw-r--r--mm/bootmem.c39
-rw-r--r--mm/filemap.c6
-rw-r--r--mm/huge_memory.c30
-rw-r--r--mm/hugetlb.c4
-rw-r--r--mm/internal.h5
-rw-r--r--mm/memblock.c2
-rw-r--r--mm/memcontrol.c363
-rw-r--r--mm/memory-failure.c22
-rw-r--r--mm/memory.c15
-rw-r--r--mm/memory_hotplug.c139
-rw-r--r--mm/mm_init.c47
-rw-r--r--mm/mmap.c40
-rw-r--r--mm/mmu_notifier.c2
-rw-r--r--mm/mremap.c20
-rw-r--r--mm/nobootmem.c35
-rw-r--r--mm/nommu.c10
-rw-r--r--mm/page_alloc.c384
-rw-r--r--mm/page_io.c50
-rw-r--r--mm/pgtable-generic.c5
-rw-r--r--mm/rmap.c9
-rw-r--r--mm/shmem.c16
-rw-r--r--mm/slab.c51
-rw-r--r--mm/slab.h3
-rw-r--r--mm/slab_common.c18
-rw-r--r--mm/slob.c4
-rw-r--r--mm/slub.c38
-rw-r--r--mm/sparse.c8
-rw-r--r--mm/swap.c106
-rw-r--r--mm/swapfile.c55
-rw-r--r--mm/util.c1
-rw-r--r--mm/vmalloc.c164
-rw-r--r--mm/vmscan.c605
-rw-r--r--mm/zbud.c527
-rw-r--r--mm/zswap.c943
37 files changed, 2862 insertions, 953 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 81bcb4bd422d..6cdd27043303 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -501,3 +501,45 @@ config CMA_DEBUG
messages for every CMA call as well as various messages while
processing calls such as dma_alloc_from_contiguous().
This option does not affect warning and error messages.
+
+config ZBUD
+ tristate
+ default n
+ help
+ A special purpose allocator for storing compressed pages.
+ It is designed to store up to two compressed pages per physical
+ page. While this design limits storage density, it has simple and
+ deterministic reclaim properties that make it preferable to a higher
+ density approach when reclaim will be used.
+
+config ZSWAP
+ bool "Compressed cache for swap pages (EXPERIMENTAL)"
+ depends on FRONTSWAP && CRYPTO=y
+ select CRYPTO_LZO
+ select ZBUD
+ default n
+ help
+ A lightweight compressed cache for swap pages. It takes
+ pages that are in the process of being swapped out and attempts to
+ compress them into a dynamically allocated RAM-based memory pool.
+ This can result in a significant I/O reduction on swap device and,
+ in the case where decompressing from RAM is faster that swap device
+ reads, can also improve workload performance.
+
+ This is marked experimental because it is a new feature (as of
+ v3.11) that interacts heavily with memory reclaim. While these
+ interactions don't cause any known issues on simple memory setups,
+ they have not be fully explored on the large set of potential
+ configurations and workloads that exist.
+
+config MEM_SOFT_DIRTY
+ bool "Track memory changes"
+ depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY
+ select PROC_PAGE_MONITOR
+ help
+ This option enables memory changes tracking by introducing a
+ soft-dirty bit on pte-s. This bit it set when someone writes
+ into a page just as regular dirty bit, but unlike the latter
+ it can be cleared by hands.
+
+ See Documentation/vm/soft-dirty.txt for more details.
diff --git a/mm/Makefile b/mm/Makefile
index 72c5acb9345f..f00803386a67 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
obj-$(CONFIG_BOUNCE) += bounce.o
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
obj-$(CONFIG_FRONTSWAP) += frontswap.o
+obj-$(CONFIG_ZSWAP) += zswap.o
obj-$(CONFIG_HAS_DMA) += dmapool.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
obj-$(CONFIG_NUMA) += mempolicy.o
@@ -58,3 +59,4 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
obj-$(CONFIG_CLEANCACHE) += cleancache.o
obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
+obj-$(CONFIG_ZBUD) += zbud.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 502517492258..d014ee5fcbbd 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -515,7 +515,6 @@ EXPORT_SYMBOL(bdi_destroy);
int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
unsigned int cap)
{
- char tmp[32];
int err;
bdi->name = name;
@@ -524,8 +523,8 @@ int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
if (err)
return err;
- sprintf(tmp, "%.28s%s", name, "-%d");
- err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq));
+ err = bdi_register(bdi, NULL, "%.28s-%ld", name,
+ atomic_long_inc_return(&bdi_seq));
if (err) {
bdi_destroy(bdi);
return err;
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 2b0bcb019ec2..6ab7744e692e 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -241,33 +241,26 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
return count;
}
-static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
+static int reset_managed_pages_done __initdata;
+
+static inline void __init reset_node_managed_pages(pg_data_t *pgdat)
{
struct zone *z;
- /*
- * In free_area_init_core(), highmem zone's managed_pages is set to
- * present_pages, and bootmem allocator doesn't allocate from highmem
- * zones. So there's no need to recalculate managed_pages because all
- * highmem pages will be managed by the buddy system. Here highmem
- * zone also includes highmem movable zone.
- */
+ if (reset_managed_pages_done)
+ return;
+
for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
- if (!is_highmem(z))
- z->managed_pages = 0;
+ z->managed_pages = 0;
}
-/**
- * free_all_bootmem_node - release a node's free pages to the buddy allocator
- * @pgdat: node to be released
- *
- * Returns the number of pages actually released.
- */
-unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
+void __init reset_all_zones_managed_pages(void)
{
- register_page_bootmem_info_node(pgdat);
- reset_node_lowmem_managed_pages(pgdat);
- return free_all_bootmem_core(pgdat->bdata);
+ struct pglist_data *pgdat;
+
+ for_each_online_pgdat(pgdat)
+ reset_node_managed_pages(pgdat);
+ reset_managed_pages_done = 1;
}
/**
@@ -279,14 +272,14 @@ unsigned long __init free_all_bootmem(void)
{
unsigned long total_pages = 0;
bootmem_data_t *bdata;
- struct pglist_data *pgdat;
- for_each_online_pgdat(pgdat)
- reset_node_lowmem_managed_pages(pgdat);
+ reset_all_zones_managed_pages();
list_for_each_entry(bdata, &bdata_list, list)
total_pages += free_all_bootmem_core(bdata);
+ totalram_pages += total_pages;
+
return total_pages;
}
diff --git a/mm/filemap.c b/mm/filemap.c
index 7905fe721aa8..4b51ac1acae7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1539,12 +1539,12 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
struct address_space *mapping = file->f_mapping;
/* If we don't want any read-ahead, don't bother */
- if (VM_RandomReadHint(vma))
+ if (vma->vm_flags & VM_RAND_READ)
return;
if (!ra->ra_pages)
return;
- if (VM_SequentialReadHint(vma)) {
+ if (vma->vm_flags & VM_SEQ_READ) {
page_cache_sync_readahead(mapping, ra, file, offset,
ra->ra_pages);
return;
@@ -1584,7 +1584,7 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma,
struct address_space *mapping = file->f_mapping;
/* If we don't want any read-ahead, don't bother */
- if (VM_RandomReadHint(vma))
+ if (vma->vm_flags & VM_RAND_READ)
return;
if (ra->mmap_miss > 0)
ra->mmap_miss--;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 362c329b83fe..243e710c6039 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -729,8 +729,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
pmd_t entry;
entry = mk_huge_pmd(page, vma);
page_add_new_anon_rmap(page, vma, haddr);
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
- pgtable_trans_huge_deposit(mm, pgtable);
add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
mm->nr_ptes++;
spin_unlock(&mm->page_table_lock);
@@ -771,8 +771,8 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
entry = mk_pmd(zero_page, vma->vm_page_prot);
entry = pmd_wrprotect(entry);
entry = pmd_mkhuge(entry);
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
- pgtable_trans_huge_deposit(mm, pgtable);
mm->nr_ptes++;
return true;
}
@@ -916,8 +916,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmdp_set_wrprotect(src_mm, addr, src_pmd);
pmd = pmd_mkold(pmd_wrprotect(pmd));
+ pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
- pgtable_trans_huge_deposit(dst_mm, pgtable);
dst_mm->nr_ptes++;
ret = 0;
@@ -987,7 +987,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
pmdp_clear_flush(vma, haddr, pmd);
/* leave pmd empty until pte is filled */
- pgtable = pgtable_trans_huge_withdraw(mm);
+ pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -1085,7 +1085,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
pmdp_clear_flush(vma, haddr, pmd);
/* leave pmd empty until pte is filled */
- pgtable = pgtable_trans_huge_withdraw(mm);
+ pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -1265,7 +1265,9 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
* young bit, instead of the current set_pmd_at.
*/
_pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
- set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
+ if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
+ pmd, _pmd, 1))
+ update_mmu_cache_pmd(vma, addr, pmd);
}
if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
if (page->mapping && trylock_page(page)) {
@@ -1358,9 +1360,15 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
struct page *page;
pgtable_t pgtable;
pmd_t orig_pmd;
- pgtable = pgtable_trans_huge_withdraw(tlb->mm);
+ /*
+ * For architectures like ppc64 we look at deposited pgtable
+ * when calling pmdp_get_and_clear. So do the
+ * pgtable_trans_huge_withdraw after finishing pmdp related
+ * operations.
+ */
orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+ pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
if (is_huge_zero_pmd(orig_pmd)) {
tlb->mm->nr_ptes--;
spin_unlock(&tlb->mm->page_table_lock);
@@ -1429,7 +1437,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
if (ret == 1) {
pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
- set_pmd_at(mm, new_addr, new_pmd, pmd);
+ set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
spin_unlock(&mm->page_table_lock);
}
out:
@@ -1691,7 +1699,7 @@ static int __split_huge_page_map(struct page *page,
pmd = page_check_address_pmd(page, mm, address,
PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
if (pmd) {
- pgtable = pgtable_trans_huge_withdraw(mm);
+ pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
haddr = address;
@@ -2359,9 +2367,9 @@ static void collapse_huge_page(struct mm_struct *mm,
spin_lock(&mm->page_table_lock);
BUG_ON(!pmd_none(*pmd));
page_add_new_anon_rmap(new_page, vma, address);
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
update_mmu_cache_pmd(vma, address, pmd);
- pgtable_trans_huge_deposit(mm, pgtable);
spin_unlock(&mm->page_table_lock);
*hpage = NULL;
@@ -2667,7 +2675,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
pmdp_clear_flush(vma, haddr, pmd);
/* leave pmd empty until pte is filled */
- pgtable = pgtable_trans_huge_withdraw(mm);
+ pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index aed085ad11a8..83aff0a4d093 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -319,7 +319,7 @@ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
hstate = hstate_vma(vma);
- return 1UL << (hstate->order + PAGE_SHIFT);
+ return 1UL << huge_page_shift(hstate);
}
EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
@@ -1263,7 +1263,7 @@ static void __init gather_bootmem_prealloc(void)
* side-effects, like CommitLimit going negative.
*/
if (h->order > (MAX_ORDER - 1))
- totalram_pages += 1 << h->order;
+ adjust_managed_page_count(page, 1 << h->order);
}
}
diff --git a/mm/internal.h b/mm/internal.h
index 8562de0a5197..4390ac6c106e 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -32,11 +32,6 @@ static inline void set_page_refcounted(struct page *page)
set_page_count(page, 1);
}
-static inline void __put_page(struct page *page)
-{
- atomic_dec(&page->_count);
-}
-
static inline void __get_page_tail_foll(struct page *page,
bool get_page_head)
{
diff --git a/mm/memblock.c b/mm/memblock.c
index c5fad932fa51..a847bfe6f3ba 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -566,7 +566,7 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
/**
* __next_free_mem_range - next function for for_each_free_mem_range()
* @idx: pointer to u64 loop variable
- * @nid: nid: node selector, %MAX_NUMNODES for all nodes
+ * @nid: node selector, %MAX_NUMNODES for all nodes
* @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @out_nid: ptr to int for nid of the range, can be %NULL
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 194721839cf5..d12ca6f3c293 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -187,10 +187,6 @@ struct mem_cgroup_per_node {
struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
};
-struct mem_cgroup_lru_info {
- struct mem_cgroup_per_node *nodeinfo[0];
-};
-
/*
* Cgroups above their limits are maintained in a RB-Tree, independent of
* their hierarchy representation
@@ -267,28 +263,10 @@ struct mem_cgroup {
/* vmpressure notifications */
struct vmpressure vmpressure;
- union {
- /*
- * the counter to account for mem+swap usage.
- */
- struct res_counter memsw;
-
- /*
- * rcu_freeing is used only when freeing struct mem_cgroup,
- * so put it into a union to avoid wasting more memory.
- * It must be disjoint from the css field. It could be
- * in a union with the res field, but res plays a much
- * larger part in mem_cgroup life than memsw, and might
- * be of interest, even at time of free, when debugging.
- * So share rcu_head with the less interesting memsw.
- */
- struct rcu_head rcu_freeing;
- /*
- * We also need some space for a worker in deferred freeing.
- * By the time we call it, rcu_freeing is no longer in use.
- */
- struct work_struct work_freeing;
- };
+ /*
+ * the counter to account for mem+swap usage.
+ */
+ struct res_counter memsw;
/*
* the counter to account for kernel memory usage.
@@ -303,8 +281,6 @@ struct mem_cgroup {
bool oom_lock;
atomic_t under_oom;
- atomic_t refcnt;
-
int swappiness;
/* OOM-Killer disable */
int oom_kill_disable;
@@ -366,14 +342,8 @@ struct mem_cgroup {
atomic_t numainfo_updating;
#endif
- /*
- * Per cgroup active and inactive list, similar to the
- * per zone LRU lists.
- *
- * WARNING: This has to be the last element of the struct. Don't
- * add new fields after this point.
- */
- struct mem_cgroup_lru_info info;
+ struct mem_cgroup_per_node *nodeinfo[0];
+ /* WARNING: nodeinfo must be the last member here */
};
static size_t memcg_size(void)
@@ -416,6 +386,11 @@ static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
{
+ /*
+ * Our caller must use css_get() first, because memcg_uncharge_kmem()
+ * will call css_put() if it sees the memcg is dead.
+ */
+ smp_wmb();
if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
}
@@ -508,9 +483,6 @@ enum res_type {
*/
static DEFINE_MUTEX(memcg_create_mutex);
-static void mem_cgroup_get(struct mem_cgroup *memcg);
-static void mem_cgroup_put(struct mem_cgroup *memcg);
-
static inline
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
{
@@ -561,15 +533,15 @@ void sock_update_memcg(struct sock *sk)
*/
if (sk->sk_cgrp) {
BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
- mem_cgroup_get(sk->sk_cgrp->memcg);
+ css_get(&sk->sk_cgrp->memcg->css);
return;
}
rcu_read_lock();
memcg = mem_cgroup_from_task(current);
cg_proto = sk->sk_prot->proto_cgroup(memcg);
- if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) {
- mem_cgroup_get(memcg);
+ if (!mem_cgroup_is_root(memcg) &&
+ memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) {
sk->sk_cgrp = cg_proto;
}
rcu_read_unlock();
@@ -583,7 +555,7 @@ void sock_release_memcg(struct sock *sk)
struct mem_cgroup *memcg;
WARN_ON(!sk->sk_cgrp->memcg);
memcg = sk->sk_cgrp->memcg;
- mem_cgroup_put(memcg);
+ css_put(&sk->sk_cgrp->memcg->css);
}
}
@@ -683,7 +655,7 @@ static struct mem_cgroup_per_zone *
mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
{
VM_BUG_ON((unsigned)nid >= nr_node_ids);
- return &memcg->info.nodeinfo[nid]->zoneinfo[zid];
+ return &memcg->nodeinfo[nid]->zoneinfo[zid];
}
struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
@@ -1148,6 +1120,58 @@ skip_node:
return NULL;
}
+static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
+{
+ /*
+ * When a group in the hierarchy below root is destroyed, the
+ * hierarchy iterator can no longer be trusted since it might
+ * have pointed to the destroyed group. Invalidate it.
+ */
+ atomic_inc(&root->dead_count);
+}
+
+static struct mem_cgroup *
+mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
+ struct mem_cgroup *root,
+ int *sequence)
+{
+ struct mem_cgroup *position = NULL;
+ /*
+ * A cgroup destruction happens in two stages: offlining and
+ * release. They are separated by a RCU grace period.
+ *
+ * If the iterator is valid, we may still race with an
+ * offlining. The RCU lock ensures the object won't be
+ * released, tryget will fail if we lost the race.
+ */
+ *sequence = atomic_read(&root->dead_count);
+ if (iter->last_dead_count == *sequence) {
+ smp_rmb();
+ position = iter->last_visited;
+ if (position && !css_tryget(&position->css))
+ position = NULL;
+ }
+ return position;
+}
+
+static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
+ struct mem_cgroup *last_visited,
+ struct mem_cgroup *new_position,
+ int sequence)
+{
+ if (last_visited)
+ css_put(&last_visited->css);
+ /*
+ * We store the sequence count from the time @last_visited was
+ * loaded successfully instead of rereading it here so that we
+ * don't lose destruction events in between. We could have
+ * raced with the destruction of @new_position after all.
+ */
+ iter->last_visited = new_position;
+ smp_wmb();
+ iter->last_dead_count = sequence;
+}
+
/**
* mem_cgroup_iter - iterate over memory cgroup hierarchy
* @root: hierarchy root
@@ -1171,7 +1195,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
{
struct mem_cgroup *memcg = NULL;
struct mem_cgroup *last_visited = NULL;
- unsigned long uninitialized_var(dead_count);
if (mem_cgroup_disabled())
return NULL;
@@ -1191,6 +1214,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
rcu_read_lock();
while (!memcg) {
struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
+ int uninitialized_var(seq);
if (reclaim) {
int nid = zone_to_nid(reclaim->zone);
@@ -1204,37 +1228,13 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
goto out_unlock;
}
- /*
- * If the dead_count mismatches, a destruction
- * has happened or is happening concurrently.
- * If the dead_count matches, a destruction
- * might still happen concurrently, but since
- * we checked under RCU, that destruction
- * won't free the object until we release the
- * RCU reader lock. Thus, the dead_count
- * check verifies the pointer is still valid,
- * css_tryget() verifies the cgroup pointed to
- * is alive.
- */
- dead_count = atomic_read(&root->dead_count);
- if (dead_count == iter->last_dead_count) {
- smp_rmb();
- last_visited = iter->last_visited;
- if (last_visited &&
- !css_tryget(&last_visited->css))
- last_visited = NULL;
- }
+ last_visited = mem_cgroup_iter_load(iter, root, &seq);
}
memcg = __mem_cgroup_iter_next(root, last_visited);
if (reclaim) {
- if (last_visited)
- css_put(&last_visited->css);
-
- iter->last_visited = memcg;
- smp_wmb();
- iter->last_dead_count = dead_count;
+ mem_cgroup_iter_update(iter, last_visited, memcg, seq);
if (!memcg)
iter->generation++;
@@ -1448,11 +1448,12 @@ static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
return ret;
}
-int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
+bool task_in_mem_cgroup(struct task_struct *task,
+ const struct mem_cgroup *memcg)
{
- int ret;
struct mem_cgroup *curr = NULL;
struct task_struct *p;
+ bool ret;
p = find_lock_task_mm(task);
if (p) {
@@ -1464,14 +1465,14 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
* killer still needs to detect if they have already been oom
* killed to prevent needlessly killing additional tasks.
*/
- task_lock(task);
+ rcu_read_lock();
curr = mem_cgroup_from_task(task);
if (curr)
css_get(&curr->css);
- task_unlock(task);
+ rcu_read_unlock();
}
if (!curr)
- return 0;
+ return false;
/*
* We should check use_hierarchy of "memcg" not "curr". Because checking
* use_hierarchy of "curr" here make this function true if hierarchy is
@@ -3031,8 +3032,16 @@ static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
if (res_counter_uncharge(&memcg->kmem, size))
return;
+ /*
+ * Releases a reference taken in kmem_cgroup_css_offline in case
+ * this last uncharge is racing with the offlining code or it is
+ * outliving the memcg existence.
+ *
+ * The memory barrier imposed by test&clear is paired with the
+ * explicit one in memcg_kmem_mark_dead().
+ */
if (memcg_kmem_test_and_clear_dead(memcg))
- mem_cgroup_put(memcg);
+ css_put(&memcg->css);
}
void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
@@ -3223,7 +3232,7 @@ void memcg_release_cache(struct kmem_cache *s)
list_del(&s->memcg_params->list);
mutex_unlock(&memcg->slab_caches_mutex);
- mem_cgroup_put(memcg);
+ css_put(&memcg->css);
out:
kfree(s->memcg_params);
}
@@ -3383,16 +3392,18 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
mutex_lock(&memcg_cache_mutex);
new_cachep = cachep->memcg_params->memcg_caches[idx];
- if (new_cachep)
+ if (new_cachep) {
+ css_put(&memcg->css);
goto out;
+ }
new_cachep = kmem_cache_dup(memcg, cachep);
if (new_cachep == NULL) {
new_cachep = cachep;
+ css_put(&memcg->css);
goto out;
}
- mem_cgroup_get(memcg);
atomic_set(&new_cachep->memcg_params->nr_pages , 0);
cachep->memcg_params->memcg_caches[idx] = new_cachep;
@@ -3480,8 +3491,6 @@ static void memcg_create_cache_work_func(struct work_struct *w)
cw = container_of(w, struct create_work, work);
memcg_create_kmem_cache(cw->memcg, cw->cachep);
- /* Drop the reference gotten when we enqueued. */
- css_put(&cw->memcg->css);
kfree(cw);
}
@@ -3618,6 +3627,34 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
int ret;
*_memcg = NULL;
+
+ /*
+ * Disabling accounting is only relevant for some specific memcg
+ * internal allocations. Therefore we would initially not have such
+ * check here, since direct calls to the page allocator that are marked
+ * with GFP_KMEMCG only happen outside memcg core. We are mostly
+ * concerned with cache allocations, and by having this test at
+ * memcg_kmem_get_cache, we are already able to relay the allocation to
+ * the root cache and bypass the memcg cache altogether.
+ *
+ * There is one exception, though: the SLUB allocator does not create
+ * large order caches, but rather service large kmallocs directly from
+ * the page allocator. Therefore, the following sequence when backed by
+ * the SLUB allocator:
+ *
+ * memcg_stop_kmem_account();
+ * kmalloc(<large_number>)
+ * memcg_resume_kmem_account();
+ *
+ * would effectively ignore the fact that we should skip accounting,
+ * since it will drive us directly to this function without passing
+ * through the cache selector memcg_kmem_get_cache. Such large
+ * allocations are extremely rare but can happen, for instance, for the
+ * cache arrays. We bring this test here.
+ */
+ if (!current->mm || current->memcg_kmem_skip_account)
+ return true;
+
memcg = try_get_mem_cgroup_from_mm(current->mm);
/*
@@ -4171,12 +4208,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
unlock_page_cgroup(pc);
/*
* even after unlock, we have memcg->res.usage here and this memcg
- * will never be freed.
+ * will never be freed, so it's safe to call css_get().
*/
memcg_check_events(memcg, page);
if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
mem_cgroup_swap_statistics(memcg, true);
- mem_cgroup_get(memcg);
+ css_get(&memcg->css);
}
/*
* Migration does not charge the res_counter for the
@@ -4288,7 +4325,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
/*
* record memcg information, if swapout && memcg != NULL,
- * mem_cgroup_get() was called in uncharge().
+ * css_get() was called in uncharge().
*/
if (do_swap_account && swapout && memcg)
swap_cgroup_record(ent, css_id(&memcg->css));
@@ -4319,7 +4356,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
if (!mem_cgroup_is_root(memcg))
res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
mem_cgroup_swap_statistics(memcg, false);
- mem_cgroup_put(memcg);
+ css_put(&memcg->css);
}
rcu_read_unlock();
}
@@ -4353,11 +4390,14 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
* This function is only called from task migration context now.
* It postpones res_counter and refcount handling till the end
* of task migration(mem_cgroup_clear_mc()) for performance
- * improvement. But we cannot postpone mem_cgroup_get(to)
- * because if the process that has been moved to @to does
- * swap-in, the refcount of @to might be decreased to 0.
+ * improvement. But we cannot postpone css_get(to) because if
+ * the process that has been moved to @to does swap-in, the
+ * refcount of @to might be decreased to 0.
+ *
+ * We are in attach() phase, so the cgroup is guaranteed to be
+ * alive, so we can just call css_get().
*/
- mem_cgroup_get(to);
+ css_get(&to->css);
return 0;
}
return -EINVAL;