summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c2
-rw-r--r--mm/memcontrol.c1846
-rw-r--r--mm/memory.c28
-rw-r--r--mm/migrate.c42
-rw-r--r--mm/oom_kill.c10
-rw-r--r--mm/page_alloc.c8
-rw-r--r--mm/page_cgroup.c207
-rw-r--r--mm/shmem.c20
-rw-r--r--mm/swap.c33
-rw-r--r--mm/swap_state.c4
-rw-r--r--mm/swapfile.c24
-rw-r--r--mm/vmscan.c197
12 files changed, 1914 insertions, 507 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 2f55a1e2baf7..ceba0bd03662 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -460,7 +460,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
VM_BUG_ON(!PageLocked(page));
error = mem_cgroup_cache_charge(page, current->mm,
- gfp_mask & ~__GFP_HIGHMEM);
+ gfp_mask & GFP_RECLAIM_MASK);
if (error)
goto out;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 51ee96545579..e2996b80601f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -21,11 +21,13 @@
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
#include <linux/mm.h>
+#include <linux/pagemap.h>
#include <linux/smp.h>
#include <linux/page-flags.h>
#include <linux/backing-dev.h>
#include <linux/bit_spinlock.h>
#include <linux/rcupdate.h>
+#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/swap.h>
#include <linux/spinlock.h>
@@ -34,12 +36,23 @@
#include <linux/vmalloc.h>
#include <linux/mm_inline.h>
#include <linux/page_cgroup.h>
+#include "internal.h"
#include <asm/uaccess.h>
struct cgroup_subsys mem_cgroup_subsys __read_mostly;
#define MEM_CGROUP_RECLAIM_RETRIES 5
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+/* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */
+int do_swap_account __read_mostly;
+static int really_do_swap_account __initdata = 1; /* for remember boot option*/
+#else
+#define do_swap_account (0)
+#endif
+
+static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */
+
/*
* Statistics for memory cgroup.
*/
@@ -60,7 +73,7 @@ struct mem_cgroup_stat_cpu {
} ____cacheline_aligned_in_smp;
struct mem_cgroup_stat {
- struct mem_cgroup_stat_cpu cpustat[NR_CPUS];
+ struct mem_cgroup_stat_cpu cpustat[0];
};
/*
@@ -89,9 +102,10 @@ struct mem_cgroup_per_zone {
/*
* spin_lock to protect the per cgroup LRU
*/
- spinlock_t lru_lock;
struct list_head lists[NR_LRU_LISTS];
unsigned long count[NR_LRU_LISTS];
+
+ struct zone_reclaim_stat reclaim_stat;
};
/* Macro for accessing counter */
#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
@@ -122,44 +136,73 @@ struct mem_cgroup {
*/
struct res_counter res;
/*
+ * the counter to account for mem+swap usage.
+ */
+ struct res_counter memsw;
+ /*
* Per cgroup active and inactive list, similar to the
* per zone LRU lists.
*/
struct mem_cgroup_lru_info info;
+ /*
+ protect against reclaim related member.
+ */
+ spinlock_t reclaim_param_lock;
+
int prev_priority; /* for recording reclaim priority */
+
+ /*
+ * While reclaiming in a hiearchy, we cache the last child we
+ * reclaimed from. Protected by hierarchy_mutex
+ */
+ struct mem_cgroup *last_scanned_child;
/*
- * statistics.
+ * Should the accounting and control be hierarchical, per subtree?
+ */
+ bool use_hierarchy;
+ unsigned long last_oom_jiffies;
+ atomic_t refcnt;
+
+ unsigned int swappiness;
+
+ /*
+ * statistics. This must be placed at the end of memcg.
*/
struct mem_cgroup_stat stat;
};
-static struct mem_cgroup init_mem_cgroup;
enum charge_type {
MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
MEM_CGROUP_CHARGE_TYPE_MAPPED,
MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */
MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
+ MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
NR_CHARGE_TYPE,
};
/* only for here (for easy reading.) */
#define PCGF_CACHE (1UL << PCG_CACHE)
#define PCGF_USED (1UL << PCG_USED)
-#define PCGF_ACTIVE (1UL << PCG_ACTIVE)
#define PCGF_LOCK (1UL << PCG_LOCK)
-#define PCGF_FILE (1UL << PCG_FILE)
static const unsigned long
pcg_default_flags[NR_CHARGE_TYPE] = {
- PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */
- PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */
- PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
+ PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */
+ PCGF_USED | PCGF_LOCK, /* Anon */
+ PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
0, /* FORCE */
};
-/*
- * Always modified under lru lock. Then, not necessary to preempt_disable()
- */
+/* for encoding cft->private value on file */
+#define _MEM (0)
+#define _MEMSWAP (1)
+#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
+#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)
+#define MEMFILE_ATTR(val) ((val) & 0xffff)
+
+static void mem_cgroup_get(struct mem_cgroup *mem);
+static void mem_cgroup_put(struct mem_cgroup *mem);
+
static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
struct page_cgroup *pc,
bool charge)
@@ -167,10 +210,9 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
int val = (charge)? 1 : -1;
struct mem_cgroup_stat *stat = &mem->stat;
struct mem_cgroup_stat_cpu *cpustat;
+ int cpu = get_cpu();
- VM_BUG_ON(!irqs_disabled());
-
- cpustat = &stat->cpustat[smp_processor_id()];
+ cpustat = &stat->cpustat[cpu];
if (PageCgroupCache(pc))
__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
else
@@ -182,6 +224,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
else
__mem_cgroup_stat_add_safe(cpustat,
MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
+ put_cpu();
}
static struct mem_cgroup_per_zone *
@@ -197,6 +240,9 @@ page_cgroup_zoneinfo(struct page_cgroup *pc)
int nid = page_cgroup_nid(pc);
int zid = page_cgroup_zid(pc);
+ if (!mem)
+ return NULL;
+
return mem_cgroup_zoneinfo(mem, nid, zid);
}
@@ -236,77 +282,152 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
struct mem_cgroup, css);
}
-static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
- struct page_cgroup *pc)
+static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
{
- int lru = LRU_BASE;
+ struct mem_cgroup *mem = NULL;
+ /*
+ * Because we have no locks, mm->owner's may be being moved to other
+ * cgroup. We use css_tryget() here even if this looks
+ * pessimistic (rather than adding locks here).
+ */
+ rcu_read_lock();
+ do {
+ mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+ if (unlikely(!mem))
+ break;
+ } while (!css_tryget(&mem->css));
+ rcu_read_unlock();
+ return mem;
+}
- if (PageCgroupUnevictable(pc))
- lru = LRU_UNEVICTABLE;
- else {
- if (PageCgroupActive(pc))
- lru += LRU_ACTIVE;
- if (PageCgroupFile(pc))
- lru += LRU_FILE;
- }
+static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem)
+{
+ if (!mem)
+ return true;
+ return css_is_removed(&mem->css);
+}
- MEM_CGROUP_ZSTAT(mz, lru) -= 1;
+/*
+ * Following LRU functions are allowed to be used without PCG_LOCK.
+ * Operations are called by routine of global LRU independently from memcg.
+ * What we have to take care of here is validness of pc->mem_cgroup.
+ *
+ * Changes to pc->mem_cgroup happens when
+ * 1. charge
+ * 2. moving account
+ * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
+ * It is added to LRU before charge.
+ * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
+ * When moving account, the page is not on LRU. It's isolated.
+ */
- mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false);
- list_del(&pc->lru);
+void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
+{
+ struct page_cgroup *pc;
+ struct mem_cgroup *mem;
+ struct mem_cgroup_per_zone *mz;
+
+ if (mem_cgroup_disabled())
+ return;
+ pc = lookup_page_cgroup(page);
+ /* can happen while we handle swapcache. */
+ if (list_empty(&pc->lru) || !pc->mem_cgroup)
+ return;
+ /*
+ * We don't check PCG_USED bit. It's cleared when the "page" is finally
+ * removed from global LRU.
+ */
+ mz = page_cgroup_zoneinfo(pc);
+ mem = pc->mem_cgroup;
+ MEM_CGROUP_ZSTAT(mz, lru) -= 1;
+ list_del_init(&pc->lru);
+ return;
}
-static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
- struct page_cgroup *pc)
+void mem_cgroup_del_lru(struct page *page)
{
- int lru = LRU_BASE;
+ mem_cgroup_del_lru_list(page, page_lru(page));
+}
- if (PageCgroupUnevictable(pc))
- lru = LRU_UNEVICTABLE;
- else {
- if (PageCgroupActive(pc))
- lru += LRU_ACTIVE;
- if (PageCgroupFile(pc))
- lru += LRU_FILE;
- }
+void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
+{
+ struct mem_cgroup_per_zone *mz;
+ struct page_cgroup *pc;
- MEM_CGROUP_ZSTAT(mz, lru) += 1;
- list_add(&pc->lru, &mz->lists[lru]);
+ if (mem_cgroup_disabled())
+ return;
- mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true);
+ pc = lookup_page_cgroup(page);
+ smp_rmb();
+ /* unused page is not rotated. */
+ if (!PageCgroupUsed(pc))
+ return;
+ mz = page_cgroup_zoneinfo(pc);
+ list_move(&pc->lru, &mz->lists[lru]);
}
-static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru)
+void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
{
- struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
- int active = PageCgroupActive(pc);
- int file = PageCgroupFile(pc);
- int unevictable = PageCgroupUnevictable(pc);
- enum lru_list from = unevictable ? LRU_UNEVICTABLE :
- (LRU_FILE * !!file + !!active);
+ struct page_cgroup *pc;
+ struct mem_cgroup_per_zone *mz;
- if (lru == from)
+ if (mem_cgroup_disabled())
+ return;
+ pc = lookup_page_cgroup(page);
+ /* barrier to sync with "charge" */
+ smp_rmb();
+ if (!PageCgroupUsed(pc))
return;
- MEM_CGROUP_ZSTAT(mz, from) -= 1;
+ mz = page_cgroup_zoneinfo(pc);
+ MEM_CGROUP_ZSTAT(mz, lru) += 1;
+ list_add(&pc->lru, &mz->lists[lru]);
+}
+
+/*
+ * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to
+ * lru because the page may.be reused after it's fully uncharged (because of
+ * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge
+ * it again. This function is only used to charge SwapCache. It's done under
+ * lock_page and expected that zone->lru_lock is never held.
+ */
+static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page)
+{
+ unsigned long flags;
+ struct zone *zone = page_zone(page);
+ struct page_cgroup *pc = lookup_page_cgroup(page);
+
+ spin_lock_irqsave(&zone->lru_lock, flags);
/*
- * However this is done under mz->lru_lock, another flags, which
- * are not related to LRU, will be modified from out-of-lock.
- * We have to use atomic set/clear flags.
+ * Forget old LRU when this page_cgroup is *not* used. This Used bit
+ * is guarded by lock_page() because the page is SwapCache.
*/
- if (is_unevictable_lru(lru)) {
- ClearPageCgroupActive(pc);
- SetPageCgroupUnevictable(pc);
- } else {
- if (is_active_lru(lru))
- SetPageCgroupActive(pc);
- else
- ClearPageCgroupActive(pc);
- ClearPageCgroupUnevictable(pc);
- }
+ if (!PageCgroupUsed(pc))
+ mem_cgroup_del_lru_list(page, page_lru(page));
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+}
- MEM_CGROUP_ZSTAT(mz, lru) += 1;
- list_move(&pc->lru, &mz->lists[lru]);
+static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
+{
+ unsigned long flags;
+ struct zone *zone = page_zone(page);
+ struct page_cgroup *pc = lookup_page_cgroup(page);
+
+ spin_lock_irqsave(&zone->lru_lock, flags);
+ /* link when the page is linked to LRU but page_cgroup isn't */
+ if (PageLRU(page) && list_empty(&pc->lru))
+ mem_cgroup_add_lru_list(page, page_lru(page));
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+}
+
+
+void mem_cgroup_move_lists(struct page *page,
+ enum lru_list from, enum lru_list to)
+{
+ if (mem_cgroup_disabled())
+ return;
+ mem_cgroup_del_lru_list(page, from);
+ mem_cgroup_add_lru_list(page, to);
}
int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
@@ -320,37 +441,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
}
/*
- * This routine assumes that the appropriate zone's lru lock is already held
- */
-void mem_cgroup_move_lists(struct page *page, enum lru_list lru)
-{
- struct page_cgroup *pc;
- struct mem_cgroup_per_zone *mz;
- unsigned long flags;
-
- if (mem_cgroup_subsys.disabled)
- return;
-
- /*
- * We cannot lock_page_cgroup while holding zone's lru_lock,
- * because other holders of lock_page_cgroup can be interrupted
- * with an attempt to rotate_reclaimable_page. But we cannot
- * safely get to page_cgroup without it, so just try_lock it:
- * mem_cgroup_isolate_pages allows for page left on wrong list.
- */
- pc = lookup_page_cgroup(page);
- if (!trylock_page_cgroup(pc))
- return;
- if (pc && PageCgroupUsed(pc)) {
- mz = page_cgroup_zoneinfo(pc);
- spin_lock_irqsave(&mz->lru_lock, flags);
- __mem_cgroup_move_lists(pc, lru);
- spin_unlock_irqrestore(&mz->lru_lock, flags);
- }
- unlock_page_cgroup(pc);
-}
-
-/*
* Calculate mapped_ratio under memory controller. This will be used in
* vmscan.c for deteremining we have to reclaim mapped pages.
*/
@@ -372,39 +462,108 @@ int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
*/
int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
{
- return mem->prev_priority;
+ int prev_priority;
+
+ spin_lock(&mem->reclaim_param_lock);
+ prev_priority = mem->prev_priority;
+ spin_unlock(&mem->reclaim_param_lock);
+
+ return prev_priority;
}
void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
{
+ spin_lock(&mem->reclaim_param_lock);
if (priority < mem->prev_priority)
mem->prev_priority = priority;
+ spin_unlock(&mem->reclaim_param_lock);
}
void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
{
+ spin_lock(&mem->reclaim_param_lock);
mem->prev_priority = priority;
+ spin_unlock(&mem->reclaim_param_lock);
}
-/*
- * Calculate # of pages to be scanned in this priority/zone.
- * See also vmscan.c
- *
- * priority starts from "DEF_PRIORITY" and decremented in each loop.
- * (see include/linux/mmzone.h)
- */
+static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
+{
+ unsigned long active;
+ unsigned long inactive;
+ unsigned long gb;
+ unsigned long inactive_ratio;
+
+ inactive = mem_cgroup_get_all_zonestat(memcg, LRU_INACTIVE_ANON);
+ active = mem_cgroup_get_all_zonestat(memcg, LRU_ACTIVE_ANON);
+
+ gb = (inactive + active) >> (30 - PAGE_SHIFT);
+ if (gb)
+ inactive_ratio = int_sqrt(10 * gb);
+ else
+ inactive_ratio = 1;
+
+ if (present_pages) {
+ present_pages[0] = inactive;
+ present_pages[1] = active;
+ }
+
+ return inactive_ratio;
+}
+
+int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
+{
+ unsigned long active;
+ unsigned long inactive;
+ unsigned long present_pages[2];
+ unsigned long inactive_ratio;
-long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
- int priority, enum lru_list lru)
+ inactive_ratio = calc_inactive_ratio(memcg, present_pages);
+
+ inactive = present_pages[0];
+ active = present_pages[1];
+
+ if (inactive * inactive_ratio < active)
+ return 1;
+
+ return 0;
+}
+
+unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
+ struct zone *zone,
+ enum lru_list lru)
{
- long nr_pages;
int nid = zone->zone_pgdat->node_id;
int zid = zone_idx(zone);
- struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
+ struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
- nr_pages = MEM_CGROUP_ZSTAT(mz, lru);
+ return MEM_CGROUP_ZSTAT(mz, lru);
+}
- return (nr_pages >> priority);
+struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
+ struct zone *zone)
+{
+ int nid = zone->zone_pgdat->node_id;
+ int zid = zone_idx(zone);
+ struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
+
+ return &mz->reclaim_stat;
+}
+
+struct zone_reclaim_stat *
+mem_cgroup_get_reclaim_stat_from_page(struct page *page)
+{
+ struct page_cgroup *pc;
+ struct mem_cgroup_per_zone *mz;
+
+ if (mem_cgroup_disabled())
+ return NULL;
+
+ pc = lookup_page_cgroup(page);
+ mz = page_cgroup_zoneinfo(pc);
+ if (!mz)
+ return NULL;
+
+ return &mz->reclaim_stat;
}
unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
@@ -429,95 +588,281 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
src = &mz->lists[lru];
- spin_lock(&mz->lru_lock);
scan = 0;
list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
if (scan >= nr_to_scan)
break;
+
+ page = pc->page;
if (unlikely(!PageCgroupUsed(pc)))
continue;
- page = pc->page;
-
if (unlikely(!PageLRU(page)))
continue;
- /*
- * TODO: play better with lumpy reclaim, grabbing anything.
- */
- if (PageUnevictable(page) ||
- (PageActive(page) && !active) ||
- (!PageActive(page) && active)) {
- __mem_cgroup_move_lists(pc, page_lru(page));
- continue;
- }
-
scan++;
- list_move(&pc->lru, &pc_list);
-
if (__isolate_lru_page(page, mode, file) == 0) {
list_move(&page->lru, dst);
nr_taken++;
}
}
- list_splice(&pc_list, src);
- spin_unlock(&mz->lru_lock);
-
*scanned = scan;
return nr_taken;
}
+#define mem_cgroup_from_res_counter(counter, member) \
+ container_of(counter, struct mem_cgroup, member)
+
/*
- * Charge the memory controller for page usage.
- * Return
- * 0 if the charge was successful
- * < 0 if the cgroup is over its limit
+ * This routine finds the DFS walk successor. This routine should be
+ * called with hierarchy_mutex held
*/
-static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, enum charge_type ctype,
- struct mem_cgroup *memcg)
+static struct mem_cgroup *
+mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem)
{
+ struct cgroup *cgroup, *curr_cgroup, *root_cgroup;
+
+ curr_cgroup = curr->css.cgroup;
+ root_cgroup = root_mem->css.cgroup;
+
+ if (!list_empty(&curr_cgroup->children)) {
+ /*
+ * Walk down to children
+ */
+ mem_cgroup_put(curr);
+ cgroup = list_entry(curr_cgroup->children.next,
+ struct cgroup, sibling);
+ curr = mem_cgroup_from_cont(cgroup);
+ mem_cgroup_get(curr);
+ goto done;
+ }
+
+visit_parent:
+ if (curr_cgroup == root_cgroup) {
+ mem_cgroup_put(curr);
+ curr = root_mem;
+ mem_cgroup_get(curr);
+ goto done;
+ }
+
+ /*
+ * Goto next sibling
+ */
+ if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) {
+ mem_cgroup_put(curr);
+ cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup,
+ sibling);
+ curr = mem_cgroup_from_cont(cgroup);
+ mem_cgroup_get(curr);
+ goto done;
+ }
+
+ /*
+ * Go up to next parent and next parent's sibling if need be
+ */
+ curr_cgroup = curr_cgroup->parent;
+ goto visit_parent;
+
+done:
+ root_mem->last_scanned_child = curr;
+ return curr;
+}
+
+/*
+ * Visit the first child (need not be the first child as per the ordering
+ * of the cgroup list, since we track last_scanned_child) of @mem and use
+ * that to reclaim free pages from.
+ */
+static struct mem_cgroup *
+mem_cgroup_get_first_node(struct mem_cgroup *root_mem)
+{
+ struct cgroup *cgroup;
+ struct mem_cgroup *ret;
+ bool obsolete;
+
+ obsolete = mem_cgroup_is_obsolete(root_mem->last_scanned_child);
+
+ /*
+ * Scan all children under the mem_cgroup mem
+ */
+ mutex_lock(&mem_cgroup_subsys.hierarchy_mutex);
+ if (list_empty(&root_mem->css.cgroup->children)) {
+ ret = root_mem;
+ goto done;
+ }
+
+ if (!root_mem->last_scanned_child || obsolete) {
+
+ if (obsolete && root_mem->last_scanned_child)
+ mem_cgroup_put(root_mem->last_scanned_child);
+
+ cgroup = list_first_entry(&root_mem->css.cgroup->children,
+ struct cgroup, sibling);
+ ret = mem_cgroup_from_cont(cgroup);
+ mem_cgroup_get(ret);
+ } else
+ ret = mem_cgroup_get_next_node(root_mem->last_scanned_child,
+ root_mem);
+
+done:
+ root_mem->last_scanned_child = ret;
+ mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex);
+ return ret;
+}
+
+static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
+{
+ if (do_swap_account) {
+ if (res_counter_check_under_limit(&mem->res) &&
+ res_counter_check_under_limit(&mem->memsw))
+ return true;
+ } else
+ if (res_counter_check_under_limit(&mem->res))
+ return true;
+ return false;
+}
+
+static unsigned int get_swappiness(struct mem_cgroup *memcg)
+{
+ struct cgroup *cgrp = memcg->css.cgroup;
+ unsigned int swappiness;
+
+ /* root ? */
+ if (cgrp->parent == NULL)
+ return vm_swappiness;
+
+ spin_lock(&memcg->reclaim_param_lock);
+ swappiness = memcg->swappiness;
+ spin_unlock(&memcg->reclaim_param_lock);
+
+ return swappiness;
+}
+
+/*
+ * Dance down the hierarchy if needed to reclaim memory. We remember the
+ * last child we reclaimed from, so that we don't end up penalizing
+ * one child extensively based on its position in the children list.
+ *
+ * root_mem is the original ancestor that we've been reclaim from.
+ */
+static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
+ gfp_t gfp_mask, bool noswap)
+{
+ struct mem_cgroup *next_mem;
+ int ret = 0;
+
+ /*
+ * Reclaim unconditionally and don't check for return value.
+ * We need to reclaim in the current group and down the tree.
+ * One might think about checking for children before reclaiming,
+ * but there might be left over accounting, even after children
+ * have left.
+ */
+ ret = try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap,
+ get_swappiness(root_mem));
+ if (mem_cgroup_check_under_limit(root_mem))
+ return 0;
+ if (!root_mem->use_hierarchy)
+ return ret;
+
+ next_mem = mem_cgroup_get_first_node(root_mem);
+
+ while (next_mem != root_mem) {
+ if (mem_cgroup_is_obsolete(next_mem)) {
+ mem_cgroup_put(next_mem);
+ next_mem = mem_cgroup_get_first_node(root_mem);
+ continue;
+ }
+ ret = try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap,
+ get_swappiness(next_mem));
+ if (mem_cgroup_check_under_limit(root_mem))
+ return 0;
+ mutex_lock(&mem_cgroup_subsys.hierarchy_mutex);
+ next_mem = mem_cgroup_get_next_node(next_mem, root_mem);
+ mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex);
+ }
+ return ret;
+}
+
+bool mem_cgroup_oom_called(struct task_struct *task)
+{
+ bool ret = false;
struct mem_cgroup *mem;
- struct page_cgroup *pc;
- unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
- struct mem_cgroup_per_zone *mz;
- unsigned long flags;
+ struct mm_struct *mm;
- pc = lookup_page_cgroup(page);
- /* can happen at boot */
- if (unlikely(!pc))
+ rcu_read_lock();
+ mm = task->mm;
+ if (!mm)
+ mm = &init_mm;
+ mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+ if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
+ ret = true;
+ rcu_read_unlock();
+ return ret;
+}
+/*
+ * Unlike exported interface, "oom" parameter is added. if oom==true,
+ * oom-killer can be invoked.
+ */
+static int __mem_cgroup_try_charge(struct mm_struct *mm,
+ gfp_t gfp_mask, struct mem_cgroup **memcg,
+ bool oom)
+{
+ struct mem_cgroup *mem, *mem_over_limit;
+ int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ struct res_counter *fail_res;
+
+ if (unlikely(test_thread_flag(TIF_MEMDIE))) {
+ /* Don't account this! */
+ *memcg = NULL;
return 0;
- prefetchw(pc);
+ }
+
/*
* We always charge the cgroup the mm_struct belongs to.
* The mm_struct's mem_cgroup changes on task migration if the
* thread group leader migrates. It's possible that mm is not
* set, if so charge the init_mm (happens for pagecache usage).
*/
-
- if (likely(!memcg)) {
- rcu_read_lock();
- mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
- if (unlikely(!mem)) {
- rcu_read_unlock();
- return 0;
- }
- /*
- * For every charge from the cgroup, increment reference count
- */
- css_get(&mem->css);
- rcu_read_unlock();
+ mem = *memcg;
+ if (likely(!mem)) {
+ mem = try_get_mem_cgroup_from_mm(mm);
+ *memcg = mem;
} else {
- mem = memcg;
- css_get(&memcg->css);
+ css_get(&mem->css);
}
+ if (unlikely(!mem))
+ return 0;
+
+ VM_BUG_ON(mem_cgroup_is_obsolete(mem));
+
+ while (1) {
+ int ret;
+ bool noswap = false;
+
+ ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
+ if (likely(!ret)) {
+ if (!do_swap_account)
+ break;
+ ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
+ &fail_res);
+ if (likely(!ret))
+ break;
+ /* mem+swap counter fails */
+ res_counter_uncharge(&mem->res, PAGE_SIZE);
+ noswap = true;
+ mem_over_limit = mem_cgroup_from_res_counter(fail_res,
+ memsw);
+ } else
+ /* mem counter fails */
+ mem_over_limit = mem_cgroup_from_res_counter(fail_res,
+ res);
- while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) {
if (!(gfp_mask & __GFP_WAIT))
- goto out;
+ goto nomem;
- if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
- continue;
+ ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask,
+ noswap);
/*
* try_to_free_mem_cgroup_pages() might not give us a full
@@ -525,49 +870,214 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
* moved to swap cache or just unmapped from the cgroup.
* Check the limit again to see if the reclaim reduced the
* current usage of the cgroup before giving up
+ *
*/
- if (res_counter_check_under_limit(&mem->res))
+ if (mem_cgroup_check_under_limit(mem_over_limit))
continue;
if (!nr_retries--) {
- mem_cgroup_out_of_memory(mem, gfp_mask);
- goto out;
+ if (oom) {
+ mutex_lock(&memcg_tasklist);
+ mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
+ mutex_unlock(&memcg_tasklist);
+ mem_over_limit->last_oom_jiffies = jiffies;
+ }
+ goto nomem;
}
}
+ return 0;
+nomem:
+ css_put(&mem->css);
+ return -ENOMEM;
+}
+static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page)
+{
+ struct mem_cgroup *mem;
+ swp_entry_t ent;
+
+ if (!PageSwapCache(page))
+ return NULL;
+
+ ent.val = page_private(page);
+ mem = lookup_swap_cgroup(ent);
+ if (!mem)
+ return NULL;
+ if (!css_tryget(&mem->css))
+ return NULL;
+ return mem;
+}
+
+/*
+ * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
+ * USED state. If already USED, uncharge and return.
+ */
+
+static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
+ struct page_cgroup *pc,
+ enum charge_type ctype)
+{
+ /* try_charge() can return NULL to *memcg, taking care of it. */
+ if (!mem)
+ return;
lock_page_cgroup(pc);
if (unlikely(PageCgroupUsed(pc))) {
unlock_page_cgroup(pc);
res_counter_uncharge(&mem->res, PAGE_SIZE);
+ if (do_swap_account)
+ res_counter_uncharge(&mem->memsw, PAGE_SIZE);
css_put(&mem->css);
-
- goto done;
+ return;
}
pc->mem_cgroup = mem;
- /*
- * If a page is accounted as a page cache, insert to inactive list.
- * If anon, insert to active list.
- */
+ smp_wmb();
pc->flags = pcg_default_flags[ctype];
- mz = page_cgroup_zoneinfo(pc);
+ mem_cgroup_charge_statistics(mem, pc, true);
- spin_lock_irqsave(&mz->lru_lock, flags);
- __mem_cgroup_add_list(mz, pc);
- spin_unlock_irqrestore(&mz->lru_lock, flags);
unlock_page_cgroup(pc);
+}
-done:
- return 0;
+/**
+ * mem_cgroup_move_account - move account of the page
+ * @pc: page_cgroup of the page.
+ * @from: mem_cgroup which the page is moved from.
+ * @to: mem_cgroup which the page is moved to. @from != @to.
+ *
+ * The caller must confirm following.
+ * - page is not on LRU (isolate_page() is useful.)
+ *
+ * returns 0 at success,
+ * returns -EBUSY when lock is busy or "pc" is unstable.
+ *
+ * This function does "uncharge" from old cgroup but doesn't do "charge" to
+ * new cgroup. It should be done by a caller.
+ */
+
+static int mem_cgroup_move_account(struct page_cgroup *pc,
+ struct mem_cgroup *from, struct mem_cgroup *to)
+{
+ struct mem_cgroup_per_zone *from_mz, *to_mz;
+ int nid, zid;
+ int ret = -EBUSY;
+
+ VM_BUG_ON(from == to);
+ VM_BUG_ON(PageLRU(pc->page));
+
+ nid = page_cgroup_nid(pc);
+ zid = page_cgroup_zid(pc);
+ from_mz = mem_cgroup_zoneinfo(from, nid, zid);
+ to_mz = mem_cgroup_zoneinfo(to, nid, zid);
+
+ if (!trylock_page_cgroup(pc))
+ return ret;
+
+ if (!PageCgroupUsed(pc))
+ goto out;
+
+ if (pc->mem_cgroup != from)
+ goto out;
+
+ css_put(&from->css);
+ res_counter_uncharge(&from->res, PAGE_SIZE);
+ mem_cgroup_charge_statistics(from, pc, false);
+ if (do_swap_account)
+ res_counter_uncharge(&from->memsw, PAGE_SIZE);
+ pc->mem_cgroup = to;
+ mem_cgroup_charge_statistics(to, pc, true);
+ css_get(&to->css);
+ ret = 0;
out:
- css_put(&mem->css);
- return -ENOMEM;
+ unlock_page_cgroup(pc);
+ return ret;
+}
+
+/*
+ * move charges to its parent.
+ */
+
+static int mem_cgroup_move_parent(struct page_cgroup *pc,
+ struct mem_cgroup *child,
+ gfp_t gfp_mask)
+{
+ struct page *page = pc->page;
+ struct cgroup *cg = child->css.cgroup;
+ struct cgroup *pcg = cg->parent;
+ struct mem_cgroup *parent;
+ int ret;
+
+ /* Is ROOT ? */
+ if (!pcg)
+ return -EINVAL;
+
+
+ parent = mem_cgroup_from_cont(pcg);
+
+
+ ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
+ if (ret || !parent)
+ return ret;
+
+ if (!get_page_unless_zero(page))
+ return -EBUSY;
+
+ ret = isolate_lru_page(page);
+
+ if (ret)
+ goto cancel;
+
+ ret = mem_cgroup_move_account(pc, child, parent);
+
+ /* drop extra refcnt by try_charge() (move_account increment one) */
+ css_put(&parent->css);
+ putback_lru_page(page);
+ if (!ret) {
+ put_page(page);
+ return 0;
+ }
+ /* uncharge if move fails */
+cancel:
+ res_counter_uncharge(&parent->res, PAGE_SIZE);
+ if (do_swap_account)
+ res_counter_uncharge(&parent->memsw, PAGE_SIZE);
+ put_page(page);
+ return ret;
+}
+
+/*
+ * Charge the memory controller for page usage.
+ * Return
+ * 0 if the charge was successful
+ * < 0 if the cgroup is over its limit
+ */
+static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
+ gfp_t gfp_mask, enum charge_type ctype,
+ struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *mem;
+ struct page_cgroup *pc;
+ int ret;
+
+ pc = lookup_page_cgroup(page);
+ /* can happen at boot */
<