summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c2
-rw-r--r--mm/hugetlb.c6
-rw-r--r--mm/init-mm.c6
-rw-r--r--mm/kmemleak.c100
-rw-r--r--mm/ksm.c71
-rw-r--r--mm/memblock.c2
-rw-r--r--mm/memcontrol.c55
-rw-r--r--mm/memory.c12
-rw-r--r--mm/mempolicy.c82
-rw-r--r--mm/migrate.c10
-rw-r--r--mm/mmap.c44
-rw-r--r--mm/oom_kill.c683
-rw-r--r--mm/page-writeback.c70
-rw-r--r--mm/page_alloc.c33
-rw-r--r--mm/rmap.c127
-rw-r--r--mm/shmem.c133
-rw-r--r--mm/slab.c2
-rw-r--r--mm/swapfile.c100
-rw-r--r--mm/truncate.c38
-rw-r--r--mm/util.c11
-rw-r--r--mm/vmalloc.c7
-rw-r--r--mm/vmscan.c533
-rw-r--r--mm/vmstat.c8
23 files changed, 1283 insertions, 852 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 20e5642e9f9f..3d4df44e4221 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2238,14 +2238,12 @@ static ssize_t generic_perform_write(struct file *file,
do {
struct page *page;
- pgoff_t index; /* Pagecache index for current page */
unsigned long offset; /* Offset into pagecache page */
unsigned long bytes; /* Bytes to write to page */
size_t copied; /* Bytes copied from user */
void *fsdata;
offset = (pos & (PAGE_CACHE_SIZE - 1));
- index = pos >> PAGE_CACHE_SHIFT;
bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
iov_iter_count(i));
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 54d42b009dbe..b61d2db9f34e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2349,11 +2349,17 @@ retry_avoidcopy:
ptep = huge_pte_offset(mm, address & huge_page_mask(h));
if (likely(pte_same(huge_ptep_get(ptep), pte))) {
/* Break COW */
+ mmu_notifier_invalidate_range_start(mm,
+ address & huge_page_mask(h),
+ (address & huge_page_mask(h)) + huge_page_size(h));
huge_ptep_clear_flush(vma, address, ptep);
set_huge_pte_at(mm, address, ptep,
make_huge_pte(vma, new_page, 1));
/* Make the old page be freed below */
new_page = old_page;
+ mmu_notifier_invalidate_range_end(mm,
+ address & huge_page_mask(h),
+ (address & huge_page_mask(h)) + huge_page_size(h));
}
page_cache_release(new_page);
page_cache_release(old_page);
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 57aba0da9668..1d29cdfe8ebb 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -7,6 +7,11 @@
#include <asm/atomic.h>
#include <asm/pgtable.h>
+#include <asm/mmu.h>
+
+#ifndef INIT_MM_CONTEXT
+#define INIT_MM_CONTEXT(name)
+#endif
struct mm_struct init_mm = {
.mm_rb = RB_ROOT,
@@ -17,4 +22,5 @@ struct mm_struct init_mm = {
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
.cpu_vm_mask = CPU_MASK_ALL,
+ INIT_MM_CONTEXT(init_mm)
};
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 2c0d032ac898..bd9bc214091b 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -211,6 +211,9 @@ static signed long jiffies_scan_wait;
static int kmemleak_stack_scan = 1;
/* protects the memory scanning, parameters and debug/kmemleak file access */
static DEFINE_MUTEX(scan_mutex);
+/* setting kmemleak=on, will set this var, skipping the disable */
+static int kmemleak_skip_disable;
+
/*
* Early object allocation/freeing logging. Kmemleak is initialized after the
@@ -398,7 +401,9 @@ static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
object = prio_tree_entry(node, struct kmemleak_object,
tree_node);
if (!alias && object->pointer != ptr) {
- kmemleak_warn("Found object by alias");
+ pr_warning("Found object by alias at 0x%08lx\n", ptr);
+ dump_stack();
+ dump_object_info(object);
object = NULL;
}
} else
@@ -695,7 +700,7 @@ static void paint_ptr(unsigned long ptr, int color)
}
/*
- * Make a object permanently as gray-colored so that it can no longer be
+ * Mark an object permanently as gray-colored so that it can no longer be
* reported as a leak. This is used in general to mark a false positive.
*/
static void make_gray_object(unsigned long ptr)
@@ -838,10 +843,19 @@ out:
rcu_read_unlock();
}
-/*
- * Memory allocation function callback. This function is called from the
- * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc,
- * vmalloc etc.).
+/**
+ * kmemleak_alloc - register a newly allocated object
+ * @ptr: pointer to beginning of the object
+ * @size: size of the object
+ * @min_count: minimum number of references to this object. If during memory
+ * scanning a number of references less than @min_count is found,
+ * the object is reported as a memory leak. If @min_count is 0,
+ * the object is never reported as a leak. If @min_count is -1,
+ * the object is ignored (not scanned and not reported as a leak)
+ * @gfp: kmalloc() flags used for kmemleak internal memory allocations
+ *
+ * This function is called from the kernel allocators when a new object
+ * (memory block) is allocated (kmem_cache_alloc, kmalloc, vmalloc etc.).
*/
void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
gfp_t gfp)
@@ -855,9 +869,12 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
}
EXPORT_SYMBOL_GPL(kmemleak_alloc);
-/*
- * Memory freeing function callback. This function is called from the kernel
- * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.).
+/**
+ * kmemleak_free - unregister a previously registered object
+ * @ptr: pointer to beginning of the object
+ *
+ * This function is called from the kernel allocators when an object (memory
+ * block) is freed (kmem_cache_free, kfree, vfree etc.).
*/
void __ref kmemleak_free(const void *ptr)
{
@@ -870,9 +887,14 @@ void __ref kmemleak_free(const void *ptr)
}
EXPORT_SYMBOL_GPL(kmemleak_free);
-/*
- * Partial memory freeing function callback. This function is usually called
- * from bootmem allocator when (part of) a memory block is freed.
+/**
+ * kmemleak_free_part - partially unregister a previously registered object
+ * @ptr: pointer to the beginning or inside the object. This also
+ * represents the start of the range to be freed
+ * @size: size to be unregistered
+ *
+ * This function is called when only a part of a memory block is freed
+ * (usually from the bootmem allocator).
*/
void __ref kmemleak_free_part(const void *ptr, size_t size)
{
@@ -885,9 +907,12 @@ void __ref kmemleak_free_part(const void *ptr, size_t size)
}
EXPORT_SYMBOL_GPL(kmemleak_free_part);
-/*
- * Mark an already allocated memory block as a false positive. This will cause
- * the block to no longer be reported as leak and always be scanned.
+/**
+ * kmemleak_not_leak - mark an allocated object as false positive
+ * @ptr: pointer to beginning of the object
+ *
+ * Calling this function on an object will cause the memory block to no longer
+ * be reported as leak and always be scanned.
*/
void __ref kmemleak_not_leak(const void *ptr)
{
@@ -900,10 +925,14 @@ void __ref kmemleak_not_leak(const void *ptr)
}
EXPORT_SYMBOL(kmemleak_not_leak);
-/*
- * Ignore a memory block. This is usually done when it is known that the
- * corresponding block is not a leak and does not contain any references to
- * other allocated memory blocks.
+/**
+ * kmemleak_ignore - ignore an allocated object
+ * @ptr: pointer to beginning of the object
+ *
+ * Calling this function on an object will cause the memory block to be
+ * ignored (not scanned and not reported as a leak). This is usually done when
+ * it is known that the corresponding block is not a leak and does not contain
+ * any references to other allocated memory blocks.
*/
void __ref kmemleak_ignore(const void *ptr)
{
@@ -916,8 +945,16 @@ void __ref kmemleak_ignore(const void *ptr)
}
EXPORT_SYMBOL(kmemleak_ignore);
-/*
- * Limit the range to be scanned in an allocated memory block.
+/**
+ * kmemleak_scan_area - limit the range to be scanned in an allocated object
+ * @ptr: pointer to beginning or inside the object. This also
+ * represents the start of the scan area
+ * @size: size of the scan area
+ * @gfp: kmalloc() flags used for kmemleak internal memory allocations
+ *
+ * This function is used when it is known that only certain parts of an object
+ * contain references to other objects. Kmemleak will only scan these areas
+ * reducing the number false negatives.
*/
void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
{
@@ -930,8 +967,14 @@ void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
}
EXPORT_SYMBOL(kmemleak_scan_area);
-/*
- * Inform kmemleak not to scan the given memory block.
+/**
+ * kmemleak_no_scan - do not scan an allocated object
+ * @ptr: pointer to beginning of the object
+ *
+ * This function notifies kmemleak not to scan the given memory block. Useful
+ * in situations where it is known that the given object does not contain any
+ * references to other objects. Kmemleak will not scan such objects reducing
+ * the number of false negatives.
*/
void __ref kmemleak_no_scan(const void *ptr)
{
@@ -1602,7 +1645,9 @@ static int kmemleak_boot_config(char *str)
return -EINVAL;
if (strcmp(str, "off") == 0)
kmemleak_disable();
- else if (strcmp(str, "on") != 0)
+ else if (strcmp(str, "on") == 0)
+ kmemleak_skip_disable = 1;
+ else
return -EINVAL;
return 0;
}
@@ -1616,6 +1661,13 @@ void __init kmemleak_init(void)
int i;
unsigned long flags;
+#ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF
+ if (!kmemleak_skip_disable) {
+ kmemleak_disable();
+ return;
+ }
+#endif
+
jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE);
jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000);
diff --git a/mm/ksm.c b/mm/ksm.c
index 6c3e99b4ae7c..e2ae00458320 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -33,6 +33,7 @@
#include <linux/mmu_notifier.h>
#include <linux/swap.h>
#include <linux/ksm.h>
+#include <linux/hash.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -153,8 +154,9 @@ struct rmap_item {
static struct rb_root root_stable_tree = RB_ROOT;
static struct rb_root root_unstable_tree = RB_ROOT;
-#define MM_SLOTS_HASH_HEADS 1024
-static struct hlist_head *mm_slots_hash;
+#define MM_SLOTS_HASH_SHIFT 10
+#define MM_SLOTS_HASH_HEADS (1 << MM_SLOTS_HASH_SHIFT)
+static struct hlist_head mm_slots_hash[MM_SLOTS_HASH_HEADS];
static struct mm_slot ksm_mm_head = {
.mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
@@ -269,28 +271,13 @@ static inline void free_mm_slot(struct mm_slot *mm_slot)
kmem_cache_free(mm_slot_cache, mm_slot);
}
-static int __init mm_slots_hash_init(void)
-{
- mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
- GFP_KERNEL);
- if (!mm_slots_hash)
- return -ENOMEM;
- return 0;
-}
-
-static void __init mm_slots_hash_free(void)
-{
- kfree(mm_slots_hash);
-}
-
static struct mm_slot *get_mm_slot(struct mm_struct *mm)
{
struct mm_slot *mm_slot;
struct hlist_head *bucket;
struct hlist_node *node;
- bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
- % MM_SLOTS_HASH_HEADS];
+ bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
hlist_for_each_entry(mm_slot, node, bucket, link) {
if (mm == mm_slot->mm)
return mm_slot;
@@ -303,8 +290,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
{
struct hlist_head *bucket;
- bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
- % MM_SLOTS_HASH_HEADS];
+ bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
mm_slot->mm = mm;
hlist_add_head(&mm_slot->link, bucket);
}
@@ -318,19 +304,14 @@ static void hold_anon_vma(struct rmap_item *rmap_item,
struct anon_vma *anon_vma)
{
rmap_item->anon_vma = anon_vma;
- atomic_inc(&anon_vma->external_refcount);
+ get_anon_vma(anon_vma);
}
-static void drop_anon_vma(struct rmap_item *rmap_item)
+static void ksm_drop_anon_vma(struct rmap_item *rmap_item)
{
struct anon_vma *anon_vma = rmap_item->anon_vma;
- if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) {
- int empty = list_empty(&anon_vma->head);
- spin_unlock(&anon_vma->lock);
- if (empty)
- anon_vma_free(anon_vma);
- }
+ drop_anon_vma(anon_vma);
}
/*
@@ -415,7 +396,7 @@ static void break_cow(struct rmap_item *rmap_item)
* It is not an accident that whenever we want to break COW
* to undo, we also need to drop a reference to the anon_vma.
*/
- drop_anon_vma(rmap_item);
+ ksm_drop_anon_vma(rmap_item);
down_read(&mm->mmap_sem);
if (ksm_test_exit(mm))
@@ -470,7 +451,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
ksm_pages_sharing--;
else
ksm_pages_shared--;
- drop_anon_vma(rmap_item);
+ ksm_drop_anon_vma(rmap_item);
rmap_item->address &= PAGE_MASK;
cond_resched();
}
@@ -558,7 +539,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
else
ksm_pages_shared--;
- drop_anon_vma(rmap_item);
+ ksm_drop_anon_vma(rmap_item);
rmap_item->address &= PAGE_MASK;
} else if (rmap_item->address & UNSTABLE_FLAG) {
@@ -1566,7 +1547,7 @@ again:
struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
- spin_lock(&anon_vma->lock);
+ anon_vma_lock(anon_vma);
list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
vma = vmac->vma;
if (rmap_item->address < vma->vm_start ||
@@ -1589,7 +1570,7 @@ again:
if (!search_new_forks || !mapcount)
break;
}
- spin_unlock(&anon_vma->lock);
+ anon_vma_unlock(anon_vma);
if (!mapcount)
goto out;
}
@@ -1619,7 +1600,7 @@ again:
struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
- spin_lock(&anon_vma->lock);
+ anon_vma_lock(anon_vma);
list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
vma = vmac->vma;
if (rmap_item->address < vma->vm_start ||
@@ -1637,11 +1618,11 @@ again:
ret = try_to_unmap_one(page, vma,
rmap_item->address, flags);
if (ret != SWAP_AGAIN || !page_mapped(page)) {
- spin_unlock(&anon_vma->lock);
+ anon_vma_unlock(anon_vma);
goto out;
}
}
- spin_unlock(&anon_vma->lock);
+ anon_vma_unlock(anon_vma);
}
if (!search_new_forks++)
goto again;
@@ -1671,7 +1652,7 @@ again:
struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
- spin_lock(&anon_vma->lock);
+ anon_vma_lock(anon_vma);
list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
vma = vmac->vma;
if (rmap_item->address < vma->vm_start ||
@@ -1688,11 +1669,11 @@ again:
ret = rmap_one(page, vma, rmap_item->address, arg);
if (ret != SWAP_AGAIN) {
- spin_unlock(&anon_vma->lock);
+ anon_vma_unlock(anon_vma);
goto out;
}
}
- spin_unlock(&anon_vma->lock);
+ anon_vma_unlock(anon_vma);
}
if (!search_new_forks++)
goto again;
@@ -1943,15 +1924,11 @@ static int __init ksm_init(void)
if (err)
goto out;
- err = mm_slots_hash_init();
- if (err)
- goto out_free1;
-
ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
if (IS_ERR(ksm_thread)) {
printk(KERN_ERR "ksm: creating kthread failed\n");
err = PTR_ERR(ksm_thread);
- goto out_free2;
+ goto out_free;
}
#ifdef CONFIG_SYSFS
@@ -1959,7 +1936,7 @@ static int __init ksm_init(void)
if (err) {
printk(KERN_ERR "ksm: register sysfs failed\n");
kthread_stop(ksm_thread);
- goto out_free2;
+ goto out_free;
}
#else
ksm_run = KSM_RUN_MERGE; /* no way for user to start it */
@@ -1975,9 +1952,7 @@ static int __init ksm_init(void)
#endif
return 0;
-out_free2:
- mm_slots_hash_free();
-out_free1:
+out_free:
ksm_slab_free();
out:
return err;
diff --git a/mm/memblock.c b/mm/memblock.c
index 3024eb30fc27..43840b305ecb 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -504,7 +504,7 @@ int __init memblock_is_reserved(u64 addr)
int memblock_is_region_reserved(u64 base, u64 size)
{
- return memblock_overlaps_region(&memblock.reserved, base, size);
+ return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
}
/*
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 20a8193a7af8..0576e9e64586 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -51,6 +51,8 @@
#include <asm/uaccess.h>
+#include <trace/events/vmscan.h>
+
struct cgroup_subsys mem_cgroup_subsys __read_mostly;
#define MEM_CGROUP_RECLAIM_RETRIES 5
struct mem_cgroup *root_mem_cgroup __read_mostly;
@@ -211,8 +213,6 @@ struct mem_cgroup {
*/
spinlock_t reclaim_param_lock;
- int prev_priority; /* for recording reclaim priority */
-
/*
* While reclaiming in a hierarchy, we cache the last child we
* reclaimed from.
@@ -858,35 +858,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
return ret;
}
-/*
- * prev_priority control...this will be used in memory reclaim path.
- */
-int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
-{
- int prev_priority;
-
- spin_lock(&mem->reclaim_param_lock);
- prev_priority = mem->prev_priority;
- spin_unlock(&mem->reclaim_param_lock);
-
- return prev_priority;
-}
-
-void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
-{
- spin_lock(&mem->reclaim_param_lock);
- if (priority < mem->prev_priority)
- mem->prev_priority = priority;
- spin_unlock(&mem->reclaim_param_lock);
-}
-
-void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
-{
- spin_lock(&mem->reclaim_param_lock);
- mem->prev_priority = priority;
- spin_unlock(&mem->reclaim_param_lock);
-}
-
static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
{
unsigned long active;
@@ -1038,6 +1009,10 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
}
*scanned = scan;
+
+ trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
+ 0, 0, 0, mode);
+
return nr_taken;
}
@@ -1158,6 +1133,24 @@ static int mem_cgroup_count_children(struct mem_cgroup *mem)
}
/*
+ * Return the memory (and swap, if configured) limit for a memcg.
+ */
+u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
+{
+ u64 limit;
+ u64 memsw;
+
+ limit = res_counter_read_u64(&memcg->res, RES_LIMIT) +
+ total_swap_pages;
+ memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+ /*
+ * If memsw is finite and limits the amount of swap space available
+ * to this memcg, return that limit.
+ */
+ return min(limit, memsw);
+}
+
+/*
* Visit the first child (need not be the first child as per the ordering
* of the cgroup list, since we track last_scanned_child) of @mem and use
* that to reclaim free pages from.
diff --git a/mm/memory.c b/mm/memory.c
index bde42c6d3633..858829d06a92 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -307,7 +307,6 @@ void free_pgd_range(struct mmu_gather *tlb,
{
pgd_t *pgd;
unsigned long next;
- unsigned long start;
/*
* The next few lines have given us lots of grief...
@@ -351,7 +350,6 @@ void free_pgd_range(struct mmu_gather *tlb,
if (addr > end - 1)
return;
- start = addr;
pgd = pgd_offset(tlb->mm, addr);
do {
next = pgd_addr_end(addr, end);
@@ -2008,11 +2006,10 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
{
pgd_t *pgd;
unsigned long next;
- unsigned long start = addr, end = addr + size;
+ unsigned long end = addr + size;
int err;
BUG_ON(addr >= end);
- mmu_notifier_invalidate_range_start(mm, start, end);
pgd = pgd_offset(mm, addr);
do {
next = pgd_addr_end(addr, end);
@@ -2020,7 +2017,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
if (err)
break;
} while (pgd++, addr = next, addr != end);
- mmu_notifier_invalidate_range_end(mm, start, end);
+
return err;
}
EXPORT_SYMBOL_GPL(apply_to_page_range);
@@ -2630,6 +2627,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
swp_entry_t entry;
pte_t pte;
struct mem_cgroup *ptr = NULL;
+ int exclusive = 0;
int ret = 0;
if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
@@ -2724,10 +2722,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
flags &= ~FAULT_FLAG_WRITE;
+ ret |= VM_FAULT_WRITE;
+ exclusive = 1;
}
flush_icache_page(vma, page);
set_pte_at(mm, address, page_table, pte);
- page_add_anon_rmap(page, vma, address);
+ do_page_add_anon_rmap(page, vma, address, exclusive);
/* It's better to call commit-charge after rmap is established */
mem_cgroup_commit_charge_swapin(page, ptr);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5bc0a96beb51..f969da5dd8a2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1275,33 +1275,42 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
const unsigned long __user *, new_nodes)
{
const struct cred *cred = current_cred(), *tcred;
- struct mm_struct *mm;
+ struct mm_struct *mm = NULL;
struct task_struct *task;
- nodemask_t old;
- nodemask_t new;
nodemask_t task_nodes;
int err;
+ nodemask_t *old;
+ nodemask_t *new;
+ NODEMASK_SCRATCH(scratch);
+
+ if (!scratch)
+ return -ENOMEM;
+
+ old = &scratch->mask1;
+ new = &scratch->mask2;
- err = get_nodes(&old, old_nodes, maxnode);
+ err = get_nodes(old, old_nodes, maxnode);
if (err)
- return err;
+ goto out;
- err = get_nodes(&new, new_nodes, maxnode);
+ err = get_nodes(new, new_nodes, maxnode);
if (err)
- return err;
+ goto out;
/* Find the mm_struct */
read_lock(&tasklist_lock);
task = pid ? find_task_by_vpid(pid) : current;
if (!task) {
read_unlock(&tasklist_lock);
- return -ESRCH;
+ err = -ESRCH;
+ goto out;
}
mm = get_task_mm(task);
read_unlock(&tasklist_lock);
+ err = -EINVAL;
if (!mm)
- return -EINVAL;
+ goto out;
/*
* Check if this process has the right to modify the specified
@@ -1322,12 +1331,12 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
task_nodes = cpuset_mems_allowed(task);
/* Is the user allowed to access the target nodes? */
- if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
+ if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
err = -EPERM;
goto out;
}
- if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
+ if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
err = -EINVAL;
goto out;
}
@@ -1336,10 +1345,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
if (err)
goto out;
- err = do_migrate_pages(mm, &old, &new,
+ err = do_migrate_pages(mm, old, new,
capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
out:
- mmput(mm);
+ if (mm)
+ mmput(mm);
+ NODEMASK_SCRATCH_FREE(scratch);
+
return err;
}
@@ -1712,6 +1724,50 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
}
#endif
+/*
+ * mempolicy_nodemask_intersects
+ *
+ * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
+ * policy. Otherwise, check for intersection between mask and the policy
+ * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
+ * policy, always return true since it may allocate elsewhere on fallback.
+ *
+ * Takes task_lock(tsk) to prevent freeing of its mempolicy.
+ */
+bool mempolicy_nodemask_intersects(struct task_struct *tsk,
+ const nodemask_t *mask)
+{
+ struct mempolicy *mempolicy;
+ bool ret = true;
+
+ if (!mask)
+ return ret;
+ task_lock(tsk);
+ mempolicy = tsk->mempolicy;
+ if (!mempolicy)
+ goto out;
+
+ switch (mempolicy->mode) {
+ case MPOL_PREFERRED:
+ /*
+ * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
+ * allocate from, they may fallback to other nodes when oom.
+ * Thus, it's possible for tsk to have allocated memory from
+ * nodes in mask.
+ */
+ break;
+ case MPOL_BIND:
+ case MPOL_INTERLEAVE:
+ ret = nodes_intersects(mempolicy->v.nodes, *mask);
+ break;
+ default:
+ BUG();
+ }
+out:
+ task_unlock(tsk);
+ return ret;
+}
+
/* Allocate a page in interleaved policy.
Own path because it needs to do special accounting. */
static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
diff --git a/mm/migrate.c b/mm/migrate.c
index 4205b1d6049e..38e7cad782f4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -639,7 +639,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
* exist when the page is remapped later
*/
anon_vma = page_anon_vma(page);
- atomic_inc(&anon_vma->external_refcount);
+ get_anon_vma(anon_vma);
}
}
@@ -682,12 +682,8 @@ skip_unmap:
rcu_unlock:
/* Drop an anon_vma reference if we took one */
- if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) {
- int empty = list_empty(&anon_vma->head);
- spin_unlock(&anon_vma->lock);
- if (empty)
- anon_vma_free(anon_vma);
- }
+ if (anon_vma)
+ drop_anon_vma(anon_vma);
if (rcu_locked)
rcu_read_unlock();
diff --git a/mm/mmap.c b/mm/mmap.c
index e38e910cb756..31003338b978 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -452,12 +452,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
spin_lock(&mapping->i_mmap_lock);
vma->vm_truncate_count = mapping->truncate_count;
}
- anon_vma_lock(vma);
__vma_link(mm, vma, prev, rb_link, rb_parent);
__vma_link_file(vma);
- anon_vma_unlock(vma);
if (mapping)
spin_unlock(&mapping->i_mmap_lock);
@@ -506,6 +504,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
struct vm_area_struct *importer = NULL;
struct address_space *mapping = NULL;
struct prio_tree_root *root = NULL;
+ struct anon_vma *anon_vma = NULL;
struct file *file = vma->vm_file;
long adjust_next = 0;
int remove_next = 0;
@@ -578,6 +577,17 @@ again: remove_next = 1 + (end > next->vm_end);
}
}
+ /*
+ * When changing only vma->vm_end, we don't really need anon_vma
+ * lock. This is a fairly rare case by itself, but the anon_vma
+ * lock may be shared between many sibling processes. Skipping
+ * the lock for brk adjustments makes a difference sometimes.
+ */
+ if (vma->anon_vma && (insert || importer || start != vma->vm_start)) {
+ anon_vma = vma->anon_vma;
+ anon_vma_lock(anon_vma);
+ }
+
if (root) {
flush_dcache_mmap_lock(mapping);
vma_prio_tree_remove(vma, root);
@@ -617,6 +627,8 @@ again: remove_next = 1 + (end > next->vm_end);
__insert_vm_struct(mm, insert);
}
+ if (anon_vma)
+ anon_vma_unlock(anon_vma);
if (mapping)
spin_unlock(&mapping->i_mmap_lock);
@@ -1710,7 +1722,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
*/
if (unlikely(anon_vma_prepare(vma)))
return -ENOMEM;
- anon_vma_lock(vma);
+ vma_lock_anon_vma(vma);
/*
* vma->vm_start/vm_end cannot change under us because the caller
@@ -1721,7 +1733,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
if (address < PAGE_ALIGN(address+4))
address = PAGE_ALIGN(address+4);
else {
- anon_vma_unlock(vma);
+ vma_unlock_anon_vma(vma);
return -ENOMEM;
}
error = 0;
@@ -1739,7 +1751,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
perf_event_mmap(vma);
}
}
- anon_vma_unlock(vma);
+ vma_unlock_anon_vma(vma);
return error;
}
#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1764,7 +1776,7 @@ static int expand_downwards(struct vm_area_struct *vma,
if (error)
return error;
- anon_vma_lock(vma);
+ vma_lock_anon_vma(vma);
/*
* vma->vm_start/vm_end cannot change under us because the caller
@@ -1786,7 +1798,7 @@ static int expand_downwards(struct vm_area_struct *vma,
perf_event_mmap(vma);
}
}
- anon_vma_unlock(vma);
+ vma_unlock_anon_vma(vma);
return error;
}
@@ -2470,23 +2482,23 @@ static DEFINE_MUTEX(mm_all_locks_mutex);
static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
{
- if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+ if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
/*
* The LSB of head.next can't change from under us
* because we hold the mm_all_locks_mutex.
*/
- spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem);
+ spin_lock_nest_lock(&anon_vma->root->lock, &mm->mmap_sem);
/*
* We can safely modify head.next after taking the
- * anon_vma->lock. If some other vma in this mm shares
+ * anon_vma->root->lock. If some other vma in this mm shares
* the same anon_vma we won't take it again.
*
* No need of atomic instructions here, head.next
* can't change from under us thanks to the
- * anon_vma->lock.
+ * anon_vma->root->lock.
*/
if (__test_and_set_bit(0, (unsigned long *)
- &anon_vma->head.next))
+ &anon_vma->root->head.next))
BUG();
}
}
@@ -2577,7 +2589,7 @@ out_unlock:
static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
{
- if (test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+ if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
/*