summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-07-25 21:00:19 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2011-07-25 21:00:19 -0700
commit45b583b10a8b438b970e95a7d1d4db22c9e35004 (patch)
tree14fa481598289df0459580c582b48a9d95db51f6 /mm
parent154dd78d30b56ffb8b447f629bfcceb14150e5c4 (diff)
parentf19da2ce8ef5e49b8b8ea199c3601dd45d71b262 (diff)
Merge 'akpm' patch series
* Merge akpm patch series: (122 commits) drivers/connector/cn_proc.c: remove unused local Documentation/SubmitChecklist: add RCU debug config options reiserfs: use hweight_long() reiserfs: use proper little-endian bitops pnpacpi: register disabled resources drivers/rtc/rtc-tegra.c: properly initialize spinlock drivers/rtc/rtc-twl.c: check return value of twl_rtc_write_u8() in twl_rtc_set_time() drivers/rtc: add support for Qualcomm PMIC8xxx RTC drivers/rtc/rtc-s3c.c: support clock gating drivers/rtc/rtc-mpc5121.c: add support for RTC on MPC5200 init: skip calibration delay if previously done misc/eeprom: add eeprom access driver for digsy_mtc board misc/eeprom: add driver for microwire 93xx46 EEPROMs checkpatch.pl: update $logFunctions checkpatch: make utf-8 test --strict checkpatch.pl: add ability to ignore various messages checkpatch: add a "prefer __aligned" check checkpatch: validate signature styles and To: and Cc: lines checkpatch: add __rcu as a sparse modifier checkpatch: suggest using min_t or max_t ... Did this as a merge because of (trivial) conflicts in - Documentation/feature-removal-schedule.txt - arch/xtensa/include/asm/uaccess.h that were just easier to fix up in the merge than in the patch series.
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c1
-rw-r--r--mm/dmapool.c2
-rw-r--r--mm/filemap.c14
-rw-r--r--mm/huge_memory.c6
-rw-r--r--mm/hugetlb.c41
-rw-r--r--mm/memblock.c8
-rw-r--r--mm/memory.c125
-rw-r--r--mm/memory_hotplug.c68
-rw-r--r--mm/mmap.c34
-rw-r--r--mm/nommu.c34
-rw-r--r--mm/oom_kill.c2
-rw-r--r--mm/page-writeback.c11
-rw-r--r--mm/page_alloc.c58
-rw-r--r--mm/page_cgroup.c10
-rw-r--r--mm/pagewalk.c49
-rw-r--r--mm/shmem.c552
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/thrash.c17
-rw-r--r--mm/truncate.c143
19 files changed, 695 insertions, 482 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 2ef0dc9e7f39..8290b1e88257 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -606,6 +606,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
void bdi_unregister(struct backing_dev_info *bdi)
{
if (bdi->dev) {
+ bdi_set_min_ratio(bdi, 0);
trace_writeback_bdi_unregister(bdi);
bdi_prune_sb(bdi);
del_timer_sync(&bdi->wb.wakeup_timer);
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 03bf3bb4519a..fbb58e346888 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -500,7 +500,7 @@ void dmam_pool_destroy(struct dma_pool *pool)
{
struct device *dev = pool->dev;
- dma_pool_destroy(pool);
WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool));
+ dma_pool_destroy(pool);
}
EXPORT_SYMBOL(dmam_pool_destroy);
diff --git a/mm/filemap.c b/mm/filemap.c
index f820e600f1ad..10a171113273 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -128,6 +128,7 @@ void __delete_from_page_cache(struct page *page)
radix_tree_delete(&mapping->page_tree, page->index);
page->mapping = NULL;
+ /* Leave page->index set: truncation lookup relies upon it */
mapping->nrpages--;
__dec_zone_page_state(page, NR_FILE_PAGES);
if (PageSwapBacked(page))
@@ -483,6 +484,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
spin_unlock_irq(&mapping->tree_lock);
} else {
page->mapping = NULL;
+ /* Leave page->index set: truncation relies upon it */
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
page_cache_release(page);
@@ -1792,7 +1794,7 @@ EXPORT_SYMBOL(generic_file_readonly_mmap);
static struct page *__read_cache_page(struct address_space *mapping,
pgoff_t index,
- int (*filler)(void *,struct page*),
+ int (*filler)(void *, struct page *),
void *data,
gfp_t gfp)
{
@@ -1823,7 +1825,7 @@ repeat:
static struct page *do_read_cache_page(struct address_space *mapping,
pgoff_t index,
- int (*filler)(void *,struct page*),
+ int (*filler)(void *, struct page *),
void *data,
gfp_t gfp)
@@ -1863,7 +1865,7 @@ out:
* @mapping: the page's address_space
* @index: the page index
* @filler: function to perform the read
- * @data: destination for read data
+ * @data: first arg to filler(data, page) function, often left as NULL
*
* Same as read_cache_page, but don't wait for page to become unlocked
* after submitting it to the filler.
@@ -1875,7 +1877,7 @@ out:
*/
struct page *read_cache_page_async(struct address_space *mapping,
pgoff_t index,
- int (*filler)(void *,struct page*),
+ int (*filler)(void *, struct page *),
void *data)
{
return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
@@ -1923,7 +1925,7 @@ EXPORT_SYMBOL(read_cache_page_gfp);
* @mapping: the page's address_space
* @index: the page index
* @filler: function to perform the read
- * @data: destination for read data
+ * @data: first arg to filler(data, page) function, often left as NULL
*
* Read into the page cache. If a page already exists, and PageUptodate() is
* not set, try to fill the page then wait for it to become unlocked.
@@ -1932,7 +1934,7 @@ EXPORT_SYMBOL(read_cache_page_gfp);
*/
struct page *read_cache_page(struct address_space *mapping,
pgoff_t index,
- int (*filler)(void *,struct page*),
+ int (*filler)(void *, struct page *),
void *data)
{
return wait_on_page_read(read_cache_page_async(mapping, index, filler, data));
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 81532f297fd2..e2d1587be269 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1596,14 +1596,13 @@ void __khugepaged_exit(struct mm_struct *mm)
list_del(&mm_slot->mm_node);
free = 1;
}
+ spin_unlock(&khugepaged_mm_lock);
if (free) {
- spin_unlock(&khugepaged_mm_lock);
clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
free_mm_slot(mm_slot);
mmdrop(mm);
} else if (mm_slot) {
- spin_unlock(&khugepaged_mm_lock);
/*
* This is required to serialize against
* khugepaged_test_exit() (which is guaranteed to run
@@ -1614,8 +1613,7 @@ void __khugepaged_exit(struct mm_struct *mm)
*/
down_write(&mm->mmap_sem);
up_write(&mm->mmap_sem);
- } else
- spin_unlock(&khugepaged_mm_lock);
+ }
}
static void release_pte_page(struct page *page)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bfcf153bc829..dae27ba3be2c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -24,7 +24,7 @@
#include <asm/page.h>
#include <asm/pgtable.h>
-#include <asm/io.h>
+#include <linux/io.h>
#include <linux/hugetlb.h>
#include <linux/node.h>
@@ -62,10 +62,10 @@ static DEFINE_SPINLOCK(hugetlb_lock);
* must either hold the mmap_sem for write, or the mmap_sem for read and
* the hugetlb_instantiation mutex:
*
- * down_write(&mm->mmap_sem);
+ * down_write(&mm->mmap_sem);
* or
- * down_read(&mm->mmap_sem);
- * mutex_lock(&hugetlb_instantiation_mutex);
+ * down_read(&mm->mmap_sem);
+ * mutex_lock(&hugetlb_instantiation_mutex);
*/
struct file_region {
struct list_head link;
@@ -503,9 +503,10 @@ static void update_and_free_page(struct hstate *h, struct page *page)
h->nr_huge_pages--;
h->nr_huge_pages_node[page_to_nid(page)]--;
for (i = 0; i < pages_per_huge_page(h); i++) {
- page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
- 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
- 1 << PG_private | 1<< PG_writeback);
+ page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
+ 1 << PG_referenced | 1 << PG_dirty |
+ 1 << PG_active | 1 << PG_reserved |
+ 1 << PG_private | 1 << PG_writeback);
}
set_compound_page_dtor(page, NULL);
set_page_refcounted(page);
@@ -591,7 +592,6 @@ int PageHuge(struct page *page)
return dtor == free_huge_page;
}
-
EXPORT_SYMBOL_GPL(PageHuge);
static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
@@ -1105,8 +1105,16 @@ static void __init gather_bootmem_prealloc(void)
struct huge_bootmem_page *m;
list_for_each_entry(m, &huge_boot_pages, list) {
- struct page *page = virt_to_page(m);
struct hstate *h = m->hstate;
+ struct page *page;
+
+#ifdef CONFIG_HIGHMEM
+ page = pfn_to_page(m->phys >> PAGE_SHIFT);
+ free_bootmem_late((unsigned long)m,
+ sizeof(struct huge_bootmem_page));
+#else
+ page = virt_to_page(m);
+#endif
__ClearPageReserved(page);
WARN_ON(page_count(page) != 1);
prep_compound_huge_page(page, h->order);
@@ -2124,9 +2132,8 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
pte_t entry;
entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
- if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) {
+ if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
update_mmu_cache(vma, address, ptep);
- }
}
@@ -2181,9 +2188,9 @@ static int is_hugetlb_entry_migration(pte_t pte)
if (huge_pte_none(pte) || pte_present(pte))
return 0;
swp = pte_to_swp_entry(pte);
- if (non_swap_entry(swp) && is_migration_entry(swp)) {
+ if (non_swap_entry(swp) && is_migration_entry(swp))
return 1;
- } else
+ else
return 0;
}
@@ -2194,9 +2201,9 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte)
if (huge_pte_none(pte) || pte_present(pte))
return 0;
swp = pte_to_swp_entry(pte);
- if (non_swap_entry(swp) && is_hwpoison_entry(swp)) {
+ if (non_swap_entry(swp) && is_hwpoison_entry(swp))
return 1;
- } else
+ else
return 0;
}
@@ -2559,7 +2566,7 @@ retry:
* So we need to block hugepage fault by PG_hwpoison bit check.
*/
if (unlikely(PageHWPoison(page))) {
- ret = VM_FAULT_HWPOISON |
+ ret = VM_FAULT_HWPOISON |
VM_FAULT_SET_HINDEX(h - hstates);
goto backout_unlocked;
}
@@ -2627,7 +2634,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
migration_entry_wait(mm, (pmd_t *)ptep, address);
return 0;
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
- return VM_FAULT_HWPOISON_LARGE |
+ return VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(h - hstates);
}
diff --git a/mm/memblock.c b/mm/memblock.c
index a0562d1a6ad4..ccbf97339592 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -758,9 +758,9 @@ void __init memblock_analyze(void)
/* Check marker in the unused last array entry */
WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base
- != (phys_addr_t)RED_INACTIVE);
+ != MEMBLOCK_INACTIVE);
WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base
- != (phys_addr_t)RED_INACTIVE);
+ != MEMBLOCK_INACTIVE);
memblock.memory_size = 0;
@@ -786,8 +786,8 @@ void __init memblock_init(void)
memblock.reserved.max = INIT_MEMBLOCK_REGIONS;
/* Write a marker in the unused last array entry */
- memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE;
- memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE;
+ memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE;
+ memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE;
/* Create a dummy zero size MEMBLOCK which will get coalesced away later.
* This simplifies the memblock_add() code below...
diff --git a/mm/memory.c b/mm/memory.c
index 9b8a01d941cb..a56e3ba816b2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1290,13 +1290,6 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
return addr;
}
-#ifdef CONFIG_PREEMPT
-# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE)
-#else
-/* No preempt: go for improved straight-line efficiency */
-# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE)
-#endif
-
/**
* unmap_vmas - unmap a range of memory covered by a list of vma's
* @tlb: address of the caller's struct mmu_gather
@@ -1310,10 +1303,6 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
*
* Unmap all pages in the vma list.
*
- * We aim to not hold locks for too long (for scheduling latency reasons).
- * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
- * return the ending mmu_gather to the caller.
- *
* Only addresses between `start' and `end' will be unmapped.
*
* The VMA list must be sorted in ascending virtual address order.
@@ -1816,7 +1805,63 @@ next_page:
}
EXPORT_SYMBOL(__get_user_pages);
-/**
+/*
+ * fixup_user_fault() - manually resolve a user page fault
+ * @tsk: the task_struct to use for page fault accounting, or
+ * NULL if faults are not to be recorded.
+ * @mm: mm_struct of target mm
+ * @address: user address
+ * @fault_flags:flags to pass down to handle_mm_fault()
+ *
+ * This is meant to be called in the specific scenario where for locking reasons
+ * we try to access user memory in atomic context (within a pagefault_disable()
+ * section), this returns -EFAULT, and we want to resolve the user fault before
+ * trying again.
+ *
+ * Typically this is meant to be used by the futex code.
+ *
+ * The main difference with get_user_pages() is that this function will
+ * unconditionally call handle_mm_fault() which will in turn perform all the
+ * necessary SW fixup of the dirty and young bits in the PTE, while
+ * handle_mm_fault() only guarantees to update these in the struct page.
+ *
+ * This is important for some architectures where those bits also gate the
+ * access permission to the page because they are maintained in software. On
+ * such architectures, gup() will not be enough to make a subsequent access
+ * succeed.
+ *
+ * This should be called with the mm_sem held for read.
+ */
+int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long address, unsigned int fault_flags)
+{
+ struct vm_area_struct *vma;
+ int ret;
+
+ vma = find_extend_vma(mm, address);
+ if (!vma || address < vma->vm_start)
+ return -EFAULT;
+
+ ret = handle_mm_fault(mm, vma, address, fault_flags);
+ if (ret & VM_FAULT_ERROR) {
+ if (ret & VM_FAULT_OOM)
+ return -ENOMEM;
+ if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
+ return -EHWPOISON;
+ if (ret & VM_FAULT_SIGBUS)
+ return -EFAULT;
+ BUG();
+ }
+ if (tsk) {
+ if (ret & VM_FAULT_MAJOR)
+ tsk->maj_flt++;
+ else
+ tsk->min_flt++;
+ }
+ return 0;
+}
+
+/*
* get_user_pages() - pin user pages in memory
* @tsk: the task_struct to use for page fault accounting, or
* NULL if faults are not to be recorded.
@@ -3104,14 +3149,34 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t *page_table;
spinlock_t *ptl;
struct page *page;
+ struct page *cow_page;
pte_t entry;
int anon = 0;
- int charged = 0;
struct page *dirty_page = NULL;
struct vm_fault vmf;
int ret;
int page_mkwrite = 0;
+ /*
+ * If we do COW later, allocate page befor taking lock_page()
+ * on the file cache page. This will reduce lock holding time.
+ */
+ if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
+
+ if (unlikely(anon_vma_prepare(vma)))
+ return VM_FAULT_OOM;
+
+ cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+ if (!cow_page)
+ return VM_FAULT_OOM;
+
+ if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) {
+ page_cache_release(cow_page);
+ return VM_FAULT_OOM;
+ }
+ } else
+ cow_page = NULL;
+
vmf.virtual_address = (void __user *)(address & PAGE_MASK);
vmf.pgoff = pgoff;
vmf.flags = flags;
@@ -3120,12 +3185,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
ret = vma->vm_ops->fault(vma, &vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
VM_FAULT_RETRY)))
- return ret;
+ goto uncharge_out;
if (unlikely(PageHWPoison(vmf.page))) {
if (ret & VM_FAULT_LOCKED)
unlock_page(vmf.page);
- return VM_FAULT_HWPOISON;
+ ret = VM_FAULT_HWPOISON;
+ goto uncharge_out;
}
/*
@@ -3143,23 +3209,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
page = vmf.page;
if (flags & FAULT_FLAG_WRITE) {
if (!(vma->vm_flags & VM_SHARED)) {
+ page = cow_page;
anon = 1;
- if (unlikely(anon_vma_prepare(vma))) {
- ret = VM_FAULT_OOM;
- goto out;
- }
- page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
- vma, address);
- if (!page) {
- ret = VM_FAULT_OOM;
- goto out;
- }
- if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
- ret = VM_FAULT_OOM;
- page_cache_release(page);
- goto out;
- }
- charged = 1;
copy_user_highpage(page, vmf.page, address, vma);
__SetPageUptodate(page);
} else {
@@ -3228,8 +3279,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
/* no need to invalidate: a not-present page won't be cached */
update_mmu_cache(vma, address, page_table);
} else {
- if (charged)
- mem_cgroup_uncharge_page(page);
+ if (cow_page)
+ mem_cgroup_uncharge_page(cow_page);
if (anon)
page_cache_release(page);
else
@@ -3238,7 +3289,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pte_unmap_unlock(page_table, ptl);
-out:
if (dirty_page) {
struct address_space *mapping = page->mapping;
@@ -3268,6 +3318,13 @@ out:
unwritable_page:
page_cache_release(page);
return ret;
+uncharge_out:
+ /* fs's fault handler get error */
+ if (cow_page) {
+ mem_cgroup_uncharge_page(cow_page);
+ page_cache_release(cow_page);
+ }
+ return ret;
}
static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c46887b5a11e..6e7d8b21dbfa 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -34,6 +34,17 @@
#include "internal.h"
+/*
+ * online_page_callback contains pointer to current page onlining function.
+ * Initially it is generic_online_page(). If it is required it could be
+ * changed by calling set_online_page_callback() for callback registration
+ * and restore_online_page_callback() for generic callback restore.
+ */
+
+static void generic_online_page(struct page *page);
+
+static online_page_callback_t online_page_callback = generic_online_page;
+
DEFINE_MUTEX(mem_hotplug_mutex);
void lock_memory_hotplug(void)
@@ -361,23 +372,74 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
}
EXPORT_SYMBOL_GPL(__remove_pages);
-void online_page(struct page *page)
+int set_online_page_callback(online_page_callback_t callback)
+{
+ int rc = -EINVAL;
+
+ lock_memory_hotplug();
+
+ if (online_page_callback == generic_online_page) {
+ online_page_callback = callback;
+ rc = 0;
+ }
+
+ unlock_memory_hotplug();
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(set_online_page_callback);
+
+int restore_online_page_callback(online_page_callback_t callback)
+{
+ int rc = -EINVAL;
+
+ lock_memory_hotplug();
+
+ if (online_page_callback == callback) {
+ online_page_callback = generic_online_page;
+ rc = 0;
+ }
+
+ unlock_memory_hotplug();
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(restore_online_page_callback);
+
+void __online_page_set_limits(struct page *page)
{
unsigned long pfn = page_to_pfn(page);
- totalram_pages++;
if (pfn >= num_physpages)
num_physpages = pfn + 1;
+}
+EXPORT_SYMBOL_GPL(__online_page_set_limits);
+
+void __online_page_increment_counters(struct page *page)
+{
+ totalram_pages++;
#ifdef CONFIG_HIGHMEM
if (PageHighMem(page))
totalhigh_pages++;
#endif
+}
+EXPORT_SYMBOL_GPL(__online_page_increment_counters);
+void __online_page_free(struct page *page)
+{
ClearPageReserved(page);
init_page_count(page);
__free_page(page);
}
+EXPORT_SYMBOL_GPL(__online_page_free);
+
+static void generic_online_page(struct page *page)
+{
+ __online_page_set_limits(page);
+ __online_page_increment_counters(page);
+ __online_page_free(page);
+}
static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
void *arg)
@@ -388,7 +450,7 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
if (PageReserved(pfn_to_page(start_pfn)))
for (i = 0; i < nr_pages; i++) {
page = pfn_to_page(start_pfn + i);
- online_page(page);
+ (*online_page_callback)(page);
onlined_pages++;
}
*(unsigned long *)arg = onlined_pages;
diff --git a/mm/mmap.c b/mm/mmap.c
index d49736ff8a8d..a65efd4db3e1 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -122,9 +122,17 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
return 0;
if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
- unsigned long n;
+ free = global_page_state(NR_FREE_PAGES);
+ free += global_page_state(NR_FILE_PAGES);
+
+ /*
+ * shmem pages shouldn't be counted as free in this
+ * case, they can't be purged, only swapped out, and
+ * that won't affect the overall amount of available
+ * memory in the system.
+ */
+ free -= global_page_state(NR_SHMEM);
- free = global_page_state(NR_FILE_PAGES);
free += nr_swap_pages;
/*
@@ -136,34 +144,18 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
free += global_page_state(NR_SLAB_RECLAIMABLE);
/*
- * Leave the last 3% for root
- */
- if (!cap_sys_admin)
- free -= free / 32;
-
- if (free > pages)
- return 0;
-
- /*
- * nr_free_pages() is very expensive on large systems,
- * only call if we're about to fail.
- */
- n = nr_free_pages();
-
- /*
* Leave reserved pages. The pages are not for anonymous pages.
*/
- if (n <= totalreserve_pages)
+ if (free <= totalreserve_pages)
goto error;
else
- n -= totalreserve_pages;
+ free -= totalreserve_pages;
/*
* Leave the last 3% for root
*/
if (!cap_sys_admin)
- n -= n / 32;
- free += n;
+ free -= free / 32;
if (free > pages)
return 0;
diff --git a/mm/nommu.c b/mm/nommu.c
index 5c5c2d4b1807..4358032566e9 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1884,9 +1884,17 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
return 0;
if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
- unsigned long n;
+ free = global_page_state(NR_FREE_PAGES);
+ free += global_page_state(NR_FILE_PAGES);
+
+ /*
+ * shmem pages shouldn't be counted as free in this
+ * case, they can't be purged, only swapped out, and
+ * that won't affect the overall amount of available
+ * memory in the system.
+ */
+ free -= global_page_state(NR_SHMEM);
- free = global_page_state(NR_FILE_PAGES);
free += nr_swap_pages;
/*
@@ -1898,34 +1906,18 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
free += global_page_state(NR_SLAB_RECLAIMABLE);
/*
- * Leave the last 3% for root
- */
- if (!cap_sys_admin)
- free -= free / 32;
-
- if (free > pages)
- return 0;
-
- /*
- * nr_free_pages() is very expensive on large systems,
- * only call if we're about to fail.
- */
- n = nr_free_pages();
-
- /*
* Leave reserved pages. The pages are not for anonymous pages.
*/
- if (n <= totalreserve_pages)
+ if (free <= totalreserve_pages)
goto error;
else
- n -= totalreserve_pages;
+ free -= totalreserve_pages;
/*
* Leave the last 3% for root
*/
if (!cap_sys_admin)
- n -= n / 32;
- free += n;
+ free -= free / 32;
if (free > pages)
return 0;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index b0be989d4365..eafff89b3dd6 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -487,7 +487,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
/*
* If any of p's children has a different mm and is eligible for kill,
- * the one with the highest badness() score is sacrificed for its
+ * the one with the highest oom_badness() score is sacrificed for its
* parent. This attempts to lose the minimal amount of work done while
* still freeing memory.
*/
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 31f698862420..d8767b381b9c 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1141,7 +1141,6 @@ EXPORT_SYMBOL(account_page_dirtied);
void account_page_writeback(struct page *page)
{
inc_zone_page_state(page, NR_WRITEBACK);
- inc_zone_page_state(page, NR_WRITTEN);
}
EXPORT_SYMBOL(account_page_writeback);
@@ -1358,8 +1357,10 @@ int test_clear_page_writeback(struct page *page)
} else {
ret = TestClearPageWriteback(page);
}
- if (ret)
+ if (ret) {
dec_zone_page_state(page, NR_WRITEBACK);
+ inc_zone_page_state(page, NR_WRITTEN);
+ }
return ret;
}
@@ -1405,10 +1406,6 @@ EXPORT_SYMBOL(test_set_page_writeback);
*/
int mapping_tagged(struct address_space *mapping, int tag)
{
- int ret;
- rcu_read_lock();
- ret = radix_tree_tagged(&mapping->page_tree, tag);
- rcu_read_unlock();
- return ret;
+ return radix_tree_tagged(&mapping->page_tree, tag);
}
EXPORT_SYMBOL(mapping_tagged);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9119faae6e6a..094472377d81 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1616,6 +1616,21 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
set_bit(i, zlc->fullzones);
}
+/*
+ * clear all zones full, called after direct reclaim makes progress so that
+ * a zone that was recently full is not skipped over for up to a second
+ */
+static void zlc_clear_zones_full(struct zonelist *zonelist)
+{
+ struct zonelist_cache *zlc; /* cached zonelist speedup info */
+
+ zlc = zonelist->zlcache_ptr;
+ if (!zlc)
+ return;
+
+ bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+}
+
#else /* CONFIG_NUMA */
static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
@@ -1632,6 +1647,10 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
{
}
+
+static void zlc_clear_zones_full(struct zonelist *zonelist)
+{
+}
#endif /* CONFIG_NUMA */
/*
@@ -1664,7 +1683,7 @@ zonelist_scan:
continue;
if ((alloc_flags & ALLOC_CPUSET) &&
!cpuset_zone_allowed_softwall(zone, gfp_mask))
- goto try_next_zone;
+ continue;
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
@@ -1676,17 +1695,36 @@ zonelist_scan:
classzone_idx, alloc_flags))
goto try_this_zone;
+ if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
+ /*
+ * we do zlc_setup if there are multiple nodes
+ * and before considering the first zone allowed
+ * by the cpuset.
+ */
+ allowednodes = zlc_setup(zonelist, alloc_flags);
+ zlc_active = 1;
+ did_zlc_setup = 1;
+ }
+
if (zone_reclaim_mode == 0)
goto this_zone_full;
+ /*
+ * As we may have just activated ZLC, check if the first
+ * eligible zone has failed zone_reclaim recently.
+ */
+ if (NUMA_BUILD && zlc_active &&
+ !zlc_zone_worth_trying(zonelist, z, allowednodes))
+ continue;
+
ret = zone_reclaim(zone, gfp_mask, order);
switch (ret) {
case ZONE_RECLAIM_NOSCAN:
/* did not scan */
- goto try_next_zone;
+ continue;
case ZONE_RECLAIM_FULL:
/* scanned but unreclaimable */
- goto this_zone_full;
+ continue;
default:
/* did we reclaim enough */
if (!zone_watermark_ok(zone, order, mark,
@@ -1703,16 +1741,6 @@ try_this_zone:
this_zone_full:
if (NUMA_BUILD)
zlc_mark_zone_full(zonelist, z);
-try_next_zone:
- if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
- /*
- * we do zlc_setup after the first zone is tried but only
- * if there are multiple nodes make it worthwhile
- */
- allowednodes = zlc_setup(zonelist, alloc_flags);
- zlc_active = 1;
- did_zlc_setup = 1;
- }
}
if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
@@ -1954,6 +1982,10 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
if (unlikely(!(*did_some_progress)))
return NULL;
+ /* After successful reclaim, reconsider all zones for allocation */
+ if (NUMA_BUILD)
+ zlc_clear_zones_full(zonelist);
+
retry:
page = get_page_from_freelist(gfp_mask, nodemask, order,
zonelist, high_zoneidx,
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 53bffc6c293e..39d216d535ea 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -225,8 +225,8 @@ int __meminit online_page_cgroup(unsigned long start_pfn,
unsigned long start, end, pfn;
int fail = 0;
- start = start_pfn & ~(PAGES_PER_SECTION - 1);
- end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
+ start = SECTION_ALIGN_DOWN(start_pfn);
+ end = SECTION_ALIGN_UP(start_pfn + nr_pages);
if (nid == -1) {
/*
@@ -258,8 +258,8 @@ int __meminit offline_page_cgroup(unsigned long start_pfn,
{
unsigned long start, end, pfn;
- start = start_pfn & ~(PAGES_PER_SECTION - 1);
- end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
+ start = SECTION_ALIGN_DOWN(start_pfn);
+ end = SECTION_ALIGN_UP(start_pfn + nr_pages);
for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
__free_page_cgroup(pfn);
@@ -537,7 +537,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
nomem:
printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
printk(KERN_INFO
- "swap_cgroup can be disabled by noswapaccount boot option\n");
+ "swap_cgroup can be disabled by swapaccount=0 boot option\n");
return -ENOMEM;
}
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index c3450d533611..2f5cf10ff660 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -126,7 +126,39 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
return 0;
}
-#endif
+
+static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk)
+{
+ struct vm_area_struct *vma;
+
+ /* We don't need vma lookup at all. */
+ if (!walk->hugetlb_entry)
+ return NULL;
+
+ VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
+ vma = find_vma(walk->mm, addr);
+ if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma))
+ return vma;
+
+ return NULL;
+}
+
+#else /* CONFIG_HUGETLB_PAGE */
+static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk)
+{
+ return NULL;