summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-06-24 20:47:21 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2015-06-24 20:47:21 -0700
commitaefbef10e3ae6e2c6e3c54f906f10b34c73a2c66 (patch)
treeef967a568ff5e7bb52d1d3d0c61e701ad4f31c21 /mm
parent266da6f14232638b9caafb7facf2a7333895dd05 (diff)
parent8a8c35fadfaf55629a37ef1a8ead1b8fb32581d2 (diff)
Merge branch 'akpm' (patches from Andrew)
Merge first patchbomb from Andrew Morton: - a few misc things - ocfs2 udpates - kernel/watchdog.c feature work (took ages to get right) - most of MM. A few tricky bits are held up and probably won't make 4.2. * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (91 commits) mm: kmemleak_alloc_percpu() should follow the gfp from per_alloc() mm, thp: respect MPOL_PREFERRED policy with non-local node tmpfs: truncate prealloc blocks past i_size mm/memory hotplug: print the last vmemmap region at the end of hot add memory mm/mmap.c: optimization of do_mmap_pgoff function mm: kmemleak: optimise kmemleak_lock acquiring during kmemleak_scan mm: kmemleak: avoid deadlock on the kmemleak object insertion error path mm: kmemleak: do not acquire scan_mutex in kmemleak_do_cleanup() mm: kmemleak: fix delete_object_*() race when called on the same memory block mm: kmemleak: allow safe memory scanning during kmemleak disabling memcg: convert mem_cgroup->under_oom from atomic_t to int memcg: remove unused mem_cgroup->oom_wakeups frontswap: allow multiple backends x86, mirror: x86 enabling - find mirrored memory ranges mm/memblock: allocate boot time data structures from mirrored memory mm/memblock: add extra "flags" to memblock to allow selection of memory based on attribute mm: do not ignore mapping_gfp_mask in page cache allocation paths mm/cma.c: fix typos in comments mm/oom_kill.c: print points as unsigned int mm/hugetlb: handle races in alloc_huge_page and hugetlb_reserve_pages ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig1
-rw-r--r--mm/cma.c10
-rw-r--r--mm/filemap.c23
-rw-r--r--mm/frontswap.c215
-rw-r--r--mm/huge_memory.c22
-rw-r--r--mm/hugetlb.c187
-rw-r--r--mm/hwpoison-inject.c4
-rw-r--r--mm/kmemleak.c168
-rw-r--r--mm/memblock.c123
-rw-r--r--mm/memcontrol.c59
-rw-r--r--mm/memory-failure.c351
-rw-r--r--mm/memory.c10
-rw-r--r--mm/memory_hotplug.c1
-rw-r--r--mm/mempolicy.c38
-rw-r--r--mm/memtest.c3
-rw-r--r--mm/migrate.c11
-rw-r--r--mm/mmap.c6
-rw-r--r--mm/mprotect.c11
-rw-r--r--mm/mremap.c17
-rw-r--r--mm/nobootmem.c14
-rw-r--r--mm/nommu.c112
-rw-r--r--mm/oom_kill.c158
-rw-r--r--mm/page_alloc.c177
-rw-r--r--mm/percpu.c2
-rw-r--r--mm/pgtable-generic.c29
-rw-r--r--mm/rmap.c9
-rw-r--r--mm/shmem.c2
-rw-r--r--mm/slab.c1
-rw-r--r--mm/slab.h1
-rw-r--r--mm/slab_common.c98
-rw-r--r--mm/slub.c1
-rw-r--r--mm/swap.c1
-rw-r--r--mm/vmscan.c15
33 files changed, 1009 insertions, 871 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 390214da4546..c180af880ed5 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -368,6 +368,7 @@ config MEMORY_FAILURE
depends on ARCH_SUPPORTS_MEMORY_FAILURE
bool "Enable recovery from hardware memory errors"
select MEMORY_ISOLATION
+ select RAS
help
Enables code to recover from some memory failures on systems
with MCA recovery. This allows a system to continue running
diff --git a/mm/cma.c b/mm/cma.c
index 3a7a67b93394..e7d1db533025 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -182,7 +182,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
if (!size || !memblock_is_region_reserved(base, size))
return -EINVAL;
- /* ensure minimal alignment requied by mm core */
+ /* ensure minimal alignment required by mm core */
alignment = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order);
/* alignment should be aligned with order_per_bit */
@@ -238,7 +238,7 @@ int __init cma_declare_contiguous(phys_addr_t base,
/*
* high_memory isn't direct mapped memory so retrieving its physical
* address isn't appropriate. But it would be useful to check the
- * physical address of the highmem boundary so it's justfiable to get
+ * physical address of the highmem boundary so it's justifiable to get
* the physical address from it. On x86 there is a validation check for
* this case, so the following workaround is needed to avoid it.
*/
@@ -316,13 +316,15 @@ int __init cma_declare_contiguous(phys_addr_t base,
*/
if (base < highmem_start && limit > highmem_start) {
addr = memblock_alloc_range(size, alignment,
- highmem_start, limit);
+ highmem_start, limit,
+ MEMBLOCK_NONE);
limit = highmem_start;
}
if (!addr) {
addr = memblock_alloc_range(size, alignment, base,
- limit);
+ limit,
+ MEMBLOCK_NONE);
if (!addr) {
ret = -ENOMEM;
goto err;
diff --git a/mm/filemap.c b/mm/filemap.c
index 6bf5e42d560a..8d17ceea8dbe 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -196,7 +196,9 @@ void __delete_from_page_cache(struct page *page, void *shadow)
page->mapping = NULL;
/* Leave page->index set: truncation lookup relies upon it */
- __dec_zone_page_state(page, NR_FILE_PAGES);
+ /* hugetlb pages do not participate in page cache accounting. */
+ if (!PageHuge(page))
+ __dec_zone_page_state(page, NR_FILE_PAGES);
if (PageSwapBacked(page))
__dec_zone_page_state(page, NR_SHMEM);
BUG_ON(page_mapped(page));
@@ -483,7 +485,12 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
error = radix_tree_insert(&mapping->page_tree, offset, new);
BUG_ON(error);
mapping->nrpages++;
- __inc_zone_page_state(new, NR_FILE_PAGES);
+
+ /*
+ * hugetlb pages do not participate in page cache accounting.
+ */
+ if (!PageHuge(new))
+ __inc_zone_page_state(new, NR_FILE_PAGES);
if (PageSwapBacked(new))
__inc_zone_page_state(new, NR_SHMEM);
spin_unlock_irq(&mapping->tree_lock);
@@ -575,7 +582,10 @@ static int __add_to_page_cache_locked(struct page *page,
radix_tree_preload_end();
if (unlikely(error))
goto err_insert;
- __inc_zone_page_state(page, NR_FILE_PAGES);
+
+ /* hugetlb pages do not participate in page cache accounting. */
+ if (!huge)
+ __inc_zone_page_state(page, NR_FILE_PAGES);
spin_unlock_irq(&mapping->tree_lock);
if (!huge)
mem_cgroup_commit_charge(page, memcg, false);
@@ -1654,8 +1664,8 @@ no_cached_page:
error = -ENOMEM;
goto out;
}
- error = add_to_page_cache_lru(page, mapping,
- index, GFP_KERNEL);
+ error = add_to_page_cache_lru(page, mapping, index,
+ GFP_KERNEL & mapping_gfp_mask(mapping));
if (error) {
page_cache_release(page);
if (error == -EEXIST) {
@@ -1756,7 +1766,8 @@ static int page_cache_read(struct file *file, pgoff_t offset)
if (!page)
return -ENOMEM;
- ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
+ ret = add_to_page_cache_lru(page, mapping, offset,
+ GFP_KERNEL & mapping_gfp_mask(mapping));
if (ret == 0)
ret = mapping->a_ops->readpage(file, page);
else if (ret == -EEXIST)
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 8d82809eb085..27a9924caf61 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -21,11 +21,16 @@
#include <linux/swapfile.h>
/*
- * frontswap_ops is set by frontswap_register_ops to contain the pointers
- * to the frontswap "backend" implementation functions.
+ * frontswap_ops are added by frontswap_register_ops, and provide the
+ * frontswap "backend" implementation functions. Multiple implementations
+ * may be registered, but implementations can never deregister. This
+ * is a simple singly-linked list of all registered implementations.
*/
static struct frontswap_ops *frontswap_ops __read_mostly;
+#define for_each_frontswap_ops(ops) \
+ for ((ops) = frontswap_ops; (ops); (ops) = (ops)->next)
+
/*
* If enabled, frontswap_store will return failure even on success. As
* a result, the swap subsystem will always write the page to swap, in
@@ -79,15 +84,6 @@ static inline void inc_frontswap_invalidates(void) { }
* on all frontswap functions to not call the backend until the backend
* has registered.
*
- * Specifically when no backend is registered (nobody called
- * frontswap_register_ops) all calls to frontswap_init (which is done via
- * swapon -> enable_swap_info -> frontswap_init) are registered and remembered
- * (via the setting of need_init bitmap) but fail to create tmem_pools. When a
- * backend registers with frontswap at some later point the previous
- * calls to frontswap_init are executed (by iterating over the need_init
- * bitmap) to create tmem_pools and set the respective poolids. All of that is
- * guarded by us using atomic bit operations on the 'need_init' bitmap.
- *
* This would not guards us against the user deciding to call swapoff right as
* we are calling the backend to initialize (so swapon is in action).
* Fortunatly for us, the swapon_mutex has been taked by the callee so we are
@@ -106,37 +102,64 @@ static inline void inc_frontswap_invalidates(void) { }
*
* Obviously the opposite (unloading the backend) must be done after all
* the frontswap_[store|load|invalidate_area|invalidate_page] start
- * ignorning or failing the requests - at which point frontswap_ops
- * would have to be made in some fashion atomic.
+ * ignoring or failing the requests. However, there is currently no way
+ * to unload a backend once it is registered.
*/
-static DECLARE_BITMAP(need_init, MAX_SWAPFILES);
/*
- * Register operations for frontswap, returning previous thus allowing
- * detection of multiple backends and possible nesting.
+ * Register operations for frontswap
*/
-struct frontswap_ops *frontswap_register_ops(struct frontswap_ops *ops)
+void frontswap_register_ops(struct frontswap_ops *ops)
{
- struct frontswap_ops *old = frontswap_ops;
- int i;
-
- for (i = 0; i < MAX_SWAPFILES; i++) {
- if (test_and_clear_bit(i, need_init)) {
- struct swap_info_struct *sis = swap_info[i];
- /* __frontswap_init _should_ have set it! */
- if (!sis->frontswap_map)
- return ERR_PTR(-EINVAL);
- ops->init(i);
- }
+ DECLARE_BITMAP(a, MAX_SWAPFILES);
+ DECLARE_BITMAP(b, MAX_SWAPFILES);
+ struct swap_info_struct *si;
+ unsigned int i;
+
+ bitmap_zero(a, MAX_SWAPFILES);
+ bitmap_zero(b, MAX_SWAPFILES);
+
+ spin_lock(&swap_lock);
+ plist_for_each_entry(si, &swap_active_head, list) {
+ if (!WARN_ON(!si->frontswap_map))
+ set_bit(si->type, a);
}
+ spin_unlock(&swap_lock);
+
+ /* the new ops needs to know the currently active swap devices */
+ for_each_set_bit(i, a, MAX_SWAPFILES)
+ ops->init(i);
+
/*
- * We MUST have frontswap_ops set _after_ the frontswap_init's
- * have been called. Otherwise __frontswap_store might fail. Hence
- * the barrier to make sure compiler does not re-order us.
+ * Setting frontswap_ops must happen after the ops->init() calls
+ * above; cmpxchg implies smp_mb() which will ensure the init is
+ * complete at this point.
*/
- barrier();
- frontswap_ops = ops;
- return old;
+ do {
+ ops->next = frontswap_ops;
+ } while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next);
+
+ spin_lock(&swap_lock);
+ plist_for_each_entry(si, &swap_active_head, list) {
+ if (si->frontswap_map)
+ set_bit(si->type, b);
+ }
+ spin_unlock(&swap_lock);
+
+ /*
+ * On the very unlikely chance that a swap device was added or
+ * removed between setting the "a" list bits and the ops init
+ * calls, we re-check and do init or invalidate for any changed
+ * bits.
+ */
+ if (unlikely(!bitmap_equal(a, b, MAX_SWAPFILES))) {
+ for (i = 0; i < MAX_SWAPFILES; i++) {
+ if (!test_bit(i, a) && test_bit(i, b))
+ ops->init(i);
+ else if (test_bit(i, a) && !test_bit(i, b))
+ ops->invalidate_area(i);
+ }
+ }
}
EXPORT_SYMBOL(frontswap_register_ops);
@@ -164,6 +187,7 @@ EXPORT_SYMBOL(frontswap_tmem_exclusive_gets);
void __frontswap_init(unsigned type, unsigned long *map)
{
struct swap_info_struct *sis = swap_info[type];
+ struct frontswap_ops *ops;
BUG_ON(sis == NULL);
@@ -179,28 +203,30 @@ void __frontswap_init(unsigned type, unsigned long *map)
* p->frontswap set to something valid to work properly.
*/
frontswap_map_set(sis, map);
- if (frontswap_ops)
- frontswap_ops->init(type);
- else {
- BUG_ON(type >= MAX_SWAPFILES);
- set_bit(type, need_init);
- }
+
+ for_each_frontswap_ops(ops)
+ ops->init(type);
}
EXPORT_SYMBOL(__frontswap_init);
bool __frontswap_test(struct swap_info_struct *sis,
pgoff_t offset)
{
- bool ret = false;
-
- if (frontswap_ops && sis->frontswap_map)
- ret = test_bit(offset, sis->frontswap_map);
- return ret;
+ if (sis->frontswap_map)
+ return test_bit(offset, sis->frontswap_map);
+ return false;
}
EXPORT_SYMBOL(__frontswap_test);
+static inline void __frontswap_set(struct swap_info_struct *sis,
+ pgoff_t offset)
+{
+ set_bit(offset, sis->frontswap_map);
+ atomic_inc(&sis->frontswap_pages);
+}
+
static inline void __frontswap_clear(struct swap_info_struct *sis,
- pgoff_t offset)
+ pgoff_t offset)
{
clear_bit(offset, sis->frontswap_map);
atomic_dec(&sis->frontswap_pages);
@@ -215,39 +241,46 @@ static inline void __frontswap_clear(struct swap_info_struct *sis,
*/
int __frontswap_store(struct page *page)
{
- int ret = -1, dup = 0;
+ int ret = -1;
swp_entry_t entry = { .val = page_private(page), };
int type = swp_type(entry);
struct swap_info_struct *sis = swap_info[type];
pgoff_t offset = swp_offset(entry);
+ struct frontswap_ops *ops;
/*
* Return if no backend registed.
* Don't need to inc frontswap_failed_stores here.
*/
if (!frontswap_ops)
- return ret;
+ return -1;
BUG_ON(!PageLocked(page));
BUG_ON(sis == NULL);
- if (__frontswap_test(sis, offset))
- dup = 1;
- ret = frontswap_ops->store(type, offset, page);
+
+ /*
+ * If a dup, we must remove the old page first; we can't leave the
+ * old page no matter if the store of the new page succeeds or fails,
+ * and we can't rely on the new page replacing the old page as we may
+ * not store to the same implementation that contains the old page.
+ */
+ if (__frontswap_test(sis, offset)) {
+ __frontswap_clear(sis, offset);
+ for_each_frontswap_ops(ops)
+ ops->invalidate_page(type, offset);
+ }
+
+ /* Try to store in each implementation, until one succeeds. */
+ for_each_frontswap_ops(ops) {
+ ret = ops->store(type, offset, page);
+ if (!ret) /* successful store */
+ break;
+ }
if (ret == 0) {
- set_bit(offset, sis->frontswap_map);
+ __frontswap_set(sis, offset);
inc_frontswap_succ_stores();
- if (!dup)
- atomic_inc(&sis->frontswap_pages);
} else {
- /*
- failed dup always results in automatic invalidate of
- the (older) page from frontswap
- */
inc_frontswap_failed_stores();
- if (dup) {
- __frontswap_clear(sis, offset);
- frontswap_ops->invalidate_page(type, offset);
- }
}
if (frontswap_writethrough_enabled)
/* report failure so swap also writes to swap device */
@@ -268,14 +301,22 @@ int __frontswap_load(struct page *page)
int type = swp_type(entry);
struct swap_info_struct *sis = swap_info[type];
pgoff_t offset = swp_offset(entry);
+ struct frontswap_ops *ops;
+
+ if (!frontswap_ops)
+ return -1;
BUG_ON(!PageLocked(page));
BUG_ON(sis == NULL);
- /*
- * __frontswap_test() will check whether there is backend registered
- */
- if (__frontswap_test(sis, offset))
- ret = frontswap_ops->load(type, offset, page);
+ if (!__frontswap_test(sis, offset))
+ return -1;
+
+ /* Try loading from each implementation, until one succeeds. */
+ for_each_frontswap_ops(ops) {
+ ret = ops->load(type, offset, page);
+ if (!ret) /* successful load */
+ break;
+ }
if (ret == 0) {
inc_frontswap_loads();
if (frontswap_tmem_exclusive_gets_enabled) {
@@ -294,16 +335,19 @@ EXPORT_SYMBOL(__frontswap_load);
void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
{
struct swap_info_struct *sis = swap_info[type];
+ struct frontswap_ops *ops;
+
+ if (!frontswap_ops)
+ return;
BUG_ON(sis == NULL);
- /*
- * __frontswap_test() will check whether there is backend registered
- */
- if (__frontswap_test(sis, offset)) {
- frontswap_ops->invalidate_page(type, offset);
- __frontswap_clear(sis, offset);
- inc_frontswap_invalidates();
- }
+ if (!__frontswap_test(sis, offset))
+ return;
+
+ for_each_frontswap_ops(ops)
+ ops->invalidate_page(type, offset);
+ __frontswap_clear(sis, offset);
+ inc_frontswap_invalidates();
}
EXPORT_SYMBOL(__frontswap_invalidate_page);
@@ -314,16 +358,19 @@ EXPORT_SYMBOL(__frontswap_invalidate_page);
void __frontswap_invalidate_area(unsigned type)
{
struct swap_info_struct *sis = swap_info[type];
+ struct frontswap_ops *ops;
- if (frontswap_ops) {
- BUG_ON(sis == NULL);
- if (sis->frontswap_map == NULL)
- return;
- frontswap_ops->invalidate_area(type);
- atomic_set(&sis->frontswap_pages, 0);
- bitmap_zero(sis->frontswap_map, sis->max);
- }
- clear_bit(type, need_init);
+ if (!frontswap_ops)
+ return;
+
+ BUG_ON(sis == NULL);
+ if (sis->frontswap_map == NULL)
+ return;
+
+ for_each_frontswap_ops(ops)
+ ops->invalidate_area(type);
+ atomic_set(&sis->frontswap_pages, 0);
+ bitmap_zero(sis->frontswap_map, sis->max);
}
EXPORT_SYMBOL(__frontswap_invalidate_area);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 078832cf3636..c107094f79ba 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1031,7 +1031,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
goto out_free_pages;
VM_BUG_ON_PAGE(!PageHead(page), page);
- pmdp_clear_flush_notify(vma, haddr, pmd);
+ pmdp_huge_clear_flush_notify(vma, haddr, pmd);
/* leave pmd empty until pte is filled */
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
@@ -1174,7 +1174,7 @@ alloc:
pmd_t entry;
entry = mk_huge_pmd(new_page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- pmdp_clear_flush_notify(vma, haddr, pmd);
+ pmdp_huge_clear_flush_notify(vma, haddr, pmd);
page_add_new_anon_rmap(new_page, vma, haddr);
mem_cgroup_commit_charge(new_page, memcg, false);
lru_cache_add_active_or_unevictable(new_page, vma);
@@ -1396,12 +1396,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t orig_pmd;
/*
* For architectures like ppc64 we look at deposited pgtable
- * when calling pmdp_get_and_clear. So do the
+ * when calling pmdp_huge_get_and_clear. So do the
* pgtable_trans_huge_withdraw after finishing pmdp related
* operations.
*/
- orig_pmd = pmdp_get_and_clear_full(tlb->mm, addr, pmd,
- tlb->fullmm);
+ orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
+ tlb->fullmm);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
if (is_huge_zero_pmd(orig_pmd)) {
@@ -1459,7 +1459,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
new_ptl = pmd_lockptr(mm, new_pmd);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
- pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
+ pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
if (pmd_move_must_withdraw(new_ptl, old_ptl)) {
@@ -1505,7 +1505,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
}
if (!prot_numa || !pmd_protnone(*pmd)) {
- entry = pmdp_get_and_clear_notify(mm, addr, pmd);
+ entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd);
entry = pmd_modify(entry, newprot);
if (preserve_write)
entry = pmd_mkwrite(entry);
@@ -2499,7 +2499,7 @@ static void collapse_huge_page(struct mm_struct *mm,
* huge and small TLB entries for the same virtual address
* to avoid the risk of CPU bugs in that area.
*/
- _pmd = pmdp_clear_flush(vma, address, pmd);
+ _pmd = pmdp_collapse_flush(vma, address, pmd);
spin_unlock(pmd_ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
@@ -2799,7 +2799,7 @@ static void khugepaged_do_scan(void)
cond_resched();
- if (unlikely(kthread_should_stop() || freezing(current)))
+ if (unlikely(kthread_should_stop() || try_to_freeze()))
break;
spin_lock(&khugepaged_mm_lock);
@@ -2820,8 +2820,6 @@ static void khugepaged_do_scan(void)
static void khugepaged_wait_work(void)
{
- try_to_freeze();
-
if (khugepaged_has_work()) {
if (!khugepaged_scan_sleep_millisecs)
return;
@@ -2865,7 +2863,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
pmd_t _pmd;
int i;
- pmdp_clear_flush_notify(vma, haddr, pmd);
+ pmdp_huge_clear_flush_notify(vma, haddr, pmd);
/* leave pmd empty until pte is filled */
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 271e4432734c..75c0eef52c5d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -40,6 +40,11 @@ int hugepages_treat_as_movable;
int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
struct hstate hstates[HUGE_MAX_HSTATE];
+/*
+ * Minimum page order among possible hugepage sizes, set to a proper value
+ * at boot time.
+ */
+static unsigned int minimum_order __read_mostly = UINT_MAX;
__initdata LIST_HEAD(huge_boot_pages);
@@ -212,8 +217,20 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
* Region tracking -- allows tracking of reservations and instantiated pages
* across the pages in a mapping.
*
- * The region data structures are embedded into a resv_map and
- * protected by a resv_map's lock
+ * The region data structures are embedded into a resv_map and protected
+ * by a resv_map's lock. The set of regions within the resv_map represent
+ * reservations for huge pages, or huge pages that have already been
+ * instantiated within the map. The from and to elements are huge page
+ * indicies into the associated mapping. from indicates the starting index
+ * of the region. to represents the first index past the end of the region.
+ *
+ * For example, a file region structure with from == 0 and to == 4 represents
+ * four huge pages in a mapping. It is important to note that the to element
+ * represents the first element past the end of the region. This is used in
+ * arithmetic as 4(to) - 0(from) = 4 huge pages in the region.
+ *
+ * Interval notation of the form [from, to) will be used to indicate that
+ * the endpoint from is inclusive and to is exclusive.
*/
struct file_region {
struct list_head link;
@@ -221,10 +238,22 @@ struct file_region {
long to;
};
+/*
+ * Add the huge page range represented by [f, t) to the reserve
+ * map. Existing regions will be expanded to accommodate the
+ * specified range. We know only existing regions need to be
+ * expanded, because region_add is only called after region_chg
+ * with the same range. If a new file_region structure must
+ * be allocated, it is done in region_chg.
+ *
+ * Return the number of new huge pages added to the map. This
+ * number is greater than or equal to zero.
+ */
static long region_add(struct resv_map *resv, long f, long t)
{
struct list_head *head = &resv->regions;
struct file_region *rg, *nrg, *trg;
+ long add = 0;
spin_lock(&resv->lock);
/* Locate the region we are either in or before. */
@@ -250,16 +279,45 @@ static long region_add(struct resv_map *resv, long f, long t)
if (rg->to > t)
t = rg->to;
if (rg != nrg) {
+ /* Decrement return value by the deleted range.
+ * Another range will span this area so that by
+ * end of routine add will be >= zero
+ */
+ add -= (rg->to - rg->from);
list_del(&rg->link);
kfree(rg);
}
}
+
+ add += (nrg->from - f); /* Added to beginning of region */
nrg->from = f;
+ add += t - nrg->to; /* Added to end of region */
nrg->to = t;
+
spin_unlock(&resv->lock);
- return 0;
+ VM_BUG_ON(add < 0);
+ return add;
}
+/*
+ * Examine the existing reserve map and determine how many
+ * huge pages in the specified range [f, t) are NOT currently
+ * represented. This routine is called before a subsequent
+ * call to region_add that will actually modify the reserve
+ * map to add the specified range [f, t). region_chg does
+ * not change the number of huge pages represented by the
+ * map. However, if the existing regions in the map can not
+ * be expanded to represent the new range, a new file_region
+ * structure is added to the map as a placeholder. This is
+ * so that the subsequent region_add call will have all the
+ * regions it needs and will not fail.
+ *
+ * Returns the number of huge pages that need to be added
+ * to the existing reservation map for the range [f, t).
+ * This number is greater or equal to zero. -ENOMEM is
+ * returned if a new file_region structure is needed and can
+ * not be allocated.
+ */
static long region_chg(struct resv_map *resv, long f, long t)
{
struct list_head *head = &resv->regions;
@@ -326,6 +384,11 @@ out_nrg:
return chg;
}
+/*
+ * Truncate the reserve map at index 'end'. Modify/truncate any
+ * region which contains end. Delete any regions past end.
+ * Return the number of huge pages removed from the map.
+ */
static long region_truncate(struct resv_map *resv, long end)
{
struct list_head *head = &resv->regions;
@@ -361,6 +424,10 @@ out:
return chg;
}
+/*
+ * Count and return the number of huge pages in the reserve map
+ * that intersect with the range [f, t).
+ */
static long region_count(struct resv_map *resv, long f, long t)
{
struct list_head *head = &resv->regions;
@@ -1188,19 +1255,13 @@ static void dissolve_free_huge_page(struct page *page)
*/
void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
{
- unsigned int order = 8 * sizeof(void *);
unsigned long pfn;
- struct hstate *h;
if (!hugepages_supported())
return;
- /* Set scan step to minimum hugepage size */
- for_each_hstate(h)
- if (order > huge_page_order(h))
- order = huge_page_order(h);
- VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << order));
- for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order)
+ VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << minimum_order));
+ for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order)
dissolve_free_huge_page(pfn_to_page(pfn));
}
@@ -1423,46 +1484,56 @@ static void return_unused_surplus_pages(struct hstate *h,
}
/*
- * Determine if the huge page at addr within the vma has an associated
- * reservation. Where it does not we will need to logically increase
- * reservation and actually increase subpool usage before an allocation
- * can occur. Where any new reservation would be required the
- * reservation change is prepared, but not committed. Once the page
- * has been allocated from the subpool and instantiated the change should
- * be committed via vma_commit_reservation. No action is required on
- * failure.
+ * vma_needs_reservation and vma_commit_reservation are used by the huge
+ * page allocation routines to manage reservations.
+ *
+ * vma_needs_reservation is called to determine if the huge page at addr
+ * within the vma has an associated reservation. If a reservation is
+ * needed, the value 1 is returned. The caller is then responsible for
+ * managing the global reservation and subpool usage counts. After
+ * the huge page has been allocated, vma_commit_reservation is called
+ * to add the page to the reservation map.
+ *
+ * In the normal case, vma_commit_reservation returns the same value
+ * as the preceding vma_needs_reservation call. The only time this
+ * is not the case is if a reserve map was changed between calls. It
+ * is the responsibility of the caller to notice the difference and
+ * take appropriate action.
*/
-static long vma_needs_reservation(struct hstate *h,
- struct vm_area_struct *vma, unsigned long addr)
+static long __vma_reservation_common(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr,
+ bool commit)
{
struct resv_map *resv;
pgoff_t idx;
- long chg;
+ long ret;
resv = vma_resv_map(vma);
if (!resv)
return 1;
idx = vma_hugecache_offset(h, vma, addr);
- chg = region_chg(resv, idx, idx + 1);
+ if (commit)
+ ret = region_add(resv, idx, idx + 1);
+ else
+ ret = region_chg(resv, idx, idx + 1);
if (vma->vm_flags & VM_MAYSHARE)
- return chg;
+ return ret;
else
- return chg < 0 ? chg : 0;
+ return ret < 0 ? ret : 0;
}
-static void vma_commit_reservation(struct hstate *h,
+
+static long vma_needs_reservation(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
- struct resv_map *resv;
- pgoff_t idx;
-
- resv = vma_resv_map(vma);
- if (!resv)
- return;
+ return __vma_reservation_common(h, vma, addr, false);
+}
- idx = vma_hugecache_offset(h, vma, addr);
- region_add(resv, idx, idx + 1);
+static long vma_commit_reservation(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ return __vma_reservation_common(h, vma, addr, true);
}
static struct page *alloc_huge_page(struct vm_area_struct *vma,
@@ -1471,7 +1542,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
struct hugepage_subpool *spool = subpool_vma(vma);
struct hstate *h = hstate_vma(vma);
struct page *page;
- long chg;
+ long chg, commit;
int ret, idx;
struct hugetlb_cgroup *h_cg;
@@ -1512,7 +1583,22 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
set_page_private(page, (unsigned long)spool);
- vma_commit_reservation(h, vma, addr);
+ commit = vma_commit_reservation(h, vma, addr);
+ if (unlikely(chg > commit)) {
+ /*
+ * The page was added to the reservation map between
+ * vma_needs_reservation and vma_commit_reservation.
+ * This indicates a race with hugetlb_reserve_pages.
+ * Ad