summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2011-02-14 11:55:18 +0100
committerIngo Molnar <mingo@elte.hu>2011-02-14 11:55:18 +0100
commitd2137d5af4259f50c19addb8246a186c9ffac325 (patch)
tree2f7e309f9cf8ef2f2698532c226edda38021fe69 /mm
parentf005fe12b90c5b9fe180a09209a893e09affa8aa (diff)
parent795abaf1e4e188c4171e3cd3dbb11a9fcacaf505 (diff)
Merge branch 'linus' into x86/bootmem
Conflicts: arch/x86/mm/numa_64.c Merge reason: fix the conflict, update to latest -rc and pick up this dependent fix from Yinghai: e6d2e2b2b1e1: memblock: don't adjust size in memblock_find_base() Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig40
-rw-r--r--mm/Makefile3
-rw-r--r--mm/compaction.c186
-rw-r--r--mm/dmapool.c16
-rw-r--r--mm/filemap.c25
-rw-r--r--mm/huge_memory.c2354
-rw-r--r--mm/hugetlb.c114
-rw-r--r--mm/internal.h7
-rw-r--r--mm/kmemleak-test.c6
-rw-r--r--mm/kmemleak.c13
-rw-r--r--mm/ksm.c88
-rw-r--r--mm/madvise.c10
-rw-r--r--mm/memblock.c10
-rw-r--r--mm/memcontrol.c431
-rw-r--r--mm/memory-failure.c118
-rw-r--r--mm/memory.c360
-rw-r--r--mm/memory_hotplug.c52
-rw-r--r--mm/mempolicy.c26
-rw-r--r--mm/migrate.c134
-rw-r--r--mm/mincore.c7
-rw-r--r--mm/mlock.c170
-rw-r--r--mm/mmap.c33
-rw-r--r--mm/mmu_notifier.c20
-rw-r--r--mm/mmzone.c21
-rw-r--r--mm/mprotect.c20
-rw-r--r--mm/mremap.c9
-rw-r--r--mm/nommu.c35
-rw-r--r--mm/page-writeback.c11
-rw-r--r--mm/page_alloc.c216
-rw-r--r--mm/pagewalk.c6
-rw-r--r--mm/percpu-vm.c2
-rw-r--r--mm/percpu.c12
-rw-r--r--mm/pgtable-generic.c121
-rw-r--r--mm/rmap.c93
-rw-r--r--mm/shmem.c9
-rw-r--r--mm/slab.c82
-rw-r--r--mm/slob.c5
-rw-r--r--mm/slub.c89
-rw-r--r--mm/sparse-vmemmap.c2
-rw-r--r--mm/sparse.c4
-rw-r--r--mm/swap.c131
-rw-r--r--mm/swap_state.c6
-rw-r--r--mm/swapfile.c9
-rw-r--r--mm/truncate.c15
-rw-r--r--mm/util.c21
-rw-r--r--mm/vmalloc.c118
-rw-r--r--mm/vmscan.c435
-rw-r--r--mm/vmstat.c206
48 files changed, 4789 insertions, 1112 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index c2c8a4a11898..e9c0c61f2ddd 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -179,7 +179,7 @@ config SPLIT_PTLOCK_CPUS
config COMPACTION
bool "Allow for memory compaction"
select MIGRATION
- depends on EXPERIMENTAL && HUGETLB_PAGE && MMU
+ depends on MMU
help
Allows the compaction of memory for the allocation of huge pages.
@@ -302,6 +302,44 @@ config NOMMU_INITIAL_TRIM_EXCESS
See Documentation/nommu-mmap.txt for more information.
+config TRANSPARENT_HUGEPAGE
+ bool "Transparent Hugepage Support"
+ depends on X86 && MMU
+ select COMPACTION
+ help
+ Transparent Hugepages allows the kernel to use huge pages and
+ huge tlb transparently to the applications whenever possible.
+ This feature can improve computing performance to certain
+ applications by speeding up page faults during memory
+ allocation, by reducing the number of tlb misses and by speeding
+ up the pagetable walking.
+
+ If memory constrained on embedded, you may want to say N.
+
+choice
+ prompt "Transparent Hugepage Support sysfs defaults"
+ depends on TRANSPARENT_HUGEPAGE
+ default TRANSPARENT_HUGEPAGE_ALWAYS
+ help
+ Selects the sysfs defaults for Transparent Hugepage Support.
+
+ config TRANSPARENT_HUGEPAGE_ALWAYS
+ bool "always"
+ help
+ Enabling Transparent Hugepage always, can increase the
+ memory footprint of applications without a guaranteed
+ benefit but it will work automatically for all applications.
+
+ config TRANSPARENT_HUGEPAGE_MADVISE
+ bool "madvise"
+ help
+ Enabling Transparent Hugepage madvise, will only provide a
+ performance improvement benefit to the applications using
+ madvise(MADV_HUGEPAGE) but it won't risk to increase the
+ memory footprint of applications without a guaranteed
+ benefit.
+endchoice
+
#
# UP and nommu archs use km based percpu allocator
#
diff --git a/mm/Makefile b/mm/Makefile
index f73f75a29f82..2b1b575ae712 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,7 +5,7 @@
mmu-y := nommu.o
mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
- vmalloc.o pagewalk.o
+ vmalloc.o pagewalk.o pgtable-generic.o
obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
maccess.o page_alloc.o page-writeback.o \
@@ -37,6 +37,7 @@ obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
obj-$(CONFIG_FS_XIP) += filemap_xip.o
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
+obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
diff --git a/mm/compaction.c b/mm/compaction.c
index 4d709ee59013..8be430b812de 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -16,6 +16,9 @@
#include <linux/sysfs.h>
#include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/compaction.h>
+
/*
* compact_control is used to track pages being migrated and the free pages
* they are being migrated to during memory compaction. The free_pfn starts
@@ -30,6 +33,7 @@ struct compact_control {
unsigned long nr_migratepages; /* Number of pages to migrate */
unsigned long free_pfn; /* isolate_freepages search base */
unsigned long migrate_pfn; /* isolate_migratepages search base */
+ bool sync; /* Synchronous migration */
/* Account for isolated anon and file pages */
unsigned long nr_anon;
@@ -38,6 +42,8 @@ struct compact_control {
unsigned int order; /* order a direct compactor needs */
int migratetype; /* MOVABLE, RECLAIMABLE etc */
struct zone *zone;
+
+ int compact_mode;
};
static unsigned long release_freepages(struct list_head *freelist)
@@ -60,7 +66,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
struct list_head *freelist)
{
unsigned long zone_end_pfn, end_pfn;
- int total_isolated = 0;
+ int nr_scanned = 0, total_isolated = 0;
struct page *cursor;
/* Get the last PFN we should scan for free pages at */
@@ -81,6 +87,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
if (!pfn_valid_within(blockpfn))
continue;
+ nr_scanned++;
if (!PageBuddy(page))
continue;
@@ -100,6 +107,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
}
}
+ trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
return total_isolated;
}
@@ -234,6 +242,8 @@ static unsigned long isolate_migratepages(struct zone *zone,
struct compact_control *cc)
{
unsigned long low_pfn, end_pfn;
+ unsigned long last_pageblock_nr = 0, pageblock_nr;
+ unsigned long nr_scanned = 0, nr_isolated = 0;
struct list_head *migratelist = &cc->migratepages;
/* Do not scan outside zone boundaries */
@@ -266,21 +276,51 @@ static unsigned long isolate_migratepages(struct zone *zone,
struct page *page;
if (!pfn_valid_within(low_pfn))
continue;
+ nr_scanned++;
/* Get the page and skip if free */
page = pfn_to_page(low_pfn);
if (PageBuddy(page))
continue;
+ /*
+ * For async migration, also only scan in MOVABLE blocks. Async
+ * migration is optimistic to see if the minimum amount of work
+ * satisfies the allocation
+ */
+ pageblock_nr = low_pfn >> pageblock_order;
+ if (!cc->sync && last_pageblock_nr != pageblock_nr &&
+ get_pageblock_migratetype(page) != MIGRATE_MOVABLE) {
+ low_pfn += pageblock_nr_pages;
+ low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
+ last_pageblock_nr = pageblock_nr;
+ continue;
+ }
+
+ if (!PageLRU(page))
+ continue;
+
+ /*
+ * PageLRU is set, and lru_lock excludes isolation,
+ * splitting and collapsing (collapsing has already
+ * happened if PageLRU is set).
+ */
+ if (PageTransHuge(page)) {
+ low_pfn += (1 << compound_order(page)) - 1;
+ continue;
+ }
+
/* Try isolate the page */
if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
continue;
+ VM_BUG_ON(PageTransCompound(page));
+
/* Successfully isolated */
del_page_from_lru_list(zone, page, page_lru(page));
list_add(&page->lru, migratelist);
- mem_cgroup_del_lru(page);
cc->nr_migratepages++;
+ nr_isolated++;
/* Avoid isolating too much */
if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
@@ -292,6 +332,8 @@ static unsigned long isolate_migratepages(struct zone *zone,
spin_unlock_irq(&zone->lru_lock);
cc->migrate_pfn = low_pfn;
+ trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
+
return cc->nr_migratepages;
}
@@ -342,10 +384,10 @@ static void update_nr_listpages(struct compact_control *cc)
}
static int compact_finished(struct zone *zone,
- struct compact_control *cc)
+ struct compact_control *cc)
{
unsigned int order;
- unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order);
+ unsigned long watermark;
if (fatal_signal_pending(current))
return COMPACT_PARTIAL;
@@ -355,12 +397,31 @@ static int compact_finished(struct zone *zone,
return COMPACT_COMPLETE;
/* Compaction run is not finished if the watermark is not met */
+ if (cc->compact_mode != COMPACT_MODE_KSWAPD)
+ watermark = low_wmark_pages(zone);
+ else
+ watermark = high_wmark_pages(zone);
+ watermark += (1 << cc->order);
+
if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
return COMPACT_CONTINUE;
+ /*
+ * order == -1 is expected when compacting via
+ * /proc/sys/vm/compact_memory
+ */
if (cc->order == -1)
return COMPACT_CONTINUE;
+ /*
+ * Generating only one page of the right order is not enough
+ * for kswapd, we must continue until we're above the high
+ * watermark as a pool for high order GFP_ATOMIC allocations
+ * too.
+ */
+ if (cc->compact_mode == COMPACT_MODE_KSWAPD)
+ return COMPACT_CONTINUE;
+
/* Direct compactor: Is a suitable page free? */
for (order = cc->order; order < MAX_ORDER; order++) {
/* Job done if page is free of the right migratetype */
@@ -375,10 +436,69 @@ static int compact_finished(struct zone *zone,
return COMPACT_CONTINUE;
}
+/*
+ * compaction_suitable: Is this suitable to run compaction on this zone now?
+ * Returns
+ * COMPACT_SKIPPED - If there are too few free pages for compaction
+ * COMPACT_PARTIAL - If the allocation would succeed without compaction
+ * COMPACT_CONTINUE - If compaction should run now
+ */
+unsigned long compaction_suitable(struct zone *zone, int order)
+{
+ int fragindex;
+ unsigned long watermark;
+
+ /*
+ * Watermarks for order-0 must be met for compaction. Note the 2UL.
+ * This is because during migration, copies of pages need to be
+ * allocated and for a short time, the footprint is higher
+ */
+ watermark = low_wmark_pages(zone) + (2UL << order);
+ if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+ return COMPACT_SKIPPED;
+
+ /*
+ * order == -1 is expected when compacting via
+ * /proc/sys/vm/compact_memory
+ */
+ if (order == -1)
+ return COMPACT_CONTINUE;
+
+ /*
+ * fragmentation index determines if allocation failures are due to
+ * low memory or external fragmentation
+ *
+ * index of -1 implies allocations might succeed dependingon watermarks
+ * index towards 0 implies failure is due to lack of memory
+ * index towards 1000 implies failure is due to fragmentation
+ *
+ * Only compact if a failure would be due to fragmentation.
+ */
+ fragindex = fragmentation_index(zone, order);
+ if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
+ return COMPACT_SKIPPED;
+
+ if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0))
+ return COMPACT_PARTIAL;
+
+ return COMPACT_CONTINUE;
+}
+
static int compact_zone(struct zone *zone, struct compact_control *cc)
{
int ret;
+ ret = compaction_suitable(zone, cc->order);
+ switch (ret) {
+ case COMPACT_PARTIAL:
+ case COMPACT_SKIPPED:
+ /* Compaction is likely to fail */
+ return ret;
+ case COMPACT_CONTINUE:
+ /* Fall through to compaction */
+ ;
+ }
+
/* Setup to move all movable pages to the end of the zone */
cc->migrate_pfn = zone->zone_start_pfn;
cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
@@ -394,7 +514,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
nr_migrate = cc->nr_migratepages;
migrate_pages(&cc->migratepages, compaction_alloc,
- (unsigned long)cc, 0);
+ (unsigned long)cc, false,
+ cc->sync);
update_nr_listpages(cc);
nr_remaining = cc->nr_migratepages;
@@ -402,6 +523,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
if (nr_remaining)
count_vm_events(COMPACTPAGEFAILED, nr_remaining);
+ trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
+ nr_remaining);
/* Release LRU pages not migrated */
if (!list_empty(&cc->migratepages)) {
@@ -418,8 +541,10 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
return ret;
}
-static unsigned long compact_zone_order(struct zone *zone,
- int order, gfp_t gfp_mask)
+unsigned long compact_zone_order(struct zone *zone,
+ int order, gfp_t gfp_mask,
+ bool sync,
+ int compact_mode)
{
struct compact_control cc = {
.nr_freepages = 0,
@@ -427,6 +552,8 @@ static unsigned long compact_zone_order(struct zone *zone,
.order = order,
.migratetype = allocflags_to_migratetype(gfp_mask),
.zone = zone,
+ .sync = sync,
+ .compact_mode = compact_mode,
};
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
@@ -442,16 +569,17 @@ int sysctl_extfrag_threshold = 500;
* @order: The order of the current allocation
* @gfp_mask: The GFP mask of the current allocation
* @nodemask: The allowed nodes to allocate from
+ * @sync: Whether migration is synchronous or not
*
* This is the main entry point for direct page compaction.
*/
unsigned long try_to_compact_pages(struct zonelist *zonelist,
- int order, gfp_t gfp_mask, nodemask_t *nodemask)
+ int order, gfp_t gfp_mask, nodemask_t *nodemask,
+ bool sync)
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
int may_enter_fs = gfp_mask & __GFP_FS;
int may_perform_io = gfp_mask & __GFP_IO;
- unsigned long watermark;
struct zoneref *z;
struct zone *zone;
int rc = COMPACT_SKIPPED;
@@ -461,7 +589,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
* made because an assumption is made that the page allocator can satisfy
* the "cheaper" orders without taking special steps
*/
- if (order <= PAGE_ALLOC_COSTLY_ORDER || !may_enter_fs || !may_perform_io)
+ if (!order || !may_enter_fs || !may_perform_io)
return rc;
count_vm_event(COMPACTSTALL);
@@ -469,43 +597,14 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
/* Compact each zone in the list */
for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
nodemask) {
- int fragindex;
int status;
- /*
- * Watermarks for order-0 must be met for compaction. Note
- * the 2UL. This is because during migration, copies of
- * pages need to be allocated and for a short time, the
- * footprint is higher
- */
- watermark = low_wmark_pages(zone) + (2UL << order);
- if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
- continue;
-
- /*
- * fragmentation index determines if allocation failures are
- * due to low memory or external fragmentation
- *
- * index of -1 implies allocations might succeed depending
- * on watermarks
- * index towards 0 implies failure is due to lack of memory
- * index towards 1000 implies failure is due to fragmentation
- *
- * Only compact if a failure would be due to fragmentation.
- */
- fragindex = fragmentation_index(zone, order);
- if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
- continue;
-
- if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) {
- rc = COMPACT_PARTIAL;
- break;
- }
-
- status = compact_zone_order(zone, order, gfp_mask);
+ status = compact_zone_order(zone, order, gfp_mask, sync,
+ COMPACT_MODE_DIRECT_RECLAIM);
rc = max(status, rc);
- if (zone_watermark_ok(zone, order, watermark, 0, 0))
+ /* If a normal allocation would succeed, stop compacting */
+ if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
break;
}
@@ -532,6 +631,7 @@ static int compact_node(int nid)
.nr_freepages = 0,
.nr_migratepages = 0,
.order = -1,
+ .compact_mode = COMPACT_MODE_DIRECT_RECLAIM,
};
zone = &pgdat->node_zones[zoneid];
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 4df2de77e069..03bf3bb4519a 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -324,7 +324,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
if (mem_flags & __GFP_WAIT) {
DECLARE_WAITQUEUE(wait, current);
- __set_current_state(TASK_INTERRUPTIBLE);
+ __set_current_state(TASK_UNINTERRUPTIBLE);
__add_wait_queue(&pool->waitq, &wait);
spin_unlock_irqrestore(&pool->lock, flags);
@@ -355,20 +355,15 @@ EXPORT_SYMBOL(dma_pool_alloc);
static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma)
{
- unsigned long flags;
struct dma_page *page;
- spin_lock_irqsave(&pool->lock, flags);
list_for_each_entry(page, &pool->page_list, page_list) {
if (dma < page->dma)
continue;
if (dma < (page->dma + pool->allocation))
- goto done;
+ return page;
}
- page = NULL;
- done:
- spin_unlock_irqrestore(&pool->lock, flags);
- return page;
+ return NULL;
}
/**
@@ -386,8 +381,10 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
unsigned long flags;
unsigned int offset;
+ spin_lock_irqsave(&pool->lock, flags);
page = pool_find_page(pool, dma);
if (!page) {
+ spin_unlock_irqrestore(&pool->lock, flags);
if (pool->dev)
dev_err(pool->dev,
"dma_pool_free %s, %p/%lx (bad dma)\n",
@@ -401,6 +398,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
offset = vaddr - page->vaddr;
#ifdef DMAPOOL_DEBUG
if ((dma - page->dma) != offset) {
+ spin_unlock_irqrestore(&pool->lock, flags);
if (pool->dev)
dev_err(pool->dev,
"dma_pool_free %s, %p (bad vaddr)/%Lx\n",
@@ -418,6 +416,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
chain = *(int *)(page->vaddr + chain);
continue;
}
+ spin_unlock_irqrestore(&pool->lock, flags);
if (pool->dev)
dev_err(pool->dev, "dma_pool_free %s, dma %Lx "
"already free\n", pool->name,
@@ -432,7 +431,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
memset(vaddr, POOL_POISON_FREED, pool->size);
#endif
- spin_lock_irqsave(&pool->lock, flags);
page->in_use--;
*(int *)vaddr = page->offset;
page->offset = offset;
diff --git a/mm/filemap.c b/mm/filemap.c
index ea89840fc65f..83a45d35468b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -102,9 +102,6 @@
* ->inode_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->__set_page_dirty_buffers)
*
- * ->task->proc_lock
- * ->dcache_lock (proc_pid_lookup)
- *
* (code doesn't rely on that order, so you could switch it around)
* ->tasklist_lock (memory_failure, collect_procs_ao)
* ->i_mmap_lock
@@ -143,13 +140,18 @@ void __remove_from_page_cache(struct page *page)
void remove_from_page_cache(struct page *page)
{
struct address_space *mapping = page->mapping;
+ void (*freepage)(struct page *);
BUG_ON(!PageLocked(page));
+ freepage = mapping->a_ops->freepage;
spin_lock_irq(&mapping->tree_lock);
__remove_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
+
+ if (freepage)
+ freepage(page);
}
EXPORT_SYMBOL(remove_from_page_cache);
@@ -296,7 +298,7 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
continue;
wait_on_page_writeback(page);
- if (PageError(page))
+ if (TestClearPageError(page))
ret = -EIO;
}
pagevec_release(&pvec);
@@ -835,9 +837,6 @@ repeat:
if (radix_tree_deref_retry(page))
goto restart;
- if (page->mapping == NULL || page->index != index)
- break;
-
if (!page_cache_get_speculative(page))
goto repeat;
@@ -847,6 +846,16 @@ repeat:
goto repeat;
}
+ /*
+ * must check mapping and index after taking the ref.
+ * otherwise we can get both false positives and false
+ * negatives, which is just confusing to the caller.
+ */
+ if (page->mapping == NULL || page->index != index) {
+ page_cache_release(page);
+ break;
+ }
+
pages[ret] = page;
ret++;
index++;
@@ -2218,7 +2227,7 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
gfp_notmask = __GFP_FS;
repeat:
page = find_lock_page(mapping, index);
- if (likely(page))
+ if (page)
return page;
page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
new file mode 100644
index 000000000000..e62ddb8f24b6
--- /dev/null
+++ b/mm/huge_memory.c
@@ -0,0 +1,2354 @@
+/*
+ * Copyright (C) 2009 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/highmem.h>
+#include <linux/hugetlb.h>
+#include <linux/mmu_notifier.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/mm_inline.h>
+#include <linux/kthread.h>
+#include <linux/khugepaged.h>
+#include <linux/freezer.h>
+#include <linux/mman.h>
+#include <asm/tlb.h>
+#include <asm/pgalloc.h>
+#include "internal.h"
+
+/*
+ * By default transparent hugepage support is enabled for all mappings
+ * and khugepaged scans all mappings. Defrag is only invoked by
+ * khugepaged hugepage allocations and by page faults inside
+ * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived
+ * allocations.
+ */
+unsigned long transparent_hugepage_flags __read_mostly =
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
+ (1<<TRANSPARENT_HUGEPAGE_FLAG)|
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
+ (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
+#endif
+ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
+ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+
+/* default scan 8*512 pte (or vmas) every 30 second */
+static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
+static unsigned int khugepaged_pages_collapsed;
+static unsigned int khugepaged_full_scans;
+static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
+/* during fragmentation poll the hugepage allocator once every minute */
+static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
+static struct task_struct *khugepaged_thread __read_mostly;
+static DEFINE_MUTEX(khugepaged_mutex);
+static DEFINE_SPINLOCK(khugepaged_mm_lock);
+static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
+/*
+ * default collapse hugepages if there is at least one pte mapped like
+ * it would have happened if the vma was large enough during page
+ * fault.
+ */
+static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
+
+static int khugepaged(void *none);
+static int mm_slots_hash_init(void);
+static int khugepaged_slab_init(void);
+static void khugepaged_slab_free(void);
+
+#define MM_SLOTS_HASH_HEADS 1024
+static struct hlist_head *mm_slots_hash __read_mostly;
+static struct kmem_cache *mm_slot_cache __read_mostly;
+
+/**
+ * struct mm_slot - hash lookup from mm to mm_slot
+ * @hash: hash collision list
+ * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
+ * @mm: the mm that this information is valid for
+ */
+struct mm_slot {
+ struct hlist_node hash;
+ struct list_head mm_node;
+ struct mm_struct *mm;
+};
+
+/**
+ * struct khugepaged_scan - cursor for scanning
+ * @mm_head: the head of the mm list to scan
+ * @mm_slot: the current mm_slot we are scanning
+ * @address: the next address inside that to be scanned
+ *
+ * There is only the one khugepaged_scan instance of this cursor structure.
+ */
+struct khugepaged_scan {
+ struct list_head mm_head;
+ struct mm_slot *mm_slot;
+ unsigned long address;
+} khugepaged_scan = {
+ .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
+};
+
+
+static int set_recommended_min_free_kbytes(void)
+{
+ struct zone *zone;
+ int nr_zones = 0;
+ unsigned long recommended_min;
+ extern int min_free_kbytes;
+
+ if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG,
+ &transparent_hugepage_flags) &&
+ !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags))
+ return 0;
+
+ for_each_populated_zone(zone)
+ nr_zones++;
+
+ /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */
+ recommended_min = pageblock_nr_pages * nr_zones * 2;
+
+ /*
+ * Make sure that on average at least two pageblocks are almost free
+ * of another type, one for a migratetype to fall back to and a
+ * second to avoid subsequent fallbacks of other types There are 3
+ * MIGRATE_TYPES we care about.
+ */
+ recommended_min += pageblock_nr_pages * nr_zones *
+ MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
+
+ /* don't ever allow to reserve more than 5% of the lowmem */
+ recommended_min = min(recommended_min,
+ (unsigned long) nr_free_buffer_pages() / 20);
+ recommended_min <<= (PAGE_SHIFT-10);
+
+ if (recommended_min > min_free_kbytes)
+ min_free_kbytes = recommended_min;
+ setup_per_zone_wmarks();