From e92dd4fd1aa1cd081dac03973b33c972637d5b7a Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Fri, 26 Mar 2010 19:27:58 -0700
Subject: slab: Fix continuation lines

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slab.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index a9f325b28bed..ceb4e3aa22f7 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -4227,10 +4227,11 @@ static int s_show(struct seq_file *m, void *p)
 		unsigned long node_frees = cachep->node_frees;
 		unsigned long overflows = cachep->node_overflow;
 
-		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
-				%4lu %4lu %4lu %4lu %4lu", allocs, high, grown,
-				reaped, errors, max_freeable, node_allocs,
-				node_frees, overflows);
+		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu "
+			   "%4lu %4lu %4lu %4lu %4lu",
+			   allocs, high, grown,
+			   reaped, errors, max_freeable, node_allocs,
+			   node_frees, overflows);
 	}
 	/* cpu stats */
 	{
-- 
cgit v1.2.3


From 8f9f8d9e8080a2ff46caa7decef47810d093d252 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Sat, 27 Mar 2010 19:40:47 -0700
Subject: slab: add memory hotplug support

Slab lacks any memory hotplug support for nodes that are hotplugged
without cpus being hotplugged.  This is possible at least on x86
CONFIG_MEMORY_HOTPLUG_SPARSE kernels where SRAT entries are marked
ACPI_SRAT_MEM_HOT_PLUGGABLE and the regions of RAM represent a seperate
node.  It can also be done manually by writing the start address to
/sys/devices/system/memory/probe for kernels that have
CONFIG_ARCH_MEMORY_PROBE set, which is how this patch was tested, and
then onlining the new memory region.

When a node is hotadded, a nodelist for that node is allocated and
initialized for each slab cache.  If this isn't completed due to a lack
of memory, the hotadd is aborted: we have a reasonable expectation that
kmalloc_node(nid) will work for all caches if nid is online and memory is
available.

Since nodelists must be allocated and initialized prior to the new node's
memory actually being online, the struct kmem_list3 is allocated off-node
due to kmalloc_node()'s fallback.

When an entire node would be offlined, its nodelists are subsequently
drained.  If slab objects still exist and cannot be freed, the offline is
aborted.  It is possible that objects will be allocated between this
drain and page isolation, so it's still possible that the offline will
still fail, however.

Acked-by: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slab.c | 157 +++++++++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 125 insertions(+), 32 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index a9f325b28bed..3230cd2c6b3b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -115,6 +115,7 @@
 #include	<linux/reciprocal_div.h>
 #include	<linux/debugobjects.h>
 #include	<linux/kmemcheck.h>
+#include	<linux/memory.h>
 
 #include	<asm/cacheflush.h>
 #include	<asm/tlbflush.h>
@@ -1102,6 +1103,52 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
 }
 #endif
 
+/*
+ * Allocates and initializes nodelists for a node on each slab cache, used for
+ * either memory or cpu hotplug.  If memory is being hot-added, the kmem_list3
+ * will be allocated off-node since memory is not yet online for the new node.
+ * When hotplugging memory or a cpu, existing nodelists are not replaced if
+ * already in use.
+ *
+ * Must hold cache_chain_mutex.
+ */
+static int init_cache_nodelists_node(int node)
+{
+	struct kmem_cache *cachep;
+	struct kmem_list3 *l3;
+	const int memsize = sizeof(struct kmem_list3);
+
+	list_for_each_entry(cachep, &cache_chain, next) {
+		/*
+		 * Set up the size64 kmemlist for cpu before we can
+		 * begin anything. Make sure some other cpu on this
+		 * node has not already allocated this
+		 */
+		if (!cachep->nodelists[node]) {
+			l3 = kmalloc_node(memsize, GFP_KERNEL, node);
+			if (!l3)
+				return -ENOMEM;
+			kmem_list3_init(l3);
+			l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
+			    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+
+			/*
+			 * The l3s don't come and go as CPUs come and
+			 * go.  cache_chain_mutex is sufficient
+			 * protection here.
+			 */
+			cachep->nodelists[node] = l3;
+		}
+
+		spin_lock_irq(&cachep->nodelists[node]->list_lock);
+		cachep->nodelists[node]->free_limit =
+			(1 + nr_cpus_node(node)) *
+			cachep->batchcount + cachep->num;
+		spin_unlock_irq(&cachep->nodelists[node]->list_lock);
+	}
+	return 0;
+}
+
 static void __cpuinit cpuup_canceled(long cpu)
 {
 	struct kmem_cache *cachep;
@@ -1172,7 +1219,7 @@ static int __cpuinit cpuup_prepare(long cpu)
 	struct kmem_cache *cachep;
 	struct kmem_list3 *l3 = NULL;
 	int node = cpu_to_node(cpu);
-	const int memsize = sizeof(struct kmem_list3);
+	int err;
 
 	/*
 	 * We need to do this right in the beginning since
@@ -1180,35 +1227,9 @@ static int __cpuinit cpuup_prepare(long cpu)
 	 * kmalloc_node allows us to add the slab to the right
 	 * kmem_list3 and not this cpu's kmem_list3
 	 */
-
-	list_for_each_entry(cachep, &cache_chain, next) {
-		/*
-		 * Set up the size64 kmemlist for cpu before we can
-		 * begin anything. Make sure some other cpu on this
-		 * node has not already allocated this
-		 */
-		if (!cachep->nodelists[node]) {
-			l3 = kmalloc_node(memsize, GFP_KERNEL, node);
-			if (!l3)
-				goto bad;
-			kmem_list3_init(l3);
-			l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
-			    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
-
-			/*
-			 * The l3s don't come and go as CPUs come and
-			 * go.  cache_chain_mutex is sufficient
-			 * protection here.
-			 */
-			cachep->nodelists[node] = l3;
-		}
-
-		spin_lock_irq(&cachep->nodelists[node]->list_lock);
-		cachep->nodelists[node]->free_limit =
-			(1 + nr_cpus_node(node)) *
-			cachep->batchcount + cachep->num;
-		spin_unlock_irq(&cachep->nodelists[node]->list_lock);
-	}
+	err = init_cache_nodelists_node(node);
+	if (err < 0)
+		goto bad;
 
 	/*
 	 * Now we can go ahead with allocating the shared arrays and
@@ -1331,11 +1352,75 @@ static struct notifier_block __cpuinitdata cpucache_notifier = {
 	&cpuup_callback, NULL, 0
 };
 
+#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
+/*
+ * Drains freelist for a node on each slab cache, used for memory hot-remove.
+ * Returns -EBUSY if all objects cannot be drained so that the node is not
+ * removed.
+ *
+ * Must hold cache_chain_mutex.
+ */
+static int __meminit drain_cache_nodelists_node(int node)
+{
+	struct kmem_cache *cachep;
+	int ret = 0;
+
+	list_for_each_entry(cachep, &cache_chain, next) {
+		struct kmem_list3 *l3;
+
+		l3 = cachep->nodelists[node];
+		if (!l3)
+			continue;
+
+		drain_freelist(cachep, l3, l3->free_objects);
+
+		if (!list_empty(&l3->slabs_full) ||
+		    !list_empty(&l3->slabs_partial)) {
+			ret = -EBUSY;
+			break;
+		}
+	}
+	return ret;
+}
+
+static int __meminit slab_memory_callback(struct notifier_block *self,
+					unsigned long action, void *arg)
+{
+	struct memory_notify *mnb = arg;
+	int ret = 0;
+	int nid;
+
+	nid = mnb->status_change_nid;
+	if (nid < 0)
+		goto out;
+
+	switch (action) {
+	case MEM_GOING_ONLINE:
+		mutex_lock(&cache_chain_mutex);
+		ret = init_cache_nodelists_node(nid);
+		mutex_unlock(&cache_chain_mutex);
+		break;
+	case MEM_GOING_OFFLINE:
+		mutex_lock(&cache_chain_mutex);
+		ret = drain_cache_nodelists_node(nid);
+		mutex_unlock(&cache_chain_mutex);
+		break;
+	case MEM_ONLINE:
+	case MEM_OFFLINE:
+	case MEM_CANCEL_ONLINE:
+	case MEM_CANCEL_OFFLINE:
+		break;
+	}
+out:
+	return ret ? notifier_from_errno(ret) : NOTIFY_OK;
+}
+#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
+
 /*
  * swap the static kmem_list3 with kmalloced memory
  */
-static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
-			int nodeid)
+static void __init init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
+				int nodeid)
 {
 	struct kmem_list3 *ptr;
 
@@ -1580,6 +1665,14 @@ void __init kmem_cache_init_late(void)
 	 */
 	register_cpu_notifier(&cpucache_notifier);
 
+#ifdef CONFIG_NUMA
+	/*
+	 * Register a memory hotplug callback that initializes and frees
+	 * nodelists.
+	 */
+	hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
+#endif
+
 	/*
 	 * The reap timers are started later, with a module init call: That part
 	 * of the kernel is not yet operational.
-- 
cgit v1.2.3


From 5c5e3b33b7cb959a401f823707bee006caadd76e Mon Sep 17 00:00:00 2001
From: Shiyong Li <shi-yong.li@motorola.com>
Date: Mon, 12 Apr 2010 13:48:21 +0800
Subject: slab: Fix missing DEBUG_SLAB last user

Even with SLAB_RED_ZONE and SLAB_STORE_USER enabled, kernel would NOT store
redzone and last user data around allocated memory space if "arch cache line >
sizeof(unsigned long long)". As a result, last user information is unexpectedly
MISSED while dumping slab corruption log.

This fix makes sure that redzone and last user tags get stored unless the
required alignment breaks redzone's.

Signed-off-by: Shiyong Li <shi-yong.li@motorola.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slab.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index bac0f4fcc216..525c66466469 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2220,8 +2220,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	if (ralign < align) {
 		ralign = align;
 	}
-	/* disable debug if necessary */
-	if (ralign > __alignof__(unsigned long long))
+	/* disable debug if not aligning with REDZONE_ALIGN */
+	if (ralign & (__alignof__(unsigned long long) - 1))
 		flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
 	/*
 	 * 4) Store it.
@@ -2247,8 +2247,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	 */
 	if (flags & SLAB_RED_ZONE) {
 		/* add space for red zone words */
-		cachep->obj_offset += sizeof(unsigned long long);
-		size += 2 * sizeof(unsigned long long);
+		cachep->obj_offset += align;
+		size += align + sizeof(unsigned long long);
 	}
 	if (flags & SLAB_STORE_USER) {
 		/* user store requires one word storage behind the end of
-- 
cgit v1.2.3


From 1f0ce8b3dd667dca720a47869f8110c298f0e5b8 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Wed, 19 May 2010 12:01:42 +0100
Subject: mm: Move ARCH_SLAB_MINALIGN and ARCH_KMALLOC_MINALIGN to
 <linux/slab_def.h>

Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slab.c | 24 ------------------------
 1 file changed, 24 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index bac0f4fcc216..7401ddc24306 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -144,30 +144,6 @@
 #define	BYTES_PER_WORD		sizeof(void *)
 #define	REDZONE_ALIGN		max(BYTES_PER_WORD, __alignof__(unsigned long long))
 
-#ifndef ARCH_KMALLOC_MINALIGN
-/*
- * Enforce a minimum alignment for the kmalloc caches.
- * Usually, the kmalloc caches are cache_line_size() aligned, except when
- * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
- * Some archs want to perform DMA into kmalloc caches and need a guaranteed
- * alignment larger than the alignment of a 64-bit integer.
- * ARCH_KMALLOC_MINALIGN allows that.
- * Note that increasing this value may disable some debug features.
- */
-#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
-#endif
-
-#ifndef ARCH_SLAB_MINALIGN
-/*
- * Enforce a minimum alignment for all caches.
- * Intended for archs that get misalignment faults even for BYTES_PER_WORD
- * aligned buffers. Includes ARCH_KMALLOC_MINALIGN.
- * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables
- * some debug features.
- */
-#define ARCH_SLAB_MINALIGN 0
-#endif
-
 #ifndef ARCH_KMALLOC_FLAGS
 #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
 #endif
-- 
cgit v1.2.3


From bac49ce42a33f53beb7cf04e9a0600879d6265ca Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Wed, 19 May 2010 12:01:43 +0100
Subject: mm: Move ARCH_SLAB_MINALIGN and ARCH_KMALLOC_MINALIGN to
 <linux/slob_def.h>

Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slob.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'mm')

diff --git a/mm/slob.c b/mm/slob.c
index 837ebd64cc34..23631e2bb57a 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -467,14 +467,6 @@ out:
  * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend.
  */
 
-#ifndef ARCH_KMALLOC_MINALIGN
-#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long)
-#endif
-
-#ifndef ARCH_SLAB_MINALIGN
-#define ARCH_SLAB_MINALIGN __alignof__(unsigned long)
-#endif
-
 void *__kmalloc_node(size_t size, gfp_t gfp, int node)
 {
 	unsigned int *m;
-- 
cgit v1.2.3


From 4581ced379736fd76432c754f999d26deb83fbb7 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Wed, 19 May 2010 12:02:14 +0100
Subject: mm: Move ARCH_SLAB_MINALIGN and ARCH_KMALLOC_MINALIGN to
 <linux/slub_def.h>

Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index d2a54fe71ea2..c874c3efac29 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -157,14 +157,6 @@
 #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
 		SLAB_CACHE_DMA | SLAB_NOTRACK)
 
-#ifndef ARCH_KMALLOC_MINALIGN
-#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
-#endif
-
-#ifndef ARCH_SLAB_MINALIGN
-#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
-#endif
-
 #define OO_SHIFT	16
 #define OO_MASK		((1 << OO_SHIFT) - 1)
 #define MAX_OBJS_PER_PAGE	65535 /* since page.objects is u16 */
-- 
cgit v1.2.3


From bbd7d57bfe852d9788bae5fb171c7edb4021d8ac Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 24 Mar 2010 22:25:47 +0100
Subject: slub: Potential stack overflow

I discovered that we can overflow stack if CONFIG_SLUB_DEBUG=y and use slabs
with many objects, since list_slab_objects() and process_slab() use
DECLARE_BITMAP(map, page->objects).

With 65535 bits, we use 8192 bytes of stack ...

Switch these allocations to dynamic allocations.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index d2a54fe71ea2..78f1a202ca33 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2429,9 +2429,11 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
 #ifdef CONFIG_SLUB_DEBUG
 	void *addr = page_address(page);
 	void *p;
-	DECLARE_BITMAP(map, page->objects);
+	long *map = kzalloc(BITS_TO_LONGS(page->objects) * sizeof(long),
+			    GFP_ATOMIC);
 
-	bitmap_zero(map, page->objects);
+	if (!map)
+		return;
 	slab_err(s, page, "%s", text);
 	slab_lock(page);
 	for_each_free_object(p, s, page->freelist)
@@ -2446,6 +2448,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
 		}
 	}
 	slab_unlock(page);
+	kfree(map);
 #endif
 }
 
@@ -3651,10 +3654,10 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
 }
 
 static void process_slab(struct loc_track *t, struct kmem_cache *s,
-		struct page *page, enum track_item alloc)
+		struct page *page, enum track_item alloc,
+		long *map)
 {
 	void *addr = page_address(page);
-	DECLARE_BITMAP(map, page->objects);
 	void *p;
 
 	bitmap_zero(map, page->objects);
@@ -3673,11 +3676,14 @@ static int list_locations(struct kmem_cache *s, char *buf,
 	unsigned long i;
 	struct loc_track t = { 0, 0, NULL };
 	int node;
+	unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
+				     sizeof(unsigned long), GFP_KERNEL);
 
-	if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
-			GFP_TEMPORARY))
+	if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
+				     GFP_TEMPORARY)) {
+		kfree(map);
 		return sprintf(buf, "Out of memory\n");
-
+	}
 	/* Push back cpu slabs */
 	flush_all(s);
 
@@ -3691,9 +3697,9 @@ static int list_locations(struct kmem_cache *s, char *buf,
 
 		spin_lock_irqsave(&n->list_lock, flags);
 		list_for_each_entry(page, &n->partial, lru)
-			process_slab(&t, s, page, alloc);
+			process_slab(&t, s, page, alloc, map);
 		list_for_each_entry(page, &n->full, lru)
-			process_slab(&t, s, page, alloc);
+			process_slab(&t, s, page, alloc, map);
 		spin_unlock_irqrestore(&n->list_lock, flags);
 	}
 
@@ -3744,6 +3750,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
 	}
 
 	free_loc_track(&t);
+	kfree(map);
 	if (!t.count)
 		len += sprintf(buf, "No data\n");
 	return len;
-- 
cgit v1.2.3


From d3e14aa336b37df76ae875fa051dfdb0e765ddf9 Mon Sep 17 00:00:00 2001
From: Xiaotian Feng <dfeng@redhat.com>
Date: Thu, 8 Apr 2010 17:26:44 +0800
Subject: slub: __kmalloc_node_track_caller should trace kmalloc_large_node
 case

commit 94b528d (kmemtrace: SLUB hooks for caller-tracking functions)
missed tracing kmalloc_large_node in __kmalloc_node_track_caller. We
should trace it same as __kmalloc_node.

Acked-by: David Rientjes <rientjes@google.com>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
Signed-off-by: Xiaotian Feng <dfeng@redhat.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 78f1a202ca33..52ae5a538180 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3341,8 +3341,15 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
 	struct kmem_cache *s;
 	void *ret;
 
-	if (unlikely(size > SLUB_MAX_SIZE))
-		return kmalloc_large_node(size, gfpflags, node);
+	if (unlikely(size > SLUB_MAX_SIZE)) {
+		ret = kmalloc_large_node(size, gfpflags, node);
+
+		trace_kmalloc_node(caller, ret,
+				   size, PAGE_SIZE << get_order(size),
+				   gfpflags, node);
+
+		return ret;
+	}
 
 	s = get_slab(size, gfpflags);
 
-- 
cgit v1.2.3


From 6b65aaf3027c4e02b42aaefd900aa79136a30681 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan.kim@gmail.com>
Date: Wed, 14 Apr 2010 23:58:36 +0900
Subject: slub: Use alloc_pages_exact_node() for page allocation

The alloc_slab_page() in SLUB uses alloc_pages() if node is '-1'.  This means
that node validity check in alloc_pages_node is unnecessary and we can use
alloc_pages_exact_node() to avoid comparison and branch as commit
6484eb3e2a81807722 ("page allocator: do not check NUMA node ID when the caller
knows the node is valid") did for the page allocator.

Cc: Christoph Lameter <cl@linux-foundation.org>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 52ae5a538180..2cdd235cb801 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1084,7 +1084,7 @@ static inline struct page *alloc_slab_page(gfp_t flags, int node,
 	if (node == -1)
 		return alloc_pages(flags, order);
 	else
-		return alloc_pages_node(node, flags, order);
+		return alloc_pages_exact_node(node, flags, order);
 }
 
 static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
-- 
cgit v1.2.3


From 73367bd8eef4f4eb311005886aaa916013073265 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Fri, 21 May 2010 14:41:35 -0700
Subject: slub: move kmem_cache_node into it's own cacheline

This patch is meant to improve the performance of SLUB by moving the local
kmem_cache_node lock into it's own cacheline separate from kmem_cache.
This is accomplished by simply removing the local_node when NUMA is enabled.

On my system with 2 nodes I saw around a 5% performance increase w/
hackbench times dropping from 6.2 seconds to 5.9 seconds on average.  I
suspect the performance gain would increase as the number of nodes
increases, but I do not have the data to currently back that up.

Bugzilla-Reference: http://bugzilla.kernel.org/show_bug.cgi?id=15713
Cc: <stable@kernel.org>
Reported-by: Alex Shi <alex.shi@intel.com>
Tested-by: Alex Shi <alex.shi@intel.com>
Acked-by: Yanmin Zhang <yanmin_zhang@linux.intel.com>
Acked-by: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 33 +++++++++++----------------------
 1 file changed, 11 insertions(+), 22 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index e46e3129697d..c2d6e6951f33 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2133,7 +2133,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
 
 	for_each_node_state(node, N_NORMAL_MEMORY) {
 		struct kmem_cache_node *n = s->node[node];
-		if (n && n != &s->local_node)
+		if (n)
 			kmem_cache_free(kmalloc_caches, n);
 		s->node[node] = NULL;
 	}
@@ -2142,33 +2142,22 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
 static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
 {
 	int node;
-	int local_node;
-
-	if (slab_state >= UP && (s < kmalloc_caches ||
-			s >= kmalloc_caches + KMALLOC_CACHES))
-		local_node = page_to_nid(virt_to_page(s));
-	else
-		local_node = 0;
 
 	for_each_node_state(node, N_NORMAL_MEMORY) {
 		struct kmem_cache_node *n;
 
-		if (local_node == node)
-			n = &s->local_node;
-		else {
-			if (slab_state == DOWN) {
-				early_kmem_cache_node_alloc(gfpflags, node);
-				continue;
-			}
-			n = kmem_cache_alloc_node(kmalloc_caches,
-							gfpflags, node);
-
-			if (!n) {
-				free_kmem_cache_nodes(s);
-				return 0;
-			}
+		if (slab_state == DOWN) {
+			early_kmem_cache_node_alloc(gfpflags, node);
+			continue;
+		}
+		n = kmem_cache_alloc_node(kmalloc_caches,
+						gfpflags, node);
 
+		if (!n) {
+			free_kmem_cache_nodes(s);
+			return 0;
 		}
+
 		s->node[node] = n;
 		init_kmem_cache_node(n, s);
 	}
-- 
cgit v1.2.3


From 47846b0650f2f62fc4217cfb36efc94b8d919727 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 25 May 2010 15:06:06 +0200
Subject: mm: export lru_cache_add_*() to modules

This is needed to enable moving pages into the page cache in fuse with
splice(..., SPLICE_F_MOVE).

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 mm/swap.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/swap.c b/mm/swap.c
index 7cd60bf0a972..3ce7bc373a52 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -224,6 +224,7 @@ void __lru_cache_add(struct page *page, enum lru_list lru)
 		____pagevec_lru_add(pvec, lru);
 	put_cpu_var(lru_add_pvecs);
 }
+EXPORT_SYMBOL(__lru_cache_add);
 
 /**
  * lru_cache_add_lru - add a page to a page list
-- 
cgit v1.2.3


From a52116aba5b3eed0ee41f70b794cc1937acd5cb8 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 25 May 2010 15:06:06 +0200
Subject: mm: export remove_from_page_cache() to modules

This is needed to enable moving pages into the page cache in fuse with
splice(..., SPLICE_F_MOVE).

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 mm/filemap.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index 140ebda9640f..09a91a9a102e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -151,6 +151,7 @@ void remove_from_page_cache(struct page *page)
 	spin_unlock_irq(&mapping->tree_lock);
 	mem_cgroup_uncharge_cache_page(page);
 }
+EXPORT_SYMBOL(remove_from_page_cache);
 
 static int sync_page(void *word)
 {
-- 
cgit v1.2.3


From 66f998f611897319b555364cefd5d6e88a205866 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Sun, 23 May 2010 11:00:54 -0400
Subject: fs: allow short direct-io reads to be completed via buffered IO

This is similar to what already happens in the write case.  If we have a short
read while doing O_DIRECT, instead of just returning, fallthrough and try to
read the rest via buffered IO.  BTRFS needs this because if we encounter a
compressed or inline extent during DIO, we need to fallback on buffered.  If the
extent is compressed we need to read the entire thing into memory and
de-compress it into the users pages.  I have tested this with fsx and everything
works great.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 mm/filemap.c | 36 +++++++++++++++++++++++++++++++-----
 1 file changed, 31 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index 140ebda9640f..829ac9cdbd70 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1263,7 +1263,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 {
 	struct file *filp = iocb->ki_filp;
 	ssize_t retval;
-	unsigned long seg;
+	unsigned long seg = 0;
 	size_t count;
 	loff_t *ppos = &iocb->ki_pos;
 
@@ -1290,21 +1290,47 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 				retval = mapping->a_ops->direct_IO(READ, iocb,
 							iov, pos, nr_segs);
 			}
-			if (retval > 0)
+			if (retval > 0) {
 				*ppos = pos + retval;
-			if (retval) {
+				count -= retval;
+			}
+
+			/*
+			 * Btrfs can have a short DIO read if we encounter
+			 * compressed extents, so if there was an error, or if
+			 * we've already read everything we wanted to, or if
+			 * there was a short read because we hit EOF, go ahead
+			 * and return.  Otherwise fallthrough to buffered io for
+			 * the rest of the read.
+			 */
+			if (retval < 0 || !count || *ppos >= size) {
 				file_accessed(filp);
 				goto out;
 			}
 		}
 	}
 
+	count = retval;
 	for (seg = 0; seg < nr_segs; seg++) {
 		read_descriptor_t desc;
+		loff_t offset = 0;
+
+		/*
+		 * If we did a short DIO read we need to skip the section of the
+		 * iov that we've already read data into.
+		 */
+		if (count) {
+			if (count > iov[seg].iov_len) {
+				count -= iov[seg].iov_len;
+				continue;
+			}
+			offset = count;
+			count = 0;
+		}
 
 		desc.written = 0;
-		desc.arg.buf = iov[seg].iov_base;
-		desc.count = iov[seg].iov_len;
+		desc.arg.buf = iov[seg].iov_base + offset;
+		desc.count = iov[seg].iov_len - offset;
 		if (desc.count == 0)
 			continue;
 		desc.error = 0;
-- 
cgit v1.2.3


From e9d6c157385e4efa61cb8293e425c9d8beba70d3 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Mon, 24 May 2010 14:31:48 -0700
Subject: tmpfs: insert tmpfs cache pages to inactive list at first

Shaohua Li reported parallel file copy on tmpfs can lead to OOM killer.
This is regression of caused by commit 9ff473b9a7 ("vmscan: evict
streaming IO first").  Wow, It is 2 years old patch!

Currently, tmpfs file cache is inserted active list at first.  This means
that the insertion doesn't only increase numbers of pages in anon LRU, but
it also reduces anon scanning ratio.  Therefore, vmscan will get totally
confused.  It scans almost only file LRU even though the system has plenty
unused tmpfs pages.

Historically, lru_cache_add_active_anon() was used for two reasons.
1) Intend to priotize shmem page rather than regular file cache.
2) Intend to avoid reclaim priority inversion of used once pages.

But we've lost both motivation because (1) Now we have separate anon and
file LRU list.  then, to insert active list doesn't help such priotize.
(2) In past, one pte access bit will cause page activation.  then to
insert inactive list with pte access bit mean higher priority than to
insert active list.  Its priority inversion may lead to uninteded lru
chun.  but it was already solved by commit 645747462 (vmscan: detect
mapped file pages used only once).  (Thanks Hannes, you are great!)

Thus, now we can use lru_cache_add_anon() instead.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reported-by: Shaohua Li <shaohua.li@intel.com>
Reviewed-by: Wu Fengguang <fengguang.wu@intel.com>
Reviewed-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/filemap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index 140ebda9640f..d6f4f073836e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -441,7 +441,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 	/*
 	 * Splice_read and readahead add shmem/tmpfs pages into the page cache
 	 * before shmem_readpage has a chance to mark them as SwapBacked: they
-	 * need to go on the active_anon lru below, and mem_cgroup_cache_charge
+	 * need to go on the anon lru below, and mem_cgroup_cache_charge
 	 * (called in add_to_page_cache) needs to know where they're going too.
 	 */
 	if (mapping_cap_swap_backed(mapping))
@@ -452,7 +452,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 		if (page_is_file_cache(page))
 			lru_cache_add_file(page);
 		else
-			lru_cache_add_active_anon(page);
+			lru_cache_add_anon(page);
 	}
 	return ret;
 }
-- 
cgit v1.2.3


From 6dda9d55bf545013597724bf0cd79d01bd2bd944 Mon Sep 17 00:00:00 2001
From: Corrado Zoccolo <czoccolo@gmail.com>
Date: Mon, 24 May 2010 14:31:54 -0700
Subject: page allocator: reduce fragmentation in buddy allocator by adding
 buddies that are merging to the tail of the free lists

In order to reduce fragmentation, this patch classifies freed pages in two
groups according to their probability of being part of a high order merge.
 Pages belonging to a compound whose next-highest buddy is free are more
likely to be part of a high order merge in the near future, so they will
be added at the tail of the freelist.  The remaining pages are put at the
front of the freelist.

In this way, the pages that are more likely to cause a big merge are kept
free longer.  Consequently there is a tendency to aggregate the
long-living allocations on a subset of the compounds, reducing the
fragmentation.

This heuristic was tested on three machines, x86, x86-64 and ppc64 with
3GB of RAM in each machine.  The tests were kernbench, netperf, sysbench
and STREAM for performance and a high-order stress test for huge page
allocations.

KernBench X86
Elapsed mean     374.77 ( 0.00%)   375.10 (-0.09%)
User    mean     649.53 ( 0.00%)   650.44 (-0.14%)
System  mean      54.75 ( 0.00%)    54.18 ( 1.05%)
CPU     mean     187.75 ( 0.00%)   187.25 ( 0.27%)

KernBench X86-64
Elapsed mean      94.45 ( 0.00%)    94.01 ( 0.47%)
User    mean     323.27 ( 0.00%)   322.66 ( 0.19%)
System  mean      36.71 ( 0.00%)    36.50 ( 0.57%)
CPU     mean     380.75 ( 0.00%)   381.75 (-0.26%)

KernBench PPC64
Elapsed mean     173.45 ( 0.00%)   173.74 (-0.17%)
User    mean     587.99 ( 0.00%)   587.95 ( 0.01%)
System  mean      60.60 ( 0.00%)    60.57 ( 0.05%)
CPU     mean     373.50 ( 0.00%)   372.75 ( 0.20%)

Nothing notable for kernbench.

NetPerf UDP X86
      64    42.68 ( 0.00%)     42.77 ( 0.21%)
     128    85.62 ( 0.00%)     85.32 (-0.35%)
     256   170.01 ( 0.00%)    168.76 (-0.74%)
    1024   655.68 ( 0.00%)    652.33 (-0.51%)
    2048  1262.39 ( 0.00%)   1248.61 (-1.10%)
    3312  1958.41 ( 0.00%)   1944.61 (-0.71%)
    4096  2345.63 ( 0.00%)   2318.83 (-1.16%)
    8192  4132.90 ( 0.00%)   4089.50 (-1.06%)
   16384  6770.88 ( 0.00%)   6642.05 (-1.94%)*

NetPerf UDP X86-64
      64   148.82 ( 0.00%)    154.92 ( 3.94%)
     128   298.96 ( 0.00%)    312.95 ( 4.47%)
     256   583.67 ( 0.00%)    626.39 ( 6.82%)
    1024  2293.18 ( 0.00%)   2371.10 ( 3.29%)
    2048  4274.16 ( 0.00%)   4396.83 ( 2.79%)
    3312  6356.94 ( 0.00%)   6571.35 ( 3.26%)
    4096  7422.68 ( 0.00%)   7635.42 ( 2.79%)*
    8192 12114.81 ( 0.00%)* 12346.88 ( 1.88%)
   16384 17022.28 ( 0.00%)* 17033.19 ( 0.06%)*
             1.64%             2.73%

NetPerf UDP PPC64
      64    49.98 ( 0.00%)     50.25 ( 0.54%)
     128    98.66 ( 0.00%)    100.95 ( 2.27%)
     256   197.33 ( 0.00%)    191.03 (-3.30%)
    1024   761.98 ( 0.00%)    785.07 ( 2.94%)
    2048  1493.50 ( 0.00%)   1510.85 ( 1.15%)
    3312  2303.95 ( 0.00%)   2271.72 (-1.42%)
    4096  2774.56 ( 0.00%)   2773.06 (-0.05%)
    8192  4918.31 ( 0.00%)   4793.59 (-2.60%)
   16384  7497.98 ( 0.00%)   7749.52 ( 3.25%)

The tests are run to have confidence limits within 1%.  Results marked
with a * were not confident although in this case, it's only outside by
small amounts.  Even with some results that were not confident, the
netperf UDP results were generally positive.

NetPerf TCP X86
      64   652.25 ( 0.00%)*   648.12 (-0.64%)*
            23.80%            22.82%
     128  1229.98 ( 0.00%)*  1220.56 (-0.77%)*
            21.03%            18.90%
     256  2105.88 ( 0.00%)   1872.03 (-12.49%)*
             1.00%            16.46%
    1024  3476.46 ( 0.00%)*  3548.28 ( 2.02%)*
            13.37%            11.39%
    2048  4023.44 ( 0.00%)*  4231.45 ( 4.92%)*
             9.76%            12.48%
    3312  4348.88 ( 0.00%)*  4396.96 ( 1.09%)*
             6.49%             8.75%
    4096  4726.56 ( 0.00%)*  4877.71 ( 3.10%)*
             9.85%             8.50%
    8192  4732.28 ( 0.00%)*  5777.77 (18.10%)*
             9.13%            13.04%
   16384  5543.05 ( 0.00%)*  5906.24 ( 6.15%)*
             7.73%             8.68%

NETPERF TCP X86-64
            netperf-tcp-vanilla-netperf       netperf-tcp
                   tcp-vanilla     pgalloc-delay
      64  1895.87 ( 0.00%)*  1775.07 (-6.81%)*
             5.79%             4.78%
     128  3571.03 ( 0.00%)*  3342.20 (-6.85%)*
             3.68%             6.06%
     256  5097.21 ( 0.00%)*  4859.43 (-4.89%)*
             3.02%             2.10%
    1024  8919.10 ( 0.00%)*  8892.49 (-0.30%)*
             5.89%             6.55%
    2048 10255.46 ( 0.00%)* 10449.39 ( 1.86%)*
             7.08%             7.44%
    3312 10839.90 ( 0.00%)* 10740.15 (-0.93%)*
             6.87%             7.33%
    4096 10814.84 ( 0.00%)* 10766.97 (-0.44%)*
             6.86%             8.18%
    8192 11606.89 ( 0.00%)* 11189.28 (-3.73%)*
             7.49%             5.55%
   16384 12554.88 ( 0.00%)* 12361.22 (-1.57%)*
             7.36%             6.49%

NETPERF TCP PPC64
            netperf-tcp-vanilla-netperf       netperf-tcp
                   tcp-vanilla     pgalloc-delay
      64   594.17 ( 0.00%)    596.04 ( 0.31%)*
             1.00%             2.29%
     128  1064.87 ( 0.00%)*  1074.77 ( 0.92%)*
             1.30%             1.40%
     256  1852.46 ( 0.00%)*  1856.95 ( 0.24%)
             1.25%             1.00%
    1024  3839.46 ( 0.00%)*  3813.05 (-0.69%)
             1.02%             1.00%
    2048  4885.04 ( 0.00%)*  4881.97 (-0.06%)*
             1.15%             1.04%
    3312  5506.90 ( 0.00%)   5459.72 (-0.86%)
    4096  6449.19 ( 0.00%)   6345.46 (-1.63%)
    8192  7501.17 ( 0.00%)   7508.79 ( 0.10%)
   16384  9618.65 ( 0.00%)   9490.10 (-1.35%)

There was a distinct lack of confidence in the X86* figures so I included
what the devation was where the results were not confident.  Many of the
results, whether gains or losses were within the standard deviation so no
solid conclusion can be reached on performance impact.  Looking at the
figures, only the X86-64 ones look suspicious with a few losses that were
outside the noise.  However, the results were so unstable that without
knowing why they vary so much, a solid conclusion cannot be reached.

SYSBENCH X86
              sysbench-vanilla     pgalloc-delay
           1  7722.85 ( 0.00%)  7756.79 ( 0.44%)
           2 14901.11 ( 0.00%) 13683.44 (-8.90%)
           3 15171.71 ( 0.00%) 14888.25 (-1.90%)
           4 14966.98 ( 0.00%) 15029.67 ( 0.42%)
           5 14370.47 ( 0.00%) 14865.00 ( 3.33%)
           6 14870.33 ( 0.00%) 14845.57 (-0.17%)
           7 14429.45 ( 0.00%) 14520.85 ( 0.63%)
           8 14354.35 ( 0.00%) 14362.31 ( 0.06%)

SYSBENCH X86-64
           1 17448.70 ( 0.00%) 17484.41 ( 0.20%)
           2 34276.39 ( 0.00%) 34251.00 (-0.07%)
           3 50805.25 ( 0.00%) 50854.80 ( 0.10%)
           4 66667.10 ( 0.00%) 66174.69 (-0.74%)
           5 66003.91 ( 0.00%) 65685.25 (-0.49%)
           6 64981.90 ( 0.00%) 65125.60 ( 0.22%)
           7 64933.16 ( 0.00%) 64379.23 (-0.86%)
           8 63353.30 ( 0.00%) 63281.22 (-0.11%)
           9 63511.84 ( 0.00%) 63570.37 ( 0.09%)
          10 62708.27 ( 0.00%) 63166.25 ( 0.73%)
          11 62092.81 ( 0.00%) 61787.75 (-0.49%)
          12 61330.11 ( 0.00%) 61036.34 (-0.48%)
          13 61438.37 ( 0.00%) 61994.47 ( 0.90%)
          14 62304.48 ( 0.00%) 62064.90 (-0.39%)
          15 63296.48 ( 0.00%) 62875.16 (-0.67%)
          16 63951.76 ( 0.00%) 63769.09 (-0.29%)

SYSBENCH PPC64
                             -sysbench-pgalloc-delay-sysbench
              sysbench-vanilla     pgalloc-delay
           1  7645.08 ( 0.00%)  7467.43 (-2.38%)
           2 14856.67 ( 0.00%) 14558.73 (-2.05%)
           3 21952.31 ( 0.00%) 21683.64 (-1.24%)
           4 27946.09 ( 0.00%) 28623.29 ( 2.37%)
           5 28045.11 ( 0.00%) 28143.69 ( 0.35%)
           6 27477.10 ( 0.00%) 27337.45 (-0.51%)
           7 26489.17 ( 0.00%) 26590.06 ( 0.38%)
           8 26642.91 ( 0.00%) 25274.33 (-5.41%)
           9 25137.27 ( 0.00%) 24810.06 (-1.32%)
          10 24451.99 ( 0.00%) 24275.85 (-0.73%)
          11 23262.20 ( 0.00%) 23674.88 ( 1.74%)
          12 24234.81 ( 0.00%) 23640.89 (-2.51%)
          13 24577.75 ( 0.00%) 24433.50 (-0.59%)
          14 25640.19 ( 0.00%) 25116.52 (-2.08%)
          15 26188.84 ( 0.00%) 26181.36 (-0.03%)
          16 26782.37 ( 0.00%) 26255.99 (-2.00%)

Again, there is little to conclude here.  While there are a few losses,
the results vary by +/- 8% in some cases.  They are the results of most
concern as there are some large losses but it's also within the variance
typically seen between kernel releases.

The STREAM results varied so little and are so verbose that I didn't
include them here.

The final test stressed how many huge pages can be allocated.  The
absolute number of huge pages allocated are the same with or without the
page.  However, the "unusability free space index" which is a measure of
external fragmentation was slightly lower (lower is better) throughout the
lifetime of the system.  I also measured the latency of how long it took
to successfully allocate a huge page.  The latency was slightly lower and
on X86 and PPC64, more huge pages were allocated almost immediately from
the free lists.  The improvement is slight but there.

[mel@csn.ul.ie: Tested, reworked for less branches]
[czoccolo@gmail.com: fix oops by checking pfn_valid_within()]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Acked-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a6326c71b663..596180fedd3a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -475,6 +475,8 @@ static inline void __free_one_page(struct page *page,
 		int migratetype)
 {
 	unsigned long page_idx;
+	unsigned long combined_idx;
+	struct page *buddy;
 
 	if (unlikely(PageCompound(page)))
 		if (unlikely(destroy_compound_page(page, order)))
@@ -488,9 +490,6 @@ static inline void __free_one_page(struct page *page,
 	VM_BUG_ON(bad_range(zone, page));
 
 	while (order < MAX_ORDER-1) {
-		unsigned long combined_idx;
-		struct page *buddy;
-
 		buddy = __page_find_buddy(page, page_idx, order);
 		if (!page_is_buddy(page, buddy, order))
 			break;
@@ -505,8 +504,29 @@ static inline void __free_one_page(struct page *page,
 		order++;
 	}
 	set_page_order(page, order);
-	list_add(&page->lru,
-		&zone->free_area[order].free_list[migratetype]);
+
+	/*
+	 * If this is not the largest possible page, check if the buddy
+	 * of the next-highest order is free. If it is, it's possible
+	 * that pages are being freed that will coalesce soon. In case,
+	 * that is happening, add the free page to the tail of the list
+	 * so it's less likely to be used soon and more likely to be merged
+	 * as a higher order page
+	 */
+	if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) {
+		struct page *higher_page, *higher_buddy;
+		combined_idx = __find_combined_index(page_idx, order);
+		higher_page = page + combined_idx - page_idx;
+		higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1);
+		if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
+			list_add_tail(&page->lru,
+				&zone->free_area[order].free_list[migratetype]);
+			goto out;
+		}
+	}
+
+	list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
+out:
 	zone->free_area[order].nr_free++;
 }
 
-- 
cgit v1.2.3


From e48e67e08c340def3d0349c2910d23c7985fb6fa Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 24 May 2010 14:31:57 -0700
Subject: sparsemem: on no vmemmap path put mem_map on node high too

We need to put mem_map high when virtual memmap is not used.

before this patch
free mem pfn range on first node:
[    0.000000]  19 - 1f
[    0.000000]  28 40 - 80 95
[    0.000000]  702 740 - 1000 1000
[    0.000000]  347c - 347e
[    0.000000]  34e7 3500 - 3b80 3b8b
[    0.000000]  73b8b 73bc0 - 73c00 73c00
[    0.000000]  73ddd - 73e00
[    0.000000]  73fdd - 74000
[    0.000000]  741dd - 74200
[    0.000000]  743dd - 74400
[    0.000000]  745dd - 74600
[    0.000000]  747dd - 74800
[    0.000000]  749dd - 74a00
[    0.000000]  74bdd - 74c00
[    0.000000]  74ddd - 74e00
[    0.000000]  74fdd - 75000
[    0.000000]  751dd - 75200
[    0.000000]  753dd - 75400
[    0.000000]  755dd - 75600
[    0.000000]  757dd - 75800
[    0.000000]  759dd - 75a00
[    0.000000]  79bdd 79c00 - 7d540 7d550
[    0.000000]  7f745 - 7f750
[    0.000000]  10000b 100040 - 2080000 2080000
so only 79c00 - 7d540 are major free block under 4g...

after this patch, we will get
[    0.000000]  19 - 1f
[    0.000000]  28 40 - 80 95
[    0.000000]  702 740 - 1000 1000
[    0.000000]  347c - 347e
[    0.000000]  34e7 3500 - 3600 3600
[    0.000000]  37dd - 3800
[    0.000000]  39dd - 3a00
[    0.000000]  3bdd - 3c00
[    0.000000]  3ddd - 3e00
[    0.000000]  3fdd - 4000
[    0.000000]  41dd - 4200
[    0.000000]  43dd - 4400
[    0.000000]  45dd - 4600
[    0.000000]  47dd - 4800
[    0.000000]  49dd - 4a00
[    0.000000]  4bdd - 4c00
[    0.000000]  4ddd - 4e00
[    0.000000]  4fdd - 5000
[    0.000000]  51dd - 5200
[    0.000000]  53dd - 5400
[    0.000000]  95dd 9600 - 7d540 7d550
[    0.000000]  7f745 - 7f750
[    0.000000]  17000b 170040 - 2080000 2080000
we will have 9600 - 7d540 for major free block...

sparse-vmemmap path already used __alloc_bootmem_node_high()

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Jiri Slaby <jirislaby@gmail.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Greg Thelen <gthelen@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/sparse.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/sparse.c b/mm/sparse.c
index dc0cc4d43ff3..95ac219af379 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -382,13 +382,15 @@ static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map,
 struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
 {
 	struct page *map;
+	unsigned long size;
 
 	map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
 	if (map)
 		return map;
 
-	map = alloc_bootmem_pages_node(NODE_DATA(nid),
-		       PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION));
+	size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
+	map = __alloc_bootmem_node_high(NODE_DATA(nid), size,
+					 PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
 	return map;
 }
 void __init sparse_mem_maps_populate_node(struct page **map_map,
@@ -412,7 +414,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
 	}
 
 	size = PAGE_ALIGN(size);
-	map = alloc_bootmem_pages_node(NODE_DATA(nodeid), size * map_count);
+	map = __alloc_bootmem_node_high(NODE_DATA(nodeid), size * map_count,
+					 PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
 	if (map) {
 		for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
 			if (!present_section_nr(pnum))
-- 
cgit v1.2.3


From 4b50dc26a0a25a9d1998d206e1f7d849aa78063f Mon Sep 17 00:00:00 2001
From: Huang Shijie <shijie8@gmail.com>
Date: Mon, 24 May 2010 14:31:58 -0700
Subject: shmem: remove redundant code

prep_new_page() will call set_page_private(page, 0) to initialise the
page, so the code is redundant.

Signed-off-by: Huang Shijie <shijie8@gmail.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Acked-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'mm')

diff --git a/mm/shmem.c b/mm/shmem.c
index 0cd7f66f1c66..4ef9797bd430 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -433,8 +433,6 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
 
 		spin_unlock(&info->lock);
 		page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping));
-		if (page)
-			set_page_private(page, 0);
 		spin_lock(&info->lock);
 
 		if (!page) {
-- 
cgit v1.2.3


From e13861d822f8f443ca0c020ea8fc2dc01039cd63 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan.kim@gmail.com>
Date: Mon, 24 May 2010 14:31:59 -0700
Subject: mm: remove return value of putback_lru_pages()

putback_lru_page() never can fail.  So it doesn't matter count of "the
number of pages put back".

In addition, users of this functions don't use return value.

Let's remove unnecessary code.

Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/migrate.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/migrate.c b/mm/migrate.c
index d3f3f7f81075..5938db54e1d7 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -58,23 +58,18 @@ int migrate_prep(void)
 /*
  * Add isolated pages on the list back to the LRU under page lock
  * to avoid leaking evictable pages back onto unevictable list.
- *
- * returns the number of pages put back.
  */
-int putback_lru_pages(struct list_head *l)
+void putback_lru_pages(struct list_head *l)
 {
 	struct page *page;
 	struct page *page2;
-	int count = 0;
 
 	list_for_each_entry_safe(page, page2, l, lru) {
 		list_del(&page->lru);
 		dec_zone_page_state(page, NR_ISOLATED_ANON +
 				page_is_file_cache(page));
 		putback_lru_page(page);
-		count++;
 	}
-	return count;
 }
 
 /*
-- 
cgit v1.2.3


From 6d556294d5b27fb12f18be7495af45b6156a409e Mon Sep 17 00:00:00 2001
From: Bob Liu <lliubbo@gmail.com>
Date: Mon, 24 May 2010 14:31:59 -0700
Subject: mempolicy: remove redundant code

1.  In funtion is_valid_nodemask(), varibable k will be inited to 0 in
   the following loop, needn't init to policy_zone anymore.

2. (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES) has already defined
   to MPOL_MODE_FLAGS in mempolicy.h.

Signed-off-by: Bob Liu <lliubbo@gmail.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 08f40a2f3fe0..ad500f3b12bf 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -127,9 +127,6 @@ static int is_valid_nodemask(const nodemask_t *nodemask)
 {
 	int nd, k;
 
-	/* Check that there is something useful in this mask */
-	k = policy_zone;
-
 	for_each_node_mask(nd, *nodemask) {
 		struct zone *z;
 
@@ -145,7 +142,7 @@ static int is_valid_nodemask(const nodemask_t *nodemask)
 
 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 {
-	return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
+	return pol->flags & MPOL_MODE_FLAGS;
 }
 
 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
-- 
cgit v1.2.3


From 6eb27e1fdf5781719a3d2e90e6c89fa012135c62 Mon Sep 17 00:00:00 2001
From: Bob Liu <lliubbo@gmail.com>
Date: Mon, 24 May 2010 14:32:00 -0700
Subject: mempolicy: remove case MPOL_INTERLEAVE from policy_zonelist()

In policy_zonelist() mode MPOL_INTERLEAVE shouldn't happen, so fall
through to BUG() instead of break to return.  I also fixed the comment.

Signed-off-by: Bob Liu <lliubbo@gmail.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index ad500f3b12bf..d97355b744ab 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1441,15 +1441,13 @@ static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
 		/*
 		 * Normally, MPOL_BIND allocations are node-local within the
 		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
-		 * current node is part of the mask, we use the zonelist for
+		 * current node isn't part of the mask, we use the zonelist for
 		 * the first node in the mask instead.
 		 */
 		if (unlikely(gfp & __GFP_THISNODE) &&
 				unlikely(!node_isset(nd, policy->v.nodes)))
 			nd = first_node(policy->v.nodes);
 		break;
-	case MPOL_INTERLEAVE: /* should not happen */
-		break;
 	default:
 		BUG();
 	}
-- 
cgit v1.2.3


From 1980050250fa052b1c24a19f9b3d82fae14d77f8 Mon Sep 17 00:00:00 2001
From: Bob Liu <lliubbo@gmail.com>
Date: Mon, 24 May 2010 14:32:01 -0700
Subject: mempolicy: remove redundant check

Lee's patch "mempolicy: use MPOL_PREFERRED for system-wide default policy"
has made the MPOL_DEFAULT only used in the memory policy APIs.  So, no
need to check in __mpol_equal also.  Also get rid of mpol_match_intent()
and move its logic directly into __mpol_equal().

Signed-off-by: Bob Liu <lliubbo@gmail.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d97355b744ab..ac5aeafaec9a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1787,16 +1787,6 @@ struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
 	return tompol;
 }
 
-static int mpol_match_intent(const struct mempolicy *a,
-			     const struct mempolicy *b)
-{
-	if (a->flags != b->flags)
-		return 0;
-	if (!mpol_store_user_nodemask(a))
-		return 1;
-	return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
-}
-
 /* Slow path of a mempolicy comparison */
 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 {
@@ -1804,8 +1794,12 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 		return 0;
 	if (a->mode != b->mode)
 		return 0;
-	if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
+	if (a->flags != b->flags)
 		return 0;
+	if (mpol_store_user_nodemask(a))
+		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
+			return 0;
+
 	switch (a->mode) {
 	case MPOL_BIND:
 		/* Fall through */
-- 
cgit v1.2.3


From e17f74af351cce9a1bade7b33af179497fdf95cf Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Mon, 24 May 2010 14:32:02 -0700
Subject: mempolicy: don't call mpol_set_nodemask() when no_context

No need to call mpol_set_nodemask() when we have no context for the
mempolicy.  This can occur when we're parsing a tmpfs 'mpol' mount option.
 Just save the raw nodemask in the mempolicy's w.user_nodemask member for
use when a tmpfs/shmem file is created.  mpol_shared_policy_init() will
"contextualize" the policy for the new file based on the creating task's
context.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Ravikiran Thirumalai <kiran@scalex86.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index ac5aeafaec9a..0e1b293e4054 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2239,7 +2239,10 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
 	if (IS_ERR(new))
 		goto out;
 
-	{
+	if (no_context) {
+		/* save for contextualization */
+		new->w.user_nodemask = nodes;
+	} else {
 		int ret;
 		NODEMASK_SCRATCH(scratch);
 		if (scratch) {
@@ -2255,10 +2258,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
 		}
 	}
 	err = 0;
-	if (no_context) {
-		/* save for contextualization */
-		new->w.user_nodemask = nodes;
-	}
 
 out:
 	/* Restore string for error message */
-- 
cgit v1.2.3


From b4652e8429100ba5c3ddb49499faa1188c98c246 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Mon, 24 May 2010 14:32:03 -0700
Subject: mempolicy: lose unnecessary loop variable in mpol_parse_str()

We don't really need the extra variable 'i' in mpol_parse_str().  The only
use is as the the loop variable.  Then, it's assigned to 'mode'.  Just use
mode, and loose the 'uninitialized_var()' macro.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Ravikiran Thirumalai <kiran@scalex86.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0e1b293e4054..b4f1265df2d8 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2148,12 +2148,11 @@ static const char * const policy_types[] =
 int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
 {
 	struct mempolicy *new = NULL;
-	unsigned short uninitialized_var(mode);
+	unsigned short mode;
 	unsigned short uninitialized_var(mode_flags);
 	nodemask_t nodes;
 	char *nodelist = strchr(str, ':');
 	char *flags = strchr(str, '=');
-	int i;
 	int err = 1;
 
 	if (nodelist) {
@@ -2169,13 +2168,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
 	if (flags)
 		*flags++ = '\0';	/* terminate mode string */
 
-	for (i = 0; i <= MPOL_LOCAL; i++) {
-		if (!strcmp(str, policy_types[i])) {
-			mode = i;
+	for (mode = 0; mode <= MPOL_LOCAL; mode++) {
+		if (!strcmp(str, policy_types[mode])) {
 			break;
 		}
 	}
-	if (i > MPOL_LOCAL)
+	if (mode > MPOL_LOCAL)
 		goto out;
 
 	switch (mode) {
-- 
cgit v1.2.3


From 345ace9c797030e77da8ff211b9502370b9d81ab Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Mon, 24 May 2010 14:32:04 -0700
Subject: mempolicy: rename policy_types and cleanup initialization

Rename 'policy_types[]' to 'policy_modes[]' to better match the array
contents.

Use designated intializer syntax for policy_modes[].

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Ravikiran Thirumalai <kiran@scalex86.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b4f1265df2d8..ade57322fa2a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2121,9 +2121,15 @@ void numa_default_policy(void)
  * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
  * Used only for mpol_parse_str() and mpol_to_str()
  */
-#define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
-static const char * const policy_types[] =
-	{ "default", "prefer", "bind", "interleave", "local" };
+#define MPOL_LOCAL MPOL_MAX
+static const char * const policy_modes[] =
+{
+	[MPOL_DEFAULT]    = "default",
+	[MPOL_PREFERRED]  = "prefer",
+	[MPOL_BIND]       = "bind",
+	[MPOL_INTERLEAVE] = "interleave",
+	[MPOL_LOCAL]      = "local"
+};
 
 
 #ifdef CONFIG_TMPFS
@@ -2169,7 +2175,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
 		*flags++ = '\0';	/* terminate mode string */
 
 	for (mode = 0; mode <= MPOL_LOCAL; mode++) {
-		if (!strcmp(str, policy_types[mode])) {
+		if (!strcmp(str, policy_modes[mode])) {
 			break;
 		}
 	}
@@ -2324,11 +2330,11 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
 		BUG();
 	}
 
-	l = strlen(policy_types[mode]);
+	l = strlen(policy_modes[mode]);
 	if (buffer + maxlen < p + l + 1)
 		return -ENOSPC;
 
-	strcpy(p, policy_types[mode]);
+	strcpy(p, policy_modes[mode]);
 	p += l;
 
 	if (flags & MPOL_MODE_FLAGS) {
-- 
cgit v1.2.3


From 15d77835ac48dbc2d4884376ea6a08b65b1c40ba Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Mon, 24 May 2010 14:32:04 -0700
Subject: mempolicy: factor mpol_shared_policy_init() return paths

Factor out duplicate put/frees in mpol_shared_policy_init() to a common
return path.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Ravikiran Thirumalai <kiran@scalex86.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index ade57322fa2a..0c73c8b814cd 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1995,26 +1995,22 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
 			return;
 		/* contextualize the tmpfs mount point mempolicy */
 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
-		if (IS_ERR(new)) {
-			mpol_put(mpol);	/* drop our ref on sb mpol */
-			NODEMASK_SCRATCH_FREE(scratch);
-			return;		/* no valid nodemask intersection */
-		}
+		if (IS_ERR(new))
+			goto put_free; /* no valid nodemask intersection */
 
 		task_lock(current);
 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
 		task_unlock(current);
 		mpol_put(mpol);	/* drop our ref on sb mpol */
-		if (ret) {
-			NODEMASK_SCRATCH_FREE(scratch);
-			mpol_put(new);
-			return;
-		}
+		if (ret)
+			goto put_free;
 
 		/* Create pseudo-vma that contains just the policy */
 		memset(&pvma, 0, sizeof(struct vm_area_struct));
 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
+
+put_free:
 		mpol_put(new);			/* drop initial ref */
 		NODEMASK_SCRATCH_FREE(scratch);
 	}
-- 
cgit v1.2.3


From 708c1bbc9d0c3e57f40501794d9b0eed29d10fce Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Mon, 24 May 2010 14:32:07 -0700
Subject: mempolicy: restructure rebinding-mempolicy functions

Nick Piggin reported that the allocator may see an empty nodemask when
changing cpuset's mems[1].  It happens only on the kernel that do not do
atomic nodemask_t stores.  (MAX_NUMNODES > BITS_PER_LONG)

But I found that there is also a problem on the kernel that can do atomic
nodemask_t stores.  The problem is that the allocator can't find a node to
alloc page when changing cpuset's mems though there is a lot of free
memory.  The reason is like this:

(mpol: mempolicy)
	task1			task1's mpol	task2
	alloc page		1
	  alloc on node0? NO	1
				1		change mems from 1 to 0
				1		rebind task1's mpol
				0-1		  set new bits
				0	  	  clear disallowed bits
	  alloc on node1? NO	0
	  ...
	can't alloc page
	  goto oom

I can use the attached program reproduce it by the following step:

# mkdir /dev/cpuset
# mount -t cpuset cpuset /dev/cpuset
# mkdir /dev/cpuset/1
# echo `cat /dev/cpuset/cpus` > /dev/cpuset/1/cpus
# echo `cat /dev/cpuset/mems` > /dev/cpuset/1/mems
# echo $$ > /dev/cpuset/1/tasks
# numactl --membind=`cat /dev/cpuset/mems` ./cpuset_mem_hog <nr_tasks> &
   <nr_tasks> = max(nr_cpus - 1, 1)
# killall -s SIGUSR1 cpuset_mem_hog
# ./change_mems.sh

several hours later, oom will happen though there is a lot of free memory.

This patchset fixes this problem by expanding the nodes range first(set
newly allowed bits) and shrink it lazily(clear newly disallowed bits).  So
we use a variable to tell the write-side task that read-side task is
reading nodemask, and the write-side task clears newly disallowed nodes
after read-side task ends the current memory allocation.

This patch:

In order to fix no node to alloc memory, when we want to update mempolicy
and mems_allowed, we expand the set of nodes first (set all the newly
nodes) and shrink the set of nodes lazily(clean disallowed nodes), But the
mempolicy's rebind functions may breaks the expanding.

So we restructure the mempolicy's rebind functions and split the rebind
work to two steps, just like the update of cpuset's mems: The 1st step:
expand the set of the mempolicy's nodes.  The 2nd step: shrink the set of
the mempolicy's nodes.  It is used when there is no real lock to protect
the mempolicy in the read-side.  Otherwise we can do rebind work at once.

In order to implement it, we define

	enum mpol_rebind_step {
		MPOL_REBIND_ONCE,
		MPOL_REBIND_STEP1,
		MPOL_REBIND_STEP2,
		MPOL_REBIND_NSTEP,
	};

If the mempolicy needn't be updated by two steps, we can pass
MPOL_REBIND_ONCE to the rebind functions.  Or we can pass
MPOL_REBIND_STEP1 to do the first step of the rebind work and pass
MPOL_REBIND_STEP2 to do the second step work.

Besides that, it maybe long time between these two step and we have to
release the lock that protects mempolicy and mems_allowed.  If we hold the
lock once again, we must check whether the current mempolicy is under the
rebinding (the first step has been done) or not, because the task may
alloc a new mempolicy when we don't hold the lock.  So we defined the
following flag to identify it:

#define MPOL_F_REBINDING (1 << 2)

The new functions will be used in the next patch.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Paul Menage <menage@google.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Ravikiran Thirumalai <kiran@scalex86.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 124 +++++++++++++++++++++++++++++++++++++++++++