From b595076a180a56d1bb170e6eceda6eb9d76f4cd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 1 Nov 2010 15:38:34 -0400 Subject: tree-wide: fix comment/printk typos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit "gadget", "through", "command", "maintain", "maintain", "controller", "address", "between", "initiali[zs]e", "instead", "function", "select", "already", "equal", "access", "management", "hierarchy", "registration", "interest", "relative", "memory", "offset", "already", Signed-off-by: Uwe Kleine-König Signed-off-by: Jiri Kosina --- mm/percpu.c | 2 +- mm/sparse-vmemmap.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index efe816856a9d..f715d01d5ba3 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -258,7 +258,7 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk, /* * (Un)populated page region iterators. Iterate over (un)populated - * page regions betwen @start and @end in @chunk. @rs and @re should + * page regions between @start and @end in @chunk. @rs and @re should * be integer variables and will be set to start and end page index of * the current region. */ diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 29d6cbffb283..64b984091edb 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -9,7 +9,7 @@ * * However, virtual mappings need a page table and TLBs. Many Linux * architectures already map their physical space using 1-1 mappings - * via TLBs. For those arches the virtual memmory map is essentially + * via TLBs. For those arches the virtual memory map is essentially * for free if we use the same page size as the 1-1 mappings. In that * case the overhead consists of a few additional pages that are * allocated to create a view of memory for vmemmap. -- cgit v1.2.3 From 4a92379bdfb48680a5e6775dd53a586df7b6b0b1 Mon Sep 17 00:00:00 2001 From: Richard Kennedy Date: Thu, 21 Oct 2010 10:29:19 +0100 Subject: slub tracing: move trace calls out of always inlined functions to reduce kernel code size Having the trace calls defined in the always inlined kmalloc functions in include/linux/slub_def.h causes a lot of code duplication as the trace functions get instantiated for each kamalloc call site. This can simply be removed by pushing the trace calls down into the functions in slub.c. On my x86_64 built this patch shrinks the code size of the kernel by approx 36K and also shrinks the code size of many modules -- too many to list here ;) size vmlinux (2.6.36) reports text data bss dec hex filename 5410611 743172 828928 6982711 6a8c37 vmlinux 5373738 744244 828928 6946910 6a005e vmlinux + patch The resulting kernel has had some testing & kmalloc trace still seems to work. This patch - moves trace_kmalloc out of the inlined kmalloc() and pushes it down into kmem_cache_alloc_trace() so this it only get instantiated once. - rename kmem_cache_alloc_notrace() to kmem_cache_alloc_trace() to indicate that now is does have tracing. (maybe this would better being called something like kmalloc_kmem_cache ?) - adds a new function kmalloc_order() to handle allocation and tracing of large allocations of page order. - removes tracing from the inlined kmalloc_large() replacing them with a call to kmalloc_order(); - move tracing out of inlined kmalloc_node() and pushing it down into kmem_cache_alloc_node_trace - rename kmem_cache_alloc_node_notrace() to kmem_cache_alloc_node_trace() - removes the include of trace/events/kmem.h from slub_def.h. v2 - keep kmalloc_order_trace inline when !CONFIG_TRACE Signed-off-by: Richard Kennedy Signed-off-by: Pekka Enberg --- mm/slub.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 8fd5401bb071..7e657aa19475 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -28,6 +28,8 @@ #include #include +#include + /* * Lock order: * 1. slab_lock(page) @@ -1774,11 +1776,21 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) EXPORT_SYMBOL(kmem_cache_alloc); #ifdef CONFIG_TRACING -void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) +void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) +{ + void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); + trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); + return ret; +} +EXPORT_SYMBOL(kmem_cache_alloc_trace); + +void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) { - return slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); + void *ret = kmalloc_order(size, flags, order); + trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags); + return ret; } -EXPORT_SYMBOL(kmem_cache_alloc_notrace); +EXPORT_SYMBOL(kmalloc_order_trace); #endif #ifdef CONFIG_NUMA @@ -1794,13 +1806,17 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) EXPORT_SYMBOL(kmem_cache_alloc_node); #ifdef CONFIG_TRACING -void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, +void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, - int node) + int node, size_t size) { - return slab_alloc(s, gfpflags, node, _RET_IP_); + void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); + + trace_kmalloc_node(_RET_IP_, ret, + size, s->size, gfpflags, node); + return ret; } -EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); +EXPORT_SYMBOL(kmem_cache_alloc_node_trace); #endif #endif -- cgit v1.2.3 From 98072e4d977aabe6a39abb95951cd8bf2c2202d5 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 28 Oct 2010 13:50:37 +0400 Subject: slub: Fix slub_lock down/up imbalance There are two places, that do not release the slub_lock. Respective bugs were introduced by sysfs changes ab4d5ed5 (slub: Enable sysfs support for !CONFIG_SLUB_DEBUG) and 2bce6485 ( slub: Allow removal of slab caches during boot). Acked-by: Christoph Lameter Signed-off-by: Pavel Emelyanov Signed-off-by: Pekka Enberg --- mm/slub.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 7e657aa19475..7796a0446b3f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3289,9 +3289,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, kfree(n); kfree(s); } +err: up_write(&slub_lock); -err: if (flags & SLAB_PANIC) panic("Cannot create slabcache %s\n", name); else @@ -3878,6 +3878,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, x += sprintf(buf + x, " N%d=%lu", node, nodes[node]); #endif + up_read(&slub_lock); kfree(nodes); return x + sprintf(buf + x, "\n"); } -- cgit v1.2.3 From e525fd89d380c4a94c0d63913a1dd1a593ed25e7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 13 Nov 2010 11:55:17 +0100 Subject: block: make blkdev_get/put() handle exclusive access Over time, block layer has accumulated a set of APIs dealing with bdev open, close, claim and release. * blkdev_get/put() are the primary open and close functions. * bd_claim/release() deal with exclusive open. * open/close_bdev_exclusive() are combination of open and claim and the other way around, respectively. * bd_link/unlink_disk_holder() to create and remove holder/slave symlinks. * open_by_devnum() wraps bdget() + blkdev_get(). The interface is a bit confusing and the decoupling of open and claim makes it impossible to properly guarantee exclusive access as in-kernel open + claim sequence can disturb the existing exclusive open even before the block layer knows the current open if for another exclusive access. Reorganize the interface such that, * blkdev_get() is extended to include exclusive access management. @holder argument is added and, if is @FMODE_EXCL specified, it will gain exclusive access atomically w.r.t. other exclusive accesses. * blkdev_put() is similarly extended. It now takes @mode argument and if @FMODE_EXCL is set, it releases an exclusive access. Also, when the last exclusive claim is released, the holder/slave symlinks are removed automatically. * bd_claim/release() and close_bdev_exclusive() are no longer necessary and either made static or removed. * bd_link_disk_holder() remains the same but bd_unlink_disk_holder() is no longer necessary and removed. * open_bdev_exclusive() becomes a simple wrapper around lookup_bdev() and blkdev_get(). It also has an unexpected extra bdev_read_only() test which probably should be moved into blkdev_get(). * open_by_devnum() is modified to take @holder argument and pass it to blkdev_get(). Most of bdev open/close operations are unified into blkdev_get/put() and most exclusive accesses are tested atomically at the open time (as it should). This cleans up code and removes some, both valid and invalid, but unnecessary all the same, corner cases. open_bdev_exclusive() and open_by_devnum() can use further cleanup - rename to blkdev_get_by_path() and blkdev_get_by_devt() and drop special features. Well, let's leave them for another day. Most conversions are straight-forward. drbd conversion is a bit more involved as there was some reordering, but the logic should stay the same. Signed-off-by: Tejun Heo Acked-by: Neil Brown Acked-by: Ryusuke Konishi Acked-by: Mike Snitzer Acked-by: Philipp Reisner Cc: Peter Osterlund Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Jan Kara Cc: Andrew Morton Cc: Andreas Dilger Cc: "Theodore Ts'o" Cc: Mark Fasheh Cc: Joel Becker Cc: Alex Elder Cc: Christoph Hellwig Cc: dm-devel@redhat.com Cc: drbd-dev@lists.linbit.com Cc: Leo Chen Cc: Scott Branden Cc: Chris Mason Cc: Steven Whitehouse Cc: Dave Kleikamp Cc: Joern Engel Cc: reiserfs-devel@vger.kernel.org Cc: Alexander Viro --- mm/swapfile.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 67ddaaf98c74..b6adcfbf6f48 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1677,7 +1677,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) if (S_ISBLK(inode->i_mode)) { struct block_device *bdev = I_BDEV(inode); set_blocksize(bdev, p->old_block_size); - bd_release(bdev); + blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); } else { mutex_lock(&inode->i_mutex); inode->i_flags &= ~S_SWAPFILE; @@ -1939,7 +1939,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = -EINVAL; if (S_ISBLK(inode->i_mode)) { bdev = I_BDEV(inode); - error = bd_claim(bdev, sys_swapon); + error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, + sys_swapon); if (error < 0) { bdev = NULL; error = -EINVAL; @@ -2136,7 +2137,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) bad_swap: if (bdev) { set_blocksize(bdev, p->old_block_size); - bd_release(bdev); + blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); } destroy_swap_extents(p); swap_cgroup_swapoff(type); -- cgit v1.2.3 From 04c3496152394d17e3bc2316f9731ee3e8a026bc Mon Sep 17 00:00:00 2001 From: "Steven J. Magnani" Date: Wed, 24 Nov 2010 12:56:54 -0800 Subject: nommu: yield CPU while disposing VM Depending on processor speed, page size, and the amount of memory a process is allowed to amass, cleanup of a large VM may freeze the system for many seconds. This can result in a watchdog timeout. Make sure other tasks receive some service when cleaning up large VMs. Signed-off-by: Steven J. Magnani Cc: Greg Ungerer Reviewed-by: KOSAKI Motohiro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/nommu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/nommu.c b/mm/nommu.c index 3613517c7592..27a9ac588516 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1717,6 +1717,7 @@ void exit_mmap(struct mm_struct *mm) mm->mmap = vma->vm_next; delete_vma_from_mm(vma); delete_vma(mm, vma); + cond_resched(); } kleave(""); -- cgit v1.2.3 From 112bc2e120a94a511858918d6866a4978f9c500e Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 24 Nov 2010 12:56:58 -0800 Subject: memcg: fix false positive VM_BUG on non-SMP Fix this: kernel BUG at mm/memcontrol.c:2155! invalid opcode: 0000 [#1] last sysfs file: Pid: 18, comm: sh Not tainted 2.6.37-rc3 #3 /Bochs EIP: 0060:[] EFLAGS: 00000246 CPU: 0 EIP is at mem_cgroup_move_account+0xe2/0xf0 EAX: 00000004 EBX: c6f931d4 ECX: c681c300 EDX: c681c000 ESI: c681c300 EDI: ffffffea EBP: c681c000 ESP: c46f3e30 DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 0068 Process sh (pid: 18, ti=c46f2000 task=c6826e60 task.ti=c46f2000) Stack: 00000155 c681c000 0805f000 c46ee180 c46f3e5c c7058820 c1074d37 00000000 08060000 c46db9a0 c46ec080 c7058820 0805f000 08060000 c46f3e98 c1074c50 c106c75e c46f3e98 c46ec080 08060000 0805ffff c46db9a0 c46f3e98 c46e0340 Call Trace: [] ? mem_cgroup_move_charge_pte_range+0xe7/0x130 [] ? mem_cgroup_move_charge_pte_range+0x0/0x130 [] ? walk_page_range+0xee/0x1d0 [] ? mem_cgroup_move_task+0x66/0x90 [] ? mem_cgroup_move_charge_pte_range+0x0/0x130 [] ? mem_cgroup_move_task+0x0/0x90 [] ? cgroup_attach_task+0x136/0x200 [] ? cgroup_tasks_write+0x48/0xc0 [] ? cgroup_file_write+0xde/0x220 [] ? do_page_fault+0x17d/0x3f0 [] ? alloc_fd+0x2d/0xd0 [] ? cgroup_file_write+0x0/0x220 [] ? vfs_write+0x92/0xc0 [] ? sys_write+0x41/0x70 [] ? syscall_call+0x7/0xb Code: 03 00 74 09 8b 44 24 04 e8 1c f1 ff ff 89 73 04 8d 86 b0 00 00 00 b9 01 00 00 00 89 da 31 ff e8 65 f5 ff ff e9 4d ff ff ff 0f 0b <0f> 0b 0f 0b 0f 0b 90 8d b4 26 00 00 00 00 83 ec 10 8b 0d f4 e3 EIP: [] mem_cgroup_move_account+0xe2/0xf0 SS:ESP 0068:c46f3e30 ---[ end trace 7daa1582159b6532 ]--- lock_page_cgroup and unlock_page_cgroup are implemented using bit_spinlock. bit_spinlock doesn't touch the bit if we are on non-SMP machine, so we can't use the bit to check whether the lock was taken. Let's introduce is_page_cgroup_locked based on bit_spin_is_locked instead of PageCgroupLocked to fix it. [akpm@linux-foundation.org: s/is_page_cgroup_locked/page_is_cgroup_locked/] Signed-off-by: Kirill A. Shutemov Reviewed-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2efa8ea07ff7..62d1880f6992 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2152,7 +2152,7 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, { VM_BUG_ON(from == to); VM_BUG_ON(PageLRU(pc->page)); - VM_BUG_ON(!PageCgroupLocked(pc)); + VM_BUG_ON(!page_is_cgroup_locked(pc)); VM_BUG_ON(!PageCgroupUsed(pc)); VM_BUG_ON(pc->mem_cgroup != from); -- cgit v1.2.3 From b1dd693e5b9348bd68a80e679e03cf9c0973b01b Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Wed, 24 Nov 2010 12:57:06 -0800 Subject: memcg: avoid deadlock between move charge and try_charge() __mem_cgroup_try_charge() can be called under down_write(&mmap_sem)(e.g. mlock does it). This means it can cause deadlock if it races with move charge: Ex.1) move charge | try charge --------------------------------------+------------------------------ mem_cgroup_can_attach() | down_write(&mmap_sem) mc.moving_task = current | .. mem_cgroup_precharge_mc() | __mem_cgroup_try_charge() mem_cgroup_count_precharge() | prepare_to_wait() down_read(&mmap_sem) | if (mc.moving_task) -> cannot aquire the lock | -> true | schedule() Ex.2) move charge | try charge --------------------------------------+------------------------------ mem_cgroup_can_attach() | mc.moving_task = current | mem_cgroup_precharge_mc() | mem_cgroup_count_precharge() | down_read(&mmap_sem) | .. | up_read(&mmap_sem) | | down_write(&mmap_sem) mem_cgroup_move_task() | .. mem_cgroup_move_charge() | __mem_cgroup_try_charge() down_read(&mmap_sem) | prepare_to_wait() -> cannot aquire the lock | if (mc.moving_task) | -> true | schedule() To avoid this deadlock, we do all the move charge works (both can_attach() and attach()) under one mmap_sem section. And after this patch, we set/clear mc.moving_task outside mc.lock, because we use the lock only to check mc.from/to. Signed-off-by: Daisuke Nishimura Cc: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 43 ++++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 62d1880f6992..26218df8d19d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -278,13 +278,14 @@ enum move_type { /* "mc" and its members are protected by cgroup_mutex */ static struct move_charge_struct { - spinlock_t lock; /* for from, to, moving_task */ + spinlock_t lock; /* for from, to */ struct mem_cgroup *from; struct mem_cgroup *to; unsigned long precharge; unsigned long moved_charge; unsigned long moved_swap; struct task_struct *moving_task; /* a task moving charges */ + struct mm_struct *mm; wait_queue_head_t waitq; /* a waitq for other context */ } mc = { .lock = __SPIN_LOCK_UNLOCKED(mc.lock), @@ -4631,7 +4632,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) unsigned long precharge; struct vm_area_struct *vma; - down_read(&mm->mmap_sem); + /* We've already held the mmap_sem */ for (vma = mm->mmap; vma; vma = vma->vm_next) { struct mm_walk mem_cgroup_count_precharge_walk = { .pmd_entry = mem_cgroup_count_precharge_pte_range, @@ -4643,7 +4644,6 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) walk_page_range(vma->vm_start, vma->vm_end, &mem_cgroup_count_precharge_walk); } - up_read(&mm->mmap_sem); precharge = mc.precharge; mc.precharge = 0; @@ -4694,11 +4694,16 @@ static void mem_cgroup_clear_mc(void) mc.moved_swap = 0; } + if (mc.mm) { + up_read(&mc.mm->mmap_sem); + mmput(mc.mm); + } spin_lock(&mc.lock); mc.from = NULL; mc.to = NULL; - mc.moving_task = NULL; spin_unlock(&mc.lock); + mc.moving_task = NULL; + mc.mm = NULL; mem_cgroup_end_move(from); memcg_oom_recover(from); memcg_oom_recover(to); @@ -4724,12 +4729,21 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, return 0; /* We move charges only when we move a owner of the mm */ if (mm->owner == p) { + /* + * We do all the move charge works under one mmap_sem to + * avoid deadlock with down_write(&mmap_sem) + * -> try_charge() -> if (mc.moving_task) -> sleep. + */ + down_read(&mm->mmap_sem); + VM_BUG_ON(mc.from); VM_BUG_ON(mc.to); VM_BUG_ON(mc.precharge); VM_BUG_ON(mc.moved_charge); VM_BUG_ON(mc.moved_swap); VM_BUG_ON(mc.moving_task); + VM_BUG_ON(mc.mm); + mem_cgroup_start_move(from); spin_lock(&mc.lock); mc.from = from; @@ -4737,14 +4751,16 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, mc.precharge = 0; mc.moved_charge = 0; mc.moved_swap = 0; - mc.moving_task = current; spin_unlock(&mc.lock); + mc.moving_task = current; + mc.mm = mm; ret = mem_cgroup_precharge_mc(mm); if (ret) mem_cgroup_clear_mc(); - } - mmput(mm); + /* We call up_read() and mmput() in clear_mc(). */ + } else + mmput(mm); } return ret; } @@ -4832,7 +4848,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) struct vm_area_struct *vma; lru_add_drain_all(); - down_read(&mm->mmap_sem); + /* We've already held the mmap_sem */ for (vma = mm->mmap; vma; vma = vma->vm_next) { int ret; struct mm_walk mem_cgroup_move_charge_walk = { @@ -4851,7 +4867,6 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) */ break; } - up_read(&mm->mmap_sem); } static void mem_cgroup_move_task(struct cgroup_subsys *ss, @@ -4860,17 +4875,11 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct task_struct *p, bool threadgroup) { - struct mm_struct *mm; - - if (!mc.to) + if (!mc.mm) /* no need to move charge */ return; - mm = get_task_mm(p); - if (mm) { - mem_cgroup_move_charge(mm); - mmput(mm); - } + mem_cgroup_move_charge(mc.mm); mem_cgroup_clear_mc(); } #else /* !CONFIG_MMU */ -- cgit v1.2.3 From a42c390cfa0c2612459d7226ba11612847ca3a64 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 24 Nov 2010 12:57:08 -0800 Subject: cgroups: make swap accounting default behavior configurable Swap accounting can be configured by CONFIG_CGROUP_MEM_RES_CTLR_SWAP configuration option and then it is turned on by default. There is a boot option (noswapaccount) which can disable this feature. This makes it hard for distributors to enable the configuration option as this feature leads to a bigger memory consumption and this is a no-go for general purpose distribution kernel. On the other hand swap accounting may be very usuful for some workloads. This patch adds a new configuration option which controls the default behavior (CGROUP_MEM_RES_CTLR_SWAP_ENABLED). If the option is selected then the feature is turned on by default. It also adds a new boot parameter swapaccount[=1|0] which enhances the original noswapaccount parameter semantic by means of enable/disable logic (defaults to 1 if no value is provided to be still consistent with noswapaccount). The default behavior is unchanged (if CONFIG_CGROUP_MEM_RES_CTLR_SWAP is enabled then CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED is enabled as well) Signed-off-by: Michal Hocko Acked-by: Daisuke Nishimura Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 26218df8d19d..7a22b4129211 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -61,7 +61,14 @@ struct mem_cgroup *root_mem_cgroup __read_mostly; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ int do_swap_account __read_mostly; -static int really_do_swap_account __initdata = 1; /* for remember boot option*/ + +/* for remember boot option*/ +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED +static int really_do_swap_account __initdata = 1; +#else +static int really_do_swap_account __initdata = 0; +#endif + #else #define do_swap_account (0) #endif @@ -4920,10 +4927,20 @@ struct cgroup_subsys mem_cgroup_subsys = { }; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +static int __init enable_swap_account(char *s) +{ + /* consider enabled if no parameter or 1 is given */ + if (!s || !strcmp(s, "1")) + really_do_swap_account = 1; + else if (!strcmp(s, "0")) + really_do_swap_account = 0; + return 1; +} +__setup("swapaccount", enable_swap_account); static int __init disable_swap_account(char *s) { - really_do_swap_account = 0; + enable_swap_account("0"); return 1; } __setup("noswapaccount", disable_swap_account); -- cgit v1.2.3 From e9959f0f37160e1f5351af828cc981712b5066c1 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 24 Nov 2010 12:57:09 -0800 Subject: mm/page_alloc.c: fix build_all_zonelist() where percpu_alloc() is wrongly called under stop_machine_run() During memory hotplug, build_allzonelists() may be called under stop_machine_run(). In this function, setup_zone_pageset() is called. But it's bug because it will do page allocation under stop_machine_run(). Here is a report from Alok Kataria. BUG: sleeping function called from invalid context at kernel/mutex.c:94 in_atomic(): 0, irqs_disabled(): 1, pid: 4, name: migration/0 Pid: 4, comm: migration/0 Not tainted 2.6.35.6-45.fc14.x86_64 #1 Call Trace: [] __might_sleep+0xeb/0xf0 [] mutex_lock+0x24/0x50 [] pcpu_alloc+0x6d/0x7ee [] ? load_balance+0xbe/0x60e [] ? rt_se_boosted+0x21/0x2f [] ? dequeue_rt_stack+0x18b/0x1ed [] __alloc_percpu+0x10/0x12 [] setup_zone_pageset+0x38/0xbe [] ? build_zonelists_node.clone.58+0x79/0x8c [] __build_all_zonelists+0x419/0x46c [] ? cpu_stopper_thread+0xb2/0x198 [] stop_machine_cpu_stop+0x8e/0xc5 [] ? stop_machine_cpu_stop+0x0/0xc5 [] cpu_stopper_thread+0x108/0x198 [] ? schedule+0x5b2/0x5cc [] ? cpu_stopper_thread+0x0/0x198 [] kthread+0x7f/0x87 [] kernel_thread_helper+0x4/0x10 [] ? kthread+0x0/0x87 [] ? kernel_thread_helper+0x0/0x10 Built 5 zonelists in Node order, mobility grouping on. Total pages: 289456 Policy zone: Normal This patch tries to fix the issue by moving setup_zone_pageset() out from stop_machine_run(). It's obviously not necessary to be called under stop_machine_run(). [akpm@linux-foundation.org: remove unneeded local] Reported-by: Alok Kataria Signed-off-by: KAMEZAWA Hiroyuki Cc: Tejun Heo Cc: Petr Vandrovec Cc: Pekka Enberg Reviewed-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 07a654486f75..e4092704c1a9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3008,14 +3008,6 @@ static __init_refok int __build_all_zonelists(void *data) build_zonelist_cache(pgdat); } -#ifdef CONFIG_MEMORY_HOTPLUG - /* Setup real pagesets for the new zone */ - if (data) { - struct zone *zone = data; - setup_zone_pageset(zone); - } -#endif - /* * Initialize the boot_pagesets that are going to be used * for bootstrapping processors. The real pagesets for @@ -3064,7 +3056,11 @@ void build_all_zonelists(void *data) } else { /* we have to stop all cpus to guarantee there is no user of zonelist */ - stop_machine(__build_all_zonelists, data, NULL); +#ifdef CONFIG_MEMORY_HOTPLUG + if (data) + setup_zone_pageset((struct zone *)data); +#endif + stop_machine(__build_all_zonelists, NULL, NULL); /* cpuset refresh routine should be here */ } vm_total_pages = nr_free_pagecache_pages(); -- cgit v1.2.3 From 5f0af70a25593a9d53b87bc8d31902fb7cc63e40 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 24 Nov 2010 12:57:10 -0800 Subject: mm: remove call to find_vma in pagewalk for non-hugetlbfs Commit d33b9f45 ("mm: hugetlb: fix hugepage memory leak in walk_page_range()") introduces a check if a vma is a hugetlbfs one and later in 5dc37642 ("mm hugetlb: add hugepage support to pagemap") it is moved under #ifdef CONFIG_HUGETLB_PAGE but a needless find_vma call is left behind and its result is not used anywhere else in the function. The side-effect of caching vma for @addr inside walk->mm is neither utilized in walk_page_range() nor in called functions. Signed-off-by: David Sterba Reviewed-by: Naoya Horiguchi Acked-by: Andi Kleen Cc: Andy Whitcroft Cc: David Rientjes Cc: Hugh Dickins Cc: Lee Schermerhorn Cc: Matt Mackall Acked-by: Mel Gorman Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pagewalk.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 8b1a2ce21ee5..38cc58b8b2b0 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -139,7 +139,6 @@ int walk_page_range(unsigned long addr, unsigned long end, pgd_t *pgd; unsigned long next; int err = 0; - struct vm_area_struct *vma; if (addr >= end) return err; @@ -149,15 +148,17 @@ int walk_page_range(unsigned long addr, unsigned long end, pgd = pgd_offset(walk->mm, addr); do { + struct vm_area_struct *uninitialized_var(vma); + next = pgd_addr_end(addr, end); +#ifdef CONFIG_HUGETLB_PAGE /* * handle hugetlb vma individually because pagetable walk for * the hugetlb page is dependent on the architecture and * we can't handled it in the same manner as non-huge pages. */ vma = find_vma(walk->mm, addr); -#ifdef CONFIG_HUGETLB_PAGE if (vma && is_vm_hugetlb_page(vma)) { if (vma->vm_end < next) next = vma->vm_end; -- cgit v1.2.3 From 85beb5869a4f6abb52a7cf8e01de6fa57e9ee47d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 24 Nov 2010 16:23:34 -0500 Subject: tracing/slab: Move kmalloc tracepoint out of inline code The tracepoint for kmalloc is in the slab inlined code which causes every instance of kmalloc to have the tracepoint. This patch moves the tracepoint out of the inline code to the slab C file, which removes a large number of inlined trace points. objdump -dr vmlinux.slab| grep 'jmpq.* Signed-off-by: Pekka Enberg --- mm/slab.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index b1e40dafbab3..dfcc8885d7d5 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3653,11 +3653,18 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) EXPORT_SYMBOL(kmem_cache_alloc); #ifdef CONFIG_TRACING -void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) +void * +kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags) { - return __cache_alloc(cachep, flags, __builtin_return_address(0)); + void *ret; + + ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); + + trace_kmalloc(_RET_IP_, ret, + size, slab_buffer_size(cachep), flags); + return ret; } -EXPORT_SYMBOL(kmem_cache_alloc_notrace); +EXPORT_SYMBOL(kmem_cache_alloc_trace); #endif /** @@ -3705,31 +3712,32 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) EXPORT_SYMBOL(kmem_cache_alloc_node); #ifdef CONFIG_TRACING -void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, - gfp_t flags, - int nodeid) +void *kmem_cache_alloc_node_trace(size_t size, + struct kmem_cache *cachep, + gfp_t flags, + int nodeid) { - return __cache_alloc_node(cachep, flags, nodeid, + void *ret; + + ret = __cache_alloc_node(cachep, flags, nodeid, __builtin_return_address(0)); + trace_kmalloc_node(_RET_IP_, ret, + size, slab_buffer_size(cachep), + flags, nodeid); + return ret; } -EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); +EXPORT_SYMBOL(kmem_cache_alloc_node_trace); #endif static __always_inline void * __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) { struct kmem_cache *cachep; - void *ret; cachep = kmem_find_general_cachep(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; - ret = kmem_cache_alloc_node_notrace(cachep, flags, node); - - trace_kmalloc_node((unsigned long) caller, ret, - size, cachep->buffer_size, flags, node); - - return ret; + return kmem_cache_alloc_node_trace(size, cachep, flags, node); } #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) -- cgit v1.2.3 From fa9f90be745d3b600a9d97a063be404c5e5d9071 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Sun, 28 Nov 2010 21:39:34 +0100 Subject: =?UTF-8?q?Kill=20off=20a=20bunch=20of=20warning:=20=E2=80=98inlin?= =?UTF-8?q?e=E2=80=99=20is=20not=20at=20beginning=20of=20declaration?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These warnings are spewed during a build of a 'allnoconfig' kernel (especially the ones from u64_stats_sync.h show up a lot) when building with -Wextra (which I often do).. They are a) annoying b) easy to get rid of. This patch kills them off. include/linux/u64_stats_sync.h:70:1: warning: ‘inline’ is not at beginning of declaration include/linux/u64_stats_sync.h:77:1: warning: ‘inline’ is not at beginning of declaration include/linux/u64_stats_sync.h:84:1: warning: ‘inline’ is not at beginning of declaration include/linux/u64_stats_sync.h:96:1: warning: ‘inline’ is not at beginning of declaration include/linux/u64_stats_sync.h:115:1: warning: ‘inline’ is not at beginning of declaration include/linux/u64_stats_sync.h:127:1: warning: ‘inline’ is not at beginning of declaration kernel/time.c:241:1: warning: ‘inline’ is not at beginning of declaration kernel/time.c:257:1: warning: ‘inline’ is not at beginning of declaration kernel/perf_event.c:4513:1: warning: ‘inline’ is not at beginning of declaration mm/page_alloc.c:4012:1: warning: ‘inline’ is not at beginning of declaration Signed-off-by: Jesper Juhl Signed-off-by: Jiri Kosina --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 07a654486f75..f823ee192e94 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4013,7 +4013,7 @@ static void __init setup_usemap(struct pglist_data *pgdat, zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); } #else -static void inline setup_usemap(struct pglist_data *pgdat, +static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, unsigned long zonesize) {} #endif /* CONFIG_SPARSEMEM */ -- cgit v1.2.3 From 6072d13c429373c5d63b69dadbbef40a9b035552 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 1 Dec 2010 13:35:19 -0500 Subject: Call the filesystem back whenever a page is removed from the page cache NFS needs to be able to release objects that are stored in the page cache once the page itself is no longer visible from the page cache. This patch adds a callback to the address space operations that allows filesystems to perform page cleanups once the page has been removed from the page cache. Original patch by: Linus Torvalds [trondmy: cover the cases of invalidate_inode_pages2() and truncate_inode_pages()] Signed-off-by: Trond Myklebust --- mm/filemap.c | 5 +++++ mm/truncate.c | 4 ++++ mm/vmscan.c | 7 +++++++ 3 files changed, 16 insertions(+) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index ea89840fc65f..6b9aee20f242 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -143,13 +143,18 @@ void __remove_from_page_cache(struct page *page) void remove_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; + void (*freepage)(struct page *); BUG_ON(!PageLocked(page)); + freepage = mapping->a_ops->freepage; spin_lock_irq(&mapping->tree_lock); __remove_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); + + if (freepage) + freepage(page); } EXPORT_SYMBOL(remove_from_page_cache); diff --git a/mm/truncate.c b/mm/truncate.c index ba887bff48c5..3c2d5ddfa0d4 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -390,6 +390,10 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) __remove_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); + + if (mapping->a_ops->freepage) + mapping->a_ops->freepage(page); + page_cache_release(page); /* pagecache ref */ return 1; failed: diff --git a/mm/vmscan.c b/mm/vmscan.c index d31d7ce52c0e..9ca587c69274 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -494,9 +494,16 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) spin_unlock_irq(&mapping->tree_lock); swapcache_free(swap, page); } else { + void (*freepage)(struct page *); + + freepage = mapping->a_ops->freepage; + __remove_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); + + if (freepage != NULL) + freepage(page); } return 1; -- cgit v1.2.3 From 1f64d69c7ad2e48e697493e45590679f7a69b7b2 Mon Sep 17 00:00:00 2001 From: Dean Nelson Date: Thu, 2 Dec 2010 14:31:12 -0800 Subject: mm/hugetlb.c: avoid double unlock_page() in hugetlb_fault() Have hugetlb_fault() call unlock_page(page) only if it had previously called lock_page(page). Setting CONFIG_DEBUG_VM=y and then running the libhugetlbfs test suite, resulted in the tripping of VM_BUG_ON(!PageLocked(page)) in unlock_page() having been called by hugetlb_fault() when page == pagecache_page. This patch remedied the problem. Signed-off-by: Dean Nelson Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c4a3558589ab..85855240933d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2738,7 +2738,8 @@ out_page_table_lock: unlock_page(pagecache_page); put_page(pagecache_page); } - unlock_page(page); + if (page != pagecache_page) + unlock_page(page); out_mutex: mutex_unlock(&hugetlb_instantiation_mutex); -- cgit v1.2.3 From 55cfaa3cbdd29c4919ecb5fb8965c310f357e48c Mon Sep 17 00:00:00 2001 From: Zeng Zhaoming Date: Thu, 2 Dec 2010 14:31:13 -0800 Subject: mm/mempolicy.c: add rcu read lock to protect pid structure find_task_by_vpid() should be protected by rcu_read_lock(), to prevent free_pid() reclaiming pid. Signed-off-by: Zeng Zhaoming Cc: "Paul E. McKenney" Cc: KOSAKI Motohiro Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4a57f135b76e..11ff260fb282 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1307,15 +1307,18 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, goto out; /* Find the mm_struct */ + rcu_read_lock(); read_lock(&tasklist_lock); task = pid ? find_task_by_vpid(pid) : current; if (!task) { read_unlock(&tasklist_lock); + rcu_read_unlock(); err = -ESRCH; goto out; } mm = get_task_mm(task); read_unlock(&tasklist_lock); + rcu_read_unlock(); err = -EINVAL; if (!mm) -- cgit v1.2.3 From e172662d113ceb22db727a979bb35b9c02f703b5 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Thu, 2 Dec 2010 14:31:13 -0800 Subject: vmstat: fix dirty threshold ordering The nr_dirty_[background_]threshold fields are misplaced before the numa_* fields, and users will read strange values. This is the right order. Before patch, nr_dirty_background_threshold will read as 0 (the value from numa_miss). numa_hit 128501 numa_miss 0 numa_foreign 0 numa_interleave 7388 numa_local 128501 numa_other 0 nr_dirty_threshold 144291 nr_dirty_background_threshold 72145 Signed-off-by: Wu Fengguang Cc: Michael Rubin Reviewed-by: KOSAKI Motohiro Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index 42eac4d33216..8f62f17ee1c7 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -750,8 +750,6 @@ static const char * const vmstat_text[] = { "nr_shmem", "nr_dirtied", "nr_written", - "nr_dirty_threshold", - "nr_dirty_background_threshold", #ifdef CONFIG_NUMA "numa_hit", @@ -761,6 +759,8 @@ static const char * const vmstat_text[] = { "numa_local", "numa_other", #endif + "nr_dirty_threshold", + "nr_dirty_background_threshold", #ifdef CONFIG_VM_EVENT_COUNTERS "pgpgin", -- cgit v1.2.3 From 64141da587241301ce8638cc945f8b67853156ec Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 2 Dec 2010 14:31:18 -0800 Subject: vmalloc: eagerly clear ptes on vunmap On stock 2.6.37-rc4, running: # mount lilith:/export /mnt/lilith # find /mnt/lilith/ -type f -print0 | xargs -0 file crashes the machine fairly quickly under Xen. Often it results in oops messages, but the couple of times I tried just now, it just hung quietly and made Xen print some rude messages: (XEN) mm.c:2389:d80 Bad type (saw 7400000000000001 != exp 3000000000000000) for mfn 1d7058 (pfn 18fa7) (XEN) mm.c:964:d80 Attempt to create linear p.t. with write perms (XEN) mm.c:2389:d80 Bad type (saw 7400000000000010 != exp 1000000000000000) for mfn 1d2e04 (pfn 1d1fb) (XEN) mm.c:2965:d80 Error while pinning mfn 1d2e04 Which means the domain tried to map a pagetable page RW, which would allow it to map arbitrary memory, so Xen stopped it. This is because vm_unmap_ram() left some pages mapped in the vmalloc area after NFS had finished with them, and those pages got recycled as pagetable pages while still having these RW aliases. Removing those mappings immediately removes the Xen-visible aliases, and so it has no problem with those pages being reused as pagetable pages. Deferring the TLB flush doesn't upset Xen because it can flush the TLB itself as needed to maintain its invariants. When unmapping a region in the vmalloc space, clear the ptes immediately. There's no point in deferring this because there's no amortization benefit. The TLBs are left dirty, and they are flushed lazily to amortize the cost of the IPIs. This specific motivation for this patch is an oops-causing regression since 2.6.36 when using NFS under Xen, triggered by the NFS client's use of vm_map_ram() introduced in 56e4ebf877b60 ("NFS: readdir with vmapped pages") . XFS also uses vm_map_ram() and could cause similar problems. Signed-off-by: Jeremy Fitzhardinge Cc: Nick Piggin Cc: Bryan Schumaker Cc: Trond Myklebust Cc: Alex Elder Cc: Dave Chinner Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a3d66b3dc5cb..eb5cc7d00c5a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -31,8 +31,6 @@ #include #include -bool vmap_lazy_unmap __read_mostly = true; - /*** Page table manipulation functions ***/ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) @@ -503,9 +501,6 @@ static unsigned long lazy_max_pages(void) { unsigned int log; - if (!vmap_lazy_unmap) - return 0; - log = fls(num_online_cpus()); return log * (32UL * 1024 * 1024 / PAGE_SIZE); @@ -566,7 +561,6 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, if (va->va_end > *end) *end = va->va_end; nr += (va->va_end - va->va_start) >> PAGE_SHIFT; - unmap_vmap_area(va); list_add_tail(&va->purge_list, &valist); va->flags |= VM_LAZY_FREEING; va->flags &= ~VM_LAZY_FREE; @@ -611,10 +605,11 @@ static void purge_vmap_area_lazy(void) } /* - * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been - * called for the correct range previously. + * Free a vmap area, caller ensuring that the area has been unmapped + * and flush_cache_vunmap had been called for the correct range + * previously. */ -static void free_unmap_vmap_area_noflush(struct vmap_area *va) +static void free_vmap_area_noflush(struct vmap_area *va) { va->flags |= VM_LAZY_FREE; atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); @@ -622,6 +617,16 @@ static void free_unmap_vmap_area_noflush(struct vmap_area *va) try_purge_vmap_area_lazy(); } +/* + * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been + * called for the correct range previously. + */ +static void free_unmap_vmap_area_noflush(struct vmap_area *va) +{ + unmap_vmap_area(va); + free_vmap_area_noflush(va); +} + /* * Free and unmap a vmap area */ @@ -798,7 +803,7 @@ static void free_vmap_block(struct vmap_block *vb) spin_unlock(&vmap_block_tree_lock); BUG_ON(tmp != vb); - free_unmap_vmap_area_noflush(vb->va); + free_vmap_area_noflush(vb->va); call_rcu(&vb->rcu_head, rcu_free_vb); } @@ -936,6 +941,8 @@ static void vb_free(const void *addr, unsigned long size) rcu_read_unlock(); BUG_ON(!vb); + vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); + spin_lock(&vb->lock); BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order)); @@ -988,7 +995,6 @@ void vm_unmap_aliases(void) s = vb->va->va_start + (i << PAGE_SHIFT); e = vb->va->va_start + (j << PAGE_SHIFT); - vunmap_page_range(s, e); flush = 1; if (s < start) -- cgit v1.2.3 From 20d6c96b5f1cad5c5da4641945ec17a1d9a1afc8 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 2 Dec 2010 14:31:19 -0800 Subject: mem-hotplug: introduce {un}lock_memory_hotplug() Presently hwpoison is using lock_system_sleep() to prevent a race with memory hotplug. However lock_system_sleep() is a no-op if CONFIG_HIBERNATION=n. Therefore we need a new lock. Signed-off-by: KOSAKI Motohiro Cc: Andi Kleen Cc: Kamezawa Hiroyuki Suggested-by: Hugh Dickins Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 8 ++++---- mm/memory_hotplug.c | 31 ++++++++++++++++++++++++------- 2 files changed, 28 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 124324134ff6..46ab2c044b0e 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -51,6 +51,7 @@ #include #include #include +#include #include "internal.h" int sysctl_memory_failure_early_kill __read_mostly = 0; @@ -1230,11 +1231,10 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) return 1; /* - * The lock_system_sleep prevents a race with memory hotplug, - * because the isolation assumes there's only a single user. + * The lock_memory_hotplug prevents a race with memory hotplug. * This is a big hammer, a better would be nicer. */ - lock_system_sleep(); + lock_memory_hotplug(); /* * Isolate the page, so that it doesn't get reallocated if it @@ -1264,7 +1264,7 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) ret = 1; } unset_migratetype_isolate(p); - unlock_system_sleep(); + unlock_memory_hotplug(); return ret; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9260314a221e..2c6523af5473 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -34,6 +34,23 @@ #include "internal.h" +DEFINE_MUTEX(mem_hotplug_mutex); + +void lock_memory_hotplug(void) +{ + mutex_lock(&mem_hotplug_mutex); + + /* for exclusive hibernation if CONFIG_HIBERNATION=y */ + lock_system_sleep(); +} + +void unlock_memory_hotplug(void) +{ + unlock_system_sleep(); + mutex_unlock(&mem_hotplug_mutex); +} + + /* add this memory to iomem resource */ static struct resource *register_memory_resource(u64 start, u64 size) { @@ -493,7 +510,7 @@ int mem_online_node(int nid) pg_data_t *pgdat; int ret; - lock_system_sleep(); + lock_memory_hotplug(); pgdat = hotadd_new_pgdat(nid, 0); if (pgdat) { ret = -ENOMEM; @@ -504,7 +521,7 @@ int mem_online_node(int nid) BUG_ON(ret); out: - unlock_system_sleep(); + unlock_memory_hotplug(); return ret; } @@ -516,7 +533,7 @@ int __ref add_memory(int nid, u64 start, u64 size) struct resource *res; int ret; - lock_system_sleep(); + lock_memory_hotplug(); res = register_memory_resource(start, size); ret = -EEXIST; @@ -563,7 +580,7 @@ error: release_memory_resource(res); out: - unlock_system_sleep(); + unlock_memory_hotplug(); return ret; } EXPORT_SYMBOL_GPL(add_memory); @@ -791,7 +808,7 @@ static int offline_pages(unsigned long start_pfn, if (!test_pages_in_a_zone(start_pfn, end_pfn)) return -EINVAL; - lock_system_sleep(); + lock_memory_hotplug(); zone = page_zone(pfn_to_page(start_pfn)); node = zone_to_nid(zone); @@ -880,7 +897,7 @@ repeat: writeback_set_ratelimit(); memory_notify(MEM_OFFLINE, &arg); - unlock_system_sleep(); + unlock_memory_hotplug(); return 0; failed_removal: @@ -891,7 +908,7 @@ failed_removal: undo_isolate_page_range(start_pfn, end_pfn); out: - unlock_system_sleep(); + unlock_memory_hotplug(); return ret; } -- cgit v1.2.3 From a0b0f58cdd32ab363a600a294ddaa90f0c32de8c Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 2 Dec 2010 14:31:20 -0800 Subject: ksm: annotate ksm_thread_mutex is no deadlock source commit 62b61f611e ("ksm: memory hotremove migration only") caused the following new lockdep warning. ======================================================= [ INFO: possible circular locking dependency detected ] ------------------------------------------------------- bash/1621 is trying to acquire lock: ((memory_chain).rwsem){.+.+.+}, at: [] __blocking_notifier_call_chain+0x69/0xc0 but task is already holding lock: (ksm_thread_mutex){+.+.+.}, at: [] ksm_memory_callback+0x3a/0xc0 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (ksm_thread_mutex){+.+.+.}: [] lock_acquire+0xaa/0x140 [] __mutex_lock_common+0x44/0x3f0 [] mutex_lock_nested+0x48/0x60 [] ksm_memory_callback+0x3a/0xc0 [] notifier_call_chain+0x8c/0xe0 [] __blocking_notifier_call_chain+0x7e/0xc0 [] blocking_notifier_call_chain+0x16/0x20 [] memory_notify+0x1b/0x20 [] remove_memory+0x1cc/0x5f0 [] memory_block_change_state+0xfd/0x1a0 [] store_mem_state+0xe2/0xf0 [] sysdev_store+0x20/0x30 [] sysfs_write_file+0xe6/0x170 [] vfs_write+0xc8/0x190 [] sys_write+0x54/0x90 [] system_call_fastpath+0x16/0x1b -> #0 ((memory_chain).rwsem){.+.+.+}: [] __lock_acquire+0x155a/0x1600 [] lock_acquire+0xaa/0x140 [] down_read+0x51/0xa0 [] __blocking_notifier_call_chain+0x69/0xc0 [] blocking_notifier_call_chain+0x16/0x20 [] memory_notify+0x1b/0x20 [] remove_memory+0x56e/0x5f0 [] memory_block_change_state+0xfd/0x1a0 [] store_mem_state+0xe2/0xf0 [] sysdev_store+0x20/0x30 [] sysfs_write_file+0xe6/0x170 [] vfs_write+0xc8/0x190 [] sys_write+0x54/0x90 [] system_call_fastpath+0x16/0x1b But it's a false positive. Both memory_chain.rwsem and ksm_thread_mutex have an outer lock (mem_hotplug_mutex). So they cannot deadlock. Thus, This patch annotate ksm_thread_mutex is not deadlock source. [akpm@linux-foundation.org: update comment, from Hugh] Signed-off-by: KOSAKI Motohiro Acked-by: Hugh Dickins Cc: Andrea Arcangeli Cc: Andi Kleen Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index 65ab5c7067d9..43bc893470b4 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1724,8 +1724,13 @@ static int ksm_memory_callback(struct notifier_block *self, /* * Keep it very simple for now: just lock out ksmd and * MADV_UNMERGEABLE while any memory is going offline. + * mutex_lock_nested() is necessary because lockdep was alarmed + * that here we take ksm_thread_mutex inside notifier chain + * mutex, and later take notifier chain mutex inside + * ksm_thread_mutex to unlock it. But that's safe because both + * are inside mem_hotplug_mutex. */ - mutex_lock(&ksm_thread_mutex); + mutex_lock_nested(&ksm_thread_mutex, SINGLE_DEPTH_NESTING); break; case MEM_OFFLINE: -- cgit v1.2.3 From 8165984acf825917437debae519209073c32a5a7 Mon Sep 17 00:00:00 2001 From: Tero Roponen Date: Wed, 1 Dec 2010 20:04:20 +0200 Subject: slub: Fix a crash during slabinfo -v Commit f7cb1933621bce66a77f690776a16fe3ebbc4d58 ("SLUB: Pass active and inactive redzone flags instead of boolean to debug functions") missed two instances of check_object(). This caused a lot of warnings during 'slabinfo -v' finally leading to a crash: BUG ext4_xattr: Freepointer corrupt ... BUG buffer_head: Freepointer corrupt ... BUG ext4_alloc_context: Freepointer corrupt ... ... BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 IP: [] file_sb_list_del+0x1c/0x35 PGD 79d78067 PUD 79e67067 PMD 0 Oops: 0002 [#1] SMP last sysfs file: /sys/kernel/slab/:t-0000192/validate This patch fixes the problem by converting the two missed instances. Acked-by: Christoph Lameter Signed-off-by: Tero Roponen Signed-off-by: Pekka Enberg --- mm/slub.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 7796a0446b3f..48d82a55f4b6 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3417,13 +3417,13 @@ static int validate_slab(struct kmem_cache *s, struct page *page, for_each_free_object(p, s, page->freelist) { set_bit(slab_index(p, s, addr), map); - if (!check_object(s, page, p, 0)) + if (!check_object(s, page, p, SLUB_RED_INACTIVE)) return 0; } for_each_object(p, s, addr, page->objects) if (!test_bit(slab_index(p, s, addr), map)) - if (!check_object(s, page, p, 1)) + if (!check_object(s, page, p, SLUB_RED_ACTIVE)) return 0; return 1; } -- cgit v1.2.3 From 37d57443d5d810c6ef49e93586b046e7d4774818 Mon Sep 17 00:00:00 2001 From: Tero Roponen Date: Wed, 1 Dec 2010 20:04:20 +0200 Subject: slub: Fix a crash during slabinfo -v Commit f7cb1933621bce66a77f690776a16fe3ebbc4d58 ("SLUB: Pass active and inactive redzone flags instead of boolean to debug functions") missed two instances of check_object(). This caused a lot of warnings during 'slabinfo -v' finally leading to a crash: BUG ext4_xattr: Freepointer corrupt ... BUG buffer_head: Freepointer corrupt ... BUG ext4_alloc_context: Freepointer corrupt ... ... BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 IP: [] file_sb_list_del+0x1c/0x35 PGD 79d78067 PUD 79e67067 PMD 0 Oops: 0002 [#1] SMP last sysfs file: /sys/kernel/slab/:t-0000192/validate This patch fixes the problem by converting the two missed instances. Acked-by: Christoph Lameter Signed-off-by: Tero Roponen Signed-off-by: Pekka Enberg --- mm/slub.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 981fb730aa04..bec0e355fbad 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3401,13 +3401,13 @@ static int validate_slab(struct kmem_cache *s, struct page *page, for_each_free_object(p, s, page->freelist) { set_bit(slab_index(p, s, addr), map); - if (!check_object(s, page, p, 0)) + if (!check_object(s, page, p, SLUB_RED_INACTIVE)) return 0; } for_each_object(p, s, addr, page->objects) if (!test_bit(slab_index(p, s, addr), map)) - if (!check_object(s, page, p, 1)) + if (!check_object(s, page, p, SLUB_RED_ACTIVE)) return 0; return 1; } -- cgit v1.2.3 From c9e664f1fdf34aa8cede047b206deaa8f1945af0 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 3 Dec 2010 22:57:45 +0100 Subject: PM / Hibernate: Fix memory corruption related to swap There is a problem that swap pages allocated before the creation of a hibernation image can be released and used for storing the contents of different memory pages while the image is being saved. Since the kernel stored in the image doesn't know of that, it causes memory corruption to occur after resume from hibernation, especially on systems with relatively small RAM that need to swap often. This issue can be addressed by keeping the GFP_IOFS bits clear in gfp_allowed_mask during the entire hibernation, including the saving of the image, until the system is finally turned off or the hibernation is aborted. Unfortunately, for this purpose it's necessary to rework the way in which the hibernate and suspend code manipulates gfp_allowed_mask. This change is based on an earlier patch from Hugh Dickins. Signed-off-by: Rafael J. Wysocki Reported-by: Ondrej Zary Acked-by: Hugh Dickins Reviewed-by: KAMEZAWA Hiroyuki Cc: stable@kernel.org --- mm/page_alloc.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e4092704c1a9..ff7e15872398 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -104,19 +104,24 @@ gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; * only be modified with pm_mutex held, unless the suspend/hibernate code is * guaranteed not to run in parallel with that modification). */ -void set_gfp_allowed_mask(gfp_t mask) + +static gfp_t saved_gfp_mask; + +void pm_restore_gfp_mask(void) { WARN_ON(!mutex_is_locked(&pm_mutex)); - gfp_allowed_mask = mask; + if (saved_gfp_mask) { + gfp_allowed_mask = saved_gfp_mask; + saved_gfp_mask = 0; + } } -gfp_t clear_gfp_allowed_mask(gfp_t mask) +void pm_restrict_gfp_mask(void) { - gfp_t ret = gfp_allowed_mask; - WARN_ON(!mutex_is_locked(&pm_mutex)); - gfp_allowed_mask &= ~mask; - return ret; + WARN_ON(saved_gfp_mask); + saved_gfp_mask = gfp_allowed_mask; + gfp_allowed_mask &= ~GFP_IOFS; } #endif /* CONFIG_PM_SLEEP */ -- cgit v1.2.3 From 7af4c0932437f97938eef67e553c8d211f9edf33 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Sat, 30 Oct 2010 15:56:54 +0200 Subject: percpu: zero memory more efficiently in mm/percpu.c::pcpu_mem_alloc() Don't do vmalloc() + memset() when vzalloc() will do. tj: dropped unnecessary temp variable ptr. Signed-off-by: Jesper Juhl Signed-off-by: Tejun Heo --- mm/percpu.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index efe816856a9d..9e16d1c9ebd5 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -293,12 +293,8 @@ static void *pcpu_mem_alloc(size_t size) if (size <= PAGE_SIZE) return kzalloc(size, GFP_KERNEL); - else { - void *ptr = vmalloc(size); - if (ptr) - memset(ptr, 0, size); - return ptr; - } + else + return vzalloc(size); } /** -- cgit v1.2.3 From afe2c511fb2d75f1515081ff1be15bd79cfe722d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 14 Dec 2010 16:21:17 +0100 Subject: workqueue: convert cancel_rearming_delayed_work[queue]() users to cancel_delayed_work_sync() cancel_rearming_delayed_work[queue]() has been superceded by cancel_delayed_work_sync() quite some time ago. Convert all the in-kernel users. The conversions are completely equivalent and trivial. Signed-off-by: Tejun Heo Acked-by: "David S. Miller" Acked-by: Greg Kroah-Hartman Acked-by: Evgeniy Polyakov Cc: Jeff Garzik Cc: Benjamin Herrenschmidt Cc: Mauro Carvalho Chehab Cc: netdev@vger.kernel.org Cc: Anton Vorontsov Cc: David Woodhouse Cc: "J. Bruce Fields" Cc: Neil Brown Cc: Alex Elder Cc: xfs-masters@oss.sgi.com Cc: Christoph Lameter Cc: Pekka Enberg Cc: Andrew Morton Cc: netfilter-devel@vger.kernel.org Cc: Trond Myklebust Cc: linux-nfs@vger.kernel.org --- mm/slab.c | 2 +- mm/vmstat.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index b1e40dafbab3..dc983867682b 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1293,7 +1293,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, * anything expensive but will only modify reap_work * and reschedule the timer. */ - cancel_rearming_delayed_work(&per_cpu(slab_reap_work, cpu)); + cancel_delayed