From 7a401a972df8e184b3d1a3fc958c0a4ddee8d312 Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Fri, 11 Nov 2011 13:29:04 +0100 Subject: backing-dev: ensure wakeup_timer is deleted bdi_prune_sb() in bdi_unregister() attempts to removes the bdi links from all super_blocks and then del_timer_sync() the writeback timer. However, this can race with __mark_inode_dirty(), leading to bdi_wakeup_thread_delayed() rearming the writeback timer on the bdi we're unregistering, after we've called del_timer_sync(). This can end up with the bdi being freed with an active timer inside it, as in the case of the following dump after the removal of an SD card. Fix this by redoing the del_timer_sync() in bdi_destory(). ------------[ cut here ]------------ WARNING: at /home/rabin/kernel/arm/lib/debugobjects.c:262 debug_print_object+0x9c/0xc8() ODEBUG: free active (active state 0) object type: timer_list hint: wakeup_timer_fn+0x0/0x180 Modules linked in: Backtrace: [] (dump_backtrace+0x0/0x110) from [] (dump_stack+0x18/0x1c) r6:c02bc638 r5:00000106 r4:c79f5d18 r3:00000000 [] (dump_stack+0x0/0x1c) from [] (warn_slowpath_common+0x54/0x6c) [] (warn_slowpath_common+0x0/0x6c) from [] (warn_slowpath_fmt+0x38/0x40) r8:20000013 r7:c780c6f0 r6:c031613c r5:c780c6f0 r4:c02b1b29 r3:00000009 [] (warn_slowpath_fmt+0x0/0x40) from [] (debug_print_object+0x9c/0xc8) r3:c02b1b29 r2:c02bc662 [] (debug_print_object+0x0/0xc8) from [] (debug_check_no_obj_freed+0xac/0x1dc) r6:c7964000 r5:00000001 r4:c7964000 [] (debug_check_no_obj_freed+0x0/0x1dc) from [] (kmem_cache_free+0x88/0x1f8) [] (kmem_cache_free+0x0/0x1f8) from [] (blk_release_queue+0x70/0x78) [] (blk_release_queue+0x0/0x78) from [] (kobject_release+0x70/0x84) r5:c79641f0 r4:c796420c [] (kobject_release+0x0/0x84) from [] (kref_put+0x68/0x80) r7:00000083 r6:c74083d0 r5:c015289c r4:c796420c [] (kref_put+0x0/0x80) from [] (kobject_put+0x48/0x5c) r5:c79643b4 r4:c79641f0 [] (kobject_put+0x0/0x5c) from [] (blk_cleanup_queue+0x68/0x74) r4:c7964000 [] (blk_cleanup_queue+0x0/0x74) from [] (mmc_blk_put+0x78/0xe8) r5:00000000 r4:c794c400 [] (mmc_blk_put+0x0/0xe8) from [] (mmc_blk_release+0x24/0x38) r5:c794c400 r4:c0322824 [] (mmc_blk_release+0x0/0x38) from [] (__blkdev_put+0xe8/0x170) r5:c78d5e00 r4:c74083c0 [] (__blkdev_put+0x0/0x170) from [] (blkdev_put+0x11c/0x12c) r8:c79f5f70 r7:00000001 r6:c74083d0 r5:00000083 r4:c74083c0 r3:00000000 [] (blkdev_put+0x0/0x12c) from [] (kill_block_super+0x60/0x6c) r7:c7942300 r6:c79f4000 r5:00000083 r4:c74083c0 [] (kill_block_super+0x0/0x6c) from [] (deactivate_locked_super+0x44/0x70) r6:c79f4000 r5:c031af64 r4:c794dc00 r3:c00b06c4 [] (deactivate_locked_super+0x0/0x70) from [] (deactivate_super+0x6c/0x70) r5:c794dc00 r4:c794dc00 [] (deactivate_super+0x0/0x70) from [] (mntput_no_expire+0x188/0x194) r5:c794dc00 r4:c7942300 [] (mntput_no_expire+0x0/0x194) from [] (sys_umount+0x2e4/0x310) r6:c7942300 r5:00000000 r4:00000000 r3:00000000 [] (sys_umount+0x0/0x310) from [] (ret_fast_syscall+0x0/0x30) ---[ end trace e5c83c92ada51c76 ]--- Cc: stable@kernel.org Signed-off-by: Rabin Vincent Signed-off-by: Linus Walleij Signed-off-by: Jens Axboe --- mm/backing-dev.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index a0860640378d..71034f41a2ba 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -724,6 +724,14 @@ void bdi_destroy(struct backing_dev_info *bdi) bdi_unregister(bdi); + /* + * If bdi_unregister() had already been called earlier, the + * wakeup_timer could still be armed because bdi_prune_sb() + * can race with the bdi_wakeup_thread_delayed() calls from + * __mark_inode_dirty(). + */ + del_timer_sync(&bdi->wb.wakeup_timer); + for (i = 0; i < NR_BDI_STAT_ITEMS; i++) percpu_counter_destroy(&bdi->bdi_stat[i]); -- cgit v1.2.3 From 5aecc85abdb9ac2b0e6548d13652a34142e7ae89 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 15 Nov 2011 14:36:07 -0800 Subject: oom: do not kill tasks with oom_score_adj OOM_SCORE_ADJ_MIN Commit c9f01245 ("oom: remove oom_disable_count") has removed the oom_disable_count counter which has been used for early break out from oom_badness so we could never select a task with oom_score_adj set to OOM_SCORE_ADJ_MIN (oom disabled). Now that the counter is gone we are always going through heuristics calculation and we always return a non zero positive value. This means that we can end up killing a task with OOM disabled because it is indistinguishable from regular tasks with 1% resp. CAP_SYS_ADMIN tasks with 3% usage of memory or tasks with oom_score_adj set but OOM enabled. Let's break out early if the task should have OOM disabled. Signed-off-by: Michal Hocko Acked-by: David Rientjes Acked-by: KOSAKI Motohiro Cc: Oleg Nesterov Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 471dedb463ab..76f2c5ae908e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -185,6 +185,11 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, if (!p) return 0; + if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + task_unlock(p); + return 0; + } + /* * The memory controller may have a limit of 0 bytes, so avoid a divide * by zero, if necessary. -- cgit v1.2.3 From ea4039a34c4c206d015d34a49d0b00868e37db1d Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Tue, 15 Nov 2011 14:36:12 -0800 Subject: hugetlb: release pages in the error path of hugetlb_cow() If we fail to prepare an anon_vma, the {new, old}_page should be released, or they will leak. Signed-off-by: Hillf Danton Reviewed-by: Andrea Arcangeli Cc: Hugh Dickins Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dae27ba3be2c..bb28a5f9db8d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2422,6 +2422,8 @@ retry_avoidcopy: * anon_vma prepared. */ if (unlikely(anon_vma_prepare(vma))) { + page_cache_release(new_page); + page_cache_release(old_page); /* Caller expects lock to be held */ spin_lock(&mm->page_table_lock); return VM_FAULT_OOM; -- cgit v1.2.3 From 499d05ecf990a7a7bbf9e0a273f9969f8ec69efc Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 16 Nov 2011 19:34:48 +0800 Subject: mm: Make task in balance_dirty_pages() killable There is no reason why task in balance_dirty_pages() shouldn't be killable and it helps in recovering from some error conditions (like when filesystem goes in error state and cannot accept writeback anymore but we still want to kill processes using it to be able to unmount it). There will be follow up patches to further abort the generic_perform_write() and other filesystem write loops, to avoid large write + SIGKILL combination exceeding the dirty limit and possibly strange OOM. Reported-by: Kazuya Mio Tested-by: Kazuya Mio Reviewed-by: Neil Brown Reviewed-by: KOSAKI Motohiro Signed-off-by: Jan Kara Signed-off-by: Wu Fengguang --- mm/page-writeback.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index a3278f005230..79c34419fadd 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1133,7 +1133,7 @@ pause: pages_dirtied, pause, start_time); - __set_current_state(TASK_UNINTERRUPTIBLE); + __set_current_state(TASK_KILLABLE); io_schedule_timeout(pause); dirty_thresh = hard_dirty_limit(dirty_thresh); @@ -1145,6 +1145,9 @@ pause: */ if (nr_dirty < dirty_thresh) break; + + if (fatal_signal_pending(current)) + break; } if (!dirty_exceeded && bdi->dirty_exceeded) -- cgit v1.2.3 From cd12909cb576d37311fe35868780e82d5007d0c8 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Thu, 29 Sep 2011 16:53:32 +0100 Subject: xen: map foreign pages for shared rings by updating the PTEs directly When mapping a foreign page with xenbus_map_ring_valloc() with the GNTTABOP_map_grant_ref hypercall, set the GNTMAP_contains_pte flag and pass a pointer to the PTE (in init_mm). After the page is mapped, the usual fault mechanism can be used to update additional MMs. This allows the vmalloc_sync_all() to be removed from alloc_vm_area(). Signed-off-by: David Vrabel Acked-by: Andrew Morton [v1: Squashed fix by Michal for no-mmu case] Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Michal Simek --- mm/nommu.c | 2 +- mm/vmalloc.c | 27 +++++++++++++-------------- 2 files changed, 14 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/nommu.c b/mm/nommu.c index 73419c55eda6..b982290fd962 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -454,7 +454,7 @@ void __attribute__((weak)) vmalloc_sync_all(void) * between processes, it syncs the pagetable across all * processes. */ -struct vm_struct *alloc_vm_area(size_t size) +struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) { BUG(); return NULL; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b669aa6f6caf..3231bf332878 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2141,23 +2141,30 @@ void __attribute__((weak)) vmalloc_sync_all(void) static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data) { - /* apply_to_page_range() does all the hard work. */ + pte_t ***p = data; + + if (p) { + *(*p) = pte; + (*p)++; + } return 0; } /** * alloc_vm_area - allocate a range of kernel address space * @size: size of the area + * @ptes: returns the PTEs for the address space * * Returns: NULL on failure, vm_struct on success * * This function reserves a range of kernel address space, and * allocates pagetables to map that range. No actual mappings - * are created. If the kernel address space is not shared - * between processes, it syncs the pagetable across all - * processes. + * are created. + * + * If @ptes is non-NULL, pointers to the PTEs (in init_mm) + * allocated for the VM area are returned. */ -struct vm_struct *alloc_vm_area(size_t size) +struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) { struct vm_struct *area; @@ -2171,19 +2178,11 @@ struct vm_struct *alloc_vm_area(size_t size) * of kernel virtual address space and mapped into init_mm. */ if (apply_to_page_range(&init_mm, (unsigned long)area->addr, - area->size, f, NULL)) { + size, f, ptes ? &ptes : NULL)) { free_vm_area(area); return NULL; } - /* - * If the allocated address space is passed to a hypercall - * before being used then we cannot rely on a page fault to - * trigger an update of the page tables. So sync all the page - * tables here. - */ - vmalloc_sync_all(); - return area; } EXPORT_SYMBOL_GPL(alloc_vm_area); -- cgit v1.2.3 From 1df647197c5b8aacaeb58592cba9a1df322c9000 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sun, 13 Nov 2011 19:47:32 -0600 Subject: writeback: hard throttle 1000+ dd on a slow USB stick The sleep based balance_dirty_pages() can pause at most MAX_PAUSE=200ms on every 1 4KB-page, which means it cannot throttle a task under 4KB/200ms=20KB/s. So when there are more than 512 dd writing to a 10MB/s USB stick, its bdi dirty pages could grow out of control. Even if we can increase MAX_PAUSE, the minimal (task_ratelimit = 1) means a limit of 4KB/s. They can eventually be safeguarded by the global limit check (nr_dirty < dirty_thresh). However if someone is also writing to an HDD at the same time, it'll get poor HDD write performance. We at least want to maintain good write performance for other devices when one device is attacked by some "massive parallel" workload, or suffers from slow write bandwidth, or somehow get stalled due to some error condition (eg. NFS server not responding). For a stalled device, we need to completely block its dirtiers, too, before its bdi dirty pages grow all the way up to the global limit and leave no space for the other functional devices. So change the loop exit condition to /* * Always enforce global dirty limit; also enforce bdi dirty limit * if the normal max_pause sleeps cannot keep things under control. */ if (nr_dirty < dirty_thresh && (bdi_dirty < bdi_thresh || bdi->dirty_ratelimit > 1)) break; which can be further simplified to if (task_ratelimit) break; Signed-off-by: Wu Fengguang --- mm/page-writeback.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 79c34419fadd..e7cb5ff6e53d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1136,14 +1136,11 @@ pause: __set_current_state(TASK_KILLABLE); io_schedule_timeout(pause); - dirty_thresh = hard_dirty_limit(dirty_thresh); /* - * max-pause area. If dirty exceeded but still within this - * area, no need to sleep for more than 200ms: (a) 8 pages per - * 200ms is typically more than enough to curb heavy dirtiers; - * (b) the pause time limit makes the dirtiers more responsive. + * This is typically equal to (nr_dirty < dirty_thresh) and can + * also keep "1000+ dd on a slow USB stick" under control. */ - if (nr_dirty < dirty_thresh) + if (task_ratelimit) break; if (fatal_signal_pending(current)) -- cgit v1.2.3 From 468e6a20afaccb67e2a7d7f60d301f90e1c6f301 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 7 Sep 2011 10:41:32 -0600 Subject: writeback: remove vm_dirties and task->dirties They are not used any more. Signed-off-by: Wu Fengguang --- mm/page-writeback.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e7cb5ff6e53d..71252486bc6f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -128,7 +128,6 @@ unsigned long global_dirty_limit; * */ static struct prop_descriptor vm_completions; -static struct prop_descriptor vm_dirties; /* * couple the period to the dirty_ratio: @@ -154,7 +153,6 @@ static void update_completion_period(void) { int shift = calc_period_shift(); prop_change_shift(&vm_completions, shift); - prop_change_shift(&vm_dirties, shift); writeback_set_ratelimit(); } @@ -235,11 +233,6 @@ void bdi_writeout_inc(struct backing_dev_info *bdi) } EXPORT_SYMBOL_GPL(bdi_writeout_inc); -void task_dirty_inc(struct task_struct *tsk) -{ - prop_inc_single(&vm_dirties, &tsk->dirties); -} - /* * Obtain an accurate fraction of the BDI's portion. */ @@ -1395,7 +1388,6 @@ void __init page_writeback_init(void) shift = calc_period_shift(); prop_descriptor_init(&vm_completions, shift); - prop_descriptor_init(&vm_dirties, shift); } /** @@ -1724,7 +1716,6 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) __inc_zone_page_state(page, NR_DIRTIED); __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED); - task_dirty_inc(current); task_io_account_write(PAGE_CACHE_SIZE); } } -- cgit v1.2.3 From 90459ce06f410b983540be56209c0abcbce23944 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Thu, 4 Aug 2011 11:02:33 +0200 Subject: percpu: rename pcpu_mem_alloc to pcpu_mem_zalloc Currently pcpu_mem_alloc() is implemented always return zeroed memory. So rename it to make user like pcpu_get_pages_and_bitmap() know don't reinit it. Signed-off-by: Bob Liu Reviewed-by: Pekka Enberg Reviewed-by: Michal Hocko Signed-off-by: Tejun Heo --- mm/percpu-vm.c | 5 ++--- mm/percpu.c | 17 +++++++++-------- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index ea534960a04b..29e3730d2ffd 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -50,14 +50,13 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, if (!pages || !bitmap) { if (may_alloc && !pages) - pages = pcpu_mem_alloc(pages_size); + pages = pcpu_mem_zalloc(pages_size); if (may_alloc && !bitmap) - bitmap = pcpu_mem_alloc(bitmap_size); + bitmap = pcpu_mem_zalloc(bitmap_size); if (!pages || !bitmap) return NULL; } - memset(pages, 0, pages_size); bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages); *bitmapp = bitmap; diff --git a/mm/percpu.c b/mm/percpu.c index bf80e55dbed7..28c37a2e2de2 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -273,11 +273,11 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk, (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end))) /** - * pcpu_mem_alloc - allocate memory + * pcpu_mem_zalloc - allocate memory * @size: bytes to allocate * * Allocate @size bytes. If @size is smaller than PAGE_SIZE, - * kzalloc() is used; otherwise, vmalloc() is used. The returned + * kzalloc() is used; otherwise, vzalloc() is used. The returned * memory is always zeroed. * * CONTEXT: @@ -286,7 +286,7 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk, * RETURNS: * Pointer to the allocated area on success, NULL on failure. */ -static void *pcpu_mem_alloc(size_t size) +static void *pcpu_mem_zalloc(size_t size) { if (WARN_ON_ONCE(!slab_is_available())) return NULL; @@ -302,7 +302,7 @@ static void *pcpu_mem_alloc(size_t size) * @ptr: memory to free * @size: size of the area * - * Free @ptr. @ptr should have been allocated using pcpu_mem_alloc(). + * Free @ptr. @ptr should have been allocated using pcpu_mem_zalloc(). */ static void pcpu_mem_free(void *ptr, size_t size) { @@ -384,7 +384,7 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc) size_t old_size = 0, new_size = new_alloc * sizeof(new[0]); unsigned long flags; - new = pcpu_mem_alloc(new_size); + new = pcpu_mem_zalloc(new_size); if (!new) return -ENOMEM; @@ -604,11 +604,12 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) { struct pcpu_chunk *chunk; - chunk = pcpu_mem_alloc(pcpu_chunk_struct_size); + chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size); if (!chunk) return NULL; - chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); + chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC * + sizeof(chunk->map[0])); if (!chunk->map) { kfree(chunk); return NULL; @@ -1889,7 +1890,7 @@ void __init percpu_init_late(void) BUILD_BUG_ON(size > PAGE_SIZE); - map = pcpu_mem_alloc(size); + map = pcpu_mem_zalloc(size); BUG_ON(!map); spin_lock_irqsave(&pcpu_lock, flags); -- cgit v1.2.3 From a855b84c3d8c73220d4d3cd392a7bee7c83de70e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 18 Nov 2011 10:55:35 -0800 Subject: percpu: fix chunk range calculation Percpu allocator recorded the cpus which map to the first and last units in pcpu_first/last_unit_cpu respectively and used them to determine the address range of a chunk - e.g. it assumed that the first unit has the lowest address in a chunk while the last unit has the highest address. This simply isn't true. Groups in a chunk can have arbitrary positive or negative offsets from the previous one and there is no guarantee that the first unit occupies the lowest offset while the last one the highest. Fix it by actually comparing unit offsets to determine cpus occupying the lowest and highest offsets. Also, rename pcu_first/last_unit_cpu to pcpu_low/high_unit_cpu to avoid confusion. The chunk address range is used to flush cache on vmalloc area map/unmap and decide whether a given address is in the first chunk by per_cpu_ptr_to_phys() and the bug was discovered by invalid per_cpu_ptr_to_phys() translation for crash_note. Kudos to Dave Young for tracking down the problem. Signed-off-by: Tejun Heo Reported-by: WANG Cong Reported-by: Dave Young Tested-by: Dave Young LKML-Reference: <4EC21F67.10905@redhat.com> Cc: stable @kernel.org --- mm/percpu-vm.c | 12 ++++++------ mm/percpu.c | 34 ++++++++++++++++++++-------------- 2 files changed, 26 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 29e3730d2ffd..12a48a88c0d8 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -142,8 +142,8 @@ static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, int page_start, int page_end) { flush_cache_vunmap( - pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), - pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); + pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); } static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) @@ -205,8 +205,8 @@ static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, int page_start, int page_end) { flush_tlb_kernel_range( - pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), - pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); + pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); } static int __pcpu_map_pages(unsigned long addr, struct page **pages, @@ -283,8 +283,8 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk, int page_start, int page_end) { flush_cache_vmap( - pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), - pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); + pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); } /** diff --git a/mm/percpu.c b/mm/percpu.c index 28c37a2e2de2..2473ff06dc76 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -116,9 +116,9 @@ static int pcpu_atom_size __read_mostly; static int pcpu_nr_slots __read_mostly; static size_t pcpu_chunk_struct_size __read_mostly; -/* cpus with the lowest and highest unit numbers */ -static unsigned int pcpu_first_unit_cpu __read_mostly; -static unsigned int pcpu_last_unit_cpu __read_mostly; +/* cpus with the lowest and highest unit addresses */ +static unsigned int pcpu_low_unit_cpu __read_mostly; +static unsigned int pcpu_high_unit_cpu __read_mostly; /* the address of the first chunk which starts with the kernel static area */ void *pcpu_base_addr __read_mostly; @@ -985,19 +985,19 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr) { void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); bool in_first_chunk = false; - unsigned long first_start, first_end; + unsigned long first_low, first_high; unsigned int cpu; /* - * The following test on first_start/end isn't strictly + * The following test on unit_low/high isn't strictly * necessary but will speed up lookups of addresses which * aren't in the first chunk. */ - first_start = pcpu_chunk_addr(pcpu_first_chunk, pcpu_first_unit_cpu, 0); - first_end = pcpu_chunk_addr(pcpu_first_chunk, pcpu_last_unit_cpu, - pcpu_unit_pages); - if ((unsigned long)addr >= first_start && - (unsigned long)addr < first_end) { + first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0); + first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu, + pcpu_unit_pages); + if ((unsigned long)addr >= first_low && + (unsigned long)addr < first_high) { for_each_possible_cpu(cpu) { void *start = per_cpu_ptr(base, cpu); @@ -1234,7 +1234,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, for (cpu = 0; cpu < nr_cpu_ids; cpu++) unit_map[cpu] = UINT_MAX; - pcpu_first_unit_cpu = NR_CPUS; + + pcpu_low_unit_cpu = NR_CPUS; + pcpu_high_unit_cpu = NR_CPUS; for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { const struct pcpu_group_info *gi = &ai->groups[group]; @@ -1254,9 +1256,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, unit_map[cpu] = unit + i; unit_off[cpu] = gi->base_offset + i * ai->unit_size; - if (pcpu_first_unit_cpu == NR_CPUS) - pcpu_first_unit_cpu = cpu; - pcpu_last_unit_cpu = cpu; + /* determine low/high unit_cpu */ + if (pcpu_low_unit_cpu == NR_CPUS || + unit_off[cpu] < unit_off[pcpu_low_unit_cpu]) + pcpu_low_unit_cpu = cpu; + if (pcpu_high_unit_cpu == NR_CPUS || + unit_off[cpu] > unit_off[pcpu_high_unit_cpu]) + pcpu_high_unit_cpu = cpu; } } pcpu_nr_units = unit; -- cgit v1.2.3 From 67589c71456b0346500629967292dea3802230b6 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 23 Nov 2011 08:20:53 -0800 Subject: percpu: explain why per_cpu_ptr_to_phys() is more complicated than necessary Add comments about current per_cpu_ptr_to_phys implementation to explain why the logic is more complicated than necessary. -tj: relocated comment into kerneldoc comment Signed-off-by: Dave Young Signed-off-by: Tejun Heo --- mm/percpu.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index 2473ff06dc76..3bb810a72006 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -978,6 +978,17 @@ bool is_kernel_percpu_address(unsigned long addr) * address. The caller is responsible for ensuring @addr stays valid * until this function finishes. * + * percpu allocator has special setup for the first chunk, which currently + * supports either embedding in linear address space or vmalloc mapping, + * and, from the second one, the backing allocator (currently either vm or + * km) provides translation. + * + * The addr can be tranlated simply without checking if it falls into the + * first chunk. But the current code reflects better how percpu allocator + * actually works, and the verification can discover both bugs in percpu + * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current + * code. + * * RETURNS: * The physical address for @addr. */ -- cgit v1.2.3 From 52cef189165d74a5d6030184a8e05595194c69ca Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Nov 2011 21:12:40 +0100 Subject: slab, lockdep: Fix silly bug Commit 30765b92 ("slab, lockdep: Annotate the locks before using them") moves the init_lock_keys() call from after g_cpucache_up = FULL, to before it. And overlooks the fact that init_node_lock_keys() tests for it and ignores everything !FULL. Introduce a LATE stage and change the lockdep test to be Cc: Pekka Enberg Cc: stable@kernel.org Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- mm/slab.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 708efe886154..83311c9aaf9d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -595,6 +595,7 @@ static enum { PARTIAL_AC, PARTIAL_L3, EARLY, + LATE, FULL } g_cpucache_up; @@ -671,7 +672,7 @@ static void init_node_lock_keys(int q) { struct cache_sizes *s = malloc_sizes; - if (g_cpucache_up != FULL) + if (g_cpucache_up < LATE) return; for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { @@ -1666,6 +1667,8 @@ void __init kmem_cache_init_late(void) { struct kmem_cache *cachep; + g_cpucache_up = LATE; + /* Annotate slab for lockdep -- annotate the malloc caches */ init_lock_keys(); -- cgit v1.2.3 From 635697c663f38106063d5659f0cf2e45afcd4bb5 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 8 Dec 2011 14:33:51 -0800 Subject: vmscan: fix initial shrinker size handling A shrinker function can return -1, means that it cannot do anything without a risk of deadlock. For example prune_super() does this if it cannot grab a superblock refrence, even if nr_to_scan=0. Currently we interpret this -1 as a ULONG_MAX size shrinker and evaluate `total_scan' according to this. So the next time around this shrinker can cause really big pressure. Let's skip such shrinkers instead. Also make total_scan signed, otherwise the check (total_scan < 0) below never works. Signed-off-by: Konstantin Khlebnikov Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index a1893c050795..f5255442ae2b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -247,14 +247,18 @@ unsigned long shrink_slab(struct shrink_control *shrink, list_for_each_entry(shrinker, &shrinker_list, list) { unsigned long long delta; - unsigned long total_scan; - unsigned long max_pass; + long total_scan; + long max_pass; int shrink_ret = 0; long nr; long new_nr; long batch_size = shrinker->batch ? shrinker->batch : SHRINK_BATCH; + max_pass = do_shrinker_shrink(shrinker, shrink, 0); + if (max_pass <= 0) + continue; + /* * copy the current shrinker scan count into a local variable * and zero it so that other concurrent shrinker invocations @@ -265,7 +269,6 @@ unsigned long shrink_slab(struct shrink_control *shrink, } while (cmpxchg(&shrinker->nr, nr, 0) != nr); total_scan = nr; - max_pass = do_shrinker_shrink(shrinker, shrink, 0); delta = (4 * nr_pages_scanned) / shrinker->seeks; delta *= max_pass; do_div(delta, lru_pages + 1); -- cgit v1.2.3 From 83aeeada7c69f35e5100b27ec354335597a7a488 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 8 Dec 2011 14:33:54 -0800 Subject: vmscan: use atomic-long for shrinker batching Use atomic-long operations instead of looping around cmpxchg(). [akpm@linux-foundation.org: massage atomic.h inclusions] Signed-off-by: Konstantin Khlebnikov Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index f5255442ae2b..f54a05b7a61d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -183,7 +183,7 @@ static unsigned long zone_nr_lru_pages(struct zone *zone, */ void register_shrinker(struct shrinker *shrinker) { - shrinker->nr = 0; + atomic_long_set(&shrinker->nr_in_batch, 0); down_write(&shrinker_rwsem); list_add_tail(&shrinker->list, &shrinker_list); up_write(&shrinker_rwsem); @@ -264,9 +264,7 @@ unsigned long shrink_slab(struct shrink_control *shrink, * and zero it so that other concurrent shrinker invocations * don't also do this scanning work. */ - do { - nr = shrinker->nr; - } while (cmpxchg(&shrinker->nr, nr, 0) != nr); + nr = atomic_long_xchg(&shrinker->nr_in_batch, 0); total_scan = nr; delta = (4 * nr_pages_scanned) / shrinker->seeks; @@ -328,12 +326,11 @@ unsigned long shrink_slab(struct shrink_control *shrink, * manner that handles concurrent updates. If we exhausted the * scan, there is no need to do an update. */ - do { - nr = shrinker->nr; - new_nr = total_scan + nr; - if (total_scan <= 0) - break; - } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr); + if (total_scan > 0) + new_nr = atomic_long_add_return(total_scan, + &shrinker->nr_in_batch); + else + new_nr = atomic_long_read(&shrinker->nr_in_batch); trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr); } -- cgit v1.2.3 From 1dfb059b9438633b0546c5431538a47f6ed99028 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 8 Dec 2011 14:33:57 -0800 Subject: thp: reduce khugepaged freezing latency khugepaged can sometimes cause suspend to fail, requiring that the user retry the suspend operation. Use wait_event_freezable_timeout() instead of schedule_timeout_interruptible() to avoid missing freezer wakeups. A try_to_freeze() would have been needed in the khugepaged_alloc_hugepage tight loop too in case of the allocation failing repeatedly, and wait_event_freezable_timeout will provide it too. khugepaged would still freeze just fine by trying again the next minute but it's better if it freezes immediately. Reported-by: Jiri Slaby Signed-off-by: Andrea Arcangeli Tested-by: Jiri Slaby Cc: Tejun Heo Cc: Oleg Nesterov Cc: "Srivatsa S. Bhat" Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4298abaae153..36b3d988b4ef 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2259,12 +2259,8 @@ static void khugepaged_do_scan(struct page **hpage) static void khugepaged_alloc_sleep(void) { - DEFINE_WAIT(wait); - add_wait_queue(&khugepaged_wait, &wait); - schedule_timeout_interruptible( - msecs_to_jiffies( - khugepaged_alloc_sleep_millisecs)); - remove_wait_queue(&khugepaged_wait, &wait); + wait_event_freezable_timeout(khugepaged_wait, false, + msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); } #ifndef CONFIG_NUMA @@ -2313,14 +2309,10 @@ static void khugepaged_loop(void) if (unlikely(kthread_should_stop())) break; if (khugepaged_has_work()) { - DEFINE_WAIT(wait); if (!khugepaged_scan_sleep_millisecs) continue; - add_wait_queue(&khugepaged_wait, &wait); - schedule_timeout_interruptible( - msecs_to_jiffies( - khugepaged_scan_sleep_millisecs)); - remove_wait_queue(&khugepaged_wait, &wait); + wait_event_freezable_timeout(khugepaged_wait, false, + msecs_to_jiffies(khugepaged_scan_sleep_millisecs)); } else if (khugepaged_enabled()) wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); -- cgit v1.2.3 From 58a84aa92723d1ac3e1cc4e3b0ff49291663f7e1 Mon Sep 17 00:00:00 2001 From: Youquan Song Date: Thu, 8 Dec 2011 14:34:18 -0800 Subject: thp: set compound tail page _count to zero Commit 70b50f94f1644 ("mm: thp: tail page refcounting fix") keeps all page_tail->_count zero at all times. But the current kernel does not set page_tail->_count to zero if a 1GB page is utilized. So when an IOMMU 1GB page is used by KVM, it wil result in a kernel oops because a tail page's _count does not equal zero. kernel BUG at include/linux/mm.h:386! invalid opcode: 0000 [#1] SMP Call Trace: gup_pud_range+0xb8/0x19d get_user_pages_fast+0xcb/0x192 ? trace_hardirqs_off+0xd/0xf hva_to_pfn+0x119/0x2f2 gfn_to_pfn_memslot+0x2c/0x2e kvm_iommu_map_pages+0xfd/0x1c1 kvm_iommu_map_memslots+0x7c/0xbd kvm_iommu_map_guest+0xaa/0xbf kvm_vm_ioctl_assigned_device+0x2ef/0xa47 kvm_vm_ioctl+0x36c/0x3a2 do_vfs_ioctl+0x49e/0x4e4 sys_ioctl+0x5a/0x7c system_call_fastpath+0x16/0x1b RIP gup_huge_pud+0xf2/0x159 Signed-off-by: Youquan Song Reviewed-by: Andrea Arcangeli Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 1 + mm/page_alloc.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bb28a5f9db8d..73f17c0293c0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -576,6 +576,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) __SetPageHead(page); for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { __SetPageTail(p); + set_page_count(p, 0); p->first_page = page; } } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9dd443d89d8b..850009a7101e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -356,8 +356,8 @@ void prep_compound_page(struct page *page, unsigned long order) __SetPageHead(page); for (i = 1; i < nr_pages; i++) { struct page *p = page + i; - __SetPageTail(p); + set_page_count(p, 0); p->first_page = page; } } -- cgit v1.2.3 From 09761333ed47e899cc1482c13090b95f3f711971 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Thu, 8 Dec 2011 14:34:20 -0800 Subject: mm/migrate.c: pair unlock_page() and lock_page() when migrating huge pages Avoid unlocking and unlocked page if we failed to lock it. Signed-off-by: Hillf Danton Cc: Naoya Horiguchi Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 578e29174fa6..177aca424a06 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -871,9 +871,9 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (anon_vma) put_anon_vma(anon_vma); -out: unlock_page(hpage); +out: if (rc != -EAGAIN) { list_del(&hpage->lru); put_page(hpage); -- cgit v1.2.3 From d021563888312018ca65681096f62e36c20e63cc Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 8 Dec 2011 14:34:27 -0800 Subject: mm: Ensure that pfn_valid() is called once per pageblock when reserving pageblocks setup_zone_migrate_reserve() expects that zone->start_pfn starts at pageblock_nr_pages aligned pfn otherwise we could access beyond an existing memblock resulting in the following panic if CONFIG_HOLES_IN_ZONE is not configured and we do not check pfn_valid: IP: [] setup_zone_migrate_reserve+0xcd/0x180 *pdpt = 0000000000000000 *pde = f000ff53f000ff53 Oops: 0000 [#1] SMP Pid: 1, comm: swapper Not tainted 3.0.7-0.7-pae #1 VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform EIP: 0060:[] EFLAGS: 00010006 CPU: 0 EIP is at setup_zone_migrate_reserve+0xcd/0x180 EAX: 000c0000 EBX: f5801fc0 ECX: 000c0000 EDX: 00000000 ESI: 000c01fe EDI: 000c01fe EBP: 00140000 ESP: f2475f58 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 Process swapper (pid: 1, ti=f2474000 task=f2472cd0 task.ti=f2474000) Call Trace: [] __setup_per_zone_wmarks+0xec/0x160 [] setup_per_zone_wmarks+0xf/0x20 [] init_per_zone_wmark_min+0x27/0x86 [] do_one_initcall+0x2b/0x160 [] kernel_init+0xbe/0x157 [] kernel_thread_helper+0x6/0xd Code: a5 39 f5 89 f7 0f 46 fd 39 cf 76 40 8b 03 f6 c4 08 74 32 eb 91 90 89 c8 c1 e8 0e 0f be 80 80 2f 86 c0 8b 14 85 60 2f 86 c0 89 c8 <2b> 82 b4 12 00 00 c1 e0 05 03 82 ac 12 00 00 8b 00 f6 c4 08 0f EIP: [] setup_zone_migrate_reserve+0xcd/0x180 SS:ESP 0068:f2475f58 CR2: 00000000000012b4 We crashed in pageblock_is_reserved() when accessing pfn 0xc0000 because highstart_pfn = 0x36ffe. The issue was introduced in 3.0-rc1 by 6d3163ce ("mm: check if any page in a pageblock is reserved before marking it MIGRATE_RESERVE"). Make sure that start_pfn is always aligned to pageblock_nr_pages to ensure that pfn_valid s always called at the start of each pageblock. Architectures with holes in pageblocks will be correctly handled by pfn_valid_within in pageblock_is_reserved. Signed-off-by: Michal Hocko Signed-off-by: Mel Gorman Tested-by: Dang Bo Reviewed-by: KAMEZAWA Hiroyuki Cc: Andrea Arcangeli Cc: David Rientjes Cc: Arve Hjnnevg Cc: KOSAKI Motohiro Cc: John Stultz Cc: Dave Hansen Cc: [3.0+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 850009a7101e..2b8ba3aebf6e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3377,9 +3377,15 @@ static void setup_zone_migrate_reserve(struct zone *zone) unsigned long block_migratetype; int reserve; - /* Get the start pfn, end pfn and the number of blocks to reserve */ + /* + * Get the start pfn, end pfn and the number of blocks to reserve + * We have to be careful to be aligned to pageblock_nr_pages to + * make sure that we always check pfn_valid for the first page in + * the block. + */ start_pfn = zone->zone_start_pfn; end_pfn = start_pfn + zone->spanned_pages; + start_pfn = roundup(start_pfn, pageblock_nr_pages); reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> pageblock_order; -- cgit v1.2.3 From 1368edf0647ac112d8cfa6ce47257dc950c50f5c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 8 Dec 2011 14:34:30 -0800 Subject: mm: vmalloc: check for page allocation failure before vmlist insertion Commit f5252e00 ("mm: avoid null pointer access in vm_struct via /proc/vmallocinfo") adds newly allocated vm_structs to the vmlist after it is fully initialised. Unfortunately, it did not check that __vmalloc_area_node() successfully populated the area. In the event of allocation failure, the vmalloc area is freed but the pointer to freed memory is inserted into the vmlist leading to a a crash later in get_vmalloc_info(). This patch adds a check for ____vmalloc_area_node() failure within __vmalloc_node_range. It does not use "goto fail" as in the previous error path as a warning was already displayed by __vmalloc_area_node() before it called vfree in its failure path. Credit goes to Luciano Chavez for doing all the real work of identifying exactly where the problem was. Signed-off-by: Mel Gorman Reported-by: Luciano Chavez Tested-by: Luciano Chavez Reviewed-by: Rik van Riel Acked-by: David Rientjes Cc: [3.1.x+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 3231bf332878..1d8b32f07139 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1633,6 +1633,8 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, goto fail; addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); + if (!addr) + return NULL; /* * In this function, newly allocated vm_struct is not added -- cgit v1.2.3 From 8f1e33daeda6cd89753f9e77d174805a6f21db09 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 23 Nov 2011 09:24:27 -0600 Subject: slub: Switch per cpu partial page support off for debugging Eric saw an issue with accounting of slabs during validation. Its not possible to determine accurately how many per cpu partial slabs exist at any time so this switches off per cpu partial pages during debug. Acked-by: Eric Dumazet Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index ed3334d9b6da..4056d29e6610 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3028,7 +3028,9 @@ static int kmem_cache_open(struct kmem_cache *s, * per node list when we run out of per cpu objects. We only fetch 50% * to keep some capacity around for frees. */ - if (s->size >= PAGE_SIZE) + if (kmem_cache_debug(s)) + s->cpu_partial = 0; + else if (s->size >= PAGE_SIZE) s->cpu_partial = 2; else if (s->size >= 1024) s->cpu_partial = 6; -- cgit v1.2.3 From b13683d1cc14d1dd30b8e20f3ebea3f814ad029f Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 11 Nov 2011 14:54:14 +0800 Subject: slub: add missed accounting With per-cpu partial list, slab is added to partial list first and then moved to node list. The __slab_free() code path for add/remove_partial is almost deprecated(except for slub debug). But we forget to account add/remove_partial when move per-cpu partial pages to node list, so the statistics for such events are always 0. Add corresponding accounting. This is against the patch "slub: use correct parameter to add a page to partial list tail" Acked-by: Christoph Lameter Signed-off-by: Shaohua Li Signed-off-by: Pekka Enberg --- mm/slub.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 4056d29e6610..8284a206f48d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1901,11 +1901,14 @@ static void unfreeze_partials(struct kmem_cache *s) } if (l != m) { - if (l == M_PARTIAL) + if (l == M_PARTIAL) { remove_partial(n, page); - else + stat(s, FREE_REMOVE_PARTIAL); + } else { add_partial(n, page, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); + } l = m; } -- cgit v1.2.3