From 6a618957ad17d8f4f4c7eeede752685374b1b176 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 17 Mar 2016 14:17:26 -0700 Subject: mm: oom_kill: don't ignore oom score on exiting tasks When the OOM killer scans tasks and encounters a PF_EXITING one, it force-selects that task regardless of the score. The problem is that if that task got stuck waiting for some state the allocation site is holding, the OOM reaper can not move on to the next best victim. Frankly, I don't even know why we check for exiting tasks in the OOM killer. We've tried direct reclaim at least 15 times by the time we decide the system is OOM, there was plenty of time to exit and free memory; and a task might exit voluntarily right after we issue a kill. This is testing pure noise. Remove it. Signed-off-by: Tetsuo Handa Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Kirill A. Shutemov Cc: Mel Gorman Cc: David Rientjes Cc: Oleg Nesterov Cc: Hugh Dickins Cc: Andrea Argangeli Cc: Rik van Riel Cc: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index e97a05d9621f..63ced708eafd 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -287,9 +287,6 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, if (oom_task_origin(task)) return OOM_SCAN_SELECT; - if (task_will_free_mem(task) && !is_sysrq_oom(oc)) - return OOM_SCAN_ABORT; - return OOM_SCAN_OK; } -- cgit v1.2.3 From fcff7d7eebe6d31e2ce20d994555c86a90197034 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:17:29 -0700 Subject: mm: memcontrol: do not bypass slab charge if memcg is offline Slab pages are charged in two steps. First, an appropriate per memcg cache is selected (see memcg_kmem_get_cache) basing on the current context, then the new slab page is charged to the memory cgroup which the selected cache was created for (see memcg_charge_slab -> __memcg_kmem_charge_memcg). It is OK to bypass kmemcg charge at step 1, but if step 1 succeeded and we successfully allocated a new slab page, step 2 must be performed, otherwise we would get a per memcg kmem cache which contains a slab that does not hold a reference to the memory cgroup owning the cache. Since per memcg kmem caches are destroyed on memcg css free, this could result in freeing a cache while there are still active objects in it. However, currently we will bypass slab page charge if the memory cgroup owning the cache is offline (see __memcg_kmem_charge_memcg). This is very unlikely to occur in practice, because for this to happen a process must be migrated to a different cgroup and the old cgroup must be removed while the process is in kmalloc somewhere between steps 1 and 2 (e.g. trying to allocate a new page). Nevertheless, it's still better to eliminate such a possibility. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 42882c1e7fce..5c9d45e4c739 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2325,9 +2325,6 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, struct page_counter *counter; int ret; - if (!memcg_kmem_online(memcg)) - return 0; - ret = try_charge(memcg, gfp, nr_pages); if (ret) return ret; @@ -2346,10 +2343,11 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) { struct mem_cgroup *memcg; - int ret; + int ret = 0; memcg = get_mem_cgroup_from_mm(current->mm); - ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); + if (memcg_kmem_online(memcg)) + ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); css_put(&memcg->css); return ret; } -- cgit v1.2.3 From 72b54e7314a2e7a68567c92bbb32fe2598a3c783 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:17:32 -0700 Subject: mm: memcontrol: make tree_{stat,events} fetch all stats Currently, tree_{stat,events} helpers can only get one stat index at a time, so when there are a lot of stats to be reported one has to call it over and over again (see memory_stat_show). This is neither effective, nor does it look good. Instead, let's make these helpers take a snapshot of all available counters. Signed-off-by: Vladimir Davydov Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 67 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 39 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5c9d45e4c739..430266071c36 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2717,39 +2717,48 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, return retval; } -static unsigned long tree_stat(struct mem_cgroup *memcg, - enum mem_cgroup_stat_index idx) +static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat) { struct mem_cgroup *iter; - unsigned long val = 0; + int i; - for_each_mem_cgroup_tree(iter, memcg) - val += mem_cgroup_read_stat(iter, idx); + memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT); - return val; + for_each_mem_cgroup_tree(iter, memcg) { + for (i = 0; i < MEMCG_NR_STAT; i++) + stat[i] += mem_cgroup_read_stat(iter, i); + } } -static unsigned long tree_events(struct mem_cgroup *memcg, - enum mem_cgroup_events_index idx) +static void tree_events(struct mem_cgroup *memcg, unsigned long *events) { struct mem_cgroup *iter; - unsigned long val = 0; + int i; - for_each_mem_cgroup_tree(iter, memcg) - val += mem_cgroup_read_events(iter, idx); + memset(events, 0, sizeof(*events) * MEMCG_NR_EVENTS); - return val; + for_each_mem_cgroup_tree(iter, memcg) { + for (i = 0; i < MEMCG_NR_EVENTS; i++) + events[i] += mem_cgroup_read_events(iter, i); + } } static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) { - unsigned long val; + unsigned long val = 0; if (mem_cgroup_is_root(memcg)) { - val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE); - val += tree_stat(memcg, MEM_CGROUP_STAT_RSS); - if (swap) - val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP); + struct mem_cgroup *iter; + + for_each_mem_cgroup_tree(iter, memcg) { + val += mem_cgroup_read_stat(iter, + MEM_CGROUP_STAT_CACHE); + val += mem_cgroup_read_stat(iter, + MEM_CGROUP_STAT_RSS); + if (swap) + val += mem_cgroup_read_stat(iter, + MEM_CGROUP_STAT_SWAP); + } } else { if (!swap) val = page_counter_read(&memcg->memory); @@ -5075,6 +5084,8 @@ static int memory_events_show(struct seq_file *m, void *v) static int memory_stat_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned long stat[MEMCG_NR_STAT]; + unsigned long events[MEMCG_NR_EVENTS]; int i; /* @@ -5088,22 +5099,22 @@ static int memory_stat_show(struct seq_file *m, void *v) * Current memory state: */ + tree_stat(memcg, stat); + tree_events(memcg, events); + seq_printf(m, "anon %llu\n", - (u64)tree_stat(memcg, MEM_CGROUP_STAT_RSS) * PAGE_SIZE); + (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE); seq_printf(m, "file %llu\n", - (u64)tree_stat(memcg, MEM_CGROUP_STAT_CACHE) * PAGE_SIZE); + (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE); seq_printf(m, "sock %llu\n", - (u64)tree_stat(memcg, MEMCG_SOCK) * PAGE_SIZE); + (u64)stat[MEMCG_SOCK] * PAGE_SIZE); seq_printf(m, "file_mapped %llu\n", - (u64)tree_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED) * - PAGE_SIZE); + (u64)stat[MEM_CGROUP_STAT_FILE_MAPPED] * PAGE_SIZE); seq_printf(m, "file_dirty %llu\n", - (u64)tree_stat(memcg, MEM_CGROUP_STAT_DIRTY) * - PAGE_SIZE); + (u64)stat[MEM_CGROUP_STAT_DIRTY] * PAGE_SIZE); seq_printf(m, "file_writeback %llu\n", - (u64)tree_stat(memcg, MEM_CGROUP_STAT_WRITEBACK) * - PAGE_SIZE); + (u64)stat[MEM_CGROUP_STAT_WRITEBACK] * PAGE_SIZE); for (i = 0; i < NR_LRU_LISTS; i++) { struct mem_cgroup *mi; @@ -5118,9 +5129,9 @@ static int memory_stat_show(struct seq_file *m, void *v) /* Accumulated memory events */ seq_printf(m, "pgfault %lu\n", - tree_events(memcg, MEM_CGROUP_EVENTS_PGFAULT)); + events[MEM_CGROUP_EVENTS_PGFAULT]); seq_printf(m, "pgmajfault %lu\n", - tree_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT)); + events[MEM_CGROUP_EVENTS_PGMAJFAULT]); return 0; } -- cgit v1.2.3 From 27ee57c93ff00b8a2d6c6dd6b0b3dddda7b43b77 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:17:35 -0700 Subject: mm: memcontrol: report slab usage in cgroup2 memory.stat Show how much memory is used for storing reclaimable and unreclaimable in-kernel data structures allocated from slab caches. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 8 ++++++++ mm/slab.c | 8 +++++--- mm/slab.h | 30 ++++++++++++++++++++++++++++-- mm/slub.c | 3 ++- 4 files changed, 43 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 430266071c36..3ad64bf464fd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5106,6 +5106,9 @@ static int memory_stat_show(struct seq_file *m, void *v) (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE); seq_printf(m, "file %llu\n", (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE); + seq_printf(m, "slab %llu\n", + (u64)(stat[MEMCG_SLAB_RECLAIMABLE] + + stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); seq_printf(m, "sock %llu\n", (u64)stat[MEMCG_SOCK] * PAGE_SIZE); @@ -5126,6 +5129,11 @@ static int memory_stat_show(struct seq_file *m, void *v) mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE); } + seq_printf(m, "slab_reclaimable %llu\n", + (u64)stat[MEMCG_SLAB_RECLAIMABLE] * PAGE_SIZE); + seq_printf(m, "slab_unreclaimable %llu\n", + (u64)stat[MEMCG_SLAB_UNRECLAIMABLE] * PAGE_SIZE); + /* Accumulated memory events */ seq_printf(m, "pgfault %lu\n", diff --git a/mm/slab.c b/mm/slab.c index 852fc5c79829..56dd0df2a8ce 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1442,9 +1442,10 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, */ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) { - const unsigned long nr_freed = (1 << cachep->gfporder); + int order = cachep->gfporder; + unsigned long nr_freed = (1 << order); - kmemcheck_free_shadow(page, cachep->gfporder); + kmemcheck_free_shadow(page, order); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) sub_zone_page_state(page_zone(page), @@ -1461,7 +1462,8 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; - __free_kmem_pages(page, cachep->gfporder); + memcg_uncharge_slab(page, order, cachep); + __free_pages(page, order); } static void kmem_rcu_free(struct rcu_head *head) diff --git a/mm/slab.h b/mm/slab.h index b7934361f026..ff39a8fc3b3f 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -246,12 +246,33 @@ static __always_inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, struct kmem_cache *s) { + int ret; + if (!memcg_kmem_enabled()) return 0; if (is_root_cache(s)) return 0; - return __memcg_kmem_charge_memcg(page, gfp, order, - s->memcg_params.memcg); + + ret = __memcg_kmem_charge_memcg(page, gfp, order, + s->memcg_params.memcg); + if (ret) + return ret; + + memcg_kmem_update_page_stat(page, + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE, + 1 << order); + return 0; +} + +static __always_inline void memcg_uncharge_slab(struct page *page, int order, + struct kmem_cache *s) +{ + memcg_kmem_update_page_stat(page, + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE, + -(1 << order)); + memcg_kmem_uncharge(page, order); } extern void slab_init_memcg_params(struct kmem_cache *); @@ -294,6 +315,11 @@ static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, return 0; } +static inline void memcg_uncharge_slab(struct page *page, int order, + struct kmem_cache *s) +{ +} + static inline void slab_init_memcg_params(struct kmem_cache *s) { } diff --git a/mm/slub.c b/mm/slub.c index 6c91324f9370..712d53474082 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1540,7 +1540,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page) page_mapcount_reset(page); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; - __free_kmem_pages(page, order); + memcg_uncharge_slab(page, order, s); + __free_pages(page, order); } #define need_reserve_slab_rcu \ -- cgit v1.2.3 From 12580e4b54ba8a1b22ec977c200be0174ca42348 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:17:38 -0700 Subject: mm: memcontrol: report kernel stack usage in cgroup2 memory.stat Show how much memory is allocated to kernel stacks. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3ad64bf464fd..4b7dda7c2e74 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5106,6 +5106,8 @@ static int memory_stat_show(struct seq_file *m, void *v) (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE); seq_printf(m, "file %llu\n", (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE); + seq_printf(m, "kernel_stack %llu\n", + (u64)stat[MEMCG_KERNEL_STACK] * PAGE_SIZE); seq_printf(m, "slab %llu\n", (u64)(stat[MEMCG_SLAB_RECLAIMABLE] + stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); -- cgit v1.2.3 From 832fc1de01aea28255cb11d270679b7f1273f0d7 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 17 Mar 2016 14:17:41 -0700 Subject: /proc/kpageflags: return KPF_BUDDY for "tail" buddy pages Currently /proc/kpageflags returns nothing for "tail" buddy pages, which is inconvenient when grasping how free pages are distributed. This patch sets KPF_BUDDY for such pages. With this patch: $ grep MemFree /proc/meminfo ; tools/vm/page-types -b buddy MemFree: 3134992 kB flags page-count MB symbolic-flags long-symbolic-flags 0x0000000000000400 779272 3044 __________B_______________________________ buddy 0x0000000000000c00 4385 17 __________BM______________________________ buddy,mmap total 783657 3061 783657 pages is 3134628 kB (roughly consistent with the global counter,) so it's OK. [akpm@linux-foundation.org: update comment, per Naoya] Signed-off-by: Naoya Horiguchi Reviewed-by: Vladimir Davydov > Cc: Konstantin Khlebnikov Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 3 --- mm/page_alloc.c | 2 -- 2 files changed, 5 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index ad9400d759c8..b95952c2faec 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -148,9 +148,6 @@ extern int __isolate_free_page(struct page *page, unsigned int order); extern void __free_pages_bootmem(struct page *page, unsigned long pfn, unsigned int order); extern void prep_compound_page(struct page *page, unsigned int order); -#ifdef CONFIG_MEMORY_FAILURE -extern bool is_free_buddy_page(struct page *page); -#endif extern int user_min_free_kbytes; #if defined CONFIG_COMPACTION || defined CONFIG_CMA diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c46b75d14b6f..c7332a4bc8db 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7152,7 +7152,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) } #endif -#ifdef CONFIG_MEMORY_FAILURE bool is_free_buddy_page(struct page *page) { struct zone *zone = page_zone(page); @@ -7171,4 +7170,3 @@ bool is_free_buddy_page(struct page *page) return order < MAX_ORDER; } -#endif -- cgit v1.2.3 From f48d97f340cbb0c323fa7a7b36bd76a108a9f49f Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 17 Mar 2016 14:17:49 -0700 Subject: mm/vmalloc: query dynamic DEBUG_PAGEALLOC setting As CONFIG_DEBUG_PAGEALLOC can be enabled/disabled via kernel parameters we can optimize some cases by checking the enablement state. This is follow-up work for Christian's Optimize CONFIG_DEBUG_PAGEALLOC: https://lkml.org/lkml/2016/1/27/194 Remaining work is to make sparc to be aware of this but it looks not easy for me so I skip that in this series. This patch (of 5): We can disable debug_pagealloc processing even if the code is complied with CONFIG_DEBUG_PAGEALLOC. This patch changes the code to query whether it is enabled or not in runtime. [akpm@linux-foundation.org: update comment, per David. Adjust comment to use 80 cols] Signed-off-by: Joonsoo Kim Reviewed-by: Christian Borntraeger Acked-by: David Rientjes Cc: Benjamin Herrenschmidt Cc: Takashi Iwai Cc: Chris Metcalf Cc: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index fb42a5bffe47..d4b2e34adae0 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -531,22 +531,21 @@ static void unmap_vmap_area(struct vmap_area *va) static void vmap_debug_free_range(unsigned long start, unsigned long end) { /* - * Unmap page tables and force a TLB flush immediately if - * CONFIG_DEBUG_PAGEALLOC is set. This catches use after free - * bugs similarly to those in linear kernel virtual address - * space after a page has been freed. + * Unmap page tables and force a TLB flush immediately if pagealloc + * debugging is enabled. This catches use after free bugs similarly to + * those in linear kernel virtual address space after a page has been + * freed. * - * All the lazy freeing logic is still retained, in order to - * minimise intrusiveness of this debugging feature. + * All the lazy freeing logic is still retained, in order to minimise + * intrusiveness of this debugging feature. * - * This is going to be *slow* (linear kernel virtual address - * debugging doesn't do a broadcast TLB flush so it is a lot - * faster). + * This is going to be *slow* (linear kernel virtual address debugging + * doesn't do a broadcast TLB flush so it is a lot faster). */ -#ifdef CONFIG_DEBUG_PAGEALLOC - vunmap_page_range(start, end); - flush_tlb_kernel_range(start, end); -#endif + if (debug_pagealloc_enabled()) { + vunmap_page_range(start, end); + flush_tlb_kernel_range(start, end); + } } /* -- cgit v1.2.3 From 922d566cdcb7166c729ff67bb15ff5f93a3774b6 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 17 Mar 2016 14:17:53 -0700 Subject: mm/slub: query dynamic DEBUG_PAGEALLOC setting We can disable debug_pagealloc processing even if the code is compiled with CONFIG_DEBUG_PAGEALLOC. This patch changes the code to query whether it is enabled or not in runtime. [akpm@linux-foundation.org: clean up code, per Christian] Signed-off-by: Joonsoo Kim Reviewed-by: Christian Borntraeger Cc: Benjamin Herrenschmidt Cc: Chris Metcalf Cc: Christoph Lameter Cc: David Rientjes Cc: Pekka Enberg Cc: Takashi Iwai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 712d53474082..2f2f04d39104 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -254,11 +254,10 @@ static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) { void *p; -#ifdef CONFIG_DEBUG_PAGEALLOC + if (!debug_pagealloc_enabled()) + return get_freepointer(s, object); + probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p)); -#else - p = get_freepointer(s, object); -#endif return p; } -- cgit v1.2.3 From 505f6d22dbc63f333d1178dc80264e40b5c35268 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 17 Mar 2016 14:17:56 -0700 Subject: sound: query dynamic DEBUG_PAGEALLOC setting We can disable debug_pagealloc processing even if the code is compiled with CONFIG_DEBUG_PAGEALLOC. This patch changes the code to query whether it is enabled or not in runtime. [akpm@linux-foundation.org: export _debug_pagealloc_enabled to modules] Signed-off-by: Joonsoo Kim Acked-by: David Rientjes Acked-by: Takashi Iwai Cc: Benjamin Herrenschmidt Cc: Chris Metcalf Cc: Christian Borntraeger Cc: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c7332a4bc8db..b1fc19ebb8a2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -498,6 +498,7 @@ void prep_compound_page(struct page *page, unsigned int order) unsigned int _debug_guardpage_minorder; bool _debug_pagealloc_enabled __read_mostly = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); +EXPORT_SYMBOL(_debug_pagealloc_enabled); bool _debug_guardpage_enabled __read_mostly; static int __init early_debug_pagealloc(char *buf) -- cgit v1.2.3 From 81c5857b279e6b18f6ff0d1975e80a07af542cd1 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 17 Mar 2016 14:18:05 -0700 Subject: mm, kswapd: remove bogus check of balance_classzone_idx During work on kcompactd integration I have spotted a confusing check of balance_classzone_idx, which I believe is bogus. The balanced_classzone_idx is filled by balance_pgdat() as the highest zone it attempted to balance. This was introduced by commit dc83edd941f4 ("mm: kswapd: use the classzone idx that kswapd was using for sleeping_prematurely()"). The intention is that (as expressed in today's function names), the value used for kswapd_shrink_zone() calls in balance_pgdat() is the same as for the decisions in kswapd_try_to_sleep(). An unwanted side-effect of that commit was breaking the checks in kswapd() whether there was another kswapd_wakeup with a tighter (=lower) classzone_idx. Commits 215ddd6664ce ("mm: vmscan: only read new_classzone_idx from pgdat when reclaiming successfully") and d2ebd0f6b895 ("kswapd: avoid unnecessary rebalance after an unsuccessful balancing") tried to fixed, but apparently introduced a bogus check that this patch removes. Consider zone indexes X < Y < Z, where: - Z is the value used for the first kswapd wakeup. - Y is returned as balanced_classzone_idx, which means zones with index higher than Y (including Z) were found to be unreclaimable. - X is the value used for the second kswapd wakeup The new wakeup with value X means that kswapd is now supposed to balance harder all zones with index <= X. But instead, due to Y < Z, it will go sleep and won't read the new value X. This is subtly wrong. The effect of this patch is that kswapd will react better in some situations, where e.g. the first wakeup is for ZONE_DMA32, the second is for ZONE_DMA, and due to unreclaimable ZONE_NORMAL. Before this patch, kswapd would go sleep instead of reclaiming ZONE_DMA harder. I expect these situations are very rare, and more value is in better maintainability due to the removal of confusing and bogus check. Signed-off-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Joonsoo Kim Cc: Mel Gorman Cc: David Rientjes Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index dd984470248f..5dcc71140108 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3468,8 +3468,7 @@ static int kswapd(void *p) * new request of a similar or harder type will succeed soon * so consider going to sleep on the basis we reclaimed at */ - if (balanced_classzone_idx >= new_classzone_idx && - balanced_order == new_order) { + if (balanced_order == new_order) { new_order = pgdat->kswapd_max_order; new_classzone_idx = pgdat->classzone_idx; pgdat->kswapd_max_order = 0; -- cgit v1.2.3 From 698b1b30642f1ff0ea10ef1de9745ab633031377 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 17 Mar 2016 14:18:08 -0700 Subject: mm, compaction: introduce kcompactd Memory compaction can be currently performed in several contexts: - kswapd balancing a zone after a high-order allocation failure - direct compaction to satisfy a high-order allocation, including THP page fault attemps - khugepaged trying to collapse a hugepage - manually from /proc The purpose of compaction is two-fold. The obvious purpose is to satisfy a (pending or future) high-order allocation, and is easy to evaluate. The other purpose is to keep overal memory fragmentation low and help the anti-fragmentation mechanism. The success wrt the latter purpose is more The current situation wrt the purposes has a few drawbacks: - compaction is invoked only when a high-order page or hugepage is not available (or manually). This might be too late for the purposes of keeping memory fragmentation low. - direct compaction increases latency of allocations. Again, it would be better if compaction was performed asynchronously to keep fragmentation low, before the allocation itself comes. - (a special case of the previous) the cost of compaction during THP page faults can easily offset the benefits of THP. - kswapd compaction appears to be complex, fragile and not working in some scenarios. It could also end up compacting for a high-order allocation request when it should be reclaiming memory for a later order-0 request. To improve the situation, we should be able to benefit from an equivalent of kswapd, but for compaction - i.e. a background thread which responds to fragmentation and the need for high-order allocations (including hugepages) somewhat proactively. One possibility is to extend the responsibilities of kswapd, which could however complicate its design too much. It should be better to let kswapd handle reclaim, as order-0 allocations are often more critical than high-order ones. Another possibility is to extend khugepaged, but this kthread is a single instance and tied to THP configs. This patch goes with the option of a new set of per-node kthreads called kcompactd, and lays the foundations, without introducing any new tunables. The lifecycle mimics kswapd kthreads, including the memory hotplug hooks. For compaction, kcompactd uses the standard compaction_suitable() and ompact_finished() criteria and the deferred compaction functionality. Unlike direct compaction, it uses only sync compaction, as there's no allocation latency to minimize. This patch doesn't yet add a call to wakeup_kcompactd. The kswapd compact/reclaim loop for high-order pages will be replaced by waking up kcompactd in the next patch with the description of what's wrong with the old approach. Waking up of the kcompactd threads is also tied to kswapd activity and follows these rules: - we don't want to affect any fastpaths, so wake up kcompactd only from the slowpath, as it's done for kswapd - if kswapd is doing reclaim, it's more important than compaction, so don't invoke kcompactd until kswapd goes to sleep - the target order used for kswapd is passed to kcompactd Future possible future uses for kcompactd include the ability to wake up kcompactd on demand in special situations, such as when hugepages are not available (currently not done due to __GFP_NO_KSWAPD) or when a fragmentation event (i.e. __rmqueue_fallback()) occurs. It's also possible to perform periodic compaction with kcompactd. [arnd@arndb.de: fix build errors with kcompactd] [paul.gortmaker@windriver.com: don't use modular references for non modular code] Signed-off-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Joonsoo Kim Cc: Mel Gorman Cc: David Rientjes Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Arnd Bergmann Signed-off-by: Paul Gortmaker Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 222 ++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/memory_hotplug.c | 9 ++- mm/page_alloc.c | 3 + mm/vmstat.c | 1 + 4 files changed, 233 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 93f71d968098..5b2bfbaa821a 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -7,6 +7,7 @@ * * Copyright IBM Corp. 2007-2010 Mel Gorman */ +#include #include #include #include @@ -17,6 +18,8 @@ #include #include #include +#include +#include #include "internal.h" #ifdef CONFIG_COMPACTION @@ -1736,4 +1739,223 @@ void compaction_unregister_node(struct node *node) } #endif /* CONFIG_SYSFS && CONFIG_NUMA */ +static inline bool kcompactd_work_requested(pg_data_t *pgdat) +{ + return pgdat->kcompactd_max_order > 0; +} + +static bool kcompactd_node_suitable(pg_data_t *pgdat) +{ + int zoneid; + struct zone *zone; + enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx; + + for (zoneid = 0; zoneid < classzone_idx; zoneid++) { + zone = &pgdat->node_zones[zoneid]; + + if (!populated_zone(zone)) + continue; + + if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0, + classzone_idx) == COMPACT_CONTINUE) + return true; + } + + return false; +} + +static void kcompactd_do_work(pg_data_t *pgdat) +{ + /* + * With no special task, compact all zones so that a page of requested + * order is allocatable. + */ + int zoneid; + struct zone *zone; + struct compact_control cc = { + .order = pgdat->kcompactd_max_order, + .classzone_idx = pgdat->kcompactd_classzone_idx, + .mode = MIGRATE_SYNC_LIGHT, + .ignore_skip_hint = true, + + }; + bool success = false; + + trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, + cc.classzone_idx); + count_vm_event(KCOMPACTD_WAKE); + + for (zoneid = 0; zoneid < cc.classzone_idx; zoneid++) { + int status; + + zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + if (compaction_deferred(zone, cc.order)) + continue; + + if (compaction_suitable(zone, cc.order, 0, zoneid) != + COMPACT_CONTINUE) + continue; + + cc.nr_freepages = 0; + cc.nr_migratepages = 0; + cc.zone = zone; + INIT_LIST_HEAD(&cc.freepages); + INIT_LIST_HEAD(&cc.migratepages); + + status = compact_zone(zone, &cc); + + if (zone_watermark_ok(zone, cc.order, low_wmark_pages(zone), + cc.classzone_idx, 0)) { + success = true; + compaction_defer_reset(zone, cc.order, false); + } else if (status == COMPACT_COMPLETE) { + /* + * We use sync migration mode here, so we defer like + * sync direct compaction does. + */ + defer_compaction(zone, cc.order); + } + + VM_BUG_ON(!list_empty(&cc.freepages)); + VM_BUG_ON(!list_empty(&cc.migratepages)); + } + + /* + * Regardless of success, we are done until woken up next. But remember + * the requested order/classzone_idx in case it was higher/tighter than + * our current ones + */ + if (pgdat->kcompactd_max_order <= cc.order) + pgdat->kcompactd_max_order = 0; + if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx) + pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; +} + +void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) +{ + if (!order) + return; + + if (pgdat->kcompactd_max_order < order) + pgdat->kcompactd_max_order = order; + + if (pgdat->kcompactd_classzone_idx > classzone_idx) + pgdat->kcompactd_classzone_idx = classzone_idx; + + if (!waitqueue_active(&pgdat->kcompactd_wait)) + return; + + if (!kcompactd_node_suitable(pgdat)) + return; + + trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order, + classzone_idx); + wake_up_interruptible(&pgdat->kcompactd_wait); +} + +/* + * The background compaction daemon, started as a kernel thread + * from the init process. + */ +static int kcompactd(void *p) +{ + pg_data_t *pgdat = (pg_data_t*)p; + struct task_struct *tsk = current; + + const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + + if (!cpumask_empty(cpumask)) + set_cpus_allowed_ptr(tsk, cpumask); + + set_freezable(); + + pgdat->kcompactd_max_order = 0; + pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; + + while (!kthread_should_stop()) { + trace_mm_compaction_kcompactd_sleep(pgdat->node_id); + wait_event_freezable(pgdat->kcompactd_wait, + kcompactd_work_requested(pgdat)); + + kcompactd_do_work(pgdat); + } + + return 0; +} + +/* + * This kcompactd start function will be called by init and node-hot-add. + * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added. + */ +int kcompactd_run(int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + int ret = 0; + + if (pgdat->kcompactd) + return 0; + + pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid); + if (IS_ERR(pgdat->kcompactd)) { + pr_err("Failed to start kcompactd on node %d\n", nid); + ret = PTR_ERR(pgdat->kcompactd); + pgdat->kcompactd = NULL; + } + return ret; +} + +/* + * Called by memory hotplug when all memory in a node is offlined. Caller must + * hold mem_hotplug_begin/end(). + */ +void kcompactd_stop(int nid) +{ + struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd; + + if (kcompactd) { + kthread_stop(kcompactd); + NODE_DATA(nid)->kcompactd = NULL; + } +} + +/* + * It's optimal to keep kcompactd on the same CPUs as their memory, but + * not required for correctness. So if the last cpu in a node goes + * away, we get changed to run anywhere: as the first one comes back, + * restore their cpu bindings. + */ +static int cpu_callback(struct notifier_block *nfb, unsigned long action, + void *hcpu) +{ + int nid; + + if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { + for_each_node_state(nid, N_MEMORY) { + pg_data_t *pgdat = NODE_DATA(nid); + const struct cpumask *mask; + + mask = cpumask_of_node(pgdat->node_id); + + if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) + /* One of our CPUs online: restore mask */ + set_cpus_allowed_ptr(pgdat->kcompactd, mask); + } + } + return NOTIFY_OK; +} + +static int __init kcompactd_init(void) +{ + int nid; + + for_each_node_state(nid, N_MEMORY) + kcompactd_run(nid); + hotcpu_notifier(cpu_callback, 0); + return 0; +} +subsys_initcall(kcompactd_init) + #endif /* CONFIG_COMPACTION */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 24ea06393816..d9bcb26fc4df 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -33,6 +33,7 @@ #include #include #include +#include #include @@ -1105,8 +1106,10 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ init_per_zone_wmark_min(); - if (onlined_pages) + if (onlined_pages) { kswapd_run(zone_to_nid(zone)); + kcompactd_run(nid); + } vm_total_pages = nr_free_pagecache_pages(); @@ -1880,8 +1883,10 @@ repeat: zone_pcp_update(zone); node_states_clear_node(node, &arg); - if (arg.status_change_nid >= 0) + if (arg.status_change_nid >= 0) { kswapd_stop(node); + kcompactd_stop(node); + } vm_total_pages = nr_free_pagecache_pages(); writeback_set_ratelimit(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b1fc19ebb8a2..25a75da53c27 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5405,6 +5405,9 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) #endif init_waitqueue_head(&pgdat->kswapd_wait); init_waitqueue_head(&pgdat->pfmemalloc_wait); +#ifdef CONFIG_COMPACTION + init_waitqueue_head(&pgdat->kcompactd_wait); +#endif pgdat_page_ext_init(pgdat); for (j = 0; j < MAX_NR_ZONES; j++) { diff --git a/mm/vmstat.c b/mm/vmstat.c index 69ce64f7b8d7..f80066248c94 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -826,6 +826,7 @@ const char * const vmstat_text[] = { "compact_stall", "compact_fail", "compact_success", + "compact_daemon_wake", #endif #ifdef CONFIG_HUGETLB_PAGE -- cgit v1.2.3 From e888ca3545dc6823603b976e40b62af2c68b6fcc Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 17 Mar 2016 14:18:12 -0700 Subject: mm, memory hotplug: small cleanup in online_pages() We can reuse the nid we've determined instead of repeated pfn_to_nid() usages. Also zone_to_nid() should be a bit cheaper in general than pfn_to_nid(). Signed-off-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Joonsoo Kim Cc: Mel Gorman Cc: David Rientjes Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index d9bcb26fc4df..c832ef3565cc 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1055,7 +1055,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ arg.nr_pages = nr_pages; node_states_check_changes_online(nr_pages, zone, &arg); - nid = pfn_to_nid(pfn); + nid = zone_to_nid(zone); ret = memory_notify(MEM_GOING_ONLINE, &arg); ret = notifier_to_errno(ret); @@ -1095,7 +1095,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ pgdat_resize_unlock(zone->zone_pgdat, &flags); if (onlined_pages) { - node_states_set_node(zone_to_nid(zone), &arg); + node_states_set_node(nid, &arg); if (need_zonelists_rebuild) build_all_zonelists(NULL, NULL); else @@ -1107,7 +1107,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ init_per_zone_wmark_min(); if (onlined_pages) { - kswapd_run(zone_to_nid(zone)); + kswapd_run(nid); kcompactd_run(nid); } -- cgit v1.2.3 From accf62422b3a67fce8ce086aa81c8300ddbf42be Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 17 Mar 2016 14:18:15 -0700 Subject: mm, kswapd: replace kswapd compaction with waking up kcompactd Similarly to direct reclaim/compaction, kswapd attempts to combine reclaim and compaction to attempt making memory allocation of given order available. The details differ from direct reclaim e.g. in having high watermark as a goal. The code involved in kswapd's reclaim/compaction decisions has evolved to be quite complex. Testing reveals that it doesn't actually work in at least one scenario, and closer inspection suggests that it could be greatly simplified without compromising on the goal (make high-order page available) or efficiency (don't reclaim too much). The simplification relieas of doing all compaction in kcompactd, which is simply woken up when high watermarks are reached by kswapd's reclaim. The scenario where kswapd compaction doesn't work was found with mmtests test stress-highalloc configured to attempt order-9 allocations without direct reclaim, just waking up kswapd. There was no compaction attempt from kswapd during the whole test. Some added instrumentation shows what happens: - balance_pgdat() sets end_zone to Normal, as it's not balanced - reclaim is attempted on DMA zone, which sets nr_attempted to 99, but it cannot reclaim anything, so sc.nr_reclaimed is 0 - for zones DMA32 and Normal, kswapd_shrink_zone uses testorder=0, so it merely checks if high watermarks were reached for base pages. This is true, so no reclaim is attempted. For DMA, testorder=0 wasn't used, as compaction_suitable() returned COMPACT_SKIPPED - even though the pgdat_needs_compaction flag wasn't set to false, no compaction happens due to the condition sc.nr_reclaimed > nr_attempted being false (as 0 < 99) - priority-- due to nr_reclaimed being 0, repeat until priority reaches 0 pgdat_balanced() is false as only the small zone DMA appears balanced (curiously in that check, watermark appears OK and compaction_suitable() returns COMPACT_PARTIAL, because a lower classzone_idx is used there) Now, even if it was decided that reclaim shouldn't be attempted on the DMA zone, the scenario would be the same, as (sc.nr_reclaimed=0 > nr_attempted=0) is also false. The condition really should use >= as the comment suggests. Then there is a mismatch in the check for setting pgdat_needs_compaction to false using low watermark, while the rest uses high watermark, and who knows what other subtlety. Hopefully this demonstrates that this is unsustainable. Luckily we can simplify this a lot. The reclaim/compaction decisions make sense for direct reclaim scenario, but in kswapd, our primary goal is to reach high watermark in order-0 pages. Afterwards we can attempt compaction just once. Unlike direct reclaim, we don't reclaim extra pages (over the high watermark), the current code already disallows it for good reasons. After this patch, we simply wake up kcompactd to process the pgdat, after we have either succeeded or failed to reach the high watermarks in kswapd, which goes to sleep. We pass kswapd's order and classzone_idx, so kcompactd can apply the same criteria to determine which zones are worth compacting. Note that we use the classzone_idx from wakeup_kswapd(), not balanced_classzone_idx which can include higher zones that kswapd tried to balance too, but didn't consider them in pgdat_balanced(). Since kswapd now cannot create high-order pages itself, we need to adjust how it determines the zones to be balanced. The key element here is adding a "highorder" parameter to zone_balanced, which, when set to false, makes it consider only order-0 watermark instead of the desired higher order (this was done previously by kswapd_shrink_zone(), but not elsewhere). This false is passed for example in pgdat_balanced(). Importantly, wakeup_kswapd() uses true to make sure kswapd and thus kcompactd are woken up for a high-order allocation failure. The last thing is to decide what to do with pageblock_skip bitmap handling. Compaction maintains a pageblock_skip bitmap to record pageblocks where isolation recently failed. This bitmap can be reset by three ways: 1) direct compaction is restarting after going through the full deferred cycle 2) kswapd goes to sleep, and some other direct compaction has previously finished scanning the whole zone and set zone->compact_blockskip_flush. Note that a successful direct compaction clears this flag. 3) compaction was invoked manually via trigger in /proc The case 2) is somewhat fuzzy to begin with, but after introducing kcompactd we should update it. The check for direct compaction in 1), and to set the flush flag in 2) use current_is_kswapd(), which doesn't work for kcompactd. Thus, this patch adds bool direct_compaction to compact_control to use in 2). For the case 1) we remove the check completely - unlike the former kswapd compaction, kcompactd does use the deferred compaction functionality, so flushing tied to restarting from deferred compaction makes sense here. Note that when kswapd goes to sleep, kcompactd is woken up, so it will see the flushed pageblock_skip bits. This is different from when the former kswapd compaction observed the bits and I believe it makes more sense. Kcompactd can afford to be more thorough than a direct compaction trying to limit allocation latency, or kswapd whose primary goal is to reclaim. For testing, I used stress-highalloc configured to do order-9 allocations with GFP_NOWAIT|__GFP_HIGH|__GFP_COMP, so they relied just on kswapd/kcompactd reclaim/compaction (the interfering kernel builds in phases 1 and 2 work as usual): stress-highalloc 4.5-rc1+before 4.5-rc1+after -nodirect -nodirect Success 1 Min 1.00 ( 0.00%) 5.00 (-66.67%) Success 1 Mean 1.40 ( 0.00%) 6.20 (-55.00%) Success 1 Max 2.00 ( 0.00%) 7.00 (-16.67%) Success 2 Min 1.00 ( 0.00%) 5.00 (-66.67%) Success 2 Mean 1.80 ( 0.00%) 6.40 (-52.38%) Success 2 Max 3.00 ( 0.00%) 7.00 (-16.67%) Success 3 Min 34.00 ( 0.00%) 62.00 ( 1.59%) Success 3 Mean 41.80 ( 0.00%) 63.80 ( 1.24%) Success 3 Max 53.00 ( 0.00%) 65.00 ( 2.99%) User 3166.67 3181.09 System 1153.37 1158.25 Elapsed 1768.53 1799.37 4.5-rc1+before 4.5-rc1+after -nodirect -nodirect Direct pages scanned 32938 32797 Kswapd pages scanned 2183166 2202613 Kswapd pages reclaimed 2152359 2143524 Direct pages reclaimed 32735 32545 Percentage direct scans 1% 1% THP fault alloc 579 612 THP collapse alloc 304 316 THP splits 0 0 THP fault fallback 793 778 THP collapse fail 11 16 Compaction stalls 1013 1007 Compaction success 92 67 Compaction failures 920 939 Page migrate success 238457 721374 Page migrate failure 23021 23469 Compaction pages isolated 504695 1479924 Compaction migrate scanned 661390 8812554 Compaction free scanned 13476658 84327916 Compaction cost 262 838 After this patch we see improvements in allocation success rate (especially for phase 3) along with increased compaction activity. The compaction stalls (direct compaction) in the interfering kernel builds (probably THP's) also decreased somewhat thanks to kcompactd activity, yet THP alloc successes improved a bit. Note that elapsed and user time isn't so useful for this benchmark, because of the background interference being unpredictable. It's just to quickly spot some major unexpected differences. System time is somewhat more useful and that didn't increase. Also (after adjusting mmtests' ftrace monitor): Time kswapd awake 2547781 2269241 Time kcompactd awake 0 119253 Time direct compacting 939937 557649 Time kswapd compacting 0 0 Time kcompactd compacting 0 119099 The decrease of overal time spent compacting appears to not match the increased compaction stats. I suspect the tasks get rescheduled and since the ftrace monitor doesn't see that, the reported time is wall time, not CPU time. But arguably direct compactors care about overall latency anyway, whether busy compacting or waiting for CPU doesn't matter. And that latency seems to almost halved. It's also interesting how much time kswapd spent awake just going through all the priorities and failing to even try compacting, over and over. We can also configure stress-highalloc to perform both direct reclaim/compaction and wakeup kswapd/kcompactd, by using GFP_KERNEL|__GFP_HIGH|__GFP_COMP: stress-highalloc 4.5-rc1+before 4.5-rc1+after -direct -direct Success 1 Min 4.00 ( 0.00%) 9.00 (-50.00%) Success 1 Mean 8.00 ( 0.00%) 10.00 (-19.05%) Success 1 Max 12.00 ( 0.00%) 11.00 ( 15.38%) Success 2 Min 4.00 ( 0.00%) 9.00 (-50.00%) Success 2 Mean 8.20 ( 0.00%) 10.00 (-16.28%) Success 2 Max 13.00 ( 0.00%) 11.00 ( 8.33%) Success 3 Min 75.00 ( 0.00%) 74.00 ( 1.33%) Success 3 Mean 75.60 ( 0.00%) 75.20 ( 0.53%) Success 3 Max 77.00 ( 0.00%) 76.00 ( 0.00%) User 3344.73 3246.04 System 1194.24 1172.29 Elapsed 1838.04 1836.76 4.5-rc1+before 4.5-rc1+after -direct -direct Direct pages scanned 125146 120966 Kswapd pages scanned 2119757 2135012 Kswapd pages reclaimed 2073183 2108388 Direct pages reclaimed 124909 120577 Percentage direct scans 5% 5% THP fault alloc 599 652 THP collapse alloc 323 354 THP splits 0 0 THP fault fallback 806 793 THP collapse fail 17 16 Compaction stalls 2457 2025 Compaction success 906 518 Compaction failures 1551 1507 Page migrate success 2031423 2360608 Page migrate failure 32845 40852 Compaction pages isolated 4129761 4802025 Compaction migrate scanned 11996712 21750613 Compaction free scanned 214970969 344372001 Compaction cost 2271 2694 In this scenario, this patch doesn't change the overall success rate as direct compaction already tries all it can. There's however significant reduction in direct compaction stalls (that is, the number of allocations that went into direct compaction). The number of successes (i.e. direct compaction stalls that ended up with successful allocation) is reduced by the same number. This means the offload to kcompactd is working as expected, and direct compaction is reduced either due to detecting contention, or compaction deferred by kcompactd. In the previous version of this patchset there was some apparent reduction of success rate, but the changes in this version (such as using sync compaction only), new baseline kernel, and/or averaging results from 5 executions (my bet), made this go away. Ftrace-based stats seem to roughly agree: Time kswapd awake 2532984 2326824 Time kcompactd awake 0 257916 Time direct compacting 864839 735130 Time kswapd compacting 0 0 Time kcompactd compacting 0 257585 Signed-off-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Joonsoo Kim Cc: Mel Gorman Cc: David Rientjes Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 10 ++-- mm/internal.h | 1 + mm/vmscan.c | 147 ++++++++++++++++++-------------------------------------- 3 files changed, 54 insertions(+), 104 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 5b2bfbaa821a..ccf97b02b85f 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1191,11 +1191,11 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc, /* * Mark that the PG_migrate_skip information should be cleared - * by kswapd when it goes to sleep. kswapd does not set the + * by kswapd when it goes to sleep. kcompactd does not set the * flag itself as the decision to be clear should be directly * based on an allocation request. */ - if (!current_is_kswapd()) + if (cc->direct_compaction) zone->compact_blockskip_flush = true; return COMPACT_COMPLETE; @@ -1338,10 +1338,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) /* * Clear pageblock skip if there were failures recently and compaction - * is about to be retried after being deferred. kswapd does not do - * this reset as it'll reset the cached information when going to sleep. + * is about to be retried after being deferred. */ - if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) + if (compaction_restarting(zone, cc->order)) __reset_isolation_suitable(zone); /* @@ -1477,6 +1476,7 @@ static unsigned long compact_zone_order(struct zone *zone, int order, .mode = mode, .alloc_flags = alloc_flags, .classzone_idx = classzone_idx, + .direct_compaction = true, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); diff --git a/mm/internal.h b/mm/internal.h index b95952c2faec..4042a8a05672 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -172,6 +172,7 @@ struct compact_control { unsigned long last_migrated_pfn;/* Not yet flushed page being freed */ enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ + bool direct_compaction; /* False from kcompactd or /proc/... */ int order; /* order a direct compactor needs */ const gfp_t gfp_mask; /* gfp mask of a direct compactor */ const int alloc_flags; /* alloc flags of a direct compactor */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 5dcc71140108..f87cfaa955a8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2968,18 +2968,23 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc) } while (memcg); } -static bool zone_balanced(struct zone *zone, int order, - unsigned long balance_gap, int classzone_idx) +static bool zone_balanced(struct zone *zone, int order, bool highorder, + unsigned long balance_gap, int classzone_idx) { - if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) + - balance_gap, classzone_idx)) - return false; + unsigned long mark = high_wmark_pages(zone) + balance_gap; - if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone, - order, 0, classzone_idx) == COMPACT_SKIPPED) - return false; + /* + * When checking from pgdat_balanced(), kswapd should stop and sleep + * when it reaches the high order-0 watermark and let kcompactd take + * over. Other callers such as wakeup_kswapd() want to determine the + * true high-order watermark. + */ + if (IS_ENABLED(CONFIG_COMPACTION) && !highorder) { + mark += (1UL << order); + order = 0; + } - return true; + return zone_watermark_ok_safe(zone, order, mark, classzone_idx); } /* @@ -3029,7 +3034,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) continue; } - if (zone_balanced(zone, order, 0, i)) + if (zone_balanced(zone, order, false, 0, i)) balanced_pages += zone->managed_pages; else if (!order) return false; @@ -3083,27 +3088,14 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, */ static bool kswapd_shrink_zone(struct zone *zone, int classzone_idx, - struct scan_control *sc, - unsigned long *nr_attempted) + struct scan_control *sc) { - int testorder = sc->order; unsigned long balance_gap; bool lowmem_pressure; /* Reclaim above the high watermark. */ sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone)); - /* - * Kswapd reclaims only single pages with compaction enabled. Trying - * too hard to reclaim until contiguous free pages have become - * available can hurt performance by evicting too much useful data - * from memory. Do not reclaim more than needed for compaction. - */ - if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && - compaction_suitable(zone, sc->order, 0, classzone_idx) - != COMPACT_SKIPPED) - testorder = 0; - /* * We put equal pressure on every zone, unless one zone has way too * many pages free already. The "too many pages" is defined as the @@ -3118,15 +3110,12 @@ static bool kswapd_shrink_zone(struct zone *zone, * reclaim is necessary */ lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone)); - if (!lowmem_pressure && zone_balanced(zone, testorder, + if (!lowmem_pressure && zone_balanced(zone, sc->order, false, balance_gap, classzone_idx)) return true; shrink_zone(zone, sc, zone_idx(zone) == classzone_idx); - /* Account for the number of pages attempted to reclaim */ - *nr_attempted += sc->nr_to_reclaim; - clear_bit(ZONE_WRITEBACK, &zone->flags); /* @@ -3136,7 +3125,7 @@ static bool kswapd_shrink_zone(struct zone *zone, * waits. */ if (zone_reclaimable(zone) && - zone_balanced(zone, testorder, 0, classzone_idx)) { + zone_balanced(zone, sc->order, false, 0, classzone_idx)) { clear_bit(ZONE_CONGESTED, &zone->flags); clear_bit(ZONE_DIRTY, &zone->flags); } @@ -3148,7 +3137,7 @@ static bool kswapd_shrink_zone(struct zone *zone, * For kswapd, balance_pgdat() will work across all this node's zones until * they are all at high_wmark_pages(zone). * - * Returns the final order kswapd was reclaiming at + * Returns the highest zone idx kswapd was reclaiming at * * There is special handling here for zones which are full of pinned pages. * This can happen if the pages are all mlocked, or if they are all used by @@ -3165,8 +3154,7 @@ static bool kswapd_shrink_zone(struct zone *zone, * interoperates with the page allocator fallback scheme to ensure that aging * of pages is balanced across the zones. */ -static unsigned long balance_pgdat(pg_data_t *pgdat, int order, - int *classzone_idx) +static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) { int i; int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ @@ -3183,9 +3171,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, count_vm_event(PAGEOUTRUN); do { - unsigned long nr_attempted = 0; bool raise_priority = true; - bool pgdat_needs_compaction = (order > 0); sc.nr_reclaimed = 0; @@ -3220,7 +3206,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, break; } - if (!zone_balanced(zone, order, 0, 0)) { + if (!zone_balanced(zone, order, false, 0, 0)) { end_zone = i; break; } else { @@ -3236,24 +3222,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, if (i < 0) goto out; - for (i = 0; i <= end_zone; i++) { - struct zone *zone = pgdat->node_zones + i; - - if (!populated_zone(zone)) - continue; - - /* - * If any zone is currently balanced then kswapd will - * not call compaction as it is expected that the - * necessary pages are already available. - */ - if (pgdat_needs_compaction && - zone_watermark_ok(zone, order, - low_wmark_pages(zone), - *classzone_idx, 0)) - pgdat_needs_compaction = false; - } - /* * If we're getting trouble reclaiming, start doing writepage * even in laptop mode. @@ -3297,8 +3265,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, * that that high watermark would be met at 100% * efficiency. */ - if (kswapd_shrink_zone(zone, end_zone, - &sc, &nr_attempted)) + if (kswapd_shrink_zone(zone, end_zone, &sc)) raise_priority = false; } @@ -3311,28 +3278,10 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, pfmemalloc_watermark_ok(pgdat)) wake_up_all(&pgdat->pfmemalloc_wait); - /* - * Fragmentation may mean that the system cannot be rebalanced - * for high-order allocations in all zones. If twice the - * allocation size has been reclaimed and the zones are still - * not balanced then recheck the watermarks at order-0 to - * prevent kswapd reclaiming excessively. Assume that a - * process requested a high-order can direct reclaim/compact. - */ - if (order && sc.nr_reclaimed >= 2UL << order) - order = sc.order = 0; - /* Check if kswapd should be suspending */ if (try_to_freeze() || kthread_should_stop()) break; - /* - * Compact if necessary and kswapd is reclaiming at least the - * high watermark number of pages as requsted - */ - if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted) - compact_pgdat(pgdat, order); - /* * Raise priority if scanning rate is too low or there was no * progress in reclaiming pages @@ -3340,20 +3289,18 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, if (raise_priority || !sc.nr_reclaimed) sc.priority--; } while (sc.priority >= 1 && - !pgdat_balanced(pgdat, order, *classzone_idx)); + !pgdat_balanced(pgdat, order, classzone_idx)); out: /* - * Return the order we were reclaiming at so prepare_kswapd_sleep() - * makes a decision on the order we were last reclaiming at. However, - * if another caller entered the allocator slow path while kswapd - * was awake, order will remain at the higher level + * Return the highest zone idx we were reclaiming at so + * prepare_kswapd_sleep() makes the same decisions as here. */ - *classzone_idx = end_zone; - return order; + return end_zone; } -static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) +static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, + int classzone_idx, int balanced_classzone_idx) { long remaining = 0; DEFINE_WAIT(wait); @@ -3364,7 +3311,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); /* Try to sleep for a short interval */ - if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { + if (prepare_kswapd_sleep(pgdat, order, remaining, + balanced_classzone_idx)) { remaining = schedule_timeout(HZ/10); finish_wait(&pgdat->kswapd_wait, &wait); prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); @@ -3374,7 +3322,8 @@ static void kswapd_tr