From b0f84ac352762ed02d7ea9f284942a8cab7f9077 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Thu, 17 Mar 2016 14:17:16 -0700 Subject: ia64: define ioremap_uc() All architectures now need ioremap_uc(), ia64 seems defines this already through its ioremap_nocache() and it already ensures it *only* uses UC. This is needed since v4.3 to complete an allyesconfig compile on ia64, there were others archs that needed this, and this one seems to have fallen through the cracks. Signed-off-by: Luis R. Rodriguez Reported-by: kbuild test robot Acked-by: Tony Luck Cc: [4.3+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/include/asm/io.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/ia64/include/asm/io.h b/arch/ia64/include/asm/io.h index a865d2a04f75..5de673ac9cb1 100644 --- a/arch/ia64/include/asm/io.h +++ b/arch/ia64/include/asm/io.h @@ -433,6 +433,7 @@ static inline void __iomem * ioremap_cache (unsigned long phys_addr, unsigned lo return ioremap(phys_addr, size); } #define ioremap_cache ioremap_cache +#define ioremap_uc ioremap_nocache /* -- cgit v1.2.3 From 4c11e554fb894b381a3dc47069259d87a2e6ffc9 Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Thu, 17 Mar 2016 14:17:20 -0700 Subject: drivers/firmware/broadcom/bcm47xx_nvram.c: fix incorrect __ioread32_copy Commit 1f330c327900 ("drivers/firmware/broadcom/bcm47xx_nvram.c: use __ioread32_copy() instead of open-coding") switched to use a generic copy function, but failed to notice that the header pointer is updated between the two copies, resulting in bogus data being copied in the latter one. Fix by keeping the old header pointer. The patch fixes totally broken networking on WRT54GL router (both LAN and WLAN interfaces fail to probe). Fixes: 1f330c327900 ("drivers/firmware/broadcom/bcm47xx_nvram.c: use __ioread32_copy() instead of open-coding") Signed-off-by: Aaro Koskinen Reviewed-by: Stephen Boyd Cc: Rafal Milecki Cc: Hauke Mehrtens Cc: [4.4.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/firmware/broadcom/bcm47xx_nvram.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/firmware/broadcom/bcm47xx_nvram.c b/drivers/firmware/broadcom/bcm47xx_nvram.c index 0c2f0a61b0ea..0b631e5b5b84 100644 --- a/drivers/firmware/broadcom/bcm47xx_nvram.c +++ b/drivers/firmware/broadcom/bcm47xx_nvram.c @@ -94,15 +94,14 @@ static int nvram_find_and_copy(void __iomem *iobase, u32 lim) found: __ioread32_copy(nvram_buf, header, sizeof(*header) / 4); - header = (struct nvram_header *)nvram_buf; - nvram_len = header->len; + nvram_len = ((struct nvram_header *)(nvram_buf))->len; if (nvram_len > size) { pr_err("The nvram size according to the header seems to be bigger than the partition on flash\n"); nvram_len = size; } if (nvram_len >= NVRAM_SPACE) { pr_err("nvram on flash (%i bytes) is bigger than the reserved space in memory, will just copy the first %i bytes\n", - header->len, NVRAM_SPACE - 1); + nvram_len, NVRAM_SPACE - 1); nvram_len = NVRAM_SPACE - 1; } /* proceed reading data after header */ -- cgit v1.2.3 From a1ee1932aa6bea0bb074f5e3ced112664e4637ed Mon Sep 17 00:00:00 2001 From: Joshua Hunt Date: Thu, 17 Mar 2016 14:17:23 -0700 Subject: watchdog: don't run proc_watchdog_update if new value is same as old While working on a script to restore all sysctl params before a series of tests I found that writing any value into the /proc/sys/kernel/{nmi_watchdog,soft_watchdog,watchdog,watchdog_thresh} causes them to call proc_watchdog_update(). NMI watchdog: enabled on all CPUs, permanently consumes one hw-PMU counter. NMI watchdog: enabled on all CPUs, permanently consumes one hw-PMU counter. NMI watchdog: enabled on all CPUs, permanently consumes one hw-PMU counter. NMI watchdog: enabled on all CPUs, permanently consumes one hw-PMU counter. There doesn't appear to be a reason for doing this work every time a write occurs, so only do it when the values change. Signed-off-by: Josh Hunt Acked-by: Don Zickus Reviewed-by: Aaron Tomlin Cc: Ulrich Obergfell Cc: [4.1.x+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index b3ace6ebbba3..9acb29f280ec 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -923,6 +923,9 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, * both lockup detectors are disabled if proc_watchdog_update() * returns an error. */ + if (old == new) + goto out; + err = proc_watchdog_update(); } out: @@ -967,7 +970,7 @@ int proc_soft_watchdog(struct ctl_table *table, int write, int proc_watchdog_thresh(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int err, old; + int err, old, new; get_online_cpus(); mutex_lock(&watchdog_proc_mutex); @@ -987,6 +990,10 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, /* * Update the sample period. Restore on failure. */ + new = ACCESS_ONCE(watchdog_thresh); + if (old == new) + goto out; + set_sample_period(); err = proc_watchdog_update(); if (err) { -- cgit v1.2.3 From 6a618957ad17d8f4f4c7eeede752685374b1b176 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 17 Mar 2016 14:17:26 -0700 Subject: mm: oom_kill: don't ignore oom score on exiting tasks When the OOM killer scans tasks and encounters a PF_EXITING one, it force-selects that task regardless of the score. The problem is that if that task got stuck waiting for some state the allocation site is holding, the OOM reaper can not move on to the next best victim. Frankly, I don't even know why we check for exiting tasks in the OOM killer. We've tried direct reclaim at least 15 times by the time we decide the system is OOM, there was plenty of time to exit and free memory; and a task might exit voluntarily right after we issue a kill. This is testing pure noise. Remove it. Signed-off-by: Tetsuo Handa Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Kirill A. Shutemov Cc: Mel Gorman Cc: David Rientjes Cc: Oleg Nesterov Cc: Hugh Dickins Cc: Andrea Argangeli Cc: Rik van Riel Cc: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index e97a05d9621f..63ced708eafd 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -287,9 +287,6 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, if (oom_task_origin(task)) return OOM_SCAN_SELECT; - if (task_will_free_mem(task) && !is_sysrq_oom(oc)) - return OOM_SCAN_ABORT; - return OOM_SCAN_OK; } -- cgit v1.2.3 From fcff7d7eebe6d31e2ce20d994555c86a90197034 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:17:29 -0700 Subject: mm: memcontrol: do not bypass slab charge if memcg is offline Slab pages are charged in two steps. First, an appropriate per memcg cache is selected (see memcg_kmem_get_cache) basing on the current context, then the new slab page is charged to the memory cgroup which the selected cache was created for (see memcg_charge_slab -> __memcg_kmem_charge_memcg). It is OK to bypass kmemcg charge at step 1, but if step 1 succeeded and we successfully allocated a new slab page, step 2 must be performed, otherwise we would get a per memcg kmem cache which contains a slab that does not hold a reference to the memory cgroup owning the cache. Since per memcg kmem caches are destroyed on memcg css free, this could result in freeing a cache while there are still active objects in it. However, currently we will bypass slab page charge if the memory cgroup owning the cache is offline (see __memcg_kmem_charge_memcg). This is very unlikely to occur in practice, because for this to happen a process must be migrated to a different cgroup and the old cgroup must be removed while the process is in kmalloc somewhere between steps 1 and 2 (e.g. trying to allocate a new page). Nevertheless, it's still better to eliminate such a possibility. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 42882c1e7fce..5c9d45e4c739 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2325,9 +2325,6 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, struct page_counter *counter; int ret; - if (!memcg_kmem_online(memcg)) - return 0; - ret = try_charge(memcg, gfp, nr_pages); if (ret) return ret; @@ -2346,10 +2343,11 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) { struct mem_cgroup *memcg; - int ret; + int ret = 0; memcg = get_mem_cgroup_from_mm(current->mm); - ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); + if (memcg_kmem_online(memcg)) + ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); css_put(&memcg->css); return ret; } -- cgit v1.2.3 From 72b54e7314a2e7a68567c92bbb32fe2598a3c783 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:17:32 -0700 Subject: mm: memcontrol: make tree_{stat,events} fetch all stats Currently, tree_{stat,events} helpers can only get one stat index at a time, so when there are a lot of stats to be reported one has to call it over and over again (see memory_stat_show). This is neither effective, nor does it look good. Instead, let's make these helpers take a snapshot of all available counters. Signed-off-by: Vladimir Davydov Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 67 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 39 insertions(+), 28 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5c9d45e4c739..430266071c36 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2717,39 +2717,48 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, return retval; } -static unsigned long tree_stat(struct mem_cgroup *memcg, - enum mem_cgroup_stat_index idx) +static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat) { struct mem_cgroup *iter; - unsigned long val = 0; + int i; - for_each_mem_cgroup_tree(iter, memcg) - val += mem_cgroup_read_stat(iter, idx); + memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT); - return val; + for_each_mem_cgroup_tree(iter, memcg) { + for (i = 0; i < MEMCG_NR_STAT; i++) + stat[i] += mem_cgroup_read_stat(iter, i); + } } -static unsigned long tree_events(struct mem_cgroup *memcg, - enum mem_cgroup_events_index idx) +static void tree_events(struct mem_cgroup *memcg, unsigned long *events) { struct mem_cgroup *iter; - unsigned long val = 0; + int i; - for_each_mem_cgroup_tree(iter, memcg) - val += mem_cgroup_read_events(iter, idx); + memset(events, 0, sizeof(*events) * MEMCG_NR_EVENTS); - return val; + for_each_mem_cgroup_tree(iter, memcg) { + for (i = 0; i < MEMCG_NR_EVENTS; i++) + events[i] += mem_cgroup_read_events(iter, i); + } } static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) { - unsigned long val; + unsigned long val = 0; if (mem_cgroup_is_root(memcg)) { - val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE); - val += tree_stat(memcg, MEM_CGROUP_STAT_RSS); - if (swap) - val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP); + struct mem_cgroup *iter; + + for_each_mem_cgroup_tree(iter, memcg) { + val += mem_cgroup_read_stat(iter, + MEM_CGROUP_STAT_CACHE); + val += mem_cgroup_read_stat(iter, + MEM_CGROUP_STAT_RSS); + if (swap) + val += mem_cgroup_read_stat(iter, + MEM_CGROUP_STAT_SWAP); + } } else { if (!swap) val = page_counter_read(&memcg->memory); @@ -5075,6 +5084,8 @@ static int memory_events_show(struct seq_file *m, void *v) static int memory_stat_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned long stat[MEMCG_NR_STAT]; + unsigned long events[MEMCG_NR_EVENTS]; int i; /* @@ -5088,22 +5099,22 @@ static int memory_stat_show(struct seq_file *m, void *v) * Current memory state: */ + tree_stat(memcg, stat); + tree_events(memcg, events); + seq_printf(m, "anon %llu\n", - (u64)tree_stat(memcg, MEM_CGROUP_STAT_RSS) * PAGE_SIZE); + (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE); seq_printf(m, "file %llu\n", - (u64)tree_stat(memcg, MEM_CGROUP_STAT_CACHE) * PAGE_SIZE); + (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE); seq_printf(m, "sock %llu\n", - (u64)tree_stat(memcg, MEMCG_SOCK) * PAGE_SIZE); + (u64)stat[MEMCG_SOCK] * PAGE_SIZE); seq_printf(m, "file_mapped %llu\n", - (u64)tree_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED) * - PAGE_SIZE); + (u64)stat[MEM_CGROUP_STAT_FILE_MAPPED] * PAGE_SIZE); seq_printf(m, "file_dirty %llu\n", - (u64)tree_stat(memcg, MEM_CGROUP_STAT_DIRTY) * - PAGE_SIZE); + (u64)stat[MEM_CGROUP_STAT_DIRTY] * PAGE_SIZE); seq_printf(m, "file_writeback %llu\n", - (u64)tree_stat(memcg, MEM_CGROUP_STAT_WRITEBACK) * - PAGE_SIZE); + (u64)stat[MEM_CGROUP_STAT_WRITEBACK] * PAGE_SIZE); for (i = 0; i < NR_LRU_LISTS; i++) { struct mem_cgroup *mi; @@ -5118,9 +5129,9 @@ static int memory_stat_show(struct seq_file *m, void *v) /* Accumulated memory events */ seq_printf(m, "pgfault %lu\n", - tree_events(memcg, MEM_CGROUP_EVENTS_PGFAULT)); + events[MEM_CGROUP_EVENTS_PGFAULT]); seq_printf(m, "pgmajfault %lu\n", - tree_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT)); + events[MEM_CGROUP_EVENTS_PGMAJFAULT]); return 0; } -- cgit v1.2.3 From 27ee57c93ff00b8a2d6c6dd6b0b3dddda7b43b77 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:17:35 -0700 Subject: mm: memcontrol: report slab usage in cgroup2 memory.stat Show how much memory is used for storing reclaimable and unreclaimable in-kernel data structures allocated from slab caches. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroup-v2.txt | 15 +++++++++++++++ include/linux/memcontrol.h | 21 +++++++++++++++++++++ mm/memcontrol.c | 8 ++++++++ mm/slab.c | 8 +++++--- mm/slab.h | 30 ++++++++++++++++++++++++++++-- mm/slub.c | 3 ++- 6 files changed, 79 insertions(+), 6 deletions(-) diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index ff49cf901148..e4e0c1d78cee 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt @@ -843,6 +843,11 @@ PAGE_SIZE multiple when read back. Amount of memory used to cache filesystem data, including tmpfs and shared memory. + slab + + Amount of memory used for storing in-kernel data + structures. + sock Amount of memory used in network transmission buffers @@ -871,6 +876,16 @@ PAGE_SIZE multiple when read back. on the internal memory management lists used by the page reclaim algorithm + slab_reclaimable + + Part of "slab" that might be reclaimed, such as + dentries and inodes. + + slab_unreclaimable + + Part of "slab" that cannot be reclaimed on memory + pressure. + pgfault Total number of page faults incurred diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index f0c4bec6565b..e7af4834ffea 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -53,6 +53,8 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_NSTATS, /* default hierarchy stats */ MEMCG_SOCK = MEM_CGROUP_STAT_NSTATS, + MEMCG_SLAB_RECLAIMABLE, + MEMCG_SLAB_UNRECLAIMABLE, MEMCG_NR_STAT, }; @@ -883,6 +885,20 @@ static __always_inline void memcg_kmem_put_cache(struct kmem_cache *cachep) if (memcg_kmem_enabled()) __memcg_kmem_put_cache(cachep); } + +/** + * memcg_kmem_update_page_stat - update kmem page state statistics + * @page: the page + * @idx: page state item to account + * @val: number of pages (positive or negative) + */ +static inline void memcg_kmem_update_page_stat(struct page *page, + enum mem_cgroup_stat_index idx, int val) +{ + if (memcg_kmem_enabled() && page->mem_cgroup) + this_cpu_add(page->mem_cgroup->stat->count[idx], val); +} + #else #define for_each_memcg_cache_index(_idx) \ for (; NULL; ) @@ -928,6 +944,11 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) static inline void memcg_kmem_put_cache(struct kmem_cache *cachep) { } + +static inline void memcg_kmem_update_page_stat(struct page *page, + enum mem_cgroup_stat_index idx, int val) +{ +} #endif /* CONFIG_MEMCG && !CONFIG_SLOB */ #endif /* _LINUX_MEMCONTROL_H */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 430266071c36..3ad64bf464fd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5106,6 +5106,9 @@ static int memory_stat_show(struct seq_file *m, void *v) (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE); seq_printf(m, "file %llu\n", (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE); + seq_printf(m, "slab %llu\n", + (u64)(stat[MEMCG_SLAB_RECLAIMABLE] + + stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); seq_printf(m, "sock %llu\n", (u64)stat[MEMCG_SOCK] * PAGE_SIZE); @@ -5126,6 +5129,11 @@ static int memory_stat_show(struct seq_file *m, void *v) mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE); } + seq_printf(m, "slab_reclaimable %llu\n", + (u64)stat[MEMCG_SLAB_RECLAIMABLE] * PAGE_SIZE); + seq_printf(m, "slab_unreclaimable %llu\n", + (u64)stat[MEMCG_SLAB_UNRECLAIMABLE] * PAGE_SIZE); + /* Accumulated memory events */ seq_printf(m, "pgfault %lu\n", diff --git a/mm/slab.c b/mm/slab.c index 852fc5c79829..56dd0df2a8ce 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1442,9 +1442,10 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, */ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) { - const unsigned long nr_freed = (1 << cachep->gfporder); + int order = cachep->gfporder; + unsigned long nr_freed = (1 << order); - kmemcheck_free_shadow(page, cachep->gfporder); + kmemcheck_free_shadow(page, order); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) sub_zone_page_state(page_zone(page), @@ -1461,7 +1462,8 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; - __free_kmem_pages(page, cachep->gfporder); + memcg_uncharge_slab(page, order, cachep); + __free_pages(page, order); } static void kmem_rcu_free(struct rcu_head *head) diff --git a/mm/slab.h b/mm/slab.h index b7934361f026..ff39a8fc3b3f 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -246,12 +246,33 @@ static __always_inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, struct kmem_cache *s) { + int ret; + if (!memcg_kmem_enabled()) return 0; if (is_root_cache(s)) return 0; - return __memcg_kmem_charge_memcg(page, gfp, order, - s->memcg_params.memcg); + + ret = __memcg_kmem_charge_memcg(page, gfp, order, + s->memcg_params.memcg); + if (ret) + return ret; + + memcg_kmem_update_page_stat(page, + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE, + 1 << order); + return 0; +} + +static __always_inline void memcg_uncharge_slab(struct page *page, int order, + struct kmem_cache *s) +{ + memcg_kmem_update_page_stat(page, + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE, + -(1 << order)); + memcg_kmem_uncharge(page, order); } extern void slab_init_memcg_params(struct kmem_cache *); @@ -294,6 +315,11 @@ static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, return 0; } +static inline void memcg_uncharge_slab(struct page *page, int order, + struct kmem_cache *s) +{ +} + static inline void slab_init_memcg_params(struct kmem_cache *s) { } diff --git a/mm/slub.c b/mm/slub.c index 6c91324f9370..712d53474082 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1540,7 +1540,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page) page_mapcount_reset(page); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; - __free_kmem_pages(page, order); + memcg_uncharge_slab(page, order, s); + __free_pages(page, order); } #define need_reserve_slab_rcu \ -- cgit v1.2.3 From 12580e4b54ba8a1b22ec977c200be0174ca42348 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 17 Mar 2016 14:17:38 -0700 Subject: mm: memcontrol: report kernel stack usage in cgroup2 memory.stat Show how much memory is allocated to kernel stacks. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroup-v2.txt | 4 ++++ include/linux/memcontrol.h | 3 ++- kernel/fork.c | 10 +++++++++- mm/memcontrol.c | 2 ++ 4 files changed, 17 insertions(+), 2 deletions(-) diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index e4e0c1d78cee..e2f4e7948a66 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt @@ -843,6 +843,10 @@ PAGE_SIZE multiple when read back. Amount of memory used to cache filesystem data, including tmpfs and shared memory. + kernel_stack + + Amount of memory allocated to kernel stacks. + slab Amount of memory used for storing in-kernel data diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e7af4834ffea..d6300313b298 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -52,9 +52,10 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ MEM_CGROUP_STAT_NSTATS, /* default hierarchy stats */ - MEMCG_SOCK = MEM_CGROUP_STAT_NSTATS, + MEMCG_KERNEL_STACK = MEM_CGROUP_STAT_NSTATS, MEMCG_SLAB_RECLAIMABLE, MEMCG_SLAB_UNRECLAIMABLE, + MEMCG_SOCK, MEMCG_NR_STAT, }; diff --git a/kernel/fork.c b/kernel/fork.c index 2e391c754ae7..accb7221d547 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -164,12 +164,20 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); + if (page) + memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, + 1 << THREAD_SIZE_ORDER); + return page ? page_address(page) : NULL; } static inline void free_thread_info(struct thread_info *ti) { - free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); + struct page *page = virt_to_page(ti); + + memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, + -(1 << THREAD_SIZE_ORDER)); + __free_kmem_pages(page, THREAD_SIZE_ORDER); } # else static struct kmem_cache *thread_info_cache; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3ad64bf464fd..4b7dda7c2e74 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5106,6 +5106,8 @@ static int memory_stat_show(struct seq_file *m, void *v) (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE); seq_printf(m, "file %llu\n", (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE); + seq_printf(m, "kernel_stack %llu\n", + (u64)stat[MEMCG_KERNEL_STACK] * PAGE_SIZE); seq_printf(m, "slab %llu\n", (u64)(stat[MEMCG_SLAB_RECLAIMABLE] + stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); -- cgit v1.2.3 From 832fc1de01aea28255cb11d270679b7f1273f0d7 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 17 Mar 2016 14:17:41 -0700 Subject: /proc/kpageflags: return KPF_BUDDY for "tail" buddy pages Currently /proc/kpageflags returns nothing for "tail" buddy pages, which is inconvenient when grasping how free pages are distributed. This patch sets KPF_BUDDY for such pages. With this patch: $ grep MemFree /proc/meminfo ; tools/vm/page-types -b buddy MemFree: 3134992 kB flags page-count MB symbolic-flags long-symbolic-flags 0x0000000000000400 779272 3044 __________B_______________________________ buddy 0x0000000000000c00 4385 17 __________BM______________________________ buddy,mmap total 783657 3061 783657 pages is 3134628 kB (roughly consistent with the global counter,) so it's OK. [akpm@linux-foundation.org: update comment, per Naoya] Signed-off-by: Naoya Horiguchi Reviewed-by: Vladimir Davydov > Cc: Konstantin Khlebnikov Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/page.c | 6 ++++-- include/linux/page-flags.h | 2 ++ mm/internal.h | 3 --- mm/page_alloc.c | 2 -- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/proc/page.c b/fs/proc/page.c index b2855eea5405..0be626d85331 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -103,9 +103,9 @@ u64 stable_page_flags(struct page *page) * pseudo flags for the well known (anonymous) memory mapped pages * * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the - * simple test in page_mapcount() is not enough. + * simple test in page_mapped() is not enough. */ - if (!PageSlab(page) && page_mapcount(page)) + if (!PageSlab(page) && page_mapped(page)) u |= 1 << KPF_MMAP; if (PageAnon(page)) u |= 1 << KPF_ANON; @@ -148,6 +148,8 @@ u64 stable_page_flags(struct page *page) */ if (PageBuddy(page)) u |= 1 << KPF_BUDDY; + else if (page_count(page) == 0 && is_free_buddy_page(page)) + u |= 1 << KPF_BUDDY; if (PageBalloon(page)) u |= 1 << KPF_BALLOON; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 19724e6ebd26..597695523679 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -593,6 +593,8 @@ static inline void __ClearPageBuddy(struct page *page) atomic_set(&page->_mapcount, -1); } +extern bool is_free_buddy_page(struct page *page); + #define PAGE_BALLOON_MAPCOUNT_VALUE (-256) static inline int PageBalloon(struct page *page) diff --git a/mm/internal.h b/mm/internal.h index ad9400d759c8..b95952c2faec 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -148,9 +148,6 @@ extern int __isolate_free_page(struct page *page, unsigned int order); extern void __free_pages_bootmem(struct page *page, unsigned long pfn, unsigned int order); extern void prep_compound_page(struct page *page, unsigned int order); -#ifdef CONFIG_MEMORY_FAILURE -extern bool is_free_buddy_page(struct page *page); -#endif extern int user_min_free_kbytes; #if defined CONFIG_COMPACTION || defined CONFIG_CMA diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c46b75d14b6f..c7332a4bc8db 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7152,7 +7152,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) } #endif -#ifdef CONFIG_MEMORY_FAILURE bool is_free_buddy_page(struct page *page) { struct zone *zone = page_zone(page); @@ -7171,4 +7170,3 @@ bool is_free_buddy_page(struct page *page) return order < MAX_ORDER; } -#endif -- cgit v1.2.3 From 0a71649cb724ab97df26baa7731ac31d2364bfe5 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 17 Mar 2016 14:17:44 -0700 Subject: /proc/kpageflags: return KPF_SLAB for slab tail pages Currently /proc/kpageflags returns just KPF_COMPOUND_TAIL for slab tail pages, which is inconvenient when grasping how slab pages are distributed (userspace always needs to check which kind of tail pages by itself). This patch sets KPF_SLAB for such pages. With this patch: $ grep Slab /proc/meminfo ; tools/vm/page-types -b slab Slab: 64880 kB flags page-count MB symbolic-flags long-symbolic-flags 0x0000000000000080 16220 63 _______S__________________________________ slab total 16220 63 16220 pages equals to 64880 kB, so returned result is consistent with the global counter. Signed-off-by: Naoya Horiguchi Reviewed-by: Vladimir Davydov Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/page.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/proc/page.c b/fs/proc/page.c index 0be626d85331..712f1b9992cc 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -160,6 +160,8 @@ u64 stable_page_flags(struct page *page) u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); + if (PageTail(page) && PageSlab(compound_head(page))) + u |= 1 << KPF_SLAB; u |= kpf_copy_bit(k, KPF_ERROR, PG_error); u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty); -- cgit v1.2.3 From 0335ddd34f39569a32096084bf3b0960d2b1212b Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 17 Mar 2016 14:17:47 -0700 Subject: tools/vm/page-types.c: support swap entry /proc/pid/pagemap (pte_to_pagemap_entry() internally) already reports about swap entry, so let's make the in-kernel utility aware of it. Signed-off-by: Naoya Horiguchi Cc: Vladimir Davydov Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/vm/page-types.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c index 5a6016224bb9..ec62ab4d8b55 100644 --- a/tools/vm/page-types.c +++ b/tools/vm/page-types.c @@ -61,6 +61,8 @@ #define PM_PFRAME_BITS 55 #define PM_PFRAME_MASK ((1LL << PM_PFRAME_BITS) - 1) #define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) +#define MAX_SWAPFILES_SHIFT 5 +#define PM_SWAP_OFFSET(x) (((x) & PM_PFRAME_MASK) >> MAX_SWAPFILES_SHIFT) #define PM_SOFT_DIRTY (1ULL << 55) #define PM_MMAP_EXCLUSIVE (1ULL << 56) #define PM_FILE (1ULL << 61) @@ -92,7 +94,8 @@ #define KPF_SLOB_FREE 49 #define KPF_SLUB_FROZEN 50 #define KPF_SLUB_DEBUG 51 -#define KPF_FILE 62 +#define KPF_FILE 61 +#define KPF_SWAP 62 #define KPF_MMAP_EXCLUSIVE 63 #define KPF_ALL_BITS ((uint64_t)~0ULL) @@ -146,6 +149,7 @@ static const char * const page_flag_names[] = { [KPF_SLUB_DEBUG] = "E:slub_debug", [KPF_FILE] = "F:file", + [KPF_SWAP] = "w:swap", [KPF_MMAP_EXCLUSIVE] = "1:mmap_exclusive", }; @@ -297,6 +301,10 @@ static unsigned long pagemap_pfn(uint64_t val) return pfn; } +static unsigned long pagemap_swap_offset(uint64_t val) +{ + return val & PM_SWAP ? PM_SWAP_OFFSET(val) : 0; +} /* * page flag names @@ -452,6 +460,8 @@ static uint64_t expand_overloaded_flags(uint64_t flags, uint64_t pme) flags |= BIT(SOFTDIRTY); if (pme & PM_FILE) flags |= BIT(FILE); + if (pme & PM_SWAP) + flags |= BIT(SWAP); if (pme & PM_MMAP_EXCLUSIVE) flags |= BIT(MMAP_EXCLUSIVE); @@ -613,6 +623,22 @@ static void walk_pfn(unsigned long voffset, } } +static void walk_swap(unsigned long voffset, uint64_t pme) +{ + uint64_t flags = kpageflags_flags(0, pme); + + if (!bit_mask_ok(flags)) + return; + + if (opt_list == 1) + show_page_range(voffset, pagemap_swap_offset(pme), 1, flags); + else if (opt_list == 2) + show_page(voffset, pagemap_swap_offset(pme), flags); + + nr_pages[hash_slot(flags)]++; + total_pages++; +} + #define PAGEMAP_BATCH (64 << 10) static void walk_vma(unsigned long index, unsigned long count) { @@ -632,6 +658,8 @@ static void walk_vma(unsigned long index, unsigned long count) pfn = pagemap_pfn(buf[i]); if (pfn) walk_pfn(index + i, pfn, 1, buf[i]); + if (buf[i] & PM_SWAP) + walk_swap(index + i, buf[i]); } index += pages; -- cgit v1.2.3 From f48d97f340cbb0c323fa7a7b36bd76a108a9f49f Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 17 Mar 2016 14:17:49 -0700 Subject: mm/vmalloc: query dynamic DEBUG_PAGEALLOC setting As CONFIG_DEBUG_PAGEALLOC can be enabled/disabled via kernel parameters we can optimize some cases by checking the enablement state. This is follow-up work for Christian's Optimize CONFIG_DEBUG_PAGEALLOC: https://lkml.org/lkml/2016/1/27/194 Remaining work is to make sparc to be aware of this but it looks not easy for me so I skip that in this series. This patch (of 5): We can disable debug_pagealloc processing even if the code is complied with CONFIG_DEBUG_PAGEALLOC. This patch changes the code to query whether it is enabled or not in runtime. [akpm@linux-foundation.org: update comment, per David. Adjust comment to use 80 cols] Signed-off-by: Joonsoo Kim Reviewed-by: Christian Borntraeger Acked-by: David Rientjes Cc: Benjamin Herrenschmidt Cc: Takashi Iwai Cc: Chris Metcalf Cc: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index fb42a5bffe47..d4b2e34adae0 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -531,22 +531,21 @@ static void unmap_vmap_area(struct vmap_area *va) static void vmap_debug_free_range(unsigned long start, unsigned long end) { /* - * Unmap page tables and force a TLB flush immediately if - * CONFIG_DEBUG_PAGEALLOC is set. This catches use after free - * bugs similarly to those in linear kernel virtual address - * space after a page has been freed. + * Unmap page tables and force a TLB flush immediately if pagealloc + * debugging is enabled. This catches use after free bugs similarly to + * those in linear kernel virtual address space after a page has been + * freed. * - * All the lazy freeing logic is still retained, in order to - * minimise intrusiveness of this debugging feature. + * All the lazy freeing logic is still retained, in order to minimise + * intrusiveness of this debugging feature. * - * This is going to be *slow* (linear kernel virtual address - * debugging doesn't do a broadcast TLB flush so it is a lot - * faster). + * This is going to be *slow* (linear kernel virtual address debugging + * doesn't do a broadcast TLB flush so it is a lot faster). */ -#ifdef CONFIG_DEBUG_PAGEALLOC - vunmap_page_range(start, end); - flush_tlb_kernel_range(start, end); -#endif + if (debug_pagealloc_enabled()) { + vunmap_page_range(start, end); + flush_tlb_kernel_range(start, end); + } } /* -- cgit v1.2.3 From 922d566cdcb7166c729ff67bb15ff5f93a3774b6 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 17 Mar 2016 14:17:53 -0700 Subject: mm/slub: query dynamic DEBUG_PAGEALLOC setting We can disable debug_pagealloc processing even if the code is compiled with CONFIG_DEBUG_PAGEALLOC. This patch changes the code to query whether it is enabled or not in runtime. [akpm@linux-foundation.org: clean up code, per Christian] Signed-off-by: Joonsoo Kim Reviewed-by: Christian Borntraeger Cc: Benjamin Herrenschmidt Cc: Chris Metcalf Cc: Christoph Lameter Cc: David Rientjes Cc: Pekka Enberg Cc: Takashi Iwai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 712d53474082..2f2f04d39104 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -254,11 +254,10 @@ static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) { void *p; -#ifdef CONFIG_DEBUG_PAGEALLOC + if (!debug_pagealloc_enabled()) + return get_freepointer(s, object); + probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p)); -#else - p = get_freepointer(s, object); -#endif return p; } -- cgit v1.2.3 From 505f6d22dbc63f333d1178dc80264e40b5c35268 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 17 Mar 2016 14:17:56 -0700 Subject: sound: query dynamic DEBUG_PAGEALLOC setting We can disable debug_pagealloc processing even if the code is compiled with CONFIG_DEBUG_PAGEALLOC. This patch changes the code to query whether it is enabled or not in runtime. [akpm@linux-foundation.org: export _debug_pagealloc_enabled to modules] Signed-off-by: Joonsoo Kim Acked-by: David Rientjes Acked-by: Takashi Iwai Cc: Benjamin Herrenschmidt Cc: Chris Metcalf Cc: Christian Borntraeger Cc: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 1 + sound/drivers/pcsp/pcsp.c | 9 +++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c7332a4bc8db..b1fc19ebb8a2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -498,6 +498,7 @@ void prep_compound_page(struct page *page, unsigned int order) unsigned int _debug_guardpage_minorder; bool _debug_pagealloc_enabled __read_mostly = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); +EXPORT_SYMBOL(_debug_pagealloc_enabled); bool _debug_guardpage_enabled __read_mostly; static int __init early_debug_pagealloc(char *buf) diff --git a/sound/drivers/pcsp/pcsp.c b/sound/drivers/pcsp/pcsp.c index 27e25bb78c97..72e2d0012084 100644 --- a/sound/drivers/pcsp/pcsp.c +++ b/sound/drivers/pcsp/pcsp.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "pcsp_input.h" #include "pcsp.h" @@ -148,11 +149,11 @@ static int alsa_card_pcsp_init(struct device *dev) return err; } -#ifdef CONFIG_DEBUG_PAGEALLOC /* Well, CONFIG_DEBUG_PAGEALLOC makes the sound horrible. Lets alert */ - printk(KERN_WARNING "PCSP: CONFIG_DEBUG_PAGEALLOC is enabled, " - "which may make the sound noisy.\n"); -#endif + if (debug_pagealloc_enabled()) { + printk(KERN_WARNING "PCSP: CONFIG_DEBUG_PAGEALLOC is enabled, " + "which may make the sound noisy.\n"); + } return 0; } -- cgit v1.2.3 From e7df0d88c455c915376397b4bd72a83b9ed656f7 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 17 Mar 2016 14:17:59 -0700 Subject: powerpc: query dynamic DEBUG_PAGEALLOC setting We can disable debug_pagealloc processing even if the code is compiled with CONFIG_DEBUG_PAGEALLOC. This patch changes the code to query whether it is enabled or not in runtime. Signed-off-by: Joonsoo Kim Acked-by: David Rientjes Cc: Christian Borntraeger Cc: Benjamin Herrenschmidt Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/kernel/traps.c | 5 ++--- arch/powerpc/mm/hash_utils_64.c | 36 ++++++++++++++++++++---------------- arch/powerpc/mm/init_32.c | 8 ++++---- 3 files changed, 26 insertions(+), 23 deletions(-) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index b6becc795bb5..33c47fcc455a 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -203,9 +203,8 @@ static int __kprobes __die(const char *str, struct pt_regs *regs, long err) #ifdef CONFIG_SMP printk("SMP NR_CPUS=%d ", NR_CPUS); #endif -#ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC "); -#endif + if (debug_pagealloc_enabled()) + printk("DEBUG_PAGEALLOC "); #ifdef CONFIG_NUMA printk("NUMA "); #endif diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index ba59d5977f34..1005281be9a6 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -255,8 +255,10 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend, if (ret < 0) break; + #ifdef CONFIG_DEBUG_PAGEALLOC - if ((paddr >> PAGE_SHIFT) < linear_map_hash_count) + if (debug_pagealloc_enabled() && + (paddr >> PAGE_SHIFT) < linear_map_hash_count) linear_map_hash_slots[paddr >> PAGE_SHIFT] = ret | 0x80; #endif /* CONFIG_DEBUG_PAGEALLOC */ } @@ -512,17 +514,17 @@ static void __init htab_init_page_sizes(void) if (mmu_has_feature(MMU_FTR_16M_PAGE)) memcpy(mmu_psize_defs, mmu_psize_defaults_gp, sizeof(mmu_psize_defaults_gp)); - found: -#ifndef CONFIG_DEBUG_PAGEALLOC - /* - * Pick a size for the linear mapping. Currently, we only support - * 16M, 1M and 4K which is the default - */ - if (mmu_psize_defs[MMU_PAGE_16M].shift) - mmu_linear_psize = MMU_PAGE_16M; - else if (mmu_psize_defs[MMU_PAGE_1M].shift) - mmu_linear_psize = MMU_PAGE_1M; -#endif /* CONFIG_DEBUG_PAGEALLOC */ +found: + if (!debug_pagealloc_enabled()) { + /* + * Pick a size for the linear mapping. Currently, we only + * support 16M, 1M and 4K which is the default + */ + if (mmu_psize_defs[MMU_PAGE_16M].shift) + mmu_linear_psize = MMU_PAGE_16M; + else if (mmu_psize_defs[MMU_PAGE_1M].shift) + mmu_linear_psize = MMU_PAGE_1M; + } #ifdef CONFIG_PPC_64K_PAGES /* @@ -721,10 +723,12 @@ static void __init htab_initialize(void) prot = pgprot_val(PAGE_KERNEL); #ifdef CONFIG_DEBUG_PAGEALLOC - linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT; - linear_map_hash_slots = __va(memblock_alloc_base(linear_map_hash_count, - 1, ppc64_rma_size)); - memset(linear_map_hash_slots, 0, linear_map_hash_count); + if (debug_pagealloc_enabled()) { + linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT; + linear_map_hash_slots = __va(memblock_alloc_base( + linear_map_hash_count, 1, ppc64_rma_size)); + memset(linear_map_hash_slots, 0, linear_map_hash_count); + } #endif /* CONFIG_DEBUG_PAGEALLOC */ /* On U3 based machines, we need to reserve the DART area and diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index a10be665b645..c2b771614d4f 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -112,10 +112,10 @@ void __init MMU_setup(void) if (strstr(boot_command_line, "noltlbs")) { __map_without_ltlbs = 1; } -#ifdef CONFIG_DEBUG_PAGEALLOC - __map_without_bats = 1; - __map_without_ltlbs = 1; -#endif + if (debug_pagealloc_enabled()) { + __map_without_bats = 1; + __map_without_ltlbs = 1; + } } /* -- cgit v1.2.3 From 21c647865a7d7b810aa94c32b40a4b9393ddfb85 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 17 Mar 2016 14:18:02 -0700 Subject: tile: query dynamic DEBUG_PAGEALLOC setting We can disable debug_pagealloc processing even if the code is compiled with CONFIG_DEBUG_PAGEALLOC. This patch changes the code to query whether it is enabled or not in runtime. Signed-off-by: Joonsoo Kim Cc: Benjamin Herrenschmidt Acked-by: Chris Metcalf Cc: Christian Borntraeger Cc: Christoph Lameter Cc: David Rientjes Cc: Pekka Enberg Cc: Takashi Iwai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/tile/mm/init.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c index d4e1fc41d06d..a0582b7f41d3 100644 --- a/arch/tile/mm/init.c +++ b/arch/tile/mm/init.c @@ -896,17 +896,15 @@ void __init pgtable_cache_init(void) panic("pgtable_cache_init(): Cannot create pgd cache"); } -#ifdef CONFIG_DEBUG_PAGEALLOC -static long __write_once initfree; -#else static long __write_once initfree = 1; -#endif +static bool __write_once set_initfree_done; /* Select whether to free (1) or mark unusable (0) the __init pages. */ static int __init set_initfree(char *str) { long val; if (kstrtol(str, 0, &val) == 0) { + set_initfree_done = true; initfree = val; pr_info("initfree: %s free init pages\n", initfree ? "will" : "won't"); @@ -919,6 +917,11 @@ static void free_init_pages(char *what, unsigned long begin, unsigned long end) { unsigned long addr = (unsigned long) begin; + /* Prefer user request first */ + if (!set_initfree_done) { + if (debug_pagealloc_enabled()) + initfree = 0; + } if (kdata_huge && !initfree) { pr_warn("Warning: ignoring initfree=0: incompatible with kdata=huge\n"); initfree = 1; -- cgit v1.2.3 From 81c5857b279e6b18f6ff0d1975e80a07af542cd1 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 17 Mar 2016 14:18:05 -0700 Subject: mm, kswapd: remove bogus check of balance_classzone_idx During work on kcompactd integration I have spotted a confusing check of balance_classzone_idx, which I believe is bogus. The balanced_classzone_idx is filled by balance_pgdat() as the highest zone it attempted to balance. This was introduced by commit dc83edd941f4 ("mm: kswapd: use the classzone idx that kswapd was using for sleeping_prematurely()"). The intention is that (as expressed in today's function names), the value used for kswapd_shrink_zone() calls in balance_pgdat() is the same as for the decisions in kswapd_try_to_sleep(). An unwanted side-effect of that commit was breaking the checks in kswapd() whether there was another kswapd_wakeup with a tighter (=lower) classzone_idx. Commits 215ddd6664ce ("mm: vmscan: only read new_classzone_idx from pgdat when reclaiming successfully") and d2ebd0f6b895 ("kswapd: avoid unnecessary rebalance after an unsuccessful balancing") tried to fixed, but apparently introduced a bogus check that this patch removes. Consider zone indexes X < Y < Z, where: - Z is the value used for the first kswapd wakeup. - Y is returned as balanced_classzone_idx, which means zones with index higher than Y (including Z) were found to be unreclaimable. - X is the value used for the second kswapd wakeup The new wakeup with value X means that kswapd is now supposed to balance harder all zones with index <= X. But instead, due to Y < Z, it will go sleep and won't read the new value X. This is subtly wrong. The effect of this patch is that kswapd will react better in some situations, where e.g. the first wakeup is for ZONE_DMA32, the second is for ZONE_DMA, and due to unreclaimable ZONE_NORMAL. Before this patch, kswapd would go sleep instead of reclaiming ZONE_DMA harder. I expect these situations are very rare, and more value is in better maintainability due to the removal of confusing and bogus check. Signed-off-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Joonsoo Kim Cc: Mel Gorman Cc: David Rientjes Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index dd984470248f..5dcc71140108 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3468,8 +3468,7 @@ static int kswapd(void *p) * new request of a similar or harder type will succeed soon * so consider going to sleep on the basis we reclaimed at */ - if (balanced_classzone_idx >= new_classzone_idx && - balanced_order == new_order) { + if (balanced_order == new_order) { new_order = pgdat->kswapd_max_order; new_classzone_idx = pgdat->classzone_idx; pgdat->kswapd_max_order = 0; -- cgit v1.2.3 From 698b1b30642f1ff0ea10ef1de9745ab633031377 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 17 Mar 2016 14:18:08 -0700 Subject: mm, compaction: introduce kcompactd Memory compaction can be currently performed in several contexts: - kswapd balancing a zone after a high-order allocation failure - direct compaction to satisfy a high-order allocation, including THP page fault attemps - khugepaged trying to collapse a hugepage - manually from /proc The purpose of compaction is two-fold. The obvious purpose is to satisfy a (pending or future) high-order allocation, and is easy to evaluate. The other purpose is to keep overal memory fragmentation low and help the anti-fragmentation mechanism. The success wrt the latter purpose is more The current situation wrt the purposes has a few drawbacks: - compaction is invoked only when a high-order page or hugepage is not available (or manually). This might be too late for the purposes of keeping memory fragmentation low. - direct compaction increases latency of allocations. Again, it would be better if compaction was performed asynchronously to keep fragmentation low, before the allocation itself comes. - (a special case of the previous) the cost of compaction during THP page faults can easily offset the benefits of THP. - kswapd compaction appears to be complex, fragile and not working in some scenarios. It could also end up compacting for a high-order allocation request when it should be reclaiming memory for a later order-0 request. To improve the situation, we should be able to benefit from an equivalent of kswapd, but for compaction - i.e. a background thread which responds to fragmentation and the need for high-order allocations (including hugepages) somewhat proactively. One possibility is to extend the responsibilities of kswapd, which could however complicate its design too much. It should be better to let kswapd handle reclaim, as order-0 allocations are often more critical than high-order ones. Another possibility is to extend khugepaged, but this kthread is a single instance and tied to THP configs. This patch goes with the option of a new set of per-node kthreads called kcompactd, and lays the foundations, without introducing any new tunables. The lifecycle mimics kswapd kthreads, including the memory hotplug hooks. For compaction, kcompactd uses the standard compaction_suitable() and ompact_finished() criteria and the deferred compaction functionality. Unlike direct compaction, it uses only sync compaction, as there's no allocation latency to minimize. This patch doesn't yet add a call to wakeup_kcompactd. The kswapd compact/reclaim loop for high-order pages will be replaced by waking up kcompactd in the next patch with the description of what's wrong with the old approach. Waking up of the kcompactd threads is also tied to kswapd activity and follows these rules: - we don't want to affect any fastpaths, so wake up kcompactd only from the slowpath, as it's done for kswapd - if kswapd is doing reclaim, it's more important than compaction, so don't invoke kcompactd until kswapd goes to sleep - the target order used for kswapd is passed to kcompactd Future possible future uses for kcompactd include the ability to wake up kcompactd on demand in special situations, such as when hugepages are not available (currently not done due to __GFP_NO_KSWAPD) or when a fragmentation event (i.e. __rmqueue_fallback()) occurs. It's also possible to perform periodic compaction with kcompactd. [arnd@arndb.de: fix build errors with kcompactd] [paul.gortmaker@windriver.com: don't use modular references for non modular code] Signed-off-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Joonsoo Kim Cc: Mel Gorman Cc: David Rientjes Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Arnd Bergmann Signed-off-by: Paul Gortmaker Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 16 +++ include/linux/mmzone.h | 6 ++ include/linux/vm_event_item.h | 1 + include/trace/events/compaction.h | 55 ++++++++++ mm/compaction.c | 222 ++++++++++++++++++++++++++++++++++++++ mm/memory_hotplug.c | 9 +- mm/page_alloc.c | 3 + mm/vmstat.c | 1 + 8 files changed, 311 insertions(+), 2 deletions(-) diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 4cd4ddf64cc7..d7c8de583a23 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -52,6 +52,10 @@ extern void compaction_defer_reset(struct zone *zone, int order, bool alloc_success); extern bool compaction_restarting(struct zone *zone, int order); +extern int kcompactd_run(int nid); +extern void kcompactd_stop(int nid); +extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx); + #else static inline unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, int alloc_flags, @@ -84,6 +88,18 @@ static inline bool compaction_deferred(struct zone *zone, int order) return true; } +static inline int kcompactd_run(int nid) +{ + return 0; +} +static inline void kcompactd_stop(int nid) +{ +} + +static inline void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) +{ +} + #endif /* CONFIG_COMPACTION */ #if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 6de02ac378a0..bdd9a270a813 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -668,6 +668,12 @@ typedef struct pglist_data { mem_hotplug_begin/end() */ int kswapd_max_order; enum zone_type classzone_idx; +#ifdef CONFIG_COMPACTION + int kcompactd_max_order; + enum zone_type kcompactd_classzone_idx; + wait_queue_head_t kcompactd_wait; + struct task_struct *kcompactd; +#endif #ifdef CONFIG_NUMA_BALANCING /* Lock serializing the migrate rate limiting window */ spinlock_t numabalancing_migrate_lock; diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 67c1dbd19c6d..58ecc056ee45 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -53,6 +53,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, COMPACTMIGRATE_SCANNED, COMPACTFREE_SCANNED, COMPACTISOLATED, COMPACTSTALL, COMPACTFAIL, COMPACTSUCCESS, + KCOMPACTD_WAKE, #endif #ifdef CONFIG_HUGETLB_PAGE HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL, diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index 111e5666e5eb..e215bf68f521 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -350,6 +350,61 @@ DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_defer_reset, ); #endif +TRACE_EVENT(mm_compaction_kcompactd_sleep, + + TP_PROTO(int nid), + + TP_ARGS(nid), + + TP_STRUCT__entry( + __field(int, nid) + ), + + TP_fast_assign( + __entry->nid = nid; + ), + + TP_printk("nid=%d", __entry->nid) +); + +DECLARE_EVENT_CLASS(kcompactd_wake_template, + + TP_PROTO(int nid, int order, enum zone_type classzone_idx), + + TP_ARGS(nid, order, classzone_idx), + + TP_STRUCT__entry( + __field(int, nid) + __field(int, order) + __field(enum zone_type, classzone_idx) + ), + + TP_fast_assign( + __entry->nid = nid; + __entry->order = order; + __entry->classzone_idx = classzone_idx; + ), + + TP_printk("nid=%d order=%d classzone_idx=%-8s", + __entry->nid, + __entry->order, + __print_symbolic(__entry->classzone_idx, ZONE_TYPE)) +); + +DEFINE_EVENT(kcompactd_wake_template, mm_compaction_wakeup_kcompactd, + + TP_PROTO(int nid, int order, enum zone_type classzone_idx), + + TP_ARGS(nid, order, classzone_idx) +); + +DEFINE_EVENT(kcompactd_wake_template, mm_compaction_kcompactd_wake, + + TP_PROTO(int nid, int order, enum zone_type classzone_idx), + + TP_ARGS(nid, order, classzone_idx) +); + #endif /* _TRACE_COMPACTION_H */ /* This part must be outside protection */ diff --git a/mm/compaction.c b/mm/compaction.c index 93f71d968098..5b2bfbaa821a 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -7,6 +7,7 @@ * * Copyright IBM Corp. 2007-2010 Mel Gorman */ +#include #include #include #include @@ -17,6 +18,8 @@ #include #include #include +#include +#include #include "internal.h" #ifdef CONFIG_COMPACTION @@ -1736,4 +1739,223 @@ void compaction_unregister_node(struct node *node) } #endif /* CONFIG_SYSFS && CONFIG_NUMA */ +static inline bool kcompactd_work_requested(pg_data_t *pgdat) +{ + return pgdat->kcompactd_max_order > 0; +} + +static bool kcompactd_node_suitable(pg_data_t *pgdat) +{ + int zoneid; + struct zone *zone; + enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx; + + for (zoneid = 0; zoneid < classzone_idx; zoneid++) { + zone = &pgdat->node_zones[zoneid]; + + if (!populated_zone(zone)) + continue; + + if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0, + classzone_idx) == COMPACT_CONTINUE) + return true; + } + + return false; +} + +static void kcompactd_do_work(pg_data_t *pgdat) +{ + /* + * With no special task, compact all zones so that a page of requested + * order is allocatable. + */ + int zoneid; + struct zone *zone; + struct compact_control cc = { + .order = pgdat->kcompactd_max_order, + .classzone_idx = pgdat->kcompactd_classzone_idx, + .mode = MIGRATE_SYNC_LIGHT, + .ignore_skip_hint = true, + + }; + bool success = false; + + trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, + cc.classzone_idx); + count_vm_event(KCOMPACTD_WAKE); + + for (zoneid = 0; zoneid < cc.classzone_idx; zoneid++) { + int status; + + zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + if (compaction_deferred(zone, cc.order)) + continue; + + if (compaction_suitable(zone, cc.order, 0, zoneid) != + COMPACT_CONTINUE) + continue; + + cc.nr_freepages = 0; + cc.nr_migratepages = 0; + cc.zone = zone; + INIT_LIST_HEAD(&cc.freepages); + INIT_LIST_HEAD(&cc.migratepages); + + status = compact_zone(zone, &cc); + + if (zone_watermark_ok(zone, cc.order, low_wmark_pages(zone), + cc.classzone_idx, 0)) { + success = true; + compaction_defer_reset(zone, cc.order, false); + } else if (status == COMPACT_COMPLETE) { + /* + * We use sync migration mode here, so we defer like + * sync direct compaction does. + */ + defer_compaction(zone, cc.order); + } + + VM_BUG_ON(!list_empty(&cc.freepages)); + VM_BUG_ON(!list_empty(&cc.migratepages)); + } + + /* + * Regardless of success, we are done until woken up next. But remember + * the requested order/classzone_idx in case it was higher/tighter than + * our current ones + */ + if (pgdat->kcompactd_max_order <= cc.order) + pgdat->kcompactd_max_order = 0; + if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx) + pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; +} + +void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) +{ + if (!order) + return; + + if (pgdat->kcompactd_max_order < order) + pgdat->kcompactd_max_order = order; + + if (pgdat->kcompactd_classzone_idx > classzone_idx) + pgdat->kcompactd_classzone_idx = classzone_idx; + + if (!waitqueue_active(&pgdat->kcompactd_wait)) + return; + + if (!kcompactd_node_suitable(pgdat)) + return; + + trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order, + classzone_idx); + wake_up_interruptible(&pgdat->kcompactd_wait); +} + +/* + * The background compaction daemon, started as a kernel thread + * from the init process. + */ +static int kcompactd(void *p) +{ + pg_data_t *pgdat = (pg_data_t*)p; + struct task_struct *tsk = current; + + const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + + if (!cpumask_empty(cpumask)) + set_cpus_allowed_ptr(tsk, cpumask); + + set_freezable(); + + pgdat->kcompactd_max_order = 0; + pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; + + while (!kthread_should_stop()) { + trace_mm_compaction_kcompactd_sleep(pgdat->node_id); + wait_event_freezable(pgdat->kcompactd_wait, + kcompactd_work_requested(pgdat)); + + kcompactd_do_work(pgdat); + } + + return 0; +} + +/* + * This kcompactd start function will be called by init and node-hot-add. + * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added. + */ +int kcompactd_run(int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + int ret = 0; + + if (pgdat->kcompactd) + return 0; + + pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid); + if (IS_ERR(pgdat->kcompactd)) { + pr_err("Failed to start kcompactd on node %d\n", nid); + ret = PTR_ERR(pgdat->kcompactd); + pgdat->kcompactd = NULL; + } + return ret; +} + +/* + * Called by memory hotplug when all memory in a node is offlined. Caller must + * hold mem_hotplug_begin/end(). + */ +void kcompactd_stop(int nid) +{ + struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd; + + if (kcompactd) { + kthread_stop(kcompactd); + NODE_DATA(nid)->kcompactd = NULL; + } +} + +/* + * It's optimal to keep kcompactd on the same CPUs as their memory, but + * not required for correctness. So if the last cpu in a node goes + * away, we get changed to run anywhere: as the first one comes back, + * restore their cpu bindings. + */ +static int cpu_callback(struct notifier_block *nfb, unsigned long action, + void *hcpu) +{ + int nid; + + if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { + for_each_node_state(nid, N_MEMORY) { + pg_data_t *pgdat = NODE_DATA(nid); + const struct cpumask *mask; + + mask = cpumask_of_node(pgdat->node_id); + + if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) + /* One of our CPUs online: restore mask */ + set_cpus_allowed_ptr(pgdat->kcompactd, mask); + } + } + return NOTIFY_OK; +} + +static int __init kcompactd_init(void) +{ + int nid; + + for_each_node_state(nid, N_MEMORY) + kcompactd_run(nid); + hotcpu_notifier(cpu_callback, 0); + return 0; +} +subsys_initcall(kcompactd_init) + #endif /* CONFIG_COMPACTION */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 24ea06393816..d9bcb26fc4df 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -33,6 +33,7 @@ #include #include #include +#include #include @@ -1