From e04b742f74c236202b7a505c2688068969d00e65 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 5 Mar 2019 15:42:27 -0800 Subject: kexec: export PG_offline to VMCOREINFO Right now, pages inflated as part of a balloon driver will be dumped by dump tools like makedumpfile. While XEN is able to check in the crash kernel whether a certain pfn is actuall backed by memory in the hypervisor (see xen_oldmem_pfn_is_ram) and optimize this case, dumps of other balloon inflated memory will essentially result in zero pages getting allocated by the hypervisor and the dump getting filled with this data. The allocation and reading of zero pages can directly be avoided if a dumping tool could know which pages only contain stale information not to be dumped. We now have PG_offline which can be (and already is by virtio-balloon) used for marking pages as logically offline. Follow up patches will make use of this flag also in other balloon implementations. Let's export PG_offline via PAGE_OFFLINE_MAPCOUNT_VALUE, so makedumpfile can directly skip pages that are logically offline and the content therefore stale. Please note that this is also helpful for a problem we were seeing under Hyper-V: Dumping logically offline memory (pages kept fake offline while onlining a section via online_page_callback) would under some condicions result in a kernel panic when dumping them. Link: http://lkml.kernel.org/r/20181119101616.8901-4-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michael S. Tsirkin Acked-by: Dave Young Cc: "Kirill A. Shutemov" Cc: Baoquan He Cc: Omar Sandoval Cc: Arnd Bergmann Cc: Matthew Wilcox Cc: Michal Hocko Cc: Lianbo Jiang Cc: Borislav Petkov Cc: Kazuhito Hagio Cc: Alexander Duyck Cc: Alexey Dobriyan Cc: Boris Ostrovsky Cc: Christian Hansen Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jonathan Corbet Cc: Juergen Gross Cc: Julien Freche Cc: Kairui Song Cc: Konstantin Khlebnikov Cc: "K. Y. Srinivasan" Cc: Len Brown Cc: Michal Hocko Cc: Mike Rapoport Cc: Miles Chen Cc: Nadav Amit Cc: Naoya Horiguchi Cc: Pankaj gupta Cc: Pavel Machek Cc: Pavel Tatashin Cc: Rafael J. Wysocki Cc: "Rafael J. Wysocki" Cc: Stefano Stabellini Cc: Stephen Hemminger Cc: Stephen Rothwell Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Xavier Deguillard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/crash_core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 933cb3e45b98..093c9f917ed0 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -464,6 +464,8 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); #ifdef CONFIG_HUGETLB_PAGE VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); +#define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline) + VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE); #endif arch_crash_save_vmcoreinfo(); -- cgit v1.2.3 From 5b56db37218e6503906c6057c177a84f0a0ba551 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 5 Mar 2019 15:42:45 -0800 Subject: PM/Hibernate: use pfn_to_online_page() Let's use pfn_to_online_page() instead of pfn_to_page() when checking for saveable pages to not save/restore offline memory sections. Link: http://lkml.kernel.org/r/20181119101616.8901-8-david@redhat.com Signed-off-by: David Hildenbrand Suggested-by: Michal Hocko Acked-by: Michal Hocko Acked-by: Pavel Machek Acked-by: Rafael J. Wysocki Cc: Len Brown Cc: Matthew Wilcox Cc: "Michael S. Tsirkin" Cc: Alexander Duyck Cc: Alexey Dobriyan Cc: Arnd Bergmann Cc: Baoquan He Cc: Borislav Petkov Cc: Boris Ostrovsky Cc: Christian Hansen Cc: Dave Young Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jonathan Corbet Cc: Juergen Gross Cc: Julien Freche Cc: Kairui Song Cc: Kazuhito Hagio Cc: "Kirill A. Shutemov" Cc: Konstantin Khlebnikov Cc: "K. Y. Srinivasan" Cc: Lianbo Jiang Cc: Mike Rapoport Cc: Miles Chen Cc: Nadav Amit Cc: Naoya Horiguchi Cc: Omar Sandoval Cc: Pankaj gupta Cc: Pavel Tatashin Cc: "Rafael J. Wysocki" Cc: Stefano Stabellini Cc: Stephen Hemminger Cc: Stephen Rothwell Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Xavier Deguillard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 640b2034edd6..87e6dd57819f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1215,8 +1215,8 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) if (!pfn_valid(pfn)) return NULL; - page = pfn_to_page(pfn); - if (page_zone(page) != zone) + page = pfn_to_online_page(pfn); + if (!page || page_zone(page) != zone) return NULL; BUG_ON(!PageHighMem(page)); @@ -1277,8 +1277,8 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) if (!pfn_valid(pfn)) return NULL; - page = pfn_to_page(pfn); - if (page_zone(page) != zone) + page = pfn_to_online_page(pfn); + if (!page || page_zone(page) != zone) return NULL; BUG_ON(PageHighMem(page)); -- cgit v1.2.3 From abd02ac616e32d818a0478e68924beac8ba5e5d8 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 5 Mar 2019 15:42:50 -0800 Subject: PM/Hibernate: exclude all PageOffline() pages The content of pages that are marked PG_offline is not of interest (e.g. inflated by a balloon driver), let's skip these pages. In saveable_highmem_page(), move the PageReserved() check to a new check along with the PageOffline() check to separate it from the swsusp checks. [david@redhat.com: v2] Link: http://lkml.kernel.org/r/20181122100627.5189-9-david@redhat.com Link: http://lkml.kernel.org/r/20181119101616.8901-9-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Pavel Machek Acked-by: Rafael J. Wysocki Cc: Pavel Machek Cc: Len Brown Cc: Matthew Wilcox Cc: Michal Hocko Cc: "Michael S. Tsirkin" Cc: Alexander Duyck Cc: Alexey Dobriyan Cc: Arnd Bergmann Cc: Baoquan He Cc: Borislav Petkov Cc: Boris Ostrovsky Cc: Christian Hansen Cc: Dave Young Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Jonathan Corbet Cc: Juergen Gross Cc: Julien Freche Cc: Kairui Song Cc: Kazuhito Hagio Cc: "Kirill A. Shutemov" Cc: Konstantin Khlebnikov Cc: "K. Y. Srinivasan" Cc: Lianbo Jiang Cc: Michal Hocko Cc: Mike Rapoport Cc: Miles Chen Cc: Nadav Amit Cc: Naoya Horiguchi Cc: Omar Sandoval Cc: Pankaj gupta Cc: Pavel Tatashin Cc: "Rafael J. Wysocki" Cc: Stefano Stabellini Cc: Stephen Hemminger Cc: Stephen Rothwell Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Xavier Deguillard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 87e6dd57819f..4802b039b89f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1221,8 +1221,10 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) BUG_ON(!PageHighMem(page)); - if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page) || - PageReserved(page)) + if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page)) + return NULL; + + if (PageReserved(page) || PageOffline(page)) return NULL; if (page_is_guard(page)) @@ -1286,6 +1288,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page)) return NULL; + if (PageOffline(page)) + return NULL; + if (PageReserved(page) && (!kernel_page_present(page) || pfn_is_nosave(pfn))) return NULL; -- cgit v1.2.3 From 98fa15f34cb379864757670b8e8743b21456a20e Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 5 Mar 2019 15:42:58 -0800 Subject: mm: replace all open encodings for NUMA_NO_NODE Patch series "Replace all open encodings for NUMA_NO_NODE", v3. All these places for replacement were found by running the following grep patterns on the entire kernel code. Please let me know if this might have missed some instances. This might also have replaced some false positives. I will appreciate suggestions, inputs and review. 1. git grep "nid == -1" 2. git grep "node == -1" 3. git grep "nid = -1" 4. git grep "node = -1" This patch (of 2): At present there are multiple places where invalid node number is encoded as -1. Even though implicitly understood it is always better to have macros in there. Replace these open encodings for an invalid node number with the global macro NUMA_NO_NODE. This helps remove NUMA related assumptions like 'invalid node' from various places redirecting them to a common definition. Link: http://lkml.kernel.org/r/1545127933-10711-2-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: David Hildenbrand Acked-by: Jeff Kirsher [ixgbe] Acked-by: Jens Axboe [mtip32xx] Acked-by: Vinod Koul [dmaengine.c] Acked-by: Michael Ellerman [powerpc] Acked-by: Doug Ledford [drivers/infiniband] Cc: Joseph Qi Cc: Hans Verkuil Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kthread.c | 3 ++- kernel/sched/fair.c | 15 ++++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 087d18d771b5..ebebbcf3c5de 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -20,6 +20,7 @@ #include #include #include +#include #include static DEFINE_SPINLOCK(kthread_create_lock); @@ -675,7 +676,7 @@ __kthread_create_worker(int cpu, unsigned int flags, { struct kthread_worker *worker; struct task_struct *task; - int node = -1; + int node = NUMA_NO_NODE; worker = kzalloc(sizeof(*worker), GFP_KERNEL); if (!worker) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 310d0637fe4b..0e6a0ef129c5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1160,7 +1160,7 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) /* New address space, reset the preferred nid */ if (!(clone_flags & CLONE_VM)) { - p->numa_preferred_nid = -1; + p->numa_preferred_nid = NUMA_NO_NODE; return; } @@ -1180,13 +1180,13 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) static void account_numa_enqueue(struct rq *rq, struct task_struct *p) { - rq->nr_numa_running += (p->numa_preferred_nid != -1); + rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE); rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p)); } static void account_numa_dequeue(struct rq *rq, struct task_struct *p) { - rq->nr_numa_running -= (p->numa_preferred_nid != -1); + rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE); rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); } @@ -1400,7 +1400,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, * two full passes of the "multi-stage node selection" test that is * executed below. */ - if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) && + if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) && (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid))) return true; @@ -1848,7 +1848,7 @@ static void numa_migrate_preferred(struct task_struct *p) unsigned long interval = HZ; /* This task has no NUMA fault statistics yet */ - if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) + if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults)) return; /* Periodically retry migrating the task to the preferred node */ @@ -2095,7 +2095,7 @@ static int preferred_group_nid(struct task_struct *p, int nid) static void task_numa_placement(struct task_struct *p) { - int seq, nid, max_nid = -1; + int seq, nid, max_nid = NUMA_NO_NODE; unsigned long max_faults = 0; unsigned long fault_types[2] = { 0, 0 }; unsigned long total_faults; @@ -2638,7 +2638,8 @@ static void update_scan_period(struct task_struct *p, int new_cpu) * the preferred node. */ if (dst_nid == p->numa_preferred_nid || - (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid)) + (p->numa_preferred_nid != NUMA_NO_NODE && + src_nid != p->numa_preferred_nid)) return; } -- cgit v1.2.3 From 6b7e5cad651a2b1031a4c69a98f87e3532dd4cef Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 5 Mar 2019 15:43:41 -0800 Subject: mm: remove sysctl_extfrag_handler() sysctl_extfrag_handler() neglects to propagate the return value from proc_dointvec_minmax() to its caller. It's a wrapper that doesn't need to exist, so just use proc_dointvec_minmax() directly. Link: http://lkml.kernel.org/r/20190104032557.3056-1-willy@infradead.org Signed-off-by: Matthew Wilcox Reported-by: Aditya Pakki Acked-by: Mel Gorman Acked-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7578e21a711b..9c78e06f7ba4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1460,7 +1460,7 @@ static struct ctl_table vm_table[] = { .data = &sysctl_extfrag_threshold, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = sysctl_extfrag_handler, + .proc_handler = proc_dointvec_minmax, .extra1 = &min_extfrag_threshold, .extra2 = &max_extfrag_threshold, }, -- cgit v1.2.3 From 5e1f0f098b4649fad53011246bcaeff011ffdf5d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 5 Mar 2019 15:45:41 -0800 Subject: mm, compaction: capture a page under direct compaction Compaction is inherently race-prone as a suitable page freed during compaction can be allocated by any parallel task. This patch uses a capture_control structure to isolate a page immediately when it is freed by a direct compactor in the slow path of the page allocator. The intent is to avoid redundant scanning. 5.0.0-rc1 5.0.0-rc1 selective-v3r17 capture-v3r19 Amean fault-both-1 0.00 ( 0.00%) 0.00 * 0.00%* Amean fault-both-3 2582.11 ( 0.00%) 2563.68 ( 0.71%) Amean fault-both-5 4500.26 ( 0.00%) 4233.52 ( 5.93%) Amean fault-both-7 5819.53 ( 0.00%) 6333.65 ( -8.83%) Amean fault-both-12 9321.18 ( 0.00%) 9759.38 ( -4.70%) Amean fault-both-18 9782.76 ( 0.00%) 10338.76 ( -5.68%) Amean fault-both-24 15272.81 ( 0.00%) 13379.55 * 12.40%* Amean fault-both-30 15121.34 ( 0.00%) 16158.25 ( -6.86%) Amean fault-both-32 18466.67 ( 0.00%) 18971.21 ( -2.73%) Latency is only moderately affected but the devil is in the details. A closer examination indicates that base page fault latency is reduced but latency of huge pages is increased as it takes creater care to succeed. Part of the "problem" is that allocation success rates are close to 100% even when under pressure and compaction gets harder 5.0.0-rc1 5.0.0-rc1 selective-v3r17 capture-v3r19 Percentage huge-3 96.70 ( 0.00%) 98.23 ( 1.58%) Percentage huge-5 96.99 ( 0.00%) 95.30 ( -1.75%) Percentage huge-7 94.19 ( 0.00%) 97.24 ( 3.24%) Percentage huge-12 94.95 ( 0.00%) 97.35 ( 2.53%) Percentage huge-18 96.74 ( 0.00%) 97.30 ( 0.58%) Percentage huge-24 97.07 ( 0.00%) 97.55 ( 0.50%) Percentage huge-30 95.69 ( 0.00%) 98.50 ( 2.95%) Percentage huge-32 96.70 ( 0.00%) 99.27 ( 2.65%) And scan rates are reduced as expected by 6% for the migration scanner and 29% for the free scanner indicating that there is less redundant work. Compaction migrate scanned 20815362 19573286 Compaction free scanned 16352612 11510663 [mgorman@techsingularity.net: remove redundant check] Link: http://lkml.kernel.org/r/20190201143853.GH9565@techsingularity.net Link: http://lkml.kernel.org/r/20190118175136.31341-23-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Dan Carpenter Cc: David Rientjes Cc: YueHaibing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7cbb5658be80..916e956e92be 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2190,6 +2190,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) INIT_HLIST_HEAD(&p->preempt_notifiers); #endif +#ifdef CONFIG_COMPACTION + p->capture_control = NULL; +#endif init_numa_balancing(clone_flags, p); } -- cgit v1.2.3 From dc50537bdd1a0804fa2cbc990565ee9a944e66fa Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 5 Mar 2019 15:45:48 -0800 Subject: kernel: cgroup: add poll file operation Cgroup has a standardized poll/notification mechanism for waking all pollers on all fds when a filesystem node changes. To allow polling for custom events, add a .poll callback that can override the default. This is in preparation for pollable cgroup pressure files which have per-fd trigger configurations. Link: http://lkml.kernel.org/r/20190124211518.244221-3-surenb@google.com Signed-off-by: Johannes Weiner Signed-off-by: Suren Baghdasaryan Cc: Dennis Zhou Cc: Ingo Molnar Cc: Jens Axboe Cc: Li Zefan Cc: Peter Zijlstra Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup/cgroup.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index cef98502b124..17828333f7c3 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3534,6 +3534,16 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, return ret ?: nbytes; } +static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt) +{ + struct cftype *cft = of->kn->priv; + + if (cft->poll) + return cft->poll(of, pt); + + return kernfs_generic_poll(of, pt); +} + static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) { return seq_cft(seq)->seq_start(seq, ppos); @@ -3572,6 +3582,7 @@ static struct kernfs_ops cgroup_kf_single_ops = { .open = cgroup_file_open, .release = cgroup_file_release, .write = cgroup_file_write, + .poll = cgroup_file_poll, .seq_show = cgroup_seqfile_show, }; @@ -3580,6 +3591,7 @@ static struct kernfs_ops cgroup_kf_ops = { .open = cgroup_file_open, .release = cgroup_file_release, .write = cgroup_file_write, + .poll = cgroup_file_poll, .seq_start = cgroup_seqfile_start, .seq_next = cgroup_seqfile_next, .seq_stop = cgroup_seqfile_stop, -- cgit v1.2.3