From 00c54a80cd36de5a0089e7ffd0a690c3a93e65d0 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Thu, 20 Aug 2020 17:41:56 -0700 Subject: mailmap: add Andi Kleen I keep getting bounce back from the suse.de address. Signed-off-by: Nick Desaulniers Signed-off-by: Andrew Morton Cc: Andi Kleen Cc: Jonathan Corbet Cc: Kees Cook Cc: Quentin Perret Link: http://lkml.kernel.org/r/20200818203214.659955-1-ndesaulniers@google.com Signed-off-by: Linus Torvalds --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index 97fd835bd2a4..21f965400a28 100644 --- a/.mailmap +++ b/.mailmap @@ -32,6 +32,7 @@ Alex Shi Alex Shi Al Viro Al Viro +Andi Kleen Andi Shyti Andreas Herrmann Andrew Morton -- cgit v1.2.3 From d5a1695977cf60d3f7b387f4d18bd2fb2a9a7a01 Mon Sep 17 00:00:00 2001 From: Xu Wang Date: Thu, 20 Aug 2020 17:41:59 -0700 Subject: hugetlb_cgroup: convert comma to semicolon Replace a comma between expression statements by a semicolon. Fixes: faced7e0806cf4 ("mm: hugetlb controller for cgroups v2") Signed-off-by: Xu Wang Signed-off-by: Andrew Morton Cc: Tejun Heo Cc: Giuseppe Scrivano Link: http://lkml.kernel.org/r/20200818064333.21759-1-vulab@iscas.ac.cn Signed-off-by: Linus Torvalds --- mm/hugetlb_cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index aabf65d4d91b..1f87aec9ab5c 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -655,7 +655,7 @@ static void __init __hugetlb_cgroup_file_dfl_init(int idx) snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events", buf); cft->private = MEMFILE_PRIVATE(idx, 0); cft->seq_show = hugetlb_events_show; - cft->file_offset = offsetof(struct hugetlb_cgroup, events_file[idx]), + cft->file_offset = offsetof(struct hugetlb_cgroup, events_file[idx]); cft->flags = CFTYPE_NOT_ON_ROOT; /* Add the events.local file */ @@ -664,7 +664,7 @@ static void __init __hugetlb_cgroup_file_dfl_init(int idx) cft->private = MEMFILE_PRIVATE(idx, 0); cft->seq_show = hugetlb_events_local_show; cft->file_offset = offsetof(struct hugetlb_cgroup, - events_local_file[idx]), + events_local_file[idx]); cft->flags = CFTYPE_NOT_ON_ROOT; /* NULL terminate the last cft */ -- cgit v1.2.3 From f3f99d63a8156c7a4a6b20aac22b53c5579c7dc1 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 20 Aug 2020 17:42:02 -0700 Subject: khugepaged: adjust VM_BUG_ON_MM() in __khugepaged_enter() syzbot crashes on the VM_BUG_ON_MM(khugepaged_test_exit(mm), mm) in __khugepaged_enter(): yes, when one thread is about to dump core, has set core_state, and is waiting for others, another might do something calling __khugepaged_enter(), which now crashes because I lumped the core_state test (known as "mmget_still_valid") into khugepaged_test_exit(). I still think it's best to lump them together, so just in this exceptional case, check mm->mm_users directly instead of khugepaged_test_exit(). Fixes: bbe98f9cadff ("khugepaged: khugepaged_test_exit() check mmget_still_valid()") Reported-by: syzbot Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Acked-by: Yang Shi Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Song Liu Cc: Mike Kravetz Cc: Eric Dumazet Cc: [4.8+] Link: http://lkml.kernel.org/r/alpine.LSU.2.11.2008141503370.18085@eggly.anvils Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 15a9af791014..e749e568e1ea 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -466,7 +466,7 @@ int __khugepaged_enter(struct mm_struct *mm) return -ENOMEM; /* __khugepaged_exit() must not run from under us */ - VM_BUG_ON_MM(khugepaged_test_exit(mm), mm); + VM_BUG_ON_MM(atomic_read(&mm->mm_users) == 0, mm); if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { free_mm_slot(mm_slot); return 0; -- cgit v1.2.3 From e47110e90584a22e9980510b00d0dfad3a83354e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 20 Aug 2020 17:42:05 -0700 Subject: mm/vunmap: add cond_resched() in vunmap_pmd_range Like zap_pte_range add cond_resched so that we can avoid softlockups as reported below. On non-preemptible kernel with large I/O map region (like the one we get when using persistent memory with sector mode), an unmap of the namespace can report below softlockups. 22724.027334] watchdog: BUG: soft lockup - CPU#49 stuck for 23s! [ndctl:50777] NIP [c0000000000dc224] plpar_hcall+0x38/0x58 LR [c0000000000d8898] pSeries_lpar_hpte_invalidate+0x68/0xb0 Call Trace: flush_hash_page+0x114/0x200 hpte_need_flush+0x2dc/0x540 vunmap_page_range+0x538/0x6f0 free_unmap_vmap_area+0x30/0x70 remove_vm_area+0xfc/0x140 __vunmap+0x68/0x270 __iounmap.part.0+0x34/0x60 memunmap+0x54/0x70 release_nodes+0x28c/0x300 device_release_driver_internal+0x16c/0x280 unbind_store+0x124/0x170 drv_attr_store+0x44/0x60 sysfs_kf_write+0x64/0x90 kernfs_fop_write+0x1b0/0x290 __vfs_write+0x3c/0x70 vfs_write+0xd8/0x260 ksys_write+0xdc/0x130 system_call+0x5c/0x70 Reported-by: Harish Sriram Signed-off-by: Aneesh Kumar K.V Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Cc: Link: http://lkml.kernel.org/r/20200807075933.310240-1-aneesh.kumar@linux.ibm.com Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b482d240f9a2..be4724b916b3 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -104,6 +104,8 @@ static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, if (pmd_none_or_clear_bad(pmd)) continue; vunmap_pte_range(pmd, addr, next, mask); + + cond_resched(); } while (pmd++, addr = next, addr != end); } -- cgit v1.2.3 From 86f54bb7e4ff22ad5dd0b1e179610cc8bbb3bd63 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 20 Aug 2020 17:42:08 -0700 Subject: mm/rodata_test.c: fix missing function declaration The compilation with CONFIG_DEBUG_RODATA_TEST set produces the following warning due to the missing include. mm/rodata_test.c:15:6: warning: no previous prototype for 'rodata_test' [-Wmissing-prototypes] 15 | void rodata_test(void) | ^~~~~~~~~~~ Fixes: 2959a5f726f6 ("mm: add arch-independent testcases for RODATA") Signed-off-by: Leon Romanovsky Signed-off-by: Andrew Morton Reviewed-by: Anshuman Khandual Link: https://lkml.kernel.org/r/20200819080026.918134-1-leon@kernel.org Signed-off-by: Linus Torvalds --- mm/rodata_test.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/rodata_test.c b/mm/rodata_test.c index 2a99df7beeb3..2613371945b7 100644 --- a/mm/rodata_test.c +++ b/mm/rodata_test.c @@ -7,6 +7,7 @@ */ #define pr_fmt(fmt) "rodata_test: " fmt +#include #include #include -- cgit v1.2.3 From bcf85fcedfdd17911982a3e3564fcfec7b01eebd Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 20 Aug 2020 17:42:11 -0700 Subject: romfs: fix uninitialized memory leak in romfs_dev_read() romfs has a superblock field that limits the size of the filesystem; data beyond that limit is never accessed. romfs_dev_read() fetches a caller-supplied number of bytes from the backing device. It returns 0 on success or an error code on failure; therefore, its API can't represent short reads, it's all-or-nothing. However, when romfs_dev_read() detects that the requested operation would cross the filesystem size limit, it currently silently truncates the requested number of bytes. This e.g. means that when the content of a file with size 0x1000 starts one byte before the filesystem size limit, ->readpage() will only fill a single byte of the supplied page while leaving the rest uninitialized, leaking that uninitialized memory to userspace. Fix it by returning an error code instead of truncating the read when the requested read operation would go beyond the end of the filesystem. Fixes: da4458bda237 ("NOMMU: Make it possible for RomFS to use MTD devices directly") Signed-off-by: Jann Horn Signed-off-by: Andrew Morton Reviewed-by: Greg Kroah-Hartman Cc: David Howells Cc: Link: http://lkml.kernel.org/r/20200818013202.2246365-1-jannh@google.com Signed-off-by: Linus Torvalds --- fs/romfs/storage.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c index 6b2b4362089e..b57b3ffcbc32 100644 --- a/fs/romfs/storage.c +++ b/fs/romfs/storage.c @@ -217,10 +217,8 @@ int romfs_dev_read(struct super_block *sb, unsigned long pos, size_t limit; limit = romfs_maxsize(sb); - if (pos >= limit) + if (pos >= limit || buflen > limit - pos) return -EIO; - if (buflen > limit - pos) - buflen = limit - pos; #ifdef CONFIG_ROMFS_ON_MTD if (sb->s_mtd) -- cgit v1.2.3 From 71e843295c680898959b22dc877ae3839cc22470 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 20 Aug 2020 17:42:14 -0700 Subject: kernel/relay.c: fix memleak on destroy relay channel kmemleak report memory leak as follows: unreferenced object 0x607ee4e5f948 (size 8): comm "syz-executor.1", pid 2098, jiffies 4295031601 (age 288.468s) hex dump (first 8 bytes): 00 00 00 00 00 00 00 00 ........ backtrace: relay_open kernel/relay.c:583 [inline] relay_open+0xb6/0x970 kernel/relay.c:563 do_blk_trace_setup+0x4a8/0xb20 kernel/trace/blktrace.c:557 __blk_trace_setup+0xb6/0x150 kernel/trace/blktrace.c:597 blk_trace_ioctl+0x146/0x280 kernel/trace/blktrace.c:738 blkdev_ioctl+0xb2/0x6a0 block/ioctl.c:613 block_ioctl+0xe5/0x120 fs/block_dev.c:1871 vfs_ioctl fs/ioctl.c:48 [inline] __do_sys_ioctl fs/ioctl.c:753 [inline] __se_sys_ioctl fs/ioctl.c:739 [inline] __x64_sys_ioctl+0x170/0x1ce fs/ioctl.c:739 do_syscall_64+0x33/0x40 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 'chan->buf' is malloced in relay_open() by alloc_percpu() but not free while destroy the relay channel. Fix it by adding free_percpu() before return from relay_destroy_channel(). Fixes: 017c59c042d0 ("relay: Use per CPU constructs for the relay channel buffer pointers") Reported-by: Hulk Robot Signed-off-by: Wei Yongjun Signed-off-by: Andrew Morton Reviewed-by: Chris Wilson Cc: Al Viro Cc: Michael Ellerman Cc: David Rientjes Cc: Michel Lespinasse Cc: Daniel Axtens Cc: Thomas Gleixner Cc: Akash Goel Cc: Link: http://lkml.kernel.org/r/20200817122826.48518-1-weiyongjun1@huawei.com Signed-off-by: Linus Torvalds --- kernel/relay.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/relay.c b/kernel/relay.c index 72fe443ea78f..fb4e0c530c08 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -197,6 +197,7 @@ free_buf: static void relay_destroy_channel(struct kref *kref) { struct rchan *chan = container_of(kref, struct rchan, kref); + free_percpu(chan->buf); kfree(chan); } -- cgit v1.2.3 From c17c3dc9d08b9aad9a55a1e53f205187972f448e Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 20 Aug 2020 17:42:17 -0700 Subject: uprobes: __replace_page() avoid BUG in munlock_vma_page() syzbot crashed on the VM_BUG_ON_PAGE(PageTail) in munlock_vma_page(), when called from uprobes __replace_page(). Which of many ways to fix it? Settled on not calling when PageCompound (since Head and Tail are equals in this context, PageCompound the usual check in uprobes.c, and the prior use of FOLL_SPLIT_PMD will have cleared PageMlocked already). Fixes: 5a52c9df62b4 ("uprobe: use FOLL_SPLIT_PMD instead of FOLL_SPLIT") Reported-by: syzbot Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Reviewed-by: Srikar Dronamraju Acked-by: Song Liu Acked-by: Oleg Nesterov Cc: "Kirill A. Shutemov" Cc: [5.4+] Link: http://lkml.kernel.org/r/alpine.LSU.2.11.2008161338360.20413@eggly.anvils Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 649fd53dc9ad..0e18aaf23a7b 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -205,7 +205,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, try_to_free_swap(old_page); page_vma_mapped_walk_done(&pvmw); - if (vma->vm_flags & VM_LOCKED) + if ((vma->vm_flags & VM_LOCKED) && !PageCompound(old_page)) munlock_vma_page(old_page); put_page(old_page); -- cgit v1.2.3 From f26044c83e6e473a61917f5db411d1417327d425 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Thu, 20 Aug 2020 17:42:21 -0700 Subject: squashfs: avoid bio_alloc() failure with 1Mbyte blocks This is a regression introduced by the patch "migrate from ll_rw_block usage to BIO". Bio_alloc() is limited to 256 pages (1 Mbyte). This can cause a failure when reading 1 Mbyte block filesystems. The problem is a datablock can be fully (or almost uncompressed), requiring 256 pages, but, because blocks are not aligned to page boundaries, it may require 257 pages to read. Bio_kmalloc() can handle 1024 pages, and so use this for the edge condition. Fixes: 93e72b3c612a ("squashfs: migrate from ll_rw_block usage to BIO") Reported-by: Nicolas Prochazka Reported-by: Tomoatsu Shimada Signed-off-by: Phillip Lougher Signed-off-by: Andrew Morton Reviewed-by: Guenter Roeck Cc: Philippe Liard Cc: Christoph Hellwig Cc: Adrien Schildknecht Cc: Daniel Rosenberg Cc: Link: http://lkml.kernel.org/r/20200815035637.15319-1-phillip@squashfs.org.uk Signed-off-by: Linus Torvalds --- fs/squashfs/block.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 76bb1c846845..8a19773b5a0b 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -87,7 +87,11 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length, int error, i; struct bio *bio; - bio = bio_alloc(GFP_NOIO, page_count); + if (page_count <= BIO_MAX_PAGES) + bio = bio_alloc(GFP_NOIO, page_count); + else + bio = bio_kmalloc(GFP_NOIO, page_count); + if (!bio) return -ENOMEM; -- cgit v1.2.3 From e08d3fdfe2dafa0331843f70ce1ff6c1c4900bf4 Mon Sep 17 00:00:00 2001 From: Doug Berger Date: Thu, 20 Aug 2020 17:42:24 -0700 Subject: mm: include CMA pages in lowmem_reserve at boot The lowmem_reserve arrays provide a means of applying pressure against allocations from lower zones that were targeted at higher zones. Its values are a function of the number of pages managed by higher zones and are assigned by a call to the setup_per_zone_lowmem_reserve() function. The function is initially called at boot time by the function init_per_zone_wmark_min() and may be called later by accesses of the /proc/sys/vm/lowmem_reserve_ratio sysctl file. The function init_per_zone_wmark_min() was moved up from a module_init to a core_initcall to resolve a sequencing issue with khugepaged. Unfortunately this created a sequencing issue with CMA page accounting. The CMA pages are added to the managed page count of a zone when cma_init_reserved_areas() is called at boot also as a core_initcall. This makes it uncertain whether the CMA pages will be added to the managed page counts of their zones before or after the call to init_per_zone_wmark_min() as it becomes dependent on link order. With the current link order the pages are added to the managed count after the lowmem_reserve arrays are initialized at boot. This means the lowmem_reserve values at boot may be lower than the values used later if /proc/sys/vm/lowmem_reserve_ratio is accessed even if the ratio values are unchanged. In many cases the difference is not significant, but for example an ARM platform with 1GB of memory and the following memory layout cma: Reserved 256 MiB at 0x0000000030000000 Zone ranges: DMA [mem 0x0000000000000000-0x000000002fffffff] Normal empty HighMem [mem 0x0000000030000000-0x000000003fffffff] would result in 0 lowmem_reserve for the DMA zone. This would allow userspace to deplete the DMA zone easily. Funnily enough $ cat /proc/sys/vm/lowmem_reserve_ratio would fix up the situation because as a side effect it forces setup_per_zone_lowmem_reserve. This commit breaks the link order dependency by invoking init_per_zone_wmark_min() as a postcore_initcall so that the CMA pages have the chance to be properly accounted in their zone(s) and allowing the lowmem_reserve arrays to receive consistent values. Fixes: bc22af74f271 ("mm: update min_free_kbytes from khugepaged after core initialization") Signed-off-by: Doug Berger Signed-off-by: Andrew Morton Acked-by: Michal Hocko Cc: Jason Baron Cc: David Rientjes Cc: "Kirill A. Shutemov" Cc: Link: http://lkml.kernel.org/r/1597423766-27849-1-git-send-email-opendmb@gmail.com Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0e2bab486fea..f4b3b91bbe7f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7888,7 +7888,7 @@ int __meminit init_per_zone_wmark_min(void) return 0; } -core_initcall(init_per_zone_wmark_min) +postcore_initcall(init_per_zone_wmark_min) /* * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so -- cgit v1.2.3 From 88e8ac11d2ea3acc003cf01bb5a38c8aa76c3cfd Mon Sep 17 00:00:00 2001 From: Charan Teja Reddy Date: Thu, 20 Aug 2020 17:42:27 -0700 Subject: mm, page_alloc: fix core hung in free_pcppages_bulk() The following race is observed with the repeated online, offline and a delay between two successive online of memory blocks of movable zone. P1 P2 Online the first memory block in the movable zone. The pcp struct values are initialized to default values,i.e., pcp->high = 0 & pcp->batch = 1. Allocate the pages from the movable zone. Try to Online the second memory block in the movable zone thus it entered the online_pages() but yet to call zone_pcp_update(). This process is entered into the exit path thus it tries to release the order-0 pages to pcp lists through free_unref_page_commit(). As pcp->high = 0, pcp->count = 1 proceed to call the function free_pcppages_bulk(). Update the pcp values thus the new pcp values are like, say, pcp->high = 378, pcp->batch = 63. Read the pcp's batch value using READ_ONCE() and pass the same to free_pcppages_bulk(), pcp values passed here are, batch = 63, count = 1. Since num of pages in the pcp lists are less than ->batch, then it will stuck in while(list_empty(list)) loop with interrupts disabled thus a core hung. Avoid this by ensuring free_pcppages_bulk() is called with proper count of pcp list pages. The mentioned race is some what easily reproducible without [1] because pcp's are not updated for the first memory block online and thus there is a enough race window for P2 between alloc+free and pcp struct values update through onlining of second memory block. With [1], the race still exists but it is very narrow as we update the pcp struct values for the first memory block online itself. This is not limited to the movable zone, it could also happen in cases with the normal zone (e.g., hotplug to a node that only has DMA memory, or no other memory yet). [1]: https://patchwork.kernel.org/patch/11696389/ Fixes: 5f8dcc21211a ("page-allocator: split per-cpu list into one-list-per-migrate-type") Signed-off-by: Charan Teja Reddy Signed-off-by: Andrew Morton Acked-by: David Hildenbrand Acked-by: David Rientjes Acked-by: Michal Hocko Cc: Michal Hocko Cc: Vlastimil Babka Cc: Vinayak Menon Cc: [2.6+] Link: http://lkml.kernel.org/r/1597150703-19003-1-git-send-email-charante@codeaurora.org Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f4b3b91bbe7f..fab5e97dc9ca 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1302,6 +1302,11 @@ static void free_pcppages_bulk(struct zone *zone, int count, struct page *page, *tmp; LIST_HEAD(head); + /* + * Ensure proper count is passed which otherwise would stuck in the + * below while (list_empty(list)) loop. + */ + count = min(pcp->count, count); while (count) { struct list_head *list; -- cgit v1.2.3