From 46b1ee38b2ba1a9524c8e886ad078bd3ca40de2a Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Sun, 1 Nov 2020 17:07:23 -0800 Subject: mm/mremap_pages: fix static key devmap_managed_key updates commit 6f42193fd86e ("memremap: don't use a separate devm action for devmap_managed_enable_get") changed the static key updates such that we now call devmap_managed_enable_put() without doing the equivalent devmap_managed_enable_get(). devmap_managed_enable_get() is only called for MEMORY_DEVICE_PRIVATE and MEMORY_DEVICE_FS_DAX, But memunmap_pages() get called for other pgmap types too. This results in the below warning when switching between system-ram and devdax mode for devdax namespace. jump label: negative count! WARNING: CPU: 52 PID: 1335 at kernel/jump_label.c:235 static_key_slow_try_dec+0x88/0xa0 Modules linked in: .... NIP static_key_slow_try_dec+0x88/0xa0 LR static_key_slow_try_dec+0x84/0xa0 Call Trace: static_key_slow_try_dec+0x84/0xa0 __static_key_slow_dec_cpuslocked+0x34/0xd0 static_key_slow_dec+0x54/0xf0 memunmap_pages+0x36c/0x500 devm_action_release+0x30/0x50 release_nodes+0x2f4/0x3e0 device_release_driver_internal+0x17c/0x280 bus_remove_device+0x124/0x210 device_del+0x1d4/0x530 unregister_dev_dax+0x48/0xe0 devm_action_release+0x30/0x50 release_nodes+0x2f4/0x3e0 device_release_driver_internal+0x17c/0x280 unbind_store+0x130/0x170 drv_attr_store+0x40/0x60 sysfs_kf_write+0x6c/0xb0 kernfs_fop_write+0x118/0x280 vfs_write+0xe8/0x2a0 ksys_write+0x84/0x140 system_call_exception+0x120/0x270 system_call_common+0xf0/0x27c Reported-by: Aneesh Kumar K.V Signed-off-by: Ralph Campbell Signed-off-by: Andrew Morton Tested-by: Sachin Sant Reviewed-by: Aneesh Kumar K.V Reviewed-by: Ira Weiny Reviewed-by: Christoph Hellwig Cc: Dan Williams Cc: Jason Gunthorpe Link: https://lkml.kernel.org/r/20201023183222.13186-1-rcampbell@nvidia.com Signed-off-by: Linus Torvalds --- mm/memremap.c | 39 ++++++++++++++++----------------------- 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/mm/memremap.c b/mm/memremap.c index 73a206d0f645..16b2fb482da1 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -41,28 +41,24 @@ EXPORT_SYMBOL_GPL(memremap_compat_align); DEFINE_STATIC_KEY_FALSE(devmap_managed_key); EXPORT_SYMBOL(devmap_managed_key); -static void devmap_managed_enable_put(void) +static void devmap_managed_enable_put(struct dev_pagemap *pgmap) { - static_branch_dec(&devmap_managed_key); + if (pgmap->type == MEMORY_DEVICE_PRIVATE || + pgmap->type == MEMORY_DEVICE_FS_DAX) + static_branch_dec(&devmap_managed_key); } -static int devmap_managed_enable_get(struct dev_pagemap *pgmap) +static void devmap_managed_enable_get(struct dev_pagemap *pgmap) { - if (pgmap->type == MEMORY_DEVICE_PRIVATE && - (!pgmap->ops || !pgmap->ops->page_free)) { - WARN(1, "Missing page_free method\n"); - return -EINVAL; - } - - static_branch_inc(&devmap_managed_key); - return 0; + if (pgmap->type == MEMORY_DEVICE_PRIVATE || + pgmap->type == MEMORY_DEVICE_FS_DAX) + static_branch_inc(&devmap_managed_key); } #else -static int devmap_managed_enable_get(struct dev_pagemap *pgmap) +static void devmap_managed_enable_get(struct dev_pagemap *pgmap) { - return -EINVAL; } -static void devmap_managed_enable_put(void) +static void devmap_managed_enable_put(struct dev_pagemap *pgmap) { } #endif /* CONFIG_DEV_PAGEMAP_OPS */ @@ -169,7 +165,7 @@ void memunmap_pages(struct dev_pagemap *pgmap) pageunmap_range(pgmap, i); WARN_ONCE(pgmap->altmap.alloc, "failed to free all reserved pages\n"); - devmap_managed_enable_put(); + devmap_managed_enable_put(pgmap); } EXPORT_SYMBOL_GPL(memunmap_pages); @@ -307,7 +303,6 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) .pgprot = PAGE_KERNEL, }; const int nr_range = pgmap->nr_range; - bool need_devmap_managed = true; int error, i; if (WARN_ONCE(!nr_range, "nr_range must be specified\n")) @@ -323,6 +318,10 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) WARN(1, "Missing migrate_to_ram method\n"); return ERR_PTR(-EINVAL); } + if (!pgmap->ops->page_free) { + WARN(1, "Missing page_free method\n"); + return ERR_PTR(-EINVAL); + } if (!pgmap->owner) { WARN(1, "Missing owner\n"); return ERR_PTR(-EINVAL); @@ -336,11 +335,9 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) } break; case MEMORY_DEVICE_GENERIC: - need_devmap_managed = false; break; case MEMORY_DEVICE_PCI_P2PDMA: params.pgprot = pgprot_noncached(params.pgprot); - need_devmap_managed = false; break; default: WARN(1, "Invalid pgmap type %d\n", pgmap->type); @@ -364,11 +361,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) } } - if (need_devmap_managed) { - error = devmap_managed_enable_get(pgmap); - if (error) - return ERR_PTR(error); - } + devmap_managed_enable_get(pgmap); /* * Clear the pgmap nr_range as it will be incremented for each -- cgit v1.2.3 From 79aa925bf239c234be8586780e482872dc4690dd Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Sun, 1 Nov 2020 17:07:27 -0800 Subject: hugetlb_cgroup: fix reservation accounting Michal Privoznik was using "free page reporting" in QEMU/virtio-balloon with hugetlbfs and hit the warning below. QEMU with free page hinting uses fallocate(FALLOC_FL_PUNCH_HOLE) to discard pages that are reported as free by a VM. The reporting granularity is in pageblock granularity. So when the guest reports 2M chunks, we fallocate(FALLOC_FL_PUNCH_HOLE) one huge page in QEMU. WARNING: CPU: 7 PID: 6636 at mm/page_counter.c:57 page_counter_uncharge+0x4b/0x50 Modules linked in: ... CPU: 7 PID: 6636 Comm: qemu-system-x86 Not tainted 5.9.0 #137 Hardware name: Gigabyte Technology Co., Ltd. X570 AORUS PRO/X570 AORUS PRO, BIOS F21 07/31/2020 RIP: 0010:page_counter_uncharge+0x4b/0x50 ... Call Trace: hugetlb_cgroup_uncharge_file_region+0x4b/0x80 region_del+0x1d3/0x300 hugetlb_unreserve_pages+0x39/0xb0 remove_inode_hugepages+0x1a8/0x3d0 hugetlbfs_fallocate+0x3c4/0x5c0 vfs_fallocate+0x146/0x290 __x64_sys_fallocate+0x3e/0x70 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Investigation of the issue uncovered bugs in hugetlb cgroup reservation accounting. This patch addresses the found issues. Fixes: 075a61d07a8e ("hugetlb_cgroup: add accounting for shared mappings") Reported-by: Michal Privoznik Co-developed-by: David Hildenbrand Signed-off-by: David Hildenbrand Signed-off-by: Mike Kravetz Signed-off-by: Andrew Morton Tested-by: Michal Privoznik Reviewed-by: Mina Almasry Acked-by: Michael S. Tsirkin Cc: Cc: David Hildenbrand Cc: Michal Hocko Cc: Muchun Song Cc: "Aneesh Kumar K . V" Cc: Tejun Heo Link: https://lkml.kernel.org/r/20201021204426.36069-1-mike.kravetz@oracle.com Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fe76f8fd5a73..5a620f690911 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -648,6 +648,8 @@ retry: } del += t - f; + hugetlb_cgroup_uncharge_file_region( + resv, rg, t - f); /* New entry for end of split region */ nrg->from = t; @@ -660,9 +662,6 @@ retry: /* Original entry is trimmed */ rg->to = f; - hugetlb_cgroup_uncharge_file_region( - resv, rg, nrg->to - nrg->from); - list_add(&nrg->link, &rg->link); nrg = NULL; break; @@ -678,17 +677,17 @@ retry: } if (f <= rg->from) { /* Trim beginning of region */ - del += t - rg->from; - rg->from = t; - hugetlb_cgroup_uncharge_file_region(resv, rg, t - rg->from); - } else { /* Trim end of region */ - del += rg->to - f; - rg->to = f; + del += t - rg->from; + rg->from = t; + } else { /* Trim end of region */ hugetlb_cgroup_uncharge_file_region(resv, rg, rg->to - f); + + del += rg->to - f; + rg->to = f; } } @@ -2443,6 +2442,9 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, rsv_adjust = hugepage_subpool_put_pages(spool, 1); hugetlb_acct_memory(h, -rsv_adjust); + if (deferred_reserve) + hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h), + pages_per_huge_page(h), page); } return page; -- cgit v1.2.3 From 7de2e9f195b9cb27583c5c64deaaf5e6afcc163e Mon Sep 17 00:00:00 2001 From: zhongjiang-ali Date: Sun, 1 Nov 2020 17:07:30 -0800 Subject: mm: memcontrol: correct the NR_ANON_THPS counter of hierarchical memcg memcg_page_state will get the specified number in hierarchical memcg, It should multiply by HPAGE_PMD_NR rather than an page if the item is NR_ANON_THPS. [akpm@linux-foundation.org: fix printk warning] [akpm@linux-foundation.org: use u64 cast, per Michal] Fixes: 468c398233da ("mm: memcontrol: switch to native NR_ANON_THPS counter") Signed-off-by: zhongjiang-ali Signed-off-by: Andrew Morton Acked-by: Johannes Weiner Acked-by: Michal Hocko Link: https://lkml.kernel.org/r/1603722395-72443-1-git-send-email-zhongjiang-ali@linux.alibaba.com Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3a24e3b619f5..c3b6dc7d5c94 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4110,11 +4110,17 @@ static int memcg_stat_show(struct seq_file *m, void *v) (u64)memsw * PAGE_SIZE); for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { + unsigned long nr; + if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; + nr = memcg_page_state(memcg, memcg1_stats[i]); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (memcg1_stats[i] == NR_ANON_THPS) + nr *= HPAGE_PMD_NR; +#endif seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], - (u64)memcg_page_state(memcg, memcg1_stats[i]) * - PAGE_SIZE); + (u64)nr * PAGE_SIZE); } for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) -- cgit v1.2.3 From 8de15e920dc85d1705ab9c202c95d56845bc2d48 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Sun, 1 Nov 2020 17:07:34 -0800 Subject: mm: memcg: link page counters to root if use_hierarchy is false MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Richard reported a warning which can be reproduced by running the LTP madvise6 test (cgroup v1 in the non-hierarchical mode should be used): WARNING: CPU: 0 PID: 12 at mm/page_counter.c:57 page_counter_uncharge (mm/page_counter.c:57 mm/page_counter.c:50 mm/page_counter.c:156) Modules linked in: CPU: 0 PID: 12 Comm: kworker/0:1 Not tainted 5.9.0-rc7-22-default #77 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-48-gd9c812d-rebuilt.opensuse.org 04/01/2014 Workqueue: events drain_local_stock RIP: 0010:page_counter_uncharge (mm/page_counter.c:57 mm/page_counter.c:50 mm/page_counter.c:156) Call Trace: __memcg_kmem_uncharge (mm/memcontrol.c:3022) drain_obj_stock (./include/linux/rcupdate.h:689 mm/memcontrol.c:3114) drain_local_stock (mm/memcontrol.c:2255) process_one_work (./arch/x86/include/asm/jump_label.h:25 ./include/linux/jump_label.h:200 ./include/trace/events/workqueue.h:108 kernel/workqueue.c:2274) worker_thread (./include/linux/list.h:282 kernel/workqueue.c:2416) kthread (kernel/kthread.c:292) ret_from_fork (arch/x86/entry/entry_64.S:300) The problem occurs because in the non-hierarchical mode non-root page counters are not linked to root page counters, so the charge is not propagated to the root memory cgroup. After the removal of the original memory cgroup and reparenting of the object cgroup, the root cgroup might be uncharged by draining a objcg stock, for example. It leads to an eventual underflow of the charge and triggers a warning. Fix it by linking all page counters to corresponding root page counters in the non-hierarchical mode. Please note, that in the non-hierarchical mode all objcgs are always reparented to the root memory cgroup, even if the hierarchy has more than 1 level. This patch doesn't change it. The patch also doesn't affect how the hierarchical mode is working, which is the only sane and truly supported mode now. Thanks to Richard for reporting, debugging and providing an alternative version of the fix! Fixes: bf4f059954dc ("mm: memcg/slab: obj_cgroup API") Reported-by: Signed-off-by: Roman Gushchin Signed-off-by: Andrew Morton Reviewed-by: Shakeel Butt Reviewed-by: Michal Koutný Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Link: https://lkml.kernel.org/r/20201026231326.3212225-1-guro@fb.com Debugged-by: Richard Palethorpe Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c3b6dc7d5c94..3dcbf24d2227 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5345,17 +5345,22 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) memcg->swappiness = mem_cgroup_swappiness(parent); memcg->oom_kill_disable = parent->oom_kill_disable; } - if (parent && parent->use_hierarchy) { + if (!parent) { + page_counter_init(&memcg->memory, NULL); + page_counter_init(&memcg->swap, NULL); + page_counter_init(&memcg->kmem, NULL); + page_counter_init(&memcg->tcpmem, NULL); + } else if (parent->use_hierarchy) { memcg->use_hierarchy = true; page_counter_init(&memcg->memory, &parent->memory); page_counter_init(&memcg->swap, &parent->swap); page_counter_init(&memcg->kmem, &parent->kmem); page_counter_init(&memcg->tcpmem, &parent->tcpmem); } else { - page_counter_init(&memcg->memory, NULL); - page_counter_init(&memcg->swap, NULL); - page_counter_init(&memcg->kmem, NULL); - page_counter_init(&memcg->tcpmem, NULL); + page_counter_init(&memcg->memory, &root_mem_cgroup->memory); + page_counter_init(&memcg->swap, &root_mem_cgroup->swap); + page_counter_init(&memcg->kmem, &root_mem_cgroup->kmem); + page_counter_init(&memcg->tcpmem, &root_mem_cgroup->tcpmem); /* * Deeper hierachy with use_hierarchy == false doesn't make * much sense so let cgroup subsystem know about this -- cgit v1.2.3 From 58b999d7a22c59313e1e84832607c7a61640f4e7 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Sun, 1 Nov 2020 17:07:37 -0800 Subject: kasan: adopt KUNIT tests to SW_TAGS mode Now that we have KASAN-KUNIT tests integration, it's easy to see that some KASAN tests are not adopted to the SW_TAGS mode and are failing. Adjust the allocation size for kasan_memchr() and kasan_memcmp() by roung it up to OOB_TAG_OFF so the bad access ends up in a separate memory granule. Add a new kmalloc_uaf_16() tests that relies on UAF, and a new kasan_bitops_tags() test that is tailored to tag-based mode, as it's hard to adopt the existing kmalloc_oob_16() and kasan_bitops_generic() (renamed from kasan_bitops()) without losing the precision. Add new kmalloc_uaf_16() and kasan_bitops_uaf() tests that rely on UAFs, as it's hard to adopt the existing kmalloc_oob_16() and kasan_bitops_oob() (rename from kasan_bitops()) without losing the precision. Disable kasan_global_oob() and kasan_alloca_oob_left/right() as SW_TAGS mode doesn't instrument globals nor dynamic allocas. Signed-off-by: Andrey Konovalov Signed-off-by: Andrew Morton Tested-by: David Gow Link: https://lkml.kernel.org/r/76eee17b6531ca8b3ca92b240cb2fd23204aaff7.1603129942.git.andreyknvl@google.com Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 149 +++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 107 insertions(+), 42 deletions(-) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 63c26171a791..662f862702fc 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -216,6 +216,12 @@ static void kmalloc_oob_16(struct kunit *test) u64 words[2]; } *ptr1, *ptr2; + /* This test is specifically crafted for the generic mode. */ + if (!IS_ENABLED(CONFIG_KASAN_GENERIC)) { + kunit_info(test, "CONFIG_KASAN_GENERIC required\n"); + return; + } + ptr1 = kmalloc(sizeof(*ptr1) - 3, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1); @@ -227,6 +233,23 @@ static void kmalloc_oob_16(struct kunit *test) kfree(ptr2); } +static void kmalloc_uaf_16(struct kunit *test) +{ + struct { + u64 words[2]; + } *ptr1, *ptr2; + + ptr1 = kmalloc(sizeof(*ptr1), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1); + + ptr2 = kmalloc(sizeof(*ptr2), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); + kfree(ptr2); + + KUNIT_EXPECT_KASAN_FAIL(test, *ptr1 = *ptr2); + kfree(ptr1); +} + static void kmalloc_oob_memset_2(struct kunit *test) { char *ptr; @@ -429,6 +452,12 @@ static void kasan_global_oob(struct kunit *test) volatile int i = 3; char *p = &global_array[ARRAY_SIZE(global_array) + i]; + /* Only generic mode instruments globals. */ + if (!IS_ENABLED(CONFIG_KASAN_GENERIC)) { + kunit_info(test, "CONFIG_KASAN_GENERIC required"); + return; + } + KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p); } @@ -467,6 +496,12 @@ static void kasan_alloca_oob_left(struct kunit *test) char alloca_array[i]; char *p = alloca_array - 1; + /* Only generic mode instruments dynamic allocas. */ + if (!IS_ENABLED(CONFIG_KASAN_GENERIC)) { + kunit_info(test, "CONFIG_KASAN_GENERIC required"); + return; + } + if (!IS_ENABLED(CONFIG_KASAN_STACK)) { kunit_info(test, "CONFIG_KASAN_STACK is not enabled"); return; @@ -481,6 +516,12 @@ static void kasan_alloca_oob_right(struct kunit *test) char alloca_array[i]; char *p = alloca_array + i; + /* Only generic mode instruments dynamic allocas. */ + if (!IS_ENABLED(CONFIG_KASAN_GENERIC)) { + kunit_info(test, "CONFIG_KASAN_GENERIC required"); + return; + } + if (!IS_ENABLED(CONFIG_KASAN_STACK)) { kunit_info(test, "CONFIG_KASAN_STACK is not enabled"); return; @@ -551,6 +592,9 @@ static void kasan_memchr(struct kunit *test) return; } + if (OOB_TAG_OFF) + size = round_up(size, OOB_TAG_OFF); + ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); @@ -573,6 +617,9 @@ static void kasan_memcmp(struct kunit *test) return; } + if (OOB_TAG_OFF) + size = round_up(size, OOB_TAG_OFF); + ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); memset(arr, 0, sizeof(arr)); @@ -619,13 +666,50 @@ static void kasan_strings(struct kunit *test) KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strnlen(ptr, 1)); } -static void kasan_bitops(struct kunit *test) +static void kasan_bitops_modify(struct kunit *test, int nr, void *addr) +{ + KUNIT_EXPECT_KASAN_FAIL(test, set_bit(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, __set_bit(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, clear_bit(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, __clear_bit(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, clear_bit_unlock(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, __clear_bit_unlock(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, change_bit(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, __change_bit(nr, addr)); +} + +static void kasan_bitops_test_and_modify(struct kunit *test, int nr, void *addr) +{ + KUNIT_EXPECT_KASAN_FAIL(test, test_and_set_bit(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, __test_and_set_bit(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, test_and_set_bit_lock(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, test_and_clear_bit(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, __test_and_clear_bit(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, test_and_change_bit(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, __test_and_change_bit(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = test_bit(nr, addr)); + +#if defined(clear_bit_unlock_is_negative_byte) + KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = + clear_bit_unlock_is_negative_byte(nr, addr)); +#endif +} + +static void kasan_bitops_generic(struct kunit *test) { + long *bits; + + /* This test is specifically crafted for the generic mode. */ + if (!IS_ENABLED(CONFIG_KASAN_GENERIC)) { + kunit_info(test, "CONFIG_KASAN_GENERIC required\n"); + return; + } + /* * Allocate 1 more byte, which causes kzalloc to round up to 16-bytes; * this way we do not actually corrupt other memory. */ - long *bits = kzalloc(sizeof(*bits) + 1, GFP_KERNEL); + bits = kzalloc(sizeof(*bits) + 1, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bits); /* @@ -633,55 +717,34 @@ static void kasan_bitops(struct kunit *test) * below accesses are still out-of-bounds, since bitops are defined to * operate on the whole long the bit is in. */ - KUNIT_EXPECT_KASAN_FAIL(test, set_bit(BITS_PER_LONG, bits)); - - KUNIT_EXPECT_KASAN_FAIL(test, __set_bit(BITS_PER_LONG, bits)); - - KUNIT_EXPECT_KASAN_FAIL(test, clear_bit(BITS_PER_LONG, bits)); - - KUNIT_EXPECT_KASAN_FAIL(test, __clear_bit(BITS_PER_LONG, bits)); - - KUNIT_EXPECT_KASAN_FAIL(test, clear_bit_unlock(BITS_PER_LONG, bits)); - - KUNIT_EXPECT_KASAN_FAIL(test, __clear_bit_unlock(BITS_PER_LONG, bits)); - - KUNIT_EXPECT_KASAN_FAIL(test, change_bit(BITS_PER_LONG, bits)); - - KUNIT_EXPECT_KASAN_FAIL(test, __change_bit(BITS_PER_LONG, bits)); + kasan_bitops_modify(test, BITS_PER_LONG, bits); /* * Below calls try to access bit beyond allocated memory. */ - KUNIT_EXPECT_KASAN_FAIL(test, - test_and_set_bit(BITS_PER_LONG + BITS_PER_BYTE, bits)); - - KUNIT_EXPECT_KASAN_FAIL(test, - __test_and_set_bit(BITS_PER_LONG + BITS_PER_BYTE, bits)); - - KUNIT_EXPECT_KASAN_FAIL(test, - test_and_set_bit_lock(BITS_PER_LONG + BITS_PER_BYTE, bits)); + kasan_bitops_test_and_modify(test, BITS_PER_LONG + BITS_PER_BYTE, bits); - KUNIT_EXPECT_KASAN_FAIL(test, - test_and_clear_bit(BITS_PER_LONG + BITS_PER_BYTE, bits)); + kfree(bits); +} - KUNIT_EXPECT_KASAN_FAIL(test, - __test_and_clear_bit(BITS_PER_LONG + BITS_PER_BYTE, bits)); +static void kasan_bitops_tags(struct kunit *test) +{ + long *bits; - KUNIT_EXPECT_KASAN_FAIL(test, - test_and_change_bit(BITS_PER_LONG + BITS_PER_BYTE, bits)); + /* This test is specifically crafted for the tag-based mode. */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) { + kunit_info(test, "CONFIG_KASAN_SW_TAGS required\n"); + return; + } - KUNIT_EXPECT_KASAN_FAIL(test, - __test_and_change_bit(BITS_PER_LONG + BITS_PER_BYTE, bits)); + /* Allocation size will be rounded to up granule size, which is 16. */ + bits = kzalloc(sizeof(*bits), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bits); - KUNIT_EXPECT_KASAN_FAIL(test, - kasan_int_result = - test_bit(BITS_PER_LONG + BITS_PER_BYTE, bits)); + /* Do the accesses past the 16 allocated bytes. */ + kasan_bitops_modify(test, BITS_PER_LONG, &bits[1]); + kasan_bitops_test_and_modify(test, BITS_PER_LONG + BITS_PER_BYTE, &bits[1]); -#if defined(clear_bit_unlock_is_negative_byte) - KUNIT_EXPECT_KASAN_FAIL(test, - kasan_int_result = clear_bit_unlock_is_negative_byte( - BITS_PER_LONG + BITS_PER_BYTE, bits)); -#endif kfree(bits); } @@ -728,6 +791,7 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(kmalloc_oob_krealloc_more), KUNIT_CASE(kmalloc_oob_krealloc_less), KUNIT_CASE(kmalloc_oob_16), + KUNIT_CASE(kmalloc_uaf_16), KUNIT_CASE(kmalloc_oob_in_memset), KUNIT_CASE(kmalloc_oob_memset_2), KUNIT_CASE(kmalloc_oob_memset_4), @@ -751,7 +815,8 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(kasan_memchr), KUNIT_CASE(kasan_memcmp), KUNIT_CASE(kasan_strings), - KUNIT_CASE(kasan_bitops), + KUNIT_CASE(kasan_bitops_generic), + KUNIT_CASE(kasan_bitops_tags), KUNIT_CASE(kmalloc_double_kzfree), KUNIT_CASE(vmalloc_oob), {} -- cgit v1.2.3 From 3f08842098e842c51e3b97d0dcdebf810b32558e Mon Sep 17 00:00:00 2001 From: Shijie Luo Date: Sun, 1 Nov 2020 17:07:40 -0800 Subject: mm: mempolicy: fix potential pte_unmap_unlock pte error When flags in queue_pages_pte_range don't have MPOL_MF_MOVE or MPOL_MF_MOVE_ALL bits, code breaks and passing origin pte - 1 to pte_unmap_unlock seems like not a good idea. queue_pages_pte_range can run in MPOL_MF_MOVE_ALL mode which doesn't migrate misplaced pages but returns with EIO when encountering such a page. Since commit a7f40cfe3b7a ("mm: mempolicy: make mbind() return -EIO when MPOL_MF_STRICT is specified") and early break on the first pte in the range results in pte_unmap_unlock on an underflow pte. This can lead to lockups later on when somebody tries to lock the pte resp. page_table_lock again.. Fixes: a7f40cfe3b7a ("mm: mempolicy: make mbind() return -EIO when MPOL_MF_STRICT is specified") Signed-off-by: Shijie Luo Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Cc: Miaohe Lin Cc: Feilong Lin Cc: Shijie Luo Cc: Link: https://lkml.kernel.org/r/20201019074853.50856-1-luoshijie1@huawei.com Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3fde772ef5ef..3ca4898f3f24 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -525,7 +525,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, unsigned long flags = qp->flags; int ret; bool has_unmovable = false; - pte_t *pte; + pte_t *pte, *mapped_pte; spinlock_t *ptl; ptl = pmd_trans_huge_lock(pmd, vma); @@ -539,7 +539,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, if (pmd_trans_unstable(pmd)) return 0; - pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) { if (!pte_present(*pte)) continue; @@ -571,7 +571,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, } else break; } - pte_unmap_unlock(pte - 1, ptl); + pte_unmap_unlock(mapped_pte, ptl); cond_resched(); if (has_unmovable) -- cgit v1.2.3 From 7b3c36fc4c231ca532120bbc0df67a12f09c1d96 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 1 Nov 2020 17:07:44 -0800 Subject: ptrace: fix task_join_group_stop() for the case when current is traced This testcase #include #include #include #include #include #include #include void *tf(void *arg) { return NULL; } int main(void) { int pid = fork(); if (!pid) { kill(getpid(), SIGSTOP); pthread_t th; pthread_create(&th, NULL, tf, NULL); return 0; } waitpid(pid, NULL, WSTOPPED); ptrace(PTRACE_SEIZE, pid, 0, PTRACE_O_TRACECLONE); waitpid(pid, NULL, 0); ptrace(PTRACE_CONT, pid, 0,0); waitpid(pid, NULL, 0); int status; int thread = waitpid(-1, &status, 0); assert(thread > 0 && thread != pid); assert(status == 0x80137f); return 0; } fails and triggers WARN_ON_ONCE(!signr) in do_jobctl_trap(). This is because task_join_group_stop() has 2 problems when current is traced: 1. We can't rely on the "JOBCTL_STOP_PENDING" check, a stopped tracee can be woken up by debugger and it can clone another thread which should join the group-stop. We need to check group_stop_count || SIGNAL_STOP_STOPPED. 2. If SIGNAL_STOP_STOPPED is already set, we should not increment sig->group_stop_count and add JOBCTL_STOP_CONSUME. The new thread should stop without another do_notify_parent_cldstop() report. To clarify, the problem is very old and we should blame ptrace_init_task(). But now that we have task_join_group_stop() it makes more sense to fix this helper to avoid the code duplication. Reported-by: syzbot+3485e3773f7da290eecc@syzkaller.appspotmail.com Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Cc: Jens Axboe Cc: Christian Brauner Cc: "Eric W . Biederman" Cc: Zhiqiang Liu Cc: Tejun Heo Cc: Link: https://lkml.kernel.org/r/20201019134237.GA18810@redhat.com Signed-off-by: Linus Torvalds --- kernel/signal.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/kernel/signal.c b/kernel/signal.c index a38b3edc6851..ef8f2a28d37c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -391,16 +391,17 @@ static bool task_participate_group_stop(struct task_struct *task) void task_join_group_stop(struct task_struct *task) { + unsigned long mask = current->jobctl & JOBCTL_STOP_SIGMASK; + struct signal_struct *sig = current->signal; + + if (sig->group_stop_count) { + sig->group_stop_count++; + mask |= JOBCTL_STOP_CONSUME; + } else if (!(sig->flags & SIGNAL_STOP_STOPPED)) + return; + /* Have the new thread join an on-going signal group stop */ - unsigned long jobctl = current->jobctl; - if (jobctl & JOBCTL_STOP_PENDING) { - struct signal_struct *sig = current->signal; - unsigned long signr = jobctl & JOBCTL_STOP_SIGMASK; - unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME; - if (task_set_jobctl_pending(task, signr | gstop)) { - sig->group_stop_count++; - } - } + task_set_jobctl_pending(task, mask | JOBCTL_STOP_PENDING); } /* -- cgit v1.2.3 From aa4e460f0976351fddd2f5ac6e08b74320c277a1 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Sun, 1 Nov 2020 17:07:47 -0800 Subject: lib/crc32test: remove extra local_irq_disable/enable Commit 4d004099a668 ("lockdep: Fix lockdep recursion") uncovered the following issue in lib/crc32test reported on s390: BUG: using __this_cpu_read() in preemptible [00000000] code: swapper/0/1 caller is lockdep_hardirqs_on_prepare+0x48/0x270 CPU: 6 PID: 1 Comm: swapper/0 Not tainted 5.9.0-next-20201015-15164-g03d992bd2de6 #19 Hardware name: IBM 3906 M04 704 (LPAR) Call Trace: lockdep_hardirqs_on_prepare+0x48/0x270 trace_hardirqs_on+0x9c/0x1b8 crc32_test.isra.0+0x170/0x1c0 crc32test_init+0x1c/0x40 do_one_initcall+0x40/0x130 do_initcalls+0x126/0x150 kernel_init_freeable+0x1f6/0x230 kernel_init+0x22/0x150 ret_from_fork+0x24/0x2c no locks held by swapper/0/1. Remove extra local_irq_disable/local_irq_enable helpers calls. Fixes: 5fb7f87408f1 ("lib: add module support to crc32 tests") Signed-off-by: Vasily Gorbik Signed-off-by: Andrew Morton Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Greg Kroah-Hartman Link: https://lkml.kernel.org/r/patch.git-4369da00c06e.your-ad-here.call-01602859837-ext-1679@work.hours Signed-off-by: Linus Torvalds --- lib/crc32test.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/lib/crc32test.c b/lib/crc32test.c index 97d6a57cefcc..61ddce2cff77 100644 --- a/lib/crc32test.c +++ b/lib/crc32test.c @@ -683,7 +683,6 @@ static int __init crc32c_test(void) /* reduce OS noise */ local_irq_save(flags); - local_irq_disable(); nsec = ktime_get_ns(); for (i = 0; i < 100; i++) { @@ -694,7 +693,6 @@ static int __init crc32c_test(void) nsec = ktime_get_ns() - nsec; local_irq_restore(flags); - local_irq_enable(); pr_info("crc32c: CRC_LE_BITS = %d\n", CRC_LE_BITS); @@ -768,7 +766,6 @@ static int __init crc32_test(void) /* reduce OS noise */ local_irq_save(flags); - local_irq_disable(); nsec = ktime_get_ns(); for (i = 0; i < 100; i++) { @@ -783,7 +780,6 @@ static int __init crc32_test(void) nsec = ktime_get_ns() - nsec; local_irq_restore(flags); - local_irq_enable(); pr_info("crc32: CRC_LE_BITS = %d, CRC_BE BITS = %d\n", CRC_LE_BITS, CRC_BE_BITS); -- cgit v1.2.3 From a77eedbc871ee3b435bffc30b123b60eecca402c Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Sun, 1 Nov 2020 17:07:50 -0800 Subject: mm/truncate.c: make __invalidate_mapping_pages() static Fix the following sparse warning: mm/truncate.c:531:15: warning: symbol '__invalidate_mapping_pages' was not declared. Should it be static? Fixes: eb1d7a65f08a ("mm, fadvise: improve the expensive remote LRU cache draining after FADV_DONTNEED") Signed-off-by: Jason Yan Signed-off-by: Andrew Morton Reviewed-by: Yafang Shao Link: https://lkml.kernel.org/r/20201015054808.2445904-1-yanaijie@huawei.com Signed-off-by: Linus Torvalds --- mm/truncate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/truncate.c b/mm/truncate.c index 18cec39a9f53..960edf5803ca 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -528,7 +528,7 @@ void truncate_inode_pages_final(struct address_space *mapping) } EXPORT_SYMBOL(truncate_inode_pages_final); -unsigned long __invalidate_mapping_pages(struct address_space *mapping, +static unsigned long __invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end, unsigned long *nr_pagevec) { pgoff_t indices[PAGEVEC_SIZE]; -- cgit v1.2.3 From 6993d0fdbee0eb38bfac350aa016f65ad11ed3b1 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Sun, 1 Nov 2020 17:07:53 -0800 Subject: kthread_worker: prevent queuing delayed work from timer_fn when it is being canceled There is a small race window when a delayed work is being canceled and the work still might be queued from the timer_fn: CPU0 CPU1 kthread_cancel_delayed_work_sync() __kthread_cancel_work_sync() __kthread_cancel_work() work->canceling++; kthread_delayed_work_timer_fn() kthread_insert_work(); BUG: kthread_insert_work() should not get called when work->canceling is set. Signed-off-by: Zqiang Signed-off-by: Andrew Morton Reviewed-by: Petr Mladek Acked-by: Tejun Heo Cc: Link: https://lkml.kernel.org/r/20201014083030.16895-1-qiang.zhang@windriver.com Signed-off-by: Linus Torvalds --- kernel/kthread.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/kthread.c b/kernel/kthread.c index e29773c82b70..933a625621b8 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -897,7 +897,8 @@ void kthread_delayed_work_timer_fn(struct timer_list *t) /* Move the work from worker->delayed_work_list. */ WARN_ON_ONCE(list_empty(&work->node)); list_del_init(&work->node); - kthread_insert_work(worker, work, &worker->work_list); + if (!work->canceling) + kthread_insert_work(worker, work, &worker->work_list); raw_spin_unlock_irqrestore(&worker->lock, flags); } -- cgit v1.2.3 From 66606567dedf395e0857f531976efad4cbbd39ea Mon Sep 17 00:00:00 2001 From: Charles Haithcock Date: Sun, 1 Nov 2020 17:07:56 -0800 Subject: mm, oom: keep oom_adj under or at upper limit when printing For oom_score_adj values in the range [942,999], the current calculations will print 16 for oom_adj. This patch simply limits the output so output is inline with docs. Signed-off-by: Charles Haithcock Signed-off-by: Andrew Morton Acked-by: Michal Hocko Cc: Alexey Dobriyan Link: https://lkml.kernel.org/r/20201020165130.33927-1-chaithco@redhat.com Signed-off-by: Linus Torvalds --- fs/proc/base.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/proc/base.c b/fs/proc/base.c index 0f707003dda5..b362523a9829 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1049,6 +1049,8 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) / OOM_SCORE_ADJ_MAX; put_task_struct(task); + if (oom_adj > OOM_ADJUST_MAX) + oom_adj = OOM_ADJUST_MAX; len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj); return simple_read_from_buffer(buf, count, ppos, buffer, len); } -- cgit v1.2.3 From f8f6ae5d077a9bdaf5cbf2ac960a5d1a04b47482 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 1 Nov 2020 17:08:00 -0800 Subject: mm: always have io_remap_pfn_range() set pgprot_decrypted() The purpose of io_remap_pfn_range() is to map IO memory, such as a memory mapped IO exposed through a PCI BAR. IO devices do not understand encryption, so this memory must always be decrypted. Automatically call pgprot_decrypted() as part of the generic implementation. This fixes a bug where enabling AMD SME causes subsystems, such as RDMA, using io_remap_pfn_range() to expose BAR pages to user space to fail. The CPU will encrypt access to those BAR pages instead of passing unencrypted IO directly to the device. Places not mapping IO should use remap_pfn_range(). Fixes: aca20d546214 ("x86/mm: Add support to make use of Secure Memory Encryption") Signed-off-by: Jason Gunthorpe Signed-off-by: Andrew Morton Cc: Arnd Bergmann Cc: Tom Lendacky Cc: Thomas Gleixner Cc: Andrey Ryabinin Cc: Borislav Petkov Cc: Brijesh Singh Cc: Jonathan Corbet Cc: Dmitry Vyukov Cc: "Dave Young" Cc: Alexander Potapenko Cc: Konrad Rzeszutek Wilk Cc: Andy Lutomirski Cc: Larry Woodman Cc: Matt Fleming Cc: Ingo Molnar Cc: "Michael S. Tsirkin" Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Rik van Riel Cc: Toshimitsu Kani Cc: Link: https://lkml.kernel.org/r/0-v1-025d64bdf6c4+e-amd_sme_fix_jgg@nvidia.com Signed-off-by: Linus Torvalds --- include/linux/mm.h | 9 +++++++++ include/linux/pgtable.h | 4 ---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index ef360fe70aaf..db6ae4d3fb4e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2759,6 +2759,15 @@ static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma, return VM_FAULT_NOPAGE; } +#ifndef io_remap_pfn_range +static inline int io_remap_pfn_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long pfn, + unsigned long size, pgprot_t prot) +{ + return remap_pfn_range(vma, addr, pfn, size, pgprot_decrypted(prot)); +} +#endif + static inline vm_fault_t vmf_error(int err) { if (err == -ENOMEM) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 38c33eabea89..71125a4676c4 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1427,10 +1427,6 @@ typedef unsigned int pgtbl_mod_mask; #endif /* !__ASSEMBLY__ */ -#ifndef io_remap_pfn_range -#define io_remap_pfn_range remap_pfn_range -#endif - #ifndef has_transparent_hugepage #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define has_transparent_hugepage() 1 -- cgit v1.2.3 From afabdf3338728c3aaa9f55d127e903dcd5f4acc7 Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Sun, 1 Nov 2020 17:08:07 -0800 Subject: epoll: add a selftest for epoll timeout race Add a test case to ensure an event is observed by at least one poller when an epoll timeout is used. Signed-off-by: Guantao Liu Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Andrew Morton Reviewed-by: Eric Dumazet Reviewed-by: Khazhismel Kumykov Acked-by: Willem de Bruijn Cc: Al Viro Cc: Davidlohr Bueso Link: https://lkml.kernel.org/r/20201028180202.952079-2-soheil.kdev@gmail.com Signed-off-by: Linus Torvalds --- .../filesystems/epoll/epoll_wakeup_test.c | 95 ++++++++++++++++++++++ 1 file changed, 95 insertions(+) diff --git a/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c b/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c index d979ff14775a..8f82f99f7748 100644 --- a/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c +++ b/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c @@ -3282,4 +3282,99 @@ TEST(epoll60) close(ctx.epfd); } +struct epoll61_ctx { + int epfd; + int evfd; +}; + +static void *epoll61_write_eventfd(void *ctx_) +{ + struct epoll61_ctx *ctx = ctx_; + int64_t l = 1; + + usleep(10950); + write(ctx->evfd, &l, sizeof(l)); + return NULL; +} + +static void *epoll61_epoll_with_timeout(void *ctx_) +{ + struct epoll61_ctx *ctx = ctx_; + struct epoll_event events[1]; + int n; + + n = epoll_wait(ctx->epfd, events, 1, 11); + /* + * If epoll returned the eventfd, write on the eventfd to wake up the + * blocking poller. + */ + if (n == 1) { + int64_t l = 1; + + write(ctx->evfd, &l, sizeof(l)); + } + return NULL; +} + +static void *epoll61_blocking_epoll(void *ctx_) +{ + struct epoll61_ctx *ctx = ctx_; + struct epoll_event events[1]; + + epoll_wait(ctx->epfd, events, 1, -1); + return NULL; +} + +TEST(epoll61) +{ + struct epoll61_ctx ctx; + struct epoll_event ev; + int i, r; + + ctx.epfd = epoll_create1(0); + ASSERT_GE(ctx.epfd, 0); + ctx.evfd = eventfd(0, EFD_NONBLOCK); + ASSERT_GE(ctx.evfd, 0); + + ev.events = EPOLLIN | EPOLLET | EPOLLERR | EPOLLHUP; + ev.data.ptr = NULL; + r = epoll_ctl(ctx.epfd, EPOLL_CTL_ADD, ctx.evfd, &ev); + ASSERT_EQ(r, 0); + + /* + * We are testing a race. Repeat the test case 1000 times to make it + * more likely to fail in case of a bug. + */ + for (i = 0; i < 1000; i++) { + pthread_t threads[3]; + int n; + + /* + * Start 3 threads: + * Thread 1 sleeps for 10.9ms and writes to the evenfd. + * Thread 2 calls epoll with a timeout of 11ms. + * Thread 3 calls epoll with a timeout of -1. + * + * The eventfd write by Thread 1 should either wakeup Thread 2 + * or Thread 3. If it wakes up Thread 2, Thread 2 writes on the + * eventfd to wake up Thread 3. + * + * If no events are missed, all three threads should eventually + * be joinable. + */ + ASSERT_EQ(pthread_create(&threads[0], NULL, + epoll61_write_eventfd, &ctx), 0); + ASSERT_EQ(pthread_create(&threads[1], NULL, + epoll61_epoll_with_timeout, &ctx), 0); + ASSERT_EQ(pthread_create(&threads[2], NULL, + epoll61_blocking_epoll, &ctx), 0); + + for (n = 0; n < ARRAY_SIZE(threads); ++n) + ASSERT_EQ(pthread_join(threads[n], NULL), 0); + } + + close(ctx.epfd); + close(ctx.evfd); +} + TEST_HARNESS_MAIN -- cgit v1.2.3 From 3b70ae4f5c4e050bdebeeefe0c369524f37917cf Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Sun, 1 Nov 2020 17:08:10 -0800 Subject: kernel/hung_task.c: make type annotations consistent Commit 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") removed various __user annotations from function signatures as part of its refactoring. It also removed the __user annotation for proc_dohung_task_timeout_secs() at its declaration in sched/sysctl.h, but not at its definition in kernel/hung_task.c. Hence, sparse complains: kernel/hung_task.c:271:5: error: symbol 'proc_dohung_task_timeout_secs' redeclared with different type (incompatible argument 3 (different address spaces)) Adjust the annotation at the definition fitting to that refactoring to make sparse happy again, which also resolves this warning from sparse: kernel/hung_task.c:277:52: warning: incorrect type in argument 3 (different address spaces) kernel/hung_task.c:277:52: expected void * kernel/hung_task.c:277:52: got void [noderef] __user *buffer No functional change. No change in object code. Signed-off-by: Lukas Bulwahn Signed-off-by: Andrew Morton Cc: Christoph Hellwig Cc: Tetsuo Handa Cc: Al Viro Cc: Andrey Ignatov Link: https://lkml.kernel.org/r/20201028130541.20320-1-lukas.bulwahn@gmail.com Signed-off-by: Linus Torvalds --- kernel/hung_task.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/hung_task.c b/kernel/hung_task.c index ce76f490126c..396ebaebea3f 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -225,8 +225,7 @@ static long hung_timeout_jiffies(unsigned long last_checked, * Process updating of timeout sysctl */ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; -- cgit v1.2.3