From c1a06df6ebf6ca98fb7a672fe447c7469d6c1968 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Thu, 6 Aug 2020 23:17:09 -0700 Subject: mm/migrate: fix migrate_pgmap_owner w/o CONFIG_MMU_NOTIFIER On x86_64, when CONFIG_MMU_NOTIFIER is not set/enabled, there is a compiler error: mm/migrate.c: In function 'migrate_vma_collect': mm/migrate.c:2481:7: error: 'struct mmu_notifier_range' has no member named 'migrate_pgmap_owner' range.migrate_pgmap_owner = migrate->pgmap_owner; ^ Fixes: 998427b3ad2c ("mm/notifier: add migration invalidation type") Reported-by: Randy Dunlap Signed-off-by: Ralph Campbell Signed-off-by: Andrew Morton Tested-by: Randy Dunlap Acked-by: Randy Dunlap Cc: Jerome Glisse Cc: John Hubbard Cc: Christoph Hellwig Cc: "Jason Gunthorpe" Link: http://lkml.kernel.org/r/20200806193353.7124-1-rcampbell@nvidia.com Signed-off-by: Linus Torvalds --- mm/migrate.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 4fcc465736ff..d179657f8685 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2386,9 +2386,9 @@ static void migrate_vma_collect(struct migrate_vma *migrate) * that the registered device driver can skip invalidating device * private page mappings that won't be migrated. */ - mmu_notifier_range_init(&range, MMU_NOTIFY_MIGRATE, 0, migrate->vma, - migrate->vma->vm_mm, migrate->start, migrate->end); - range.migrate_pgmap_owner = migrate->pgmap_owner; + mmu_notifier_range_init_migrate(&range, 0, migrate->vma, + migrate->vma->vm_mm, migrate->start, migrate->end, + migrate->pgmap_owner); mmu_notifier_invalidate_range_start(&range); walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end, -- cgit v1.2.3 From 4a93025cbe4a0b19d1a25a2d763a3d2018bad0d9 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 6 Aug 2020 23:17:13 -0700 Subject: mm/shuffle: don't move pages between zones and don't read garbage memmaps Especially with memory hotplug, we can have offline sections (with a garbage memmap) and overlapping zones. We have to make sure to only touch initialized memmaps (online sections managed by the buddy) and that the zone matches, to not move pages between zones. To test if this can actually happen, I added a simple BUG_ON(page_zone(page_i) != page_zone(page_j)); right before the swap. When hotplugging a 256M DIMM to a 4G x86-64 VM and onlining the first memory block "online_movable" and the second memory block "online_kernel", it will trigger the BUG, as both zones (NORMAL and MOVABLE) overlap. This might result in all kinds of weird situations (e.g., double allocations, list corruptions, unmovable allocations ending up in the movable zone). Fixes: e900a918b098 ("mm: shuffle initial free memory to improve memory-side-cache utilization") Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Wei Yang Acked-by: Michal Hocko Acked-by: Dan Williams Cc: Andrew Morton Cc: Johannes Weiner Cc: Michal Hocko Cc: Minchan Kim Cc: Huang Ying Cc: Wei Yang Cc: Mel Gorman Cc: [5.2+] Link: http://lkml.kernel.org/r/20200624094741.9918-2-david@redhat.com Signed-off-by: Linus Torvalds --- mm/shuffle.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/shuffle.c b/mm/shuffle.c index 44406d9977c7..dd13ab851b3e 100644 --- a/mm/shuffle.c +++ b/mm/shuffle.c @@ -58,25 +58,25 @@ module_param_call(shuffle, shuffle_store, shuffle_show, &shuffle_param, 0400); * For two pages to be swapped in the shuffle, they must be free (on a * 'free_area' lru), have the same order, and have the same migratetype. */ -static struct page * __meminit shuffle_valid_page(unsigned long pfn, int order) +static struct page * __meminit shuffle_valid_page(struct zone *zone, + unsigned long pfn, int order) { - struct page *page; + struct page *page = pfn_to_online_page(pfn); /* * Given we're dealing with randomly selected pfns in a zone we * need to ask questions like... */ - /* ...is the pfn even in the memmap? */ - if (!pfn_valid_within(pfn)) + /* ... is the page managed by the buddy? */ + if (!page) return NULL; - /* ...is the pfn in a present section or a hole? */ - if (!pfn_in_present_section(pfn)) + /* ... is the page assigned to the same zone? */ + if (page_zone(page) != zone) return NULL; /* ...is the page free and currently on a free_area list? */ - page = pfn_to_page(pfn); if (!PageBuddy(page)) return NULL; @@ -123,7 +123,7 @@ void __meminit __shuffle_zone(struct zone *z) * page_j randomly selected in the span @zone_start_pfn to * @spanned_pages. */ - page_i = shuffle_valid_page(i, order); + page_i = shuffle_valid_page(z, i, order); if (!page_i) continue; @@ -137,7 +137,7 @@ void __meminit __shuffle_zone(struct zone *z) j = z->zone_start_pfn + ALIGN_DOWN(get_random_long() % z->spanned_pages, order_pages); - page_j = shuffle_valid_page(j, order); + page_j = shuffle_valid_page(z, j, order); if (page_j && page_j != page_i) break; } -- cgit v1.2.3 From 453431a54934d917153c65211b2dabf45562ca88 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 6 Aug 2020 23:18:13 -0700 Subject: mm, treewide: rename kzfree() to kfree_sensitive() As said by Linus: A symmetric naming is only helpful if it implies symmetries in use. Otherwise it's actively misleading. In "kzalloc()", the z is meaningful and an important part of what the caller wants. In "kzfree()", the z is actively detrimental, because maybe in the future we really _might_ want to use that "memfill(0xdeadbeef)" or something. The "zero" part of the interface isn't even _relevant_. The main reason that kzfree() exists is to clear sensitive information that should not be leaked to other future users of the same memory objects. Rename kzfree() to kfree_sensitive() to follow the example of the recently added kvfree_sensitive() and make the intention of the API more explicit. In addition, memzero_explicit() is used to clear the memory to make sure that it won't get optimized away by the compiler. The renaming is done by using the command sequence: git grep -w --name-only kzfree |\ xargs sed -i 's/kzfree/kfree_sensitive/' followed by some editing of the kfree_sensitive() kerneldoc and adding a kzfree backward compatibility macro in slab.h. [akpm@linux-foundation.org: fs/crypto/inline_crypt.c needs linux/slab.h] [akpm@linux-foundation.org: fix fs/crypto/inline_crypt.c some more] Suggested-by: Joe Perches Signed-off-by: Waiman Long Signed-off-by: Andrew Morton Acked-by: David Howells Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Jarkko Sakkinen Cc: James Morris Cc: "Serge E. Hallyn" Cc: Joe Perches Cc: Matthew Wilcox Cc: David Rientjes Cc: Dan Carpenter Cc: "Jason A . Donenfeld" Link: http://lkml.kernel.org/r/20200616154311.12314-3-longman@redhat.com Signed-off-by: Linus Torvalds --- mm/slab_common.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/slab_common.c b/mm/slab_common.c index fe8b68482670..f47a097bb4b8 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1729,17 +1729,17 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags) EXPORT_SYMBOL(krealloc); /** - * kzfree - like kfree but zero memory + * kfree_sensitive - Clear sensitive information in memory before freeing * @p: object to free memory of * * The memory of the object @p points to is zeroed before freed. - * If @p is %NULL, kzfree() does nothing. + * If @p is %NULL, kfree_sensitive() does nothing. * * Note: this function zeroes the whole allocated buffer which can be a good * deal bigger than the requested buffer size passed to kmalloc(). So be * careful when using this function in performance sensitive code. */ -void kzfree(const void *p) +void kfree_sensitive(const void *p) { size_t ks; void *mem = (void *)p; @@ -1750,7 +1750,7 @@ void kzfree(const void *p) memzero_explicit(mem, ks); kfree(mem); } -EXPORT_SYMBOL(kzfree); +EXPORT_SYMBOL(kfree_sensitive); /** * ksize - get the actual amount of memory allocated for a given object -- cgit v1.2.3 From fa9ba3aa89f9f1c003b5f5cde893bbbc140c7223 Mon Sep 17 00:00:00 2001 From: William Kucharski Date: Thu, 6 Aug 2020 23:18:17 -0700 Subject: mm: ksize() should silently accept a NULL pointer Other mm routines such as kfree() and kzfree() silently do the right thing if passed a NULL pointer, so ksize() should do the same. Signed-off-by: William Kucharski Signed-off-by: Andrew Morton Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Link: http://lkml.kernel.org/r/20200616225409.4670-1-william.kucharski@oracle.com Signed-off-by: Linus Torvalds --- mm/slab_common.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/slab_common.c b/mm/slab_common.c index f47a097bb4b8..e493203b5002 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1681,10 +1681,9 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size, gfp_t flags) { void *ret; - size_t ks = 0; + size_t ks; - if (p) - ks = ksize(p); + ks = ksize(p); if (ks >= new_size) { p = kasan_krealloc((void *)p, new_size, flags); @@ -1744,10 +1743,9 @@ void kfree_sensitive(const void *p) size_t ks; void *mem = (void *)p; - if (unlikely(ZERO_OR_NULL_PTR(mem))) - return; ks = ksize(mem); - memzero_explicit(mem, ks); + if (ks) + memzero_explicit(mem, ks); kfree(mem); } EXPORT_SYMBOL(kfree_sensitive); @@ -1770,8 +1768,6 @@ size_t ksize(const void *objp) { size_t size; - if (WARN_ON_ONCE(!objp)) - return 0; /* * We need to check that the pointed to object is valid, and only then * unpoison the shadow memory below. We use __kasan_check_read(), to @@ -1785,7 +1781,7 @@ size_t ksize(const void *objp) * We want to perform the check before __ksize(), to avoid potentially * crashing in __ksize() due to accessing invalid metadata. */ - if (unlikely(objp == ZERO_SIZE_PTR) || !__kasan_check_read(objp, 1)) + if (unlikely(ZERO_OR_NULL_PTR(objp)) || !__kasan_check_read(objp, 1)) return 0; size = __ksize(objp); -- cgit v1.2.3 From dabc3e291d56e3125113101bc6d53d1a1738294d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 6 Aug 2020 23:18:24 -0700 Subject: mm/slab: add naive detection of double free Similar to commit ce6fa91b9363 ("mm/slub.c: add a naive detection of double free or corruption"), add a very cheap double-free check for SLAB under CONFIG_SLAB_FREELIST_HARDENED. With this added, the "SLAB_FREE_DOUBLE" LKDTM test passes under SLAB: lkdtm: Performing direct entry SLAB_FREE_DOUBLE lkdtm: Attempting double slab free ... ------------[ cut here ]------------ WARNING: CPU: 2 PID: 2193 at mm/slab.c:757 ___cache _free+0x325/0x390 [keescook@chromium.org: fix misplaced __free_one()] Link: http://lkml.kernel.org/r/202006261306.0D82A2B@keescook Link: https://lore.kernel.org/lkml/7ff248c7-d447-340c-a8e2-8c02972aca70@infradead.org Signed-off-by: Kees Cook Signed-off-by: Andrew Morton Acked-by: Vlastimil Babka Acked-by: Randy Dunlap [build tested] Cc: Roman Gushchin Cc: Christoph Lameter Cc: Alexander Popov Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vinayak Menon Cc: Matthew Garrett Cc: Jann Horn Cc: Vijayanand Jitta Link: http://lkml.kernel.org/r/20200625215548.389774-3-keescook@chromium.org Signed-off-by: Linus Torvalds --- mm/slab.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 9350062ffc1a..77e90f9de9c0 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -588,6 +588,16 @@ static int transfer_objects(struct array_cache *to, return nr; } +/* &alien->lock must be held by alien callers. */ +static __always_inline void __free_one(struct array_cache *ac, void *objp) +{ + /* Avoid trivial double-free. */ + if (IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) && + WARN_ON_ONCE(ac->avail > 0 && ac->entry[ac->avail - 1] == objp)) + return; + ac->entry[ac->avail++] = objp; +} + #ifndef CONFIG_NUMA #define drain_alien_cache(cachep, alien) do { } while (0) @@ -767,7 +777,7 @@ static int __cache_free_alien(struct kmem_cache *cachep, void *objp, STATS_INC_ACOVERFLOW(cachep); __drain_alien_cache(cachep, ac, page_node, &list); } - ac->entry[ac->avail++] = objp; + __free_one(ac, objp); spin_unlock(&alien->lock); slabs_destroy(cachep, &list); } else { @@ -3466,7 +3476,7 @@ void ___cache_free(struct kmem_cache *cachep, void *objp, } } - ac->entry[ac->avail++] = objp; + __free_one(ac, objp); } /** -- cgit v1.2.3 From 444050990db4abdf0c85ca6050542b888ba316a1 Mon Sep 17 00:00:00 2001 From: Long Li Date: Thu, 6 Aug 2020 23:18:28 -0700 Subject: mm, slab: check GFP_SLAB_BUG_MASK before alloc_pages in kmalloc_order kmalloc cannot allocate memory from HIGHMEM. Allocating large amounts of memory currently bypasses the check and will simply leak the memory when page_address() returns NULL. To fix this, factor the GFP_SLAB_BUG_MASK check out of slab & slub, and call it from kmalloc_order() as well. In order to make the code clear, the warning message is put in one place. Signed-off-by: Long Li Signed-off-by: Andrew Morton Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Pekka Enberg Acked-by: David Rientjes Cc: Christoph Lameter Cc: Joonsoo Kim Link: http://lkml.kernel.org/r/20200704035027.GA62481@lilong Signed-off-by: Linus Torvalds --- mm/slab.c | 10 +++------- mm/slab.h | 1 + mm/slab_common.c | 17 +++++++++++++++++ mm/slub.c | 9 ++------- 4 files changed, 23 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 77e90f9de9c0..0781a2384441 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2589,13 +2589,9 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep, * Be lazy and only check for valid flags here, keeping it out of the * critical path in kmem_cache_alloc(). */ - if (unlikely(flags & GFP_SLAB_BUG_MASK)) { - gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK; - flags &= ~GFP_SLAB_BUG_MASK; - pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n", - invalid_mask, &invalid_mask, flags, &flags); - dump_stack(); - } + if (unlikely(flags & GFP_SLAB_BUG_MASK)) + flags = kmalloc_fix_flags(flags); + WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO)); local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); diff --git a/mm/slab.h b/mm/slab.h index 74f7e09a7cfd..58ed025a0b23 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -152,6 +152,7 @@ void create_kmalloc_caches(slab_flags_t); struct kmem_cache *kmalloc_slab(size_t, gfp_t); #endif +gfp_t kmalloc_fix_flags(gfp_t flags); /* Functions provided by the slab allocators */ int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags); diff --git a/mm/slab_common.c b/mm/slab_common.c index e493203b5002..616ec8a0d91a 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -26,6 +26,8 @@ #define CREATE_TRACE_POINTS #include +#include "internal.h" + #include "slab.h" enum slab_state slab_state; @@ -1332,6 +1334,18 @@ void __init create_kmalloc_caches(slab_flags_t flags) } #endif /* !CONFIG_SLOB */ +gfp_t kmalloc_fix_flags(gfp_t flags) +{ + gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK; + + flags &= ~GFP_SLAB_BUG_MASK; + pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n", + invalid_mask, &invalid_mask, flags, &flags); + dump_stack(); + + return flags; +} + /* * To avoid unnecessary overhead, we pass through large allocation requests * directly to the page allocator. We use __GFP_COMP, because we will need to @@ -1342,6 +1356,9 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) void *ret = NULL; struct page *page; + if (unlikely(flags & GFP_SLAB_BUG_MASK)) + flags = kmalloc_fix_flags(flags); + flags |= __GFP_COMP; page = alloc_pages(flags, order); if (likely(page)) { diff --git a/mm/slub.c b/mm/slub.c index f226d66408ee..7f0764b6a3df 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1745,13 +1745,8 @@ out: static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) { - if (unlikely(flags & GFP_SLAB_BUG_MASK)) { - gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK; - flags &= ~GFP_SLAB_BUG_MASK; - pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n", - invalid_mask, &invalid_mask, flags, &flags); - dump_stack(); - } + if (unlikely(flags & GFP_SLAB_BUG_MASK)) + flags = kmalloc_fix_flags(flags); return allocate_slab(s, flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); -- cgit v1.2.3 From 221503e1281f8a7d67eacbb6dc99db95acdf806c Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Thu, 6 Aug 2020 23:18:31 -0700 Subject: mm/slab.c: update outdated kmem_list3 in a comment kmem_list3 has been renamed to kmem_cache_node long long ago so update it. References: 6744f087ba2a ("slab: Common name for the per node structures") ce8eb6c424c7 ("slab: Rename list3/l3 to node") Signed-off-by: Xiao Yang Signed-off-by: Andrew Morton Reviewed-by: Pekka Enberg Cc: Christoph Lameter Cc: David Rientjes Cc: Joonsoo Kim Link: http://lkml.kernel.org/r/20200722033355.26908-1-yangx.jy@cn.fujitsu.com Signed-off-by: Linus Torvalds --- mm/slab.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 0781a2384441..390e249aaa8d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1060,7 +1060,7 @@ int slab_prepare_cpu(unsigned int cpu) * offline. * * Even if all the cpus of a node are down, we don't free the - * kmem_list3 of any cache. This to avoid a race between cpu_down, and + * kmem_cache_node of any cache. This to avoid a race between cpu_down, and * a kmalloc allocation from another cpu for memory from the node of * the cpu going down. The list3 structure is usually allocated from * kmem_cache_create() and gets destroyed at kmem_cache_destroy(). -- cgit v1.2.3 From e17f1dfba37b84b574ae91e809ae3804fe5b29b9 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 6 Aug 2020 23:18:35 -0700 Subject: mm, slub: extend slub_debug syntax for multiple blocks Patch series "slub_debug fixes and improvements". The slub_debug kernel boot parameter can either apply a single set of options to all caches or a list of caches. There is a use case where debugging is applied for all caches and then disabled at runtime for specific caches, for performance and memory consumption reasons [1]. As runtime changes are dangerous, extend the boot parameter syntax so that multiple blocks of either global or slab-specific options can be specified, with blocks delimited by ';'. This will also support the use case of [1] without runtime changes. For details see the updated Documentation/vm/slub.rst [1] https://lore.kernel.org/r/1383cd32-1ddc-4dac-b5f8-9c42282fa81c@codeaurora.org [weiyongjun1@huawei.com: make parse_slub_debug_flags() static] Link: http://lkml.kernel.org/r/20200702150522.4940-1-weiyongjun1@huawei.com Signed-off-by: Vlastimil Babka Signed-off-by: Andrew Morton Reviewed-by: Kees Cook Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Jann Horn Cc: Roman Gushchin Cc: Vijayanand Jitta Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Link: http://lkml.kernel.org/r/20200610163135.17364-2-vbabka@suse.cz Signed-off-by: Linus Torvalds --- mm/slub.c | 177 ++++++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 126 insertions(+), 51 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 7f0764b6a3df..829985d7c7c5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -499,7 +499,7 @@ static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS; static slab_flags_t slub_debug; #endif -static char *slub_debug_slabs; +static char *slub_debug_string; static int disable_higher_order_debug; /* @@ -1262,68 +1262,132 @@ out: return ret; } -static int __init setup_slub_debug(char *str) +/* + * Parse a block of slub_debug options. Blocks are delimited by ';' + * + * @str: start of block + * @flags: returns parsed flags, or DEBUG_DEFAULT_FLAGS if none specified + * @slabs: return start of list of slabs, or NULL when there's no list + * @init: assume this is initial parsing and not per-kmem-create parsing + * + * returns the start of next block if there's any, or NULL + */ +static char * +parse_slub_debug_flags(char *str, slab_flags_t *flags, char **slabs, bool init) { - slub_debug = DEBUG_DEFAULT_FLAGS; - if (*str++ != '=' || !*str) - /* - * No options specified. Switch on full debugging. - */ - goto out; + bool higher_order_disable = false; - if (*str == ',') + /* Skip any completely empty blocks */ + while (*str && *str == ';') + str++; + + if (*str == ',') { /* * No options but restriction on slabs. This means full * debugging for slabs matching a pattern. */ + *flags = DEBUG_DEFAULT_FLAGS; goto check_slabs; + } + *flags = 0; - slub_debug = 0; - if (*str == '-') - /* - * Switch off all debugging measures. - */ - goto out; - - /* - * Determine which debug features should be switched on - */ - for (; *str && *str != ','; str++) { + /* Determine which debug features should be switched on */ + for (; *str && *str != ',' && *str != ';'; str++) { switch (tolower(*str)) { + case '-': + *flags = 0; + break; case 'f': - slub_debug |= SLAB_CONSISTENCY_CHECKS; + *flags |= SLAB_CONSISTENCY_CHECKS; break; case 'z': - slub_debug |= SLAB_RED_ZONE; + *flags |= SLAB_RED_ZONE; break; case 'p': - slub_debug |= SLAB_POISON; + *flags |= SLAB_POISON; break; case 'u': - slub_debug |= SLAB_STORE_USER; + *flags |= SLAB_STORE_USER; break; case 't': - slub_debug |= SLAB_TRACE; + *flags |= SLAB_TRACE; break; case 'a': - slub_debug |= SLAB_FAILSLAB; + *flags |= SLAB_FAILSLAB; break; case 'o': /* * Avoid enabling debugging on caches if its minimum * order would increase as a result. */ - disable_higher_order_debug = 1; + higher_order_disable = true; break; default: - pr_err("slub_debug option '%c' unknown. skipped\n", - *str); + if (init) + pr_err("slub_debug option '%c' unknown. skipped\n", *str); } } - check_slabs: if (*str == ',') - slub_debug_slabs = str + 1; + *slabs = ++str; + else + *slabs = NULL; + + /* Skip over the slab list */ + while (*str && *str != ';') + str++; + + /* Skip any completely empty blocks */ + while (*str && *str == ';') + str++; + + if (init && higher_order_disable) + disable_higher_order_debug = 1; + + if (*str) + return str; + else + return NULL; +} + +static int __init setup_slub_debug(char *str) +{ + slab_flags_t flags; + char *saved_str; + char *slab_list; + bool global_slub_debug_changed = false; + bool slab_list_specified = false; + + slub_debug = DEBUG_DEFAULT_FLAGS; + if (*str++ != '=' || !*str) + /* + * No options specified. Switch on full debugging. + */ + goto out; + + saved_str = str; + while (str) { + str = parse_slub_debug_flags(str, &flags, &slab_list, true); + + if (!slab_list) { + slub_debug = flags; + global_slub_debug_changed = true; + } else { + slab_list_specified = true; + } + } + + /* + * For backwards compatibility, a single list of flags with list of + * slabs means debugging is only enabled for those slabs, so the global + * slub_debug should be 0. We can extended that to multiple lists as + * long as there is no option specifying flags without a slab list. + */ + if (slab_list_specified) { + if (!global_slub_debug_changed) + slub_debug = 0; + slub_debug_string = saved_str; + } out: if ((static_branch_unlikely(&init_on_alloc) || static_branch_unlikely(&init_on_free)) && @@ -1352,36 +1416,47 @@ slab_flags_t kmem_cache_flags(unsigned int object_size, { char *iter; size_t len; + char *next_block; + slab_flags_t block_flags; /* If slub_debug = 0, it folds into the if conditional. */ - if (!slub_debug_slabs) + if (!slub_debug_string) return flags | slub_debug; len = strlen(name); - iter = slub_debug_slabs; - while (*iter) { - char *end, *glob; - size_t cmplen; - - end = strchrnul(iter, ','); + next_block = slub_debug_string; + /* Go through all blocks of debug options, see if any matches our slab's name */ + while (next_block) { + next_block = parse_slub_debug_flags(next_block, &block_flags, &iter, false); + if (!iter) + continue; + /* Found a block that has a slab list, search it */ + while (*iter) { + char *end, *glob; + size_t cmplen; + + end = strchrnul(iter, ','); + if (next_block && next_block < end) + end = next_block - 1; + + glob = strnchr(iter, end - iter, '*'); + if (glob) + cmplen = glob - iter; + else + cmplen = max_t(size_t, len, (end - iter)); - glob = strnchr(iter, end - iter, '*'); - if (glob) - cmplen = glob - iter; - else - cmplen = max_t(size_t, len, (end - iter)); + if (!strncmp(name, iter, cmplen)) { + flags |= block_flags; + return flags; + } - if (!strncmp(name, iter, cmplen)) { - flags |= slub_debug; - break; + if (!*end || *end == ';') + break; + iter = end + 1; } - - if (!*end) - break; - iter = end + 1; } - return flags; + return slub_debug; } #else /* !CONFIG_SLUB_DEBUG */ static inline void setup_object_debug(struct kmem_cache *s, -- cgit v1.2.3 From ad38b5b1131e2a0e5c46724847da2e1eba31fb68 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 6 Aug 2020 23:18:38 -0700 Subject: mm, slub: make some slub_debug related attributes read-only SLUB_DEBUG creates several files under /sys/kernel/slab// that can be read to check if the respective debugging options are enabled for given cache. The options can be also toggled at runtime by writing into the files. Some of those, namely red_zone, poison, and store_user can be toggled only when no objects yet exist in the cache. Vijayanand reports [1] that there is a problem with freelist randomization if changing the debugging option's state results in different number of objects per page, and the random sequence cache needs thus needs to be recomputed. However, another problem is that the check for "no objects yet exist in the cache" is racy, as noted by Jann [2] and fixing that would add overhead or otherwise complicate the allocation/freeing paths. Thus it would be much simpler just to remove the runtime toggling support. The documentation describes it's "In case you forgot to enable debugging on the kernel command line", but the neccessity of having no objects limits its usefulness anyway for many caches. Vijayanand describes an use case [3] where debugging is enabled for all but zram caches for memory overhead reasons, and using the runtime toggles was the only way to achieve such configuration. After the previous patch it's now possible to do that directly from the kernel boot option, so we can remove the dangerous runtime toggles by making the /sys attribute files read-only. While updating it, also improve the documentation of the debugging /sys files. [1] https://lkml.kernel.org/r/1580379523-32272-1-git-send-email-vjitta@codeaurora.org [2] https://lore.kernel.org/r/CAG48ez31PP--h6_FzVyfJ4H86QYczAFPdxtJHUEEan+7VJETAQ@mail.gmail.com [3] https://lore.kernel.org/r/1383cd32-1ddc-4dac-b5f8-9c42282fa81c@codeaurora.org Reported-by: Vijayanand Jitta Reported-by: Jann Horn Signed-off-by: Vlastimil Babka Signed-off-by: Andrew Morton Reviewed-by: Kees Cook Acked-by: Roman Gushchin Cc: Christoph Lameter Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Link: http://lkml.kernel.org/r/20200610163135.17364-3-vbabka@suse.cz Signed-off-by: Linus Torvalds --- mm/slub.c | 46 +++------------------------------------------- 1 file changed, 3 insertions(+), 43 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 829985d7c7c5..fd0b196fdaaf 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5335,61 +5335,21 @@ static ssize_t red_zone_show(struct kmem_cache *s, char *buf) return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE)); } -static ssize_t red_zone_store(struct kmem_cache *s, - const char *buf, size_t length) -{ - if (any_slab_objects(s)) - return -EBUSY; - - s->flags &= ~SLAB_RED_ZONE; - if (buf[0] == '1') { - s->flags |= SLAB_RED_ZONE; - } - calculate_sizes(s, -1); - return length; -} -SLAB_ATTR(red_zone); +SLAB_ATTR_RO(red_zone); static ssize_t poison_show(struct kmem_cache *s, char *buf) { return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON)); } -static ssize_t poison_store(struct kmem_cache *s, - const char *buf, size_t length) -{ - if (any_slab_objects(s)) - return -EBUSY; - - s->flags &= ~SLAB_POISON; - if (buf[0] == '1') { - s->flags |= SLAB_POISON; - } - calculate_sizes(s, -1); - return length; -} -SLAB_ATTR(poison); +SLAB_ATTR_RO(poison); static ssize_t store_user_show(struct kmem_cache *s, char *buf) { return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER)); } -static ssize_t store_user_store(struct kmem_cache *s, - const char *buf, size_t length) -{ - if (any_slab_objects(s)) - return -EBUSY; - - s->flags &= ~SLAB_STORE_USER; - if (buf[0] == '1') { - s->flags &= ~__CMPXCHG_DOUBLE; - s->flags |= SLAB_STORE_USER; - } - calculate_sizes(s, -1); - return length; -} -SLAB_ATTR(store_user); +SLAB_ATTR_RO(store_user); static ssize_t validate_show(struct kmem_cache *s, char *buf) { -- cgit v1.2.3 From 32a6f409b6935d175464f26d31250ea13fda6b66 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 6 Aug 2020 23:18:41 -0700 Subject: mm, slub: remove runtime allocation order changes SLUB allows runtime changing of page allocation order by writing into the /sys/kernel/slab//order file. Jann has reported [1] that this interface allows the order to be set too small, leading to crashes. While it's possible to fix the immediate issue, closer inspection reveals potential races. Storing the new order calls calculate_sizes() which non-atomically updates a lot of kmem_cache fields while the cache is still in use. Unexpected behavior might occur even if the fields are set to the same value as they were. This could be fixed by splitting out the part of calculate_sizes() that depends on forced_order, so that we only update kmem_cache.oo field. This could still race with init_cache_random_seq(), shuffle_freelist(), allocate_slab(). Perhaps it's possible to audit and e.g. add some READ_ONCE/WRITE_ONCE accesses, it might be easier just to remove the runtime order changes, which is what this patch does. If there are valid usecases for per-cache order setting, we could e.g. extend the boot parameters to do that. [1] https://lore.kernel.org/r/CAG48ez31PP--h6_FzVyfJ4H86QYczAFPdxtJHUEEan+7VJETAQ@mail.gmail.com Reported-by: Jann Horn Signed-off-by: Vlastimil Babka Signed-off-by: Andrew Morton Reviewed-by: Kees Cook Acked-by: Christoph Lameter Acked-by: Roman Gushchin Cc: Vijayanand Jitta Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Link: http://lkml.kernel.org/r/20200610163135.17364-4-vbabka@suse.cz Signed-off-by: Linus Torvalds --- mm/slub.c | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index fd0b196fdaaf..f03b1073bdcf 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5095,28 +5095,11 @@ static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(objs_per_slab); -static ssize_t order_store(struct kmem_cache *s, - const char *buf, size_t length) -{ - unsigned int order; - int err; - - err = kstrtouint(buf, 10, &order); - if (err) - return err; - - if (order > slub_max_order || order < slub_min_order) - return -EINVAL; - - calculate_sizes(s, order); - return length; -} - static ssize_t order_show(struct kmem_cache *s, char *buf) { return sprintf(buf, "%u\n", oo_order(s->oo)); } -SLAB_ATTR(order); +SLAB_ATTR_RO(order); static ssize_t min_partial_show(struct kmem_cache *s, char *buf) { -- cgit v1.2.3 From 060807f841ac94d3826ce6fa3b4f3831cd0c015b Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 6 Aug 2020 23:18:45 -0700 Subject: mm, slub: make remaining slub_debug related attributes read-only SLUB_DEBUG creates several files under /sys/kernel/slab// that can be read to check if the respective debugging options are enabled for given cache. Some options, namely sanity_checks, trace, and failslab can be also enabled and disabled at runtime by writing into the files. The runtime toggling is racy. Some options disable __CMPXCHG_DOUBLE when enabled, which means that in case of concurrent allocations, some can still use __CMPXCHG_DOUBLE and some not, leading to potential corruption. The s->flags field is also not updated or checked atomically. The simplest solution is to remove the runtime toggling. The extended slub_debug boot parameter syntax introduced by earlier patch should allow to fine-tune the debugging configuration during boot with same granularity. Signed-off-by: Vlastimil Babka Signed-off-by: Andrew Morton Reviewed-by: Kees Cook Acked-by: Roman Gushchin Cc: Christoph Lameter Cc: Jann Horn Cc: Vijayanand Jitta Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Link: http://lkml.kernel.org/r/20200610163135.17364-5-vbabka@suse.cz Signed-off-by: Linus Torvalds --- mm/slub.c | 62 +++----------------------------------------------------------- 1 file changed, 3 insertions(+), 59 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index f03b1073bdcf..5848b7c3575a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5040,20 +5040,6 @@ static ssize_t show_slab_objects(struct kmem_cache *s, return x + sprintf(buf + x, "\n"); } -#ifdef CONFIG_SLUB_DEBUG -static int any_slab_objects(struct kmem_cache *s) -{ - int node; - struct kmem_cache_node *n; - - for_each_kmem_cache_node(s, node, n) - if (atomic_long_read(&n->total_objects)) - return 1; - - return 0; -} -#endif - #define to_slab_attr(n) container_of(n, struct slab_attribute, attr) #define to_slab(n) container_of(n, struct kmem_cache, kobj) @@ -5275,43 +5261,13 @@ static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) { return sprintf(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS)); } - -static ssize_t sanity_checks_store(struct kmem_cache *s, - const char *buf, size_t length) -{ - s->flags &= ~SLAB_CONSISTENCY_CHECKS; - if (buf[0] == '1') { - s->flags &= ~__CMPXCHG_DOUBLE; - s->flags |= SLAB_CONSISTENCY_CHECKS; - } - return length; -} -SLAB_ATTR(sanity_checks); +SLAB_ATTR_RO(sanity_checks); static ssize_t trace_show(struct kmem_cache *s, char *buf) { return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE)); } - -static ssize_t trace_store(struct kmem_cache *s, const char *buf, - size_t length) -{ - /* - * Tracing a merged cache is going to give confusing results - * as well as cause other issues like converting a mergeable - * cache into an umergeable one. - */ - if (s->refcount > 1) - return -EINVAL; - - s->flags &= ~SLAB_TRACE; - if (buf[0] == '1') { - s->flags &= ~__CMPXCHG_DOUBLE; - s->flags |= SLAB_TRACE; - } - return length; -} -SLAB_ATTR(trace); +SLAB_ATTR_RO(trace); static ssize_t red_zone_show(struct kmem_cache *s, char *buf) { @@ -5375,19 +5331,7 @@ static ssize_t failslab_show(struct kmem_cache *s, char *buf) { return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); } - -static ssize_t failslab_store(struct kmem_cache *s, const char *buf, - size_t length) -{ - if (s->refcount > 1) - return -EINVAL; - - s->flags &= ~SLAB_FAILSLAB; - if (buf[0] == '1') - s->flags |= SLAB_FAILSLAB; - return length; -} -SLAB_ATTR(failslab); +SLAB_ATTR_RO(failslab); #endif static ssize_t shrink_show(struct kmem_cache *s, char *buf) -- cgit v1.2.3 From 8f58119ac49c03c5a0879353661c586b0436c250 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 6 Aug 2020 23:18:48 -0700 Subject: mm, slub: make reclaim_account attribute read-only The attribute reflects the SLAB_RECLAIM_ACCOUNT cache flag. It's not clear why this attribute was writable in the first place, as it's tied to how the cache is used by its creator, it's not a user tunable. Furthermore: - it affects slab merging, but that's not being checked while toggled - if affects whether __GFP_RECLAIMABLE flag is used to allocate page, but the runtime toggle doesn't update allocflags - it affects cache_vmstat_idx() so runtime toggling might lead to incosistency of NR_SLAB_RECLAIMABLE and NR_SLAB_UNRECLAIMABLE Thus make it read-only. Signed-off-by: Vlastimil Babka Signed-off-by: Andrew Morton Reviewed-by: Kees Cook Acked-by: Roman Gushchin Cc: Christoph Lameter Cc: Jann Horn Cc: Vijayanand Jitta Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Link: http://lkml.kernel.org/r/20200610163135.17364-6-vbabka@suse.cz Signed-off-by: Linus Torvalds --- mm/slub.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 5848b7c3575a..617cf1fff128 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5207,16 +5207,7 @@ static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) { return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); } - -static ssize_t reclaim_account_store(struct kmem_cache *s, - const char *buf, size_t length) -{ - s->flags &= ~SLAB_RECLAIM_ACCOUNT; - if (buf[0] == '1') - s->flags |= SLAB_RECLAIM_ACCOUNT; - return length; -} -SLAB_ATTR(reclaim_account); +SLAB_ATTR_RO(reclaim_account); static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) { -- cgit v1.2.3 From ca0cab65ea2b8c1527dc48c8dfd38ae055f5f241 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 6 Aug 2020 23:18:51 -0700 Subject: mm, slub: introduce static key for slub_debug() One advantage of CONFIG_SLUB_DEBUG is that a generic distro kernel can be built with the option enabled, but it's inactive until simply enabled on boot, without rebuilding the kernel. With a static key, we can further eliminate the overhead of checking whether a cache has a particular debug flag enabled if we know that there are no such caches (slub_debug was not enabled during boot). We use the same mechanism also for e.g. page_owner, debug_pagealloc or kmemcg functionality. This patch introduces the static key and makes the general check for per-cache debug flags kmem_cache_debug() use it. This benefits several call sites, including (slow path but still rather frequent) __slab_free(). The next patches will add more uses. Signed-off-by: Vlastimil Babka Signed-off-by: Andrew Morton Reviewed-by: Kees Cook Acked-by: Roman Gushchin Acked-by: Christoph Lameter Cc: Jann Horn Cc: Vijayanand Jitta Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Link: http://lkml.kernel.org/r/20200610163135.17364-7-vbabka@suse.cz Signed-off-by: Linus Torvalds --- mm/slub.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 617cf1fff128..8adab4c5296d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -114,13 +114,21 @@ * the fast path and disables lockless freelists. */ +#ifdef CONFIG_SLUB_DEBUG +#ifdef CONFIG_SLUB_DEBUG_ON +DEFINE_STATIC_KEY_TRUE(slub_debug_enabled); +#else +DEFINE_STATIC_KEY_FALSE(slub_debug_enabled); +#endif +#endif + static inline int kmem_cache_debug(struct kmem_cache *s) { #ifdef CONFIG_SLUB_DEBUG - return unlikely(s->flags & SLAB_DEBUG_FLAGS); -#else - return 0; + if (static_branch_unlikely(&slub_debug_enabled)) + return s->flags & SLAB_DEBUG_FLAGS; #endif + return 0; } void *fixup_red_left(struct kmem_cache *s, void *p) @@ -1389,6 +1397,8 @@ static int __init setup_slub_debug(char *str) slub_debug_string = saved_str; } out: + if (slub_debug != 0 || slub_debug_string) + static_branch_enable(&slub_debug_enabled); if ((static_branch_unlikely(&init_on_alloc) || static_branch_unlikely(&init_on_free)) && (slub_debug & SLAB_POISON)) -- cgit v1.2.3 From 59052e89fc89e3e6bef0151052e093566e446851 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 6 Aug 2020 23:18:55 -0700 Subject: mm, slub: introduce kmem_cache_debug_flags() There are few places that call kmem_cache_debug(s) (which tests if any of debug flags are enabled for a cache) immediately followed by a test for a specific flag. The compiler can probably eliminate the extra check, but we can make the code nicer by introducing kmem_cache_debug_flags() that works like kmem_cache_debug() (including the static key check) but tests for specific flag(s). The next patches will add more users. [vbabka@suse.cz: change return from int to bool, per Kees. Add VM_WARN_ON_ONCE() for invalid flags, per Roman] Link: http://lkml.kernel.org/r/949b90ed-e0f0-07d7-4d21-e30ec0958a7c@suse.cz Signed-off-by: Vlastimil Babka Signed-off-by: Andrew Morton Acked-by: Roman Gushchin Acked-by: Christoph Lameter Acked-by: Kees Cook Cc: Jann Horn Cc: Vijayanand Jitta Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Link: http://lkml.kernel.org/r/20200610163135.17364-8-vbabka@suse.cz Signed-off-by: Linus Torvalds --- mm/slub.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 8adab4c5296d..97074631a2d1 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -122,18 +122,29 @@ DEFINE_STATIC_KEY_FALSE(slub_debug_enabled); #endif #endif -static inline int kmem_cache_debug(struct kmem_cache *s) +/* + * Returns true if any of the specified slub_debug flags is enabled for the + * cache. Use only for flags parsed by setup_slub_debug() as it also enables + * the static key. + */ +static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t flags) { + VM_WARN_ON_ONCE(!(flags & SLAB_DEBUG_FLAGS)); #ifdef CONFIG_SLUB_DEBUG if (static_branch_unlikely(&slub_debug_enabled)) - return s->flags & SLAB_DEBUG_FLAGS; + return s->flags & flags; #endif - return 0; + return false; +} + +static inline bool kmem_cache_debug(struct kmem_cache *s) +{ + return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS); } void *fixup_red_left(struct kmem_cache *s, void *p) { - if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) + if (kmem_cache_debug_flags(s, SLAB_RED_ZONE)) p += s->red_left_pad; return p; @@ -4060,7 +4071,7 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, offset = (ptr - page_address(page)) % s->size; /* Adjust for redzone and reject if within the redzone. */ - if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) { + if (kmem_cache_debug_flags(s, SLAB_RED_ZONE)) { if (offset < s->red_left_pad) usercopy_abort("SLUB object in left red zone", s->name, to_user, offset, n); -- cgit v1.2.3 From 8fc8d6664247a6e65cba000789e2e85e2288f6f7 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 6 Aug 2020 23:18:58 -0700 Subject: mm, slub: extend checks guarded by slub_debug static key There are few more places in SLUB that could benefit from reduced overhead of the static key introduced by a previous patch: - setup_object_debug() called on each object in newly allocated slab page - setup_page_debug() called on newly allocated slab page - __free_slab() called on freed slab page Signed-off-by: Vlastimil Babka Signed-off-by: Andrew Morton Acked-by: Roman Gushchin Acked-by: Christoph Lameter Cc: Jann Horn Cc: Kees Cook Cc: Vijayanand Jitta Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Link: http://lkml.kernel.org/r/20200610163135.17364-9-vbabka@suse.cz Signed-off-by: Linus Torvalds --- mm/slub.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 97074631a2d1..0b80a8409836 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1131,7 +1131,7 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) static void setup_object_debug(struct kmem_cache *s, struct page *page, void *object) { - if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))) + if (!kmem_cache_debug_flags(s, SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)) return; init_object(s, object, SLUB_RED_INACTIVE); @@ -1141,7 +1141,7 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page, static void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr) { - if (!(s->flags & SLAB_POISON)) + if (!kmem_cache_debug_flags(s, SLAB_POISON)) return; metadata_access_enable(); @@ -1853,7 +1853,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) int order = compound_order(page); int pages = 1 << order; - if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) { void *p; slab_pad_check(s, page); -- cgit v1.2.3 From d3c58f24be1bf10fa9e11977080a2398ddcd8361 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 6 Aug 2020 23:19:01 -0700 Subject: mm, slab/slub: move and improve cache_from_obj() The function cache_from_obj() was added by commit b9ce5ef49f00 ("sl[au]b: always get the cache from its page in kmem_cache_free()") to support kmemcg, where per-memcg cache can be different from the root one, so we can't use the kmem_cache pointer given to kmem_cache_free(). Prior to that commit, SLUB already had debugging check+warning that could be enabled to compare the given kmem_cache pointer to one referenced by the slab page where the object-to-be-freed resides. This check was moved to cache_from_obj(). Later the check was also enabled for SLAB_FREELIST_HARDENED configs by commit 598a0717a816 ("mm/slab: validate cache membership under freelist hardening"). These checks and warnings can be useful especially for the debugging, which can be improved. Commit 598a0717a816 changed the pr_err() with WARN_ON_ONCE() to WARN_ONCE() so only the first hit is now reported, others are silent. This patch changes it to WARN() so that all errors are reported. It's also useful to print SLUB allocation/free tracking info for the offending object, if tracking is enabled. We could export the SLUB print_tracking() function and provide an empty one for SLAB, or realize that both the debugging and hardening cases in cache_from_obj() are only supported by SLUB anyway. So this patch moves cache_from_obj() from slab.h to separate instances in slab.c and slub.c, where the SLAB version only does the kmemcg lookup and even could be completely removed once the kmemcg rework [1] is merged. The SLUB version can thus easily use the print_tracking() function. It can also use the kmem_cache_debug_flags() static key check for improved performance in kernels without the hardening and with debugging not enabled on boot. [1] https://lore.kernel.org/r/20200608230654.828134-18-guro@fb.com Signed-off-by: Vlastimil Babka Signed-off-by: Andrew Morton Cc: Christoph Lameter Cc: Jann Horn Cc: Kees Cook Cc: Vijayanand Jitta Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Link: http://lkml.kernel.org/r/20200610163135.17364-10-vbabka@suse.cz Signed-off-by: Linus Torvalds --- mm/slab.c | 8 ++++++++ mm/slab.h | 23 ----------------------- mm/slub.c | 21 +++++++++++++++++++++ 3 files changed, 29 insertions(+), 23 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 390e249aaa8d..af3d3887b9b8 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3678,6 +3678,14 @@ void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) } EXPORT_SYMBOL(__kmalloc_track_caller); +static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) +{ + if (memcg_kmem_enabled()) + return virt_to_cache(x); + else + return s; +} + /** * kmem_cache_free - Deallocate an object * @cachep: The cache the allocation was from. diff --git a/mm/slab.h b/mm/slab.h index 58ed025a0b23..a21c78eccee7 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -504,29 +504,6 @@ static __always_inline void uncharge_slab_page(struct page *page, int order, memcg_uncharge_slab(page, order, s); } -static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) -{ - struct kmem_cache *cachep; - - /* - * When kmemcg is not being used, both assignments should return the - * same value. but we don't want to pay the assignment price in that - * case. If it is not compiled in, the compiler should be smart enough - * to not do even the assignment. In that case, slab_equal_or_root - * will also be a constant. - */ - if (!memcg_kmem_enabled() && - !IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) && - !unlikely(s->flags & SLAB_CONSISTENCY_CHECKS)) - return s; - - cachep = virt_to_cache(x); - WARN_ONCE(cachep && !slab_equal_or_root(cachep, s), - "%s: Wrong slab cache. %s but object is from %s\n", - __func__, s->name, cachep->name); - return cachep; -} - static inline size_t slab_ksize(const struct kmem_cache *s) { #ifndef CONFIG_SLUB diff --git a/mm/slub.c b/mm/slub.c index 0b80a8409836..6a99a3c945cb 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1525,6 +1525,10 @@ static bool freelist_corrupted(struct kmem_cache *s, struct page *page, { return false; } + +static void print_tracking(struct kmem_cache *s, void *object) +{ +} #endif /* CONFIG_SLUB_DEBUG */ /* @@ -3171,6 +3175,23 @@ void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) } #endif +static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) +{ + struct kmem_cache *cachep; + + if (!IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) && + !memcg_kmem_enabled() && + !kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) + return s; + + cachep = virt_to_cache(x); + if (WARN(cachep && !slab_equal_or_root(cachep, s), + "%s: Wrong slab cache. %s but object is from %s\n", + __func__, s->name, cachep->name)) + print_tracking(cachep, x); + return cachep; +} + void kmem_cache_free(struct kmem_cache *s, void *x) { s = cache_from_obj(s, x); -- cgit v1.2.3 From e42f174e43e47b623d9dbf814521c4961000c962 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 6 Aug 2020 23:19:05 -0700 Subject: mm, slab/slub: improve error reporting and overhead of cache_from_obj() cache_from_obj() was added by commit b9ce5ef49f00 ("sl[au]b: always get the cache from its page in kmem_cache_free()") to support kmemcg, where per-memcg cache can be different from the root one, so we can't use the kmem_cache pointer given to kmem_cache_free(). Prior to that commit, SLUB already had debugging check+warning that could be enabled to compare the given kmem_cache pointer to one referenced by the slab page where the object-to-be-freed resides. This check was moved to cache_from_obj(). Later the check was also enabled for SLAB_FREELIST_HARDENED configs by commit 598a0717a816 ("mm/slab: validate cache membership under freelist hardening"). These checks and warnings can be useful especially for the debugging, which can be improved. Commit 598a0717a816 changed the pr_err() with WARN_ON_ONCE() to WARN_ONCE() so only the first hit is now reported, others are silent. This patch changes it to WARN() so that all errors are reported. It's also useful to print SLUB allocation/free tracking info for the offending object, if tracking is enabled. Thus, export the SLUB print_tracking() function and provide an empty one for SLAB. For SLUB we can also benefit from the static key check in kmem_cache_debug_flags(), but we need to move this function to slab.h and declare the static key there. [1] https://lore.kernel.org/r/20200608230654.828134-18-guro@fb.com [vbabka@suse.cz: avoid bogus WARN()] Link: https://lore.kernel.org/r/20200623090213.GW5535@shao2-debian Link: http://lkml.kernel.org/r/b33e0fa7-cd28-4788-9e54-5927846329ef@suse.cz Signed-off-by: Vlastimil Babka Signed-off-by: Andrew Morton Acked-by: Kees Cook Acked-by: Roman Gushchin Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Matthew Garrett Cc: Jann Horn Cc: Vijayanand Jitta Cc: Vinayak Menon Link: http://lkml.kernel.org/r/afeda7ac-748b-33d8-a905-56b708148ad5@suse.cz Signed-off-by: Linus Torvalds --- mm/slab.c | 8 -------- mm/slab.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ mm/slub.c | 38 +------------------------------------- 3 files changed, 46 insertions(+), 45 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index af3d3887b9b8..390e249aaa8d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3678,14 +3678,6 @@ void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) } EXPORT_SYMBOL(__kmalloc_track_caller); -static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) -{ - if (memcg_kmem_enabled()) - return virt_to_cache(x); - else - return s; -} - /** * kmem_cache_free - Deallocate an object * @cachep: The cache the allocation was from. diff --git a/mm/slab.h b/mm/slab.h index a21c78eccee7..fceb4341ba91 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -276,6 +276,34 @@ static inline int cache_vmstat_idx(struct kmem_cache *s) NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE; } +#ifdef CONFIG_SLUB_DEBUG +#ifdef CONFIG_SLUB_DEBUG_ON +DECLARE_STATIC_KEY_TRUE(slub_debug_enabled); +#else +DECLARE_STATIC_KEY_FALSE(slub_debug_enabled); +#endif +extern void print_tracking(struct kmem_cache *s, void *object); +#else +static inline void print_tracking(struct kmem_cache *s, void *object) +{ +} +#endif + +/* + * Returns true if any of the specified slub_debug flags is enabled for the + * cache. Use only for flags parsed by setup_slub_debug() as it also enables + * the static key. + */ +static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t flags) +{ +#ifdef CONFIG_SLUB_DEBUG + VM_WARN_ON_ONCE(!(flags & SLAB_DEBUG_FLAGS)); + if (static_branch_unlikely(&slub_debug_enabled)) + return s->flags & flags; +#endif + return false; +} + #ifdef CONFIG_MEMCG_KMEM /* List of all root caches. */ @@ -504,6 +532,23 @@ static __always_inline void uncharge_slab_page(struct page *page, int order, memcg_uncharge_slab(page, order, s); } +static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) +{ + struct kmem_cache *cachep; + + if (!IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) && + !memcg_kmem_enabled() && + !kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) + return s; + + cachep = virt_to_cache(x); + if (WARN(cachep && !slab_equal_or_root(cachep, s), + "%s: Wrong slab cache. %s but object is from %s\n", + __func__, s->name, cachep->name)) + print_tracking(cachep, x); + return cachep; +} + static inline size_t slab_ksize(const struct kmem_cache *s) { #ifndef CONFIG_SLUB diff --git a/mm/slub.c b/mm/slub.c index 6a99a3c945cb..629a58b56728 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -122,21 +122,6 @@ DEFINE_STATIC_KEY_FALSE(slub_debug_enabled); #endif #endif -/* - * Returns true if any of the specified slub_debug flags is enabled for the - * cache. Use only for flags parsed by setup_slub_debug() as it also enables - * the static key. - */ -static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t flags) -{ - VM_WARN_ON_ONCE(!(flags & SLAB_DEBUG_FLAGS)); -#ifdef CONFIG_SLUB_DEBUG - if (static_branch_unlikely(&slub_debug_enabled)) - return s->flags & flags; -#endif - return false; -} - static inline bool kmem_cache_debug(struct kmem_cache *s) { return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS); @@ -653,7 +638,7 @@ static void print_track(const char *s, struct track *t, unsigned long pr_time) #endif } -static void print_tracking(struct kmem_cache *s, void *object) +void print_tracking(struct kmem_cache *s, void *object) { unsigned long pr_time = jiffies; if (!(s->flags & SLAB_STORE_USER)) @@ -1525,10 +1510,6 @@ static bool freelist_corrupted(struct kmem_cache *s, struct page *page, { return false; } - -static void print_tracking(struct kmem_cache *s, void *object) -{ -} #endif /* CONFIG_SLUB_DEBUG */ /* @@ -3175,23 +3156,6 @@ void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) } #endif -static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) -{ - struct kmem_cache *cachep; - - if (!IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) && - !memcg_kmem_enabled() && - !kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) - return s; - - cachep = virt_to_cache(x); - if (WARN(cachep && !slab_equal_or_root(cachep, s), - "%s: Wrong slab cache. %s but object is from %s\n", - __func__, s->name, cachep->name)) - print_tracking(cachep, x); - return cachep; -} - void kmem_cache_free(struct kmem_cache *s, void *x) { s = cache_from_obj(s, x); -- cgit v1.2.3 From b3cb9fc3aeaf9d52967c87d54a98f035e96279dc Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 6 Aug 2020 23:19:09 -0700 Subject: mm/slub.c: drop lockdep_assert_held() from put_map() There is no point in using lockdep_assert_held() unlock that is about to be unlocked. It works only with lockdep and lockdep will complain if spin_unlock() is used on a lock that has not been locked. Remove superfluous lockdep_assert_held(). Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton Cc: Yu Zhao Cc: Christopher Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200618201234.795692-2-bigeasy@linutronix.de Signed-off-by: Linus Torvalds --- mm/slub.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 629a58b56728..e5fc31c37b37 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -473,8 +473,6 @@ static unsigned long *get_map(struct kmem_cache *s, struct page *page) static void put_map(unsigned long *map) __releases(&object_map_lock) { VM_BUG_ON(map != object_map); - lockdep_assert_held(&object_map_lock); - spin_unlock(&object_map_lock); } -- cgit v1.2.3 From cfbe1636c3585c1e032bfac512fb8be903fbc913 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 6 Aug 2020 23:19:12 -0700 Subject: mm, kcsan: instrument SLAB/SLUB free with "ASSERT_EXCLUSIVE_ACCESS" Provide the necessary KCSAN checks to assist with debugging racy use-after-frees. While KASAN is more reliable at generally catching such use-after-frees (due to its use of a quarantine), it can be difficult to debug racy use-after-frees. If a reliable reproducer exists, KCSAN can assist in debugging such issues. Note: ASSERT_EXCLUSIVE_ACCESS is a convenience wrapper if the size is simply sizeof(var). Instead, here we just use __kcsan_check_access() explicitly to pass the correct size. Signed-off-by: Marco Elver Signed-off-by: Andrew Morton Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Link: http://lkml.kernel.org/r/20200623072653.114563-1-elver@google.com Signed-off-by: Linus Torvalds --- mm/slab.c | 5 +++++ mm/slub.c | 5 +++++ 2 files changed, 10 insertions(+) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 390e249aaa8d..fa31cbb76124 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3432,6 +3432,11 @@ static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp, if (kasan_slab_free(cachep, objp, _RET_IP_)) return; + /* Use KCSAN to help debug racy use-after-free. */ + if (!(cachep->flags & SLAB_TYPESAFE_BY_RCU)) + __kcsan_check_access(objp, cachep->object_size, + KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT); + ___cache_free(cachep, objp, caller); } diff --git a/mm/slub.c b/mm/slub.c index e5fc31c37b37..ae39eb392396 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1549,6 +1549,11 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x) if (!(s->flags & SLAB_DEBUG_OBJECTS)) debug_check_no_obj_freed(x, s->object_size); + /* Use KCSAN to help debug racy use-after-free. */ + if (!(s->flags & SLAB_TYPESAFE_BY_RCU)) + __kcsan_check_access(x, s->object_size, + KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT); +