diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-05-19 12:15:32 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-05-19 12:15:32 -0700 |
commit | cb6f8739fbf98203d0fb0bc2c2dbbec0ddfe978a (patch) | |
tree | bdb69f5b9fb5a3e7b6810319468706bad2b6e1ac | |
parent | ff8583d6e4e33fe3856a609095c683d5dbe39120 (diff) | |
parent | de6da1e8bcf0dd2058b950b127491821207679dc (diff) |
Merge branch 'akpm' (patches from Andrew)
Merge yet more updates from Andrew Morton:
"A few final bits:
- large changes to vmalloc, yielding large performance benefits
- tweak the console-flush-on-panic code
- a few fixes"
* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
panic: add an option to replay all the printk message in buffer
initramfs: don't free a non-existent initrd
fs/writeback.c: use rcu_barrier() to wait for inflight wb switches going into workqueue when umount
mm/compaction.c: correct zone boundary handling when isolating pages from a pageblock
mm/vmap: add DEBUG_AUGMENT_LOWEST_MATCH_CHECK macro
mm/vmap: add DEBUG_AUGMENT_PROPAGATE_CHECK macro
mm/vmalloc.c: keep track of free blocks for vmap allocation
-rw-r--r-- | Documentation/admin-guide/kernel-parameters.txt | 1 | ||||
-rw-r--r-- | arch/powerpc/kernel/traps.c | 2 | ||||
-rw-r--r-- | fs/fs-writeback.c | 11 | ||||
-rw-r--r-- | include/linux/console.h | 7 | ||||
-rw-r--r-- | include/linux/vmalloc.h | 6 | ||||
-rw-r--r-- | init/initramfs.c | 2 | ||||
-rw-r--r-- | kernel/panic.c | 6 | ||||
-rw-r--r-- | kernel/printk/printk.c | 12 | ||||
-rw-r--r-- | mm/compaction.c | 4 | ||||
-rw-r--r-- | mm/vmalloc.c | 1095 |
10 files changed, 889 insertions, 257 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 52e6fbb042cc..138f6664b2e2 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3212,6 +3212,7 @@ bit 2: print timer info bit 3: print locks info if CONFIG_LOCKDEP is on bit 4: print ftrace buffer + bit 5: print all printk messages in buffer panic_on_warn panic() instead of WARN(). Useful to cause kdump on a WARN(). diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 665f294725cb..83e59fdaa62d 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -179,7 +179,7 @@ extern void panic_flush_kmsg_end(void) kmsg_dump(KMSG_DUMP_PANIC); bust_spinlocks(0); debug_locks_off(); - console_flush_on_panic(); + console_flush_on_panic(CONSOLE_FLUSH_PENDING); } static unsigned long oops_begin(struct pt_regs *regs) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 36855c1f8daf..b16645b417d9 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -523,8 +523,6 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) isw->inode = inode; - atomic_inc(&isw_nr_in_flight); - /* * In addition to synchronizing among switchers, I_WB_SWITCH tells * the RCU protected stat update paths to grab the i_page @@ -532,6 +530,9 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) * Let's continue after I_WB_SWITCH is guaranteed to be visible. */ call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); + + atomic_inc(&isw_nr_in_flight); + goto out_unlock; out_free: @@ -901,7 +902,11 @@ restart: void cgroup_writeback_umount(void) { if (atomic_read(&isw_nr_in_flight)) { - synchronize_rcu(); + /* + * Use rcu_barrier() to wait for all pending callbacks to + * ensure that all in-flight wb switches are in the workqueue. + */ + rcu_barrier(); flush_workqueue(isw_wq); } } diff --git a/include/linux/console.h b/include/linux/console.h index ec9bdb3d7bab..d09951d5a94e 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -166,6 +166,11 @@ struct console { extern int console_set_on_cmdline; extern struct console *early_console; +enum con_flush_mode { + CONSOLE_FLUSH_PENDING, + CONSOLE_REPLAY_ALL, +}; + extern int add_preferred_console(char *name, int idx, char *options); extern void register_console(struct console *); extern int unregister_console(struct console *); @@ -175,7 +180,7 @@ extern int console_trylock(void); extern void console_unlock(void); extern void console_conditional_schedule(void); extern void console_unblank(void); -extern void console_flush_on_panic(void); +extern void console_flush_on_panic(enum con_flush_mode mode); extern struct tty_driver *console_device(int *); extern void console_stop(struct console *); extern void console_start(struct console *); diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index c6eebb839552..51e131245379 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -50,12 +50,16 @@ struct vm_struct { struct vmap_area { unsigned long va_start; unsigned long va_end; + + /* + * Largest available free size in subtree. + */ + unsigned long subtree_max_size; unsigned long flags; struct rb_node rb_node; /* address sorted rbtree */ struct list_head list; /* address sorted list */ struct llist_node purge_list; /* "lazy purge" list */ struct vm_struct *vm; - struct rcu_head rcu_head; }; /* diff --git a/init/initramfs.c b/init/initramfs.c index 435a428c2af1..178130fd61c2 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -669,7 +669,7 @@ done: * If the initrd region is overlapped with crashkernel reserved region, * free only memory that is not part of crashkernel region. */ - if (!do_retain_initrd && !kexec_free_initrd()) + if (!do_retain_initrd && initrd_start && !kexec_free_initrd()) free_initrd_mem(initrd_start, initrd_end); initrd_start = 0; initrd_end = 0; diff --git a/kernel/panic.c b/kernel/panic.c index 8779d64bace0..b4543a31a495 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -51,6 +51,7 @@ EXPORT_SYMBOL_GPL(panic_timeout); #define PANIC_PRINT_TIMER_INFO 0x00000004 #define PANIC_PRINT_LOCK_INFO 0x00000008 #define PANIC_PRINT_FTRACE_INFO 0x00000010 +#define PANIC_PRINT_ALL_PRINTK_MSG 0x00000020 unsigned long panic_print; ATOMIC_NOTIFIER_HEAD(panic_notifier_list); @@ -134,6 +135,9 @@ EXPORT_SYMBOL(nmi_panic); static void panic_print_sys_info(void) { + if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG) + console_flush_on_panic(CONSOLE_REPLAY_ALL); + if (panic_print & PANIC_PRINT_TASK_INFO) show_state(); @@ -277,7 +281,7 @@ void panic(const char *fmt, ...) * panic() is not being callled from OOPS. */ debug_locks_off(); - console_flush_on_panic(); + console_flush_on_panic(CONSOLE_FLUSH_PENDING); panic_print_sys_info(); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 17102fd4c136..a6e06fe38e41 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2535,10 +2535,11 @@ void console_unblank(void) /** * console_flush_on_panic - flush console content on panic + * @mode: flush all messages in buffer or just the pending ones * * Immediately output all pending messages no matter what. */ -void console_flush_on_panic(void) +void console_flush_on_panic(enum con_flush_mode mode) { /* * If someone else is holding the console lock, trylock will fail @@ -2549,6 +2550,15 @@ void console_flush_on_panic(void) */ console_trylock(); console_may_schedule = 0; + + if (mode == CONSOLE_REPLAY_ALL) { + unsigned long flags; + + logbuf_lock_irqsave(flags); + console_seq = log_first_seq; + console_idx = log_first_idx; + logbuf_unlock_irqrestore(flags); + } console_unlock(); } diff --git a/mm/compaction.c b/mm/compaction.c index cbac7277978a..9febc8cc84e7 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1230,7 +1230,7 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long /* Pageblock boundaries */ start_pfn = pageblock_start_pfn(pfn); - end_pfn = min(start_pfn + pageblock_nr_pages, zone_end_pfn(cc->zone)); + end_pfn = min(pageblock_end_pfn(pfn), zone_end_pfn(cc->zone)) - 1; /* Scan before */ if (start_pfn != pfn) { @@ -1241,7 +1241,7 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long /* Scan after */ start_pfn = pfn + nr_isolated; - if (start_pfn != end_pfn) + if (start_pfn < end_pfn) isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false); /* Skip this pageblock in the future as it's full or nearly full */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 67bbb8d2a0a8..c42872ed82ac 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -32,6 +32,7 @@ #include <linux/compiler.h> #include <linux/llist.h> #include <linux/bitops.h> +#include <linux/rbtree_augmented.h> #include <linux/uaccess.h> #include <asm/tlbflush.h> @@ -324,6 +325,9 @@ EXPORT_SYMBOL(vmalloc_to_pfn); /*** Global kva allocator ***/ +#define DEBUG_AUGMENT_PROPAGATE_CHECK 0 +#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0 + #define VM_LAZY_FREE 0x02 #define VM_VM_AREA 0x04 @@ -332,14 +336,67 @@ static DEFINE_SPINLOCK(vmap_area_lock); LIST_HEAD(vmap_area_list); static LLIST_HEAD(vmap_purge_list); static struct rb_root vmap_area_root = RB_ROOT; +static bool vmap_initialized __read_mostly; + +/* + * This kmem_cache is used for vmap_area objects. Instead of + * allocating from slab we reuse an object from this cache to + * make things faster. Especially in "no edge" splitting of + * free block. + */ +static struct kmem_cache *vmap_area_cachep; + +/* + * This linked list is used in pair with free_vmap_area_root. + * It gives O(1) access to prev/next to perform fast coalescing. + */ +static LIST_HEAD(free_vmap_area_list); + +/* + * This augment red-black tree represents the free vmap space. + * All vmap_area objects in this tree are sorted by va->va_start + * address. It is used for allocation and merging when a vmap + * object is released. + * + * Each vmap_area node contains a maximum available free block + * of its sub-tree, right or left. Therefore it is possible to + * find a lowest match of free area. + */ +static struct rb_root free_vmap_area_root = RB_ROOT; + +static __always_inline unsigned long +va_size(struct vmap_area *va) +{ + return (va->va_end - va->va_start); +} + +static __always_inline unsigned long +get_subtree_max_size(struct rb_node *node) +{ + struct vmap_area *va; + + va = rb_entry_safe(node, struct vmap_area, rb_node); + return va ? va->subtree_max_size : 0; +} + +/* + * Gets called when remove the node and rotate. + */ +static __always_inline unsigned long +compute_subtree_max_size(struct vmap_area *va) +{ + return max3(va_size(va), + get_subtree_max_size(va->rb_node.rb_left), + get_subtree_max_size(va->rb_node.rb_right)); +} -/* The vmap cache globals are protected by vmap_area_lock */ -static struct rb_node *free_vmap_cache; -static unsigned long cached_hole_size; -static unsigned long cached_vstart; -static unsigned long cached_align; +RB_DECLARE_CALLBACKS(static, free_vmap_area_rb_augment_cb, + struct vmap_area, rb_node, unsigned long, subtree_max_size, + compute_subtree_max_size) -static unsigned long vmap_area_pcpu_hole; +static void purge_vmap_area_lazy(void); +static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); +static unsigned long lazy_max_pages(void); static struct vmap_area *__find_vmap_area(unsigned long addr) { @@ -360,41 +417,610 @@ static struct vmap_area *__find_vmap_area(unsigned long addr) return NULL; } -static void __insert_vmap_area(struct vmap_area *va) -{ - struct rb_node **p = &vmap_area_root.rb_node; - struct rb_node *parent = NULL; - struct rb_node *tmp; +/* + * This function returns back addresses of parent node + * and its left or right link for further processing. + */ +static __always_inline struct rb_node ** +find_va_links(struct vmap_area *va, + struct rb_root *root, struct rb_node *from, + struct rb_node **parent) +{ + struct vmap_area *tmp_va; + struct rb_node **link; + + if (root) { + link = &root->rb_node; + if (unlikely(!*link)) { + *parent = NULL; + return link; + } + } else { + link = &from; + } - while (*p) { - struct vmap_area *tmp_va; + /* + * Go to the bottom of the tree. When we hit the last point + * we end up with parent rb_node and correct direction, i name + * it link, where the new va->rb_node will be attached to. + */ + do { + tmp_va = rb_entry(*link, struct vmap_area, rb_node); - parent = *p; - tmp_va = rb_entry(parent, struct vmap_area, rb_node); - if (va->va_start < tmp_va->va_end) - p = &(*p)->rb_left; - else if (va->va_end > tmp_va->va_start) - p = &(*p)->rb_right; + /* + * During the traversal we also do some sanity check. + * Trigger the BUG() if there are sides(left/right) + * or full overlaps. + */ + if (va->va_start < tmp_va->va_end && + va->va_end <= tmp_va->va_start) + link = &(*link)->rb_left; + else if (va->va_end > tmp_va->va_start && + va->va_start >= tmp_va->va_end) + link = &(*link)->rb_right; else BUG(); + } while (*link); + + *parent = &tmp_va->rb_node; + return link; +} + +static __always_inline struct list_head * +get_va_next_sibling(struct rb_node *parent, struct rb_node **link) +{ + struct list_head *list; + + if (unlikely(!parent)) + /* + * The red-black tree where we try to find VA neighbors + * before merging or inserting is empty, i.e. it means + * there is no free vmap space. Normally it does not + * happen but we handle this case anyway. + */ + return NULL; + + list = &rb_entry(parent, struct vmap_area, rb_node)->list; + return (&parent->rb_right == link ? list->next : list); +} + +static __always_inline void +link_va(struct vmap_area *va, struct rb_root *root, + struct rb_node *parent, struct rb_node **link, struct list_head *head) +{ + /* + * VA is still not in the list, but we can + * identify its future previous list_head node. + */ + if (likely(parent)) { + head = &rb_entry(parent, struct vmap_area, rb_node)->list; + if (&parent->rb_right != link) + head = head->prev; } - rb_link_node(&va->rb_node, parent, p); - rb_insert_color(&va->rb_node, &vmap_area_root); + /* Insert to the rb-tree */ + rb_link_node(&va->rb_node, parent, link); + if (root == &free_vmap_area_root) { + /* + * Some explanation here. Just perform simple insertion + * to the tree. We do not set va->subtree_max_size to + * its current size before calling rb_insert_augmented(). + * It is because of we populate the tree from the bottom + * to parent levels when the node _is_ in the tree. + * + * Therefore we set subtree_max_size to zero after insertion, + * to let __augment_tree_propagate_from() puts everything to + * the correct order later on. + */ + rb_insert_augmented(&va->rb_node, + root, &free_vmap_area_rb_augment_cb); + va->subtree_max_size = 0; + } else { + rb_insert_color(&va->rb_node, root); + } - /* address-sort this list */ - tmp = rb_prev(&va->rb_node); - if (tmp) { - struct vmap_area *prev; - prev = rb_entry(tmp, struct vmap_area, rb_node); - list_add_rcu(&va->list, &prev->list); - } else - list_add_rcu(&va->list, &vmap_area_list); + /* Address-sort this list */ + list_add(&va->list, head); } -static void purge_vmap_area_lazy(void); +static __always_inline void +unlink_va(struct vmap_area *va, struct rb_root *root) +{ + /* + * During merging a VA node can be empty, therefore + * not linked with the tree nor list. Just check it. + */ + if (!RB_EMPTY_NODE(&va->rb_node)) { + if (root == &free_vmap_area_root) + rb_erase_augmented(&va->rb_node, + root, &free_vmap_area_rb_augment_cb); + else + rb_erase(&va->rb_node, root); -static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); + list_del(&va->list); + RB_CLEAR_NODE(&va->rb_node); + } +} + +#if DEBUG_AUGMENT_PROPAGATE_CHECK +static void +augment_tree_propagate_check(struct rb_node *n) +{ + struct vmap_area *va; + struct rb_node *node; + unsigned long size; + bool found = false; + + if (n == NULL) + return; + + va = rb_entry(n, struct vmap_area, rb_node); + size = va->subtree_max_size; + node = n; + + while (node) { + va = rb_entry(node, struct vmap_area, rb_node); + + if (get_subtree_max_size(node->rb_left) == size) { + node = node->rb_left; + } else { + if (va_size(va) == size) { + found = true; + break; + } + + node = node->rb_right; + } + } + + if (!found) { + va = rb_entry(n, struct vmap_area, rb_node); + pr_emerg("tree is corrupted: %lu, %lu\n", + va_size(va), va->subtree_max_size); + } + + augment_tree_propagate_check(n->rb_left); + augment_tree_propagate_check(n->rb_right); +} +#endif + +/* + * This function populates subtree_max_size from bottom to upper + * levels starting from VA point. The propagation must be done + * when VA size is modified by changing its va_start/va_end. Or + * in case of newly inserting of VA to the tree. + * + * It means that __augment_tree_propagate_from() must be called: + * - After VA has been inserted to the tree(free path); + * - After VA has been shrunk(allocation path); + * - After VA has been increased(merging path). + * + * Please note that, it does not mean that upper parent nodes + * and their subtree_max_size are recalculated all the time up + * to the root node. + * + * 4--8 + * /\ + * / \ + * / \ + * 2--2 8--8 + * + * For example if we modify the node 4, shrinking it to 2, then + * no any modification is required. If we shrink the node 2 to 1 + * its subtree_max_size is updated only, and set to 1. If we shrink + * the node 8 to 6, then its subtree_max_size is set to 6 and parent + * node becomes 4--6. + */ +static __always_inline void +augment_tree_propagate_from(struct vmap_area *va) +{ + struct rb_node *node = &va->rb_node; + unsigned long new_va_sub_max_size; + + while (node) { + va = rb_entry(node, struct vmap_area, rb_node); + new_va_sub_max_size = compute_subtree_max_size(va); + + /* + * If the newly calculated maximum available size of the + * subtree is equal to the current one, then it means that + * the tree is propagated correctly. So we have to stop at + * this point to save cycles. + */ + if (va->subtree_max_size == new_va_sub_max_size) + break; + + va->subtree_max_size = new_va_sub_max_size; + node = rb_parent(&va->rb_node); + } + +#if DEBUG_AUGMENT_PROPAGATE_CHECK + augment_tree_propagate_check(free_vmap_area_root.rb_node); +#endif +} + +static void +insert_vmap_area(struct vmap_area *va, + struct rb_root *root, struct list_head *head) +{ + struct rb_node **link; + struct rb_node *parent; + + link = find_va_links(va, root, NULL, &parent); + link_va(va, root, parent, link, head); +} + +static void +insert_vmap_area_augment(struct vmap_area *va, + struct rb_node *from, struct rb_root *root, + struct list_head *head) +{ + struct rb_node **link; + struct rb_node *parent; + + if (from) + link = find_va_links(va, NULL, from, &parent); + else + link = find_va_links(va, root, NULL, &parent); + + link_va(va, root, parent, link, head); + augment_tree_propagate_from(va); +} + +/* + * Merge de-allocated chunk of VA memory with previous + * and next free blocks. If coalesce is not done a new + * free area is inserted. If VA has been merged, it is + * freed. + */ +static __always_inline void +merge_or_add_vmap_area(struct vmap_area *va, + struct rb_root *root, struct list_head *head) +{ + struct vmap_area *sibling; + struct list_head *next; + struct rb_node **link; + struct rb_node *parent; + bool merged = false; + + /* + * Find a place in the tree where VA potentially will be + * inserted, unless it is merged with its sibling/siblings. + */ + link = find_va_links(va, root, NULL, &parent); + + /* + * Get next node of VA to check if merging can be done. + */ + next = get_va_next_sibling(parent, link); + if (unlikely(next == NULL)) + goto insert; + + /* + * start end + * | | + * |<------VA------>|<-----Next----->| + * | | + * start end + */ + if (next != head) { + sibling = list_entry(next, struct vmap_area, list); + if (sibling->va_start == va->va_end) { + sibling->va_start = va->va_start; + + /* Check and update the tree if needed. */ + augment_tree_propagate_from(sibling); + + /* Remove this VA, it has been merged. */ + unlink_va(va, root); + + /* Free vmap_area object. */ + kmem_cache_free(vmap_area_cachep, va); + + /* Point to the new merged area. */ + va = sibling; + merged = true; + } + } + + /* + * start end + * | | + * |<-----Prev----->|<------VA------>| + * | | + * start end + */ + if (next->prev != head) { + sibling = list_entry(next->prev, struct vmap_area, list); + if (sibling->va_end == va->va_start) { + sibling->va_end = va->va_end; + + /* Check and update the tree if needed. */ + augment_tree_propagate_from(sibling); + + /* Remove this VA, it has been merged. */ + unlink_va(va, root); + + /* Free vmap_area object. */ + kmem_cache_free(vmap_area_cachep, va); + + return; + } + } + +insert: + if (!merged) { + link_va(va, root, parent, link, head); + augment_tree_propagate_from(va); + } +} + +static __always_inline bool +is_within_this_va(struct vmap_area *va, unsigned long size, + unsigned long align, unsigned long vstart) +{ + unsigned long nva_start_addr; + + if (va->va_start > vstart) + nva_start_addr = ALIGN(va->va_start, align); + else + nva_start_addr = ALIGN(vstart, align); + + /* Can be overflowed due to big size or alignment. */ + if (nva_start_addr + size < nva_start_addr || + nva_start_addr < vstart) + return false; + + return (nva_start_addr + size <= va->va_end); +} + +/* + * Find the first free block(lowest start address) in the tree, + * that will accomplish the request corresponding to passing + * parameters. + */ +static __always_inline struct vmap_area * +find_vmap_lowest_match(unsigned long size, + unsigned long align, unsigned long vstart) +{ + struct vmap_area *va; + struct rb_node *node; + unsigned long length; + + /* Start from the root. */ + node = free_vmap_area_root.rb_node; + + /* Adjust the search size for alignment overhead. */ + length = size + align - 1; + + while (node) { + va = rb_entry(node, struct vmap_area, rb_node); + + if (get_subtree_max_size(node->rb_left) >= length && + vstart < va->va_start) { + node = node->rb_left; + } else { + if (is_within_this_va(va, size, align, vstart)) + return va; + + /* + * Does not make sense to go deeper towards the right + * sub-tree if it does not have a free block that is + * equal or bigger to the requested search length. + */ + if (get_subtree_max_size(node->rb_right) >= length) { + node = node->rb_right; + continue; + } + + /* + * OK. We roll back and find the fist right sub-tree, + * that will satisfy the search criteria. It can happen + * only once due to "vstart" restriction. + */ + while ((node = rb_parent(node))) { + va = rb_entry(node, struct vmap_area, rb_node); + if (is_within_this_va(va, size, align, vstart)) + return va; + + if (get_subtree_max_size(node->rb_right) >= length && + vstart <= va->va_start) { + node = node->rb_right; + break; + } + } + } + } + + return NULL; +} + +#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK +#include <linux/random.h> + +static struct vmap_area * +find_vmap_lowest_linear_match(unsigned long size, + unsigned long align, unsigned long vstart) +{ + struct vmap_area *va; + + list_for_each_entry(va, &free_vmap_area_list, list) { + if (!is_within_this_va(va, size, align, vstart)) + continue; + + return va; + } + + return NULL; +} + +static void +find_vmap_lowest_match_check(unsigned long size) +{ + struct vmap_area *va_1, *va_2; + unsigned long vstart; + unsigned int rnd; + + get_random_bytes(&rnd, sizeof(rnd)); + vstart = VMALLOC_START + rnd; + + va_1 = find_vmap_lowest_match(size, 1, vstart); + va_2 = find_vmap_lowest_linear_match(size, 1, vstart); + + if (va_1 != va_2) + pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n", + va_1, va_2, vstart); +} +#endif + +enum fit_type { + NOTHING_FIT = 0, + FL_FIT_TYPE = 1, /* full fit */ + LE_FIT_TYPE = 2, /* left edge fit */ + RE_FIT_TYPE = 3, /* right edge fit */ + NE_FIT_TYPE = 4 /* no edge fit */ +}; + +static __always_inline enum fit_type +classify_va_fit_type(struct vmap_area *va, + unsigned long nva_start_addr, unsigned long size) +{ + enum fit_type type; + + /* Check if it is within VA. */ + if (nva_start_addr < va->va_start || + nva_start_addr + size > va->va_end) + return NOTHING_FIT; + + /* Now classify. */ + if (va->va_start == nva_start_addr) { + if (va->va_end == nva_start_addr + size) + type = FL_FIT_TYPE; + else + type = LE_FIT_TYPE; + } else if (va->va_end == nva_start_addr + size) { + type = RE_FIT_TYPE; + } else { + type = NE_FIT_TYPE; + } + + return type; +} + +static __always_inline int +adjust_va_to_fit_type(struct vmap_area *va, + unsigned long nva_start_addr, unsigned long size, + enum fit_type type) +{ + struct vmap_area *lva; + + if (type == FL_FIT_TYPE) { + /* + * No need to split VA, it fully fits. + * + * | | + * V NVA V + * |---------------| + */ + unlink_va(va, &free_vmap_area_root); + kmem_cache_free(vmap_area_cachep, va); + } else if (type == LE_FIT_TYPE) { + /* + * Split left edge of fit VA. + * + * | | + * V NVA V R + * |-------|-------| + */ + va->va_start += size; + } else if (type == RE_FIT_TYPE) { + /* + * Split right edge of fit VA. + * + * | | + * L V NVA V + * |-------|-------| + */ + va->va_end = nva_start_addr; + } else if (type == NE_FIT_TYPE) { + /* + * Split no edge of fit VA. + * + * | | + * L V NVA V R + * |---|-------|---| + */ + lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT); + if (unlikely(!lva)) + return -1; + + /* + * Build the remainder. + */ + lva->va_start = va->va_start; + lva->va_end = nva_start_addr; + + /* + * Shrink this VA to remaining size. + */ + va->va_start = nva_start_addr + size; + } else { + return -1; + } + + if (type != FL_FIT_TYPE) { + augment_tree_propagate_from(va); + + if (type == NE_FIT_TYPE) + insert_vmap_area_augment(lva, &va->rb_node, + &free_vmap_area_root, &free_vmap_area_list); + } + + return 0; +} + +/* + * Returns a start address of the newly allocated area, if success. + * Otherwise a vend is returned that indicates failure. + */ +static __always_inline unsigned long +__alloc_vmap_area(unsigned long size, unsigned long align, + unsigned long vstart, unsigned long vend, int node) +{ + unsigned long nva_start_addr; + struct vmap_area *va; + enum fit_type type; + int ret; + + va = find_vmap_lowest_match(size, align, vstart); + if (unlikely(!va)) + return vend; + + if (va->va_start > vstart) + nva_start_addr = ALIGN(va->va_start, align); + else + nva_start_addr = ALIGN(vstart, align); + + /* Check the "vend" restriction. */ + if (nva_start_addr + size > vend) + return vend; + + /* Classify what we have found. */ + type = classify_va_fit_type(va, nva_start_addr, size); + if (WARN_ON_ONCE(type == NOTHING_FIT)) + return vend; + + /* Update the free vmap_area. */ + ret = adjust_va_to_fit_type(va, nva_start_addr, size, type); + if (ret) + return vend; + +#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK + find_vmap_lowest_match_check(size); +#endif + + return nva_start_addr; +} /* * Allocate a region of KVA of the specified size and alignment, within the @@ -406,18 +1032,19 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, int node, gfp_t gfp_mask) { struct vmap_area *va; - struct rb_node *n; unsigned long addr; int purged = 0; - struct vmap_area *first; BUG_ON(!size); BUG_ON(offset_in_page(size)); BUG_ON(!is_power_of_2(align)); + if (unlikely(!vmap_initialized)) + return ERR_PTR(-EBUSY); + might_sleep(); - va = kmalloc_node(sizeof(struct vmap_area), + va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask & GFP_RECLAIM_MASK, node); if (unlikely(!va)) return ERR_PTR(-ENOMEM); @@ -430,87 +1057,20 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, retry: spin_lock(&vmap_area_lock); - /* - * Invalidate cache if we have more permissive parameters. - * cached_hole_size notes the largest hole noticed _below_ - * the vmap_area cached in free_vmap_cache: if size fits - * into that hole, we want to scan from vstart to reuse - * the hole instead of allocating above free_vmap_cache. - * Note that __free_vmap_area may update free_vmap_cache - * without updating cached_hole_size or cached_align. - */ - if (!free_vmap_cache || - size < cached_hole_size || - vstart < cached_vstart || - align < cached_align) { -nocache: - cached_hole_size = 0; - free_vmap_cache = NULL; - } - /* record if we encounter less permissive parameters */ - cached_vstart = vstart; - cached_align = align; - - /* find starting point for our search */ - if (free_vmap_cache) { - first = rb_entry(free_vmap_cache, struct vmap_area, rb_node); - addr = ALIGN(first->va_end, align); - if (addr < vstart) - goto nocache; - if (addr + size < addr) - goto overflow; - - } else { - addr = ALIGN(vstart, align); - if (addr + size < addr) - goto overflow; - - n = vmap_area_root.rb_node; - first = NULL; - - while (n) { - struct vmap_area *tmp; - tmp = rb_entry(n, struct vmap_area, rb_node); - if (tmp->va_end >= addr) { - first = tmp; - if (tmp->va_start <= addr) - break; - n = n->rb_left; - } else - n = n->rb_right; - } - - if (!first) - goto found; - } - - /* from the starting point, walk areas until a suitable hole is found */ - while (addr + size > first->va_start && addr + size <= vend) { - if (addr + cached_hole_size < first->va_start) - cached_hole_size = first->va_start - addr; - addr = ALIGN(first->va_end, align); - if (addr + size < addr) - goto overflow; - - if (list_is_last(&first->list, &vmap_area_list)) - goto found; - first = list_next_entry(first, list); - } - -found: /* - * Check also calculated address against the vstart, - * because it can be 0 because of big align request. + * If an allocation fails, the "vend" address is + * returned. Therefore trigger the overflow path. */ - if (addr + size > vend || addr < vstart) + addr = __alloc_vmap_area(size, align, vstart, vend, node); + if (unlikely(addr == vend)) goto overflow; va->va_start = addr; va->va_end = addr + size; va->flags = 0; - __insert_vmap_ |