summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt1
-rw-r--r--arch/powerpc/kernel/traps.c2
-rw-r--r--fs/fs-writeback.c11
-rw-r--r--include/linux/console.h7
-rw-r--r--include/linux/vmalloc.h6
-rw-r--r--init/initramfs.c2
-rw-r--r--kernel/panic.c6
-rw-r--r--kernel/printk/printk.c12
-rw-r--r--mm/compaction.c4
-rw-r--r--mm/vmalloc.c1095
10 files changed, 889 insertions, 257 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 52e6fbb042cc..138f6664b2e2 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3212,6 +3212,7 @@
bit 2: print timer info
bit 3: print locks info if CONFIG_LOCKDEP is on
bit 4: print ftrace buffer
+ bit 5: print all printk messages in buffer
panic_on_warn panic() instead of WARN(). Useful to cause kdump
on a WARN().
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 665f294725cb..83e59fdaa62d 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -179,7 +179,7 @@ extern void panic_flush_kmsg_end(void)
kmsg_dump(KMSG_DUMP_PANIC);
bust_spinlocks(0);
debug_locks_off();
- console_flush_on_panic();
+ console_flush_on_panic(CONSOLE_FLUSH_PENDING);
}
static unsigned long oops_begin(struct pt_regs *regs)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 36855c1f8daf..b16645b417d9 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -523,8 +523,6 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
isw->inode = inode;
- atomic_inc(&isw_nr_in_flight);
-
/*
* In addition to synchronizing among switchers, I_WB_SWITCH tells
* the RCU protected stat update paths to grab the i_page
@@ -532,6 +530,9 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
* Let's continue after I_WB_SWITCH is guaranteed to be visible.
*/
call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
+
+ atomic_inc(&isw_nr_in_flight);
+
goto out_unlock;
out_free:
@@ -901,7 +902,11 @@ restart:
void cgroup_writeback_umount(void)
{
if (atomic_read(&isw_nr_in_flight)) {
- synchronize_rcu();
+ /*
+ * Use rcu_barrier() to wait for all pending callbacks to
+ * ensure that all in-flight wb switches are in the workqueue.
+ */
+ rcu_barrier();
flush_workqueue(isw_wq);
}
}
diff --git a/include/linux/console.h b/include/linux/console.h
index ec9bdb3d7bab..d09951d5a94e 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -166,6 +166,11 @@ struct console {
extern int console_set_on_cmdline;
extern struct console *early_console;
+enum con_flush_mode {
+ CONSOLE_FLUSH_PENDING,
+ CONSOLE_REPLAY_ALL,
+};
+
extern int add_preferred_console(char *name, int idx, char *options);
extern void register_console(struct console *);
extern int unregister_console(struct console *);
@@ -175,7 +180,7 @@ extern int console_trylock(void);
extern void console_unlock(void);
extern void console_conditional_schedule(void);
extern void console_unblank(void);
-extern void console_flush_on_panic(void);
+extern void console_flush_on_panic(enum con_flush_mode mode);
extern struct tty_driver *console_device(int *);
extern void console_stop(struct console *);
extern void console_start(struct console *);
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index c6eebb839552..51e131245379 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -50,12 +50,16 @@ struct vm_struct {
struct vmap_area {
unsigned long va_start;
unsigned long va_end;
+
+ /*
+ * Largest available free size in subtree.
+ */
+ unsigned long subtree_max_size;
unsigned long flags;
struct rb_node rb_node; /* address sorted rbtree */
struct list_head list; /* address sorted list */
struct llist_node purge_list; /* "lazy purge" list */
struct vm_struct *vm;
- struct rcu_head rcu_head;
};
/*
diff --git a/init/initramfs.c b/init/initramfs.c
index 435a428c2af1..178130fd61c2 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -669,7 +669,7 @@ done:
* If the initrd region is overlapped with crashkernel reserved region,
* free only memory that is not part of crashkernel region.
*/
- if (!do_retain_initrd && !kexec_free_initrd())
+ if (!do_retain_initrd && initrd_start && !kexec_free_initrd())
free_initrd_mem(initrd_start, initrd_end);
initrd_start = 0;
initrd_end = 0;
diff --git a/kernel/panic.c b/kernel/panic.c
index 8779d64bace0..b4543a31a495 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -51,6 +51,7 @@ EXPORT_SYMBOL_GPL(panic_timeout);
#define PANIC_PRINT_TIMER_INFO 0x00000004
#define PANIC_PRINT_LOCK_INFO 0x00000008
#define PANIC_PRINT_FTRACE_INFO 0x00000010
+#define PANIC_PRINT_ALL_PRINTK_MSG 0x00000020
unsigned long panic_print;
ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
@@ -134,6 +135,9 @@ EXPORT_SYMBOL(nmi_panic);
static void panic_print_sys_info(void)
{
+ if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG)
+ console_flush_on_panic(CONSOLE_REPLAY_ALL);
+
if (panic_print & PANIC_PRINT_TASK_INFO)
show_state();
@@ -277,7 +281,7 @@ void panic(const char *fmt, ...)
* panic() is not being callled from OOPS.
*/
debug_locks_off();
- console_flush_on_panic();
+ console_flush_on_panic(CONSOLE_FLUSH_PENDING);
panic_print_sys_info();
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 17102fd4c136..a6e06fe38e41 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2535,10 +2535,11 @@ void console_unblank(void)
/**
* console_flush_on_panic - flush console content on panic
+ * @mode: flush all messages in buffer or just the pending ones
*
* Immediately output all pending messages no matter what.
*/
-void console_flush_on_panic(void)
+void console_flush_on_panic(enum con_flush_mode mode)
{
/*
* If someone else is holding the console lock, trylock will fail
@@ -2549,6 +2550,15 @@ void console_flush_on_panic(void)
*/
console_trylock();
console_may_schedule = 0;
+
+ if (mode == CONSOLE_REPLAY_ALL) {
+ unsigned long flags;
+
+ logbuf_lock_irqsave(flags);
+ console_seq = log_first_seq;
+ console_idx = log_first_idx;
+ logbuf_unlock_irqrestore(flags);
+ }
console_unlock();
}
diff --git a/mm/compaction.c b/mm/compaction.c
index cbac7277978a..9febc8cc84e7 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1230,7 +1230,7 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long
/* Pageblock boundaries */
start_pfn = pageblock_start_pfn(pfn);
- end_pfn = min(start_pfn + pageblock_nr_pages, zone_end_pfn(cc->zone));
+ end_pfn = min(pageblock_end_pfn(pfn), zone_end_pfn(cc->zone)) - 1;
/* Scan before */
if (start_pfn != pfn) {
@@ -1241,7 +1241,7 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long
/* Scan after */
start_pfn = pfn + nr_isolated;
- if (start_pfn != end_pfn)
+ if (start_pfn < end_pfn)
isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false);
/* Skip this pageblock in the future as it's full or nearly full */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 67bbb8d2a0a8..c42872ed82ac 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -32,6 +32,7 @@
#include <linux/compiler.h>
#include <linux/llist.h>
#include <linux/bitops.h>
+#include <linux/rbtree_augmented.h>
#include <linux/uaccess.h>
#include <asm/tlbflush.h>
@@ -324,6 +325,9 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
/*** Global kva allocator ***/
+#define DEBUG_AUGMENT_PROPAGATE_CHECK 0
+#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
+
#define VM_LAZY_FREE 0x02
#define VM_VM_AREA 0x04
@@ -332,14 +336,67 @@ static DEFINE_SPINLOCK(vmap_area_lock);
LIST_HEAD(vmap_area_list);
static LLIST_HEAD(vmap_purge_list);
static struct rb_root vmap_area_root = RB_ROOT;
+static bool vmap_initialized __read_mostly;
+
+/*
+ * This kmem_cache is used for vmap_area objects. Instead of
+ * allocating from slab we reuse an object from this cache to
+ * make things faster. Especially in "no edge" splitting of
+ * free block.
+ */
+static struct kmem_cache *vmap_area_cachep;
+
+/*
+ * This linked list is used in pair with free_vmap_area_root.
+ * It gives O(1) access to prev/next to perform fast coalescing.
+ */
+static LIST_HEAD(free_vmap_area_list);
+
+/*
+ * This augment red-black tree represents the free vmap space.
+ * All vmap_area objects in this tree are sorted by va->va_start
+ * address. It is used for allocation and merging when a vmap
+ * object is released.
+ *
+ * Each vmap_area node contains a maximum available free block
+ * of its sub-tree, right or left. Therefore it is possible to
+ * find a lowest match of free area.
+ */
+static struct rb_root free_vmap_area_root = RB_ROOT;
+
+static __always_inline unsigned long
+va_size(struct vmap_area *va)
+{
+ return (va->va_end - va->va_start);
+}
+
+static __always_inline unsigned long
+get_subtree_max_size(struct rb_node *node)
+{
+ struct vmap_area *va;
+
+ va = rb_entry_safe(node, struct vmap_area, rb_node);
+ return va ? va->subtree_max_size : 0;
+}
+
+/*
+ * Gets called when remove the node and rotate.
+ */
+static __always_inline unsigned long
+compute_subtree_max_size(struct vmap_area *va)
+{
+ return max3(va_size(va),
+ get_subtree_max_size(va->rb_node.rb_left),
+ get_subtree_max_size(va->rb_node.rb_right));
+}
-/* The vmap cache globals are protected by vmap_area_lock */
-static struct rb_node *free_vmap_cache;
-static unsigned long cached_hole_size;
-static unsigned long cached_vstart;
-static unsigned long cached_align;
+RB_DECLARE_CALLBACKS(static, free_vmap_area_rb_augment_cb,
+ struct vmap_area, rb_node, unsigned long, subtree_max_size,
+ compute_subtree_max_size)
-static unsigned long vmap_area_pcpu_hole;
+static void purge_vmap_area_lazy(void);
+static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
+static unsigned long lazy_max_pages(void);
static struct vmap_area *__find_vmap_area(unsigned long addr)
{
@@ -360,41 +417,610 @@ static struct vmap_area *__find_vmap_area(unsigned long addr)
return NULL;
}
-static void __insert_vmap_area(struct vmap_area *va)
-{
- struct rb_node **p = &vmap_area_root.rb_node;
- struct rb_node *parent = NULL;
- struct rb_node *tmp;
+/*
+ * This function returns back addresses of parent node
+ * and its left or right link for further processing.
+ */
+static __always_inline struct rb_node **
+find_va_links(struct vmap_area *va,
+ struct rb_root *root, struct rb_node *from,
+ struct rb_node **parent)
+{
+ struct vmap_area *tmp_va;
+ struct rb_node **link;
+
+ if (root) {
+ link = &root->rb_node;
+ if (unlikely(!*link)) {
+ *parent = NULL;
+ return link;
+ }
+ } else {
+ link = &from;
+ }
- while (*p) {
- struct vmap_area *tmp_va;
+ /*
+ * Go to the bottom of the tree. When we hit the last point
+ * we end up with parent rb_node and correct direction, i name
+ * it link, where the new va->rb_node will be attached to.
+ */
+ do {
+ tmp_va = rb_entry(*link, struct vmap_area, rb_node);
- parent = *p;
- tmp_va = rb_entry(parent, struct vmap_area, rb_node);
- if (va->va_start < tmp_va->va_end)
- p = &(*p)->rb_left;
- else if (va->va_end > tmp_va->va_start)
- p = &(*p)->rb_right;
+ /*
+ * During the traversal we also do some sanity check.
+ * Trigger the BUG() if there are sides(left/right)
+ * or full overlaps.
+ */
+ if (va->va_start < tmp_va->va_end &&
+ va->va_end <= tmp_va->va_start)
+ link = &(*link)->rb_left;
+ else if (va->va_end > tmp_va->va_start &&
+ va->va_start >= tmp_va->va_end)
+ link = &(*link)->rb_right;
else
BUG();
+ } while (*link);
+
+ *parent = &tmp_va->rb_node;
+ return link;
+}
+
+static __always_inline struct list_head *
+get_va_next_sibling(struct rb_node *parent, struct rb_node **link)
+{
+ struct list_head *list;
+
+ if (unlikely(!parent))
+ /*
+ * The red-black tree where we try to find VA neighbors
+ * before merging or inserting is empty, i.e. it means
+ * there is no free vmap space. Normally it does not
+ * happen but we handle this case anyway.
+ */
+ return NULL;
+
+ list = &rb_entry(parent, struct vmap_area, rb_node)->list;
+ return (&parent->rb_right == link ? list->next : list);
+}
+
+static __always_inline void
+link_va(struct vmap_area *va, struct rb_root *root,
+ struct rb_node *parent, struct rb_node **link, struct list_head *head)
+{
+ /*
+ * VA is still not in the list, but we can
+ * identify its future previous list_head node.
+ */
+ if (likely(parent)) {
+ head = &rb_entry(parent, struct vmap_area, rb_node)->list;
+ if (&parent->rb_right != link)
+ head = head->prev;
}
- rb_link_node(&va->rb_node, parent, p);
- rb_insert_color(&va->rb_node, &vmap_area_root);
+ /* Insert to the rb-tree */
+ rb_link_node(&va->rb_node, parent, link);
+ if (root == &free_vmap_area_root) {
+ /*
+ * Some explanation here. Just perform simple insertion
+ * to the tree. We do not set va->subtree_max_size to
+ * its current size before calling rb_insert_augmented().
+ * It is because of we populate the tree from the bottom
+ * to parent levels when the node _is_ in the tree.
+ *
+ * Therefore we set subtree_max_size to zero after insertion,
+ * to let __augment_tree_propagate_from() puts everything to
+ * the correct order later on.
+ */
+ rb_insert_augmented(&va->rb_node,
+ root, &free_vmap_area_rb_augment_cb);
+ va->subtree_max_size = 0;
+ } else {
+ rb_insert_color(&va->rb_node, root);
+ }
- /* address-sort this list */
- tmp = rb_prev(&va->rb_node);
- if (tmp) {
- struct vmap_area *prev;
- prev = rb_entry(tmp, struct vmap_area, rb_node);
- list_add_rcu(&va->list, &prev->list);
- } else
- list_add_rcu(&va->list, &vmap_area_list);
+ /* Address-sort this list */
+ list_add(&va->list, head);
}
-static void purge_vmap_area_lazy(void);
+static __always_inline void
+unlink_va(struct vmap_area *va, struct rb_root *root)
+{
+ /*
+ * During merging a VA node can be empty, therefore
+ * not linked with the tree nor list. Just check it.
+ */
+ if (!RB_EMPTY_NODE(&va->rb_node)) {
+ if (root == &free_vmap_area_root)
+ rb_erase_augmented(&va->rb_node,
+ root, &free_vmap_area_rb_augment_cb);
+ else
+ rb_erase(&va->rb_node, root);
-static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
+ list_del(&va->list);
+ RB_CLEAR_NODE(&va->rb_node);
+ }
+}
+
+#if DEBUG_AUGMENT_PROPAGATE_CHECK
+static void
+augment_tree_propagate_check(struct rb_node *n)
+{
+ struct vmap_area *va;
+ struct rb_node *node;
+ unsigned long size;
+ bool found = false;
+
+ if (n == NULL)
+ return;
+
+ va = rb_entry(n, struct vmap_area, rb_node);
+ size = va->subtree_max_size;
+ node = n;
+
+ while (node) {
+ va = rb_entry(node, struct vmap_area, rb_node);
+
+ if (get_subtree_max_size(node->rb_left) == size) {
+ node = node->rb_left;
+ } else {
+ if (va_size(va) == size) {
+ found = true;
+ break;
+ }
+
+ node = node->rb_right;
+ }
+ }
+
+ if (!found) {
+ va = rb_entry(n, struct vmap_area, rb_node);
+ pr_emerg("tree is corrupted: %lu, %lu\n",
+ va_size(va), va->subtree_max_size);
+ }
+
+ augment_tree_propagate_check(n->rb_left);
+ augment_tree_propagate_check(n->rb_right);
+}
+#endif
+
+/*
+ * This function populates subtree_max_size from bottom to upper
+ * levels starting from VA point. The propagation must be done
+ * when VA size is modified by changing its va_start/va_end. Or
+ * in case of newly inserting of VA to the tree.
+ *
+ * It means that __augment_tree_propagate_from() must be called:
+ * - After VA has been inserted to the tree(free path);
+ * - After VA has been shrunk(allocation path);
+ * - After VA has been increased(merging path).
+ *
+ * Please note that, it does not mean that upper parent nodes
+ * and their subtree_max_size are recalculated all the time up
+ * to the root node.
+ *
+ * 4--8
+ * /\
+ * / \
+ * / \
+ * 2--2 8--8
+ *
+ * For example if we modify the node 4, shrinking it to 2, then
+ * no any modification is required. If we shrink the node 2 to 1
+ * its subtree_max_size is updated only, and set to 1. If we shrink
+ * the node 8 to 6, then its subtree_max_size is set to 6 and parent
+ * node becomes 4--6.
+ */
+static __always_inline void
+augment_tree_propagate_from(struct vmap_area *va)
+{
+ struct rb_node *node = &va->rb_node;
+ unsigned long new_va_sub_max_size;
+
+ while (node) {
+ va = rb_entry(node, struct vmap_area, rb_node);
+ new_va_sub_max_size = compute_subtree_max_size(va);
+
+ /*
+ * If the newly calculated maximum available size of the
+ * subtree is equal to the current one, then it means that
+ * the tree is propagated correctly. So we have to stop at
+ * this point to save cycles.
+ */
+ if (va->subtree_max_size == new_va_sub_max_size)
+ break;
+
+ va->subtree_max_size = new_va_sub_max_size;
+ node = rb_parent(&va->rb_node);
+ }
+
+#if DEBUG_AUGMENT_PROPAGATE_CHECK
+ augment_tree_propagate_check(free_vmap_area_root.rb_node);
+#endif
+}
+
+static void
+insert_vmap_area(struct vmap_area *va,
+ struct rb_root *root, struct list_head *head)
+{
+ struct rb_node **link;
+ struct rb_node *parent;
+
+ link = find_va_links(va, root, NULL, &parent);
+ link_va(va, root, parent, link, head);
+}
+
+static void
+insert_vmap_area_augment(struct vmap_area *va,
+ struct rb_node *from, struct rb_root *root,
+ struct list_head *head)
+{
+ struct rb_node **link;
+ struct rb_node *parent;
+
+ if (from)
+ link = find_va_links(va, NULL, from, &parent);
+ else
+ link = find_va_links(va, root, NULL, &parent);
+
+ link_va(va, root, parent, link, head);
+ augment_tree_propagate_from(va);
+}
+
+/*
+ * Merge de-allocated chunk of VA memory with previous
+ * and next free blocks. If coalesce is not done a new
+ * free area is inserted. If VA has been merged, it is
+ * freed.
+ */
+static __always_inline void
+merge_or_add_vmap_area(struct vmap_area *va,
+ struct rb_root *root, struct list_head *head)
+{
+ struct vmap_area *sibling;
+ struct list_head *next;
+ struct rb_node **link;
+ struct rb_node *parent;
+ bool merged = false;
+
+ /*
+ * Find a place in the tree where VA potentially will be
+ * inserted, unless it is merged with its sibling/siblings.
+ */
+ link = find_va_links(va, root, NULL, &parent);
+
+ /*
+ * Get next node of VA to check if merging can be done.
+ */
+ next = get_va_next_sibling(parent, link);
+ if (unlikely(next == NULL))
+ goto insert;
+
+ /*
+ * start end
+ * | |
+ * |<------VA------>|<-----Next----->|
+ * | |
+ * start end
+ */
+ if (next != head) {
+ sibling = list_entry(next, struct vmap_area, list);
+ if (sibling->va_start == va->va_end) {
+ sibling->va_start = va->va_start;
+
+ /* Check and update the tree if needed. */
+ augment_tree_propagate_from(sibling);
+
+ /* Remove this VA, it has been merged. */
+ unlink_va(va, root);
+
+ /* Free vmap_area object. */
+ kmem_cache_free(vmap_area_cachep, va);
+
+ /* Point to the new merged area. */
+ va = sibling;
+ merged = true;
+ }
+ }
+
+ /*
+ * start end
+ * | |
+ * |<-----Prev----->|<------VA------>|
+ * | |
+ * start end
+ */
+ if (next->prev != head) {
+ sibling = list_entry(next->prev, struct vmap_area, list);
+ if (sibling->va_end == va->va_start) {
+ sibling->va_end = va->va_end;
+
+ /* Check and update the tree if needed. */
+ augment_tree_propagate_from(sibling);
+
+ /* Remove this VA, it has been merged. */
+ unlink_va(va, root);
+
+ /* Free vmap_area object. */
+ kmem_cache_free(vmap_area_cachep, va);
+
+ return;
+ }
+ }
+
+insert:
+ if (!merged) {
+ link_va(va, root, parent, link, head);
+ augment_tree_propagate_from(va);
+ }
+}
+
+static __always_inline bool
+is_within_this_va(struct vmap_area *va, unsigned long size,
+ unsigned long align, unsigned long vstart)
+{
+ unsigned long nva_start_addr;
+
+ if (va->va_start > vstart)
+ nva_start_addr = ALIGN(va->va_start, align);
+ else
+ nva_start_addr = ALIGN(vstart, align);
+
+ /* Can be overflowed due to big size or alignment. */
+ if (nva_start_addr + size < nva_start_addr ||
+ nva_start_addr < vstart)
+ return false;
+
+ return (nva_start_addr + size <= va->va_end);
+}
+
+/*
+ * Find the first free block(lowest start address) in the tree,
+ * that will accomplish the request corresponding to passing
+ * parameters.
+ */
+static __always_inline struct vmap_area *
+find_vmap_lowest_match(unsigned long size,
+ unsigned long align, unsigned long vstart)
+{
+ struct vmap_area *va;
+ struct rb_node *node;
+ unsigned long length;
+
+ /* Start from the root. */
+ node = free_vmap_area_root.rb_node;
+
+ /* Adjust the search size for alignment overhead. */
+ length = size + align - 1;
+
+ while (node) {
+ va = rb_entry(node, struct vmap_area, rb_node);
+
+ if (get_subtree_max_size(node->rb_left) >= length &&
+ vstart < va->va_start) {
+ node = node->rb_left;
+ } else {
+ if (is_within_this_va(va, size, align, vstart))
+ return va;
+
+ /*
+ * Does not make sense to go deeper towards the right
+ * sub-tree if it does not have a free block that is
+ * equal or bigger to the requested search length.
+ */
+ if (get_subtree_max_size(node->rb_right) >= length) {
+ node = node->rb_right;
+ continue;
+ }
+
+ /*
+ * OK. We roll back and find the fist right sub-tree,
+ * that will satisfy the search criteria. It can happen
+ * only once due to "vstart" restriction.
+ */
+ while ((node = rb_parent(node))) {
+ va = rb_entry(node, struct vmap_area, rb_node);
+ if (is_within_this_va(va, size, align, vstart))
+ return va;
+
+ if (get_subtree_max_size(node->rb_right) >= length &&
+ vstart <= va->va_start) {
+ node = node->rb_right;
+ break;
+ }
+ }
+ }
+ }
+
+ return NULL;
+}
+
+#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
+#include <linux/random.h>
+
+static struct vmap_area *
+find_vmap_lowest_linear_match(unsigned long size,
+ unsigned long align, unsigned long vstart)
+{
+ struct vmap_area *va;
+
+ list_for_each_entry(va, &free_vmap_area_list, list) {
+ if (!is_within_this_va(va, size, align, vstart))
+ continue;
+
+ return va;
+ }
+
+ return NULL;
+}
+
+static void
+find_vmap_lowest_match_check(unsigned long size)
+{
+ struct vmap_area *va_1, *va_2;
+ unsigned long vstart;
+ unsigned int rnd;
+
+ get_random_bytes(&rnd, sizeof(rnd));
+ vstart = VMALLOC_START + rnd;
+
+ va_1 = find_vmap_lowest_match(size, 1, vstart);
+ va_2 = find_vmap_lowest_linear_match(size, 1, vstart);
+
+ if (va_1 != va_2)
+ pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
+ va_1, va_2, vstart);
+}
+#endif
+
+enum fit_type {
+ NOTHING_FIT = 0,
+ FL_FIT_TYPE = 1, /* full fit */
+ LE_FIT_TYPE = 2, /* left edge fit */
+ RE_FIT_TYPE = 3, /* right edge fit */
+ NE_FIT_TYPE = 4 /* no edge fit */
+};
+
+static __always_inline enum fit_type
+classify_va_fit_type(struct vmap_area *va,
+ unsigned long nva_start_addr, unsigned long size)
+{
+ enum fit_type type;
+
+ /* Check if it is within VA. */
+ if (nva_start_addr < va->va_start ||
+ nva_start_addr + size > va->va_end)
+ return NOTHING_FIT;
+
+ /* Now classify. */
+ if (va->va_start == nva_start_addr) {
+ if (va->va_end == nva_start_addr + size)
+ type = FL_FIT_TYPE;
+ else
+ type = LE_FIT_TYPE;
+ } else if (va->va_end == nva_start_addr + size) {
+ type = RE_FIT_TYPE;
+ } else {
+ type = NE_FIT_TYPE;
+ }
+
+ return type;
+}
+
+static __always_inline int
+adjust_va_to_fit_type(struct vmap_area *va,
+ unsigned long nva_start_addr, unsigned long size,
+ enum fit_type type)
+{
+ struct vmap_area *lva;
+
+ if (type == FL_FIT_TYPE) {
+ /*
+ * No need to split VA, it fully fits.
+ *
+ * | |
+ * V NVA V
+ * |---------------|
+ */
+ unlink_va(va, &free_vmap_area_root);
+ kmem_cache_free(vmap_area_cachep, va);
+ } else if (type == LE_FIT_TYPE) {
+ /*
+ * Split left edge of fit VA.
+ *
+ * | |
+ * V NVA V R
+ * |-------|-------|
+ */
+ va->va_start += size;
+ } else if (type == RE_FIT_TYPE) {
+ /*
+ * Split right edge of fit VA.
+ *
+ * | |
+ * L V NVA V
+ * |-------|-------|
+ */
+ va->va_end = nva_start_addr;
+ } else if (type == NE_FIT_TYPE) {
+ /*
+ * Split no edge of fit VA.
+ *
+ * | |
+ * L V NVA V R
+ * |---|-------|---|
+ */
+ lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
+ if (unlikely(!lva))
+ return -1;
+
+ /*
+ * Build the remainder.
+ */
+ lva->va_start = va->va_start;
+ lva->va_end = nva_start_addr;
+
+ /*
+ * Shrink this VA to remaining size.
+ */
+ va->va_start = nva_start_addr + size;
+ } else {
+ return -1;
+ }
+
+ if (type != FL_FIT_TYPE) {
+ augment_tree_propagate_from(va);
+
+ if (type == NE_FIT_TYPE)
+ insert_vmap_area_augment(lva, &va->rb_node,
+ &free_vmap_area_root, &free_vmap_area_list);
+ }
+
+ return 0;
+}
+
+/*
+ * Returns a start address of the newly allocated area, if success.
+ * Otherwise a vend is returned that indicates failure.
+ */
+static __always_inline unsigned long
+__alloc_vmap_area(unsigned long size, unsigned long align,
+ unsigned long vstart, unsigned long vend, int node)
+{
+ unsigned long nva_start_addr;
+ struct vmap_area *va;
+ enum fit_type type;
+ int ret;
+
+ va = find_vmap_lowest_match(size, align, vstart);
+ if (unlikely(!va))
+ return vend;
+
+ if (va->va_start > vstart)
+ nva_start_addr = ALIGN(va->va_start, align);
+ else
+ nva_start_addr = ALIGN(vstart, align);
+
+ /* Check the "vend" restriction. */
+ if (nva_start_addr + size > vend)
+ return vend;
+
+ /* Classify what we have found. */
+ type = classify_va_fit_type(va, nva_start_addr, size);
+ if (WARN_ON_ONCE(type == NOTHING_FIT))
+ return vend;
+
+ /* Update the free vmap_area. */
+ ret = adjust_va_to_fit_type(va, nva_start_addr, size, type);
+ if (ret)
+ return vend;
+
+#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
+ find_vmap_lowest_match_check(size);
+#endif
+
+ return nva_start_addr;
+}
/*
* Allocate a region of KVA of the specified size and alignment, within the
@@ -406,18 +1032,19 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
int node, gfp_t gfp_mask)
{
struct vmap_area *va;
- struct rb_node *n;
unsigned long addr;
int purged = 0;
- struct vmap_area *first;
BUG_ON(!size);
BUG_ON(offset_in_page(size));
BUG_ON(!is_power_of_2(align));
+ if (unlikely(!vmap_initialized))
+ return ERR_PTR(-EBUSY);
+
might_sleep();
- va = kmalloc_node(sizeof(struct vmap_area),
+ va = kmem_cache_alloc_node(vmap_area_cachep,
gfp_mask & GFP_RECLAIM_MASK, node);
if (unlikely(!va))
return ERR_PTR(-ENOMEM);
@@ -430,87 +1057,20 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
retry:
spin_lock(&vmap_area_lock);
- /*
- * Invalidate cache if we have more permissive parameters.
- * cached_hole_size notes the largest hole noticed _below_
- * the vmap_area cached in free_vmap_cache: if size fits
- * into that hole, we want to scan from vstart to reuse
- * the hole instead of allocating above free_vmap_cache.
- * Note that __free_vmap_area may update free_vmap_cache
- * without updating cached_hole_size or cached_align.
- */
- if (!free_vmap_cache ||
- size < cached_hole_size ||
- vstart < cached_vstart ||
- align < cached_align) {
-nocache:
- cached_hole_size = 0;
- free_vmap_cache = NULL;
- }
- /* record if we encounter less permissive parameters */
- cached_vstart = vstart;
- cached_align = align;
-
- /* find starting point for our search */
- if (free_vmap_cache) {
- first = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
- addr = ALIGN(first->va_end, align);
- if (addr < vstart)
- goto nocache;
- if (addr + size < addr)
- goto overflow;
-
- } else {
- addr = ALIGN(vstart, align);
- if (addr + size < addr)
- goto overflow;
-
- n = vmap_area_root.rb_node;
- first = NULL;
-
- while (n) {
- struct vmap_area *tmp;
- tmp = rb_entry(n, struct vmap_area, rb_node);
- if (tmp->va_end >= addr) {
- first = tmp;
- if (tmp->va_start <= addr)
- break;
- n = n->rb_left;
- } else
- n = n->rb_right;
- }
-
- if (!first)
- goto found;
- }
-
- /* from the starting point, walk areas until a suitable hole is found */
- while (addr + size > first->va_start && addr + size <= vend) {
- if (addr + cached_hole_size < first->va_start)
- cached_hole_size = first->va_start - addr;
- addr = ALIGN(first->va_end, align);
- if (addr + size < addr)
- goto overflow;
-
- if (list_is_last(&first->list, &vmap_area_list))
- goto found;
- first = list_next_entry(first, list);
- }
-
-found:
/*
- * Check also calculated address against the vstart,
- * because it can be 0 because of big align request.
+ * If an allocation fails, the "vend" address is
+ * returned. Therefore trigger the overflow path.
*/
- if (addr + size > vend || addr < vstart)
+ addr = __alloc_vmap_area(size, align, vstart, vend, node);
+ if (unlikely(addr == vend))
goto overflow;
va->va_start = addr;
va->va_end = addr + size;
va->flags = 0;
- __insert_vmap_area(va);
- free_vmap_cache = &va->rb_node;
+ insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+
spin_unlock(&vmap_area_lock);
BUG_ON(!IS_ALIGNED(va->va_start, align));
@@ -539,7 +1099,8 @@ overflow:
if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
size);
- kfree(va);
+
+ kmem_cache_free(vmap_area_cachep, va);
return ERR_PTR(-EBUSY);
}
@@ -559,35 +1120,16 @@ static void __free_vmap_area(struct vmap_area *va)
{
BUG_ON(RB_EMPTY_NODE(&va->rb_node));
- if (free_vmap_cache) {
- if (va->va_end < cached_vstart) {
- free_vmap_cache = NULL;
- } else {
- struct vmap_area *cache;
- cache = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
- if (va->va_start <= cache->va_start) {
- free_vmap_cache = rb_prev(&va->rb_node);
- /*
- * We don't try to update cached_hole_size or
- * cached_align, but it won't go very wrong.
- */
- }
- }
- }
- rb_erase(&va->rb_node, &vmap_area_root);
- RB_CLEAR_NODE(&va->rb_node);
- list_del_rcu(&va->list);
-
/*
- * Track the highest possible candidate for pcpu area
- * allocation. Areas outside of vmalloc area can be returned
- * here too, consider only end addresses which fall inside
- * vmalloc area proper.
+ * Remove from the busy tree/list.
*/
- if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END)
- vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end);
+ unlink_va(va, &vmap_area_root);
- kfree_rcu(va, rcu_head);
+ /*
+ * Merge VA with its neighbors, otherwise just add it.
+ */
+ merge_or_add_vmap_area(va,
+ &free_vmap_area_root, &free_vmap_area_list);
}
/*
@@ -794,8 +1336,6 @@ static struct vmap_area *find_vmap_area(unsigned long addr)