summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-03-13 17:08:30 +0100
committerIngo Molnar <mingo@elte.hu>2009-03-13 17:08:30 +0100
commit063402356280a7b262952d6351d21315336f657b (patch)
treee6fae7db15d0f4cbb9b9b7aa2b5bc176b4bf60f5 /mm
parentf9a36fa5413f1b2694841c410a6fdb4666e78f16 (diff)
parenta98fe7f3425c6b4e90de16f8da63b0429a8fed08 (diff)
Merge branch 'x86/core' into x86/kconfig
Diffstat (limited to 'mm')
-rw-r--r--mm/Makefile4
-rw-r--r--mm/allocpercpu.c32
-rw-r--r--mm/bootmem.c35
-rw-r--r--mm/filemap.c7
-rw-r--r--mm/percpu.c1226
-rw-r--r--mm/vmalloc.c97
6 files changed, 1375 insertions, 26 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 72255be57f89..818569b68f46 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -30,6 +30,10 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
obj-$(CONFIG_FS_XIP) += filemap_xip.o
obj-$(CONFIG_MIGRATION) += migrate.o
+ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+obj-$(CONFIG_SMP) += percpu.o
+else
obj-$(CONFIG_SMP) += allocpercpu.o
+endif
obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index 4297bc41bfd2..3653c570232b 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -99,45 +99,51 @@ static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
__percpu_populate_mask((__pdata), (size), (gfp), &(mask))
/**
- * percpu_alloc_mask - initial setup of per-cpu data
+ * alloc_percpu - initial setup of per-cpu data
* @size: size of per-cpu object
- * @gfp: may sleep or not etc.
- * @mask: populate per-data for cpu's selected through mask bits
+ * @align: alignment
*
- * Populating per-cpu data for all online cpu's would be a typical use case,
- * which is simplified by the percpu_alloc() wrapper.
- * Per-cpu objects are populated with zeroed buffers.
+ * Allocate dynamic percpu area. Percpu objects are populated with
+ * zeroed buffers.
*/
-void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
+void *__alloc_percpu(size_t size, size_t align)
{
/*
* We allocate whole cache lines to avoid false sharing
*/
size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size());
- void *pdata = kzalloc(sz, gfp);
+ void *pdata = kzalloc(sz, GFP_KERNEL);
void *__pdata = __percpu_disguise(pdata);
+ /*
+ * Can't easily make larger alignment work with kmalloc. WARN
+ * on it. Larger alignment should only be used for module
+ * percpu sections on SMP for which this path isn't used.
+ */
+ WARN_ON_ONCE(align > __alignof__(unsigned long long));
+
if (unlikely(!pdata))
return NULL;
- if (likely(!__percpu_populate_mask(__pdata, size, gfp, mask)))
+ if (likely(!__percpu_populate_mask(__pdata, size, GFP_KERNEL,
+ &cpu_possible_map)))
return __pdata;
kfree(pdata);
return NULL;
}
-EXPORT_SYMBOL_GPL(__percpu_alloc_mask);
+EXPORT_SYMBOL_GPL(__alloc_percpu);
/**
- * percpu_free - final cleanup of per-cpu data
+ * free_percpu - final cleanup of per-cpu data
* @__pdata: object to clean up
*
* We simply clean up any per-cpu object left. No need for the client to
* track and specify through a bis mask which per-cpu objects are to free.
*/
-void percpu_free(void *__pdata)
+void free_percpu(void *__pdata)
{
if (unlikely(!__pdata))
return;
__percpu_depopulate_mask(__pdata, &cpu_possible_map);
kfree(__percpu_disguise(__pdata));
}
-EXPORT_SYMBOL_GPL(percpu_free);
+EXPORT_SYMBOL_GPL(free_percpu);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 51a0ccf61e0e..daf92713f7de 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -382,7 +382,6 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
}
-#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
/**
* reserve_bootmem - mark a page range as usable
* @addr: starting address of the range
@@ -403,7 +402,6 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
return mark_bootmem(start, end, 1, flags);
}
-#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx,
unsigned long step)
@@ -429,8 +427,8 @@ static unsigned long align_off(struct bootmem_data *bdata, unsigned long off,
}
static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
- unsigned long size, unsigned long align,
- unsigned long goal, unsigned long limit)
+ unsigned long size, unsigned long align,
+ unsigned long goal, unsigned long limit)
{
unsigned long fallback = 0;
unsigned long min, max, start, sidx, midx, step;
@@ -530,17 +528,34 @@ find_block:
return NULL;
}
+static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
+ unsigned long size, unsigned long align,
+ unsigned long goal, unsigned long limit)
+{
+#ifdef CONFIG_HAVE_ARCH_BOOTMEM
+ bootmem_data_t *p_bdata;
+
+ p_bdata = bootmem_arch_preferred_node(bdata, size, align, goal, limit);
+ if (p_bdata)
+ return alloc_bootmem_core(p_bdata, size, align, goal, limit);
+#endif
+ return NULL;
+}
+
static void * __init ___alloc_bootmem_nopanic(unsigned long size,
unsigned long align,
unsigned long goal,
unsigned long limit)
{
bootmem_data_t *bdata;
+ void *region;
restart:
- list_for_each_entry(bdata, &bdata_list, list) {
- void *region;
+ region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit);
+ if (region)
+ return region;
+ list_for_each_entry(bdata, &bdata_list, list) {
if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
continue;
if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
@@ -618,6 +633,10 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
{
void *ptr;
+ ptr = alloc_arch_preferred_bootmem(bdata, size, align, goal, limit);
+ if (ptr)
+ return ptr;
+
ptr = alloc_bootmem_core(bdata, size, align, goal, limit);
if (ptr)
return ptr;
@@ -674,6 +693,10 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
{
void *ptr;
+ ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
+ if (ptr)
+ return ptr;
+
ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
if (ptr)
return ptr;
diff --git a/mm/filemap.c b/mm/filemap.c
index 23acefe51808..126d3973b3d1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1823,7 +1823,7 @@ static size_t __iovec_copy_from_user_inatomic(char *vaddr,
int copy = min(bytes, iov->iov_len - base);
base = 0;
- left = __copy_from_user_inatomic_nocache(vaddr, buf, copy);
+ left = __copy_from_user_inatomic(vaddr, buf, copy);
copied += copy;
bytes -= copy;
vaddr += copy;
@@ -1851,8 +1851,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
if (likely(i->nr_segs == 1)) {
int left;
char __user *buf = i->iov->iov_base + i->iov_offset;
- left = __copy_from_user_inatomic_nocache(kaddr + offset,
- buf, bytes);
+ left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
copied = bytes - left;
} else {
copied = __iovec_copy_from_user_inatomic(kaddr + offset,
@@ -1880,7 +1879,7 @@ size_t iov_iter_copy_from_user(struct page *page,
if (likely(i->nr_segs == 1)) {
int left;
char __user *buf = i->iov->iov_base + i->iov_offset;
- left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
+ left = __copy_from_user(kaddr + offset, buf, bytes);
copied = bytes - left;
} else {
copied = __iovec_copy_from_user_inatomic(kaddr + offset,
diff --git a/mm/percpu.c b/mm/percpu.c
new file mode 100644
index 000000000000..bfe6a3afaf45
--- /dev/null
+++ b/mm/percpu.c
@@ -0,0 +1,1226 @@
+/*
+ * linux/mm/percpu.c - percpu memory allocator
+ *
+ * Copyright (C) 2009 SUSE Linux Products GmbH
+ * Copyright (C) 2009 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ *
+ * This is percpu allocator which can handle both static and dynamic
+ * areas. Percpu areas are allocated in chunks in vmalloc area. Each
+ * chunk is consisted of num_possible_cpus() units and the first chunk
+ * is used for static percpu variables in the kernel image (special
+ * boot time alloc/init handling necessary as these areas need to be
+ * brought up before allocation services are running). Unit grows as
+ * necessary and all units grow or shrink in unison. When a chunk is
+ * filled up, another chunk is allocated. ie. in vmalloc area
+ *
+ * c0 c1 c2
+ * ------------------- ------------------- ------------
+ * | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u
+ * ------------------- ...... ------------------- .... ------------
+ *
+ * Allocation is done in offset-size areas of single unit space. Ie,
+ * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
+ * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring
+ * percpu base registers UNIT_SIZE apart.
+ *
+ * There are usually many small percpu allocations many of them as
+ * small as 4 bytes. The allocator organizes chunks into lists
+ * according to free size and tries to allocate from the fullest one.
+ * Each chunk keeps the maximum contiguous area size hint which is
+ * guaranteed to be eqaul to or larger than the maximum contiguous
+ * area in the chunk. This helps the allocator not to iterate the
+ * chunk maps unnecessarily.
+ *
+ * Allocation state in each chunk is kept using an array of integers
+ * on chunk->map. A positive value in the map represents a free
+ * region and negative allocated. Allocation inside a chunk is done
+ * by scanning this map sequentially and serving the first matching
+ * entry. This is mostly copied from the percpu_modalloc() allocator.
+ * Chunks are also linked into a rb tree to ease address to chunk
+ * mapping during free.
+ *
+ * To use this allocator, arch code should do the followings.
+ *
+ * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+ *
+ * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
+ * regular address to percpu pointer and back
+ *
+ * - use pcpu_setup_first_chunk() during percpu area initialization to
+ * setup the first chunk containing the kernel static percpu area
+ */
+
+#include <linux/bitmap.h>
+#include <linux/bootmem.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/pfn.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <linux/workqueue.h>
+
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */
+#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
+
+struct pcpu_chunk {
+ struct list_head list; /* linked to pcpu_slot lists */
+ struct rb_node rb_node; /* key is chunk->vm->addr */
+ int free_size; /* free bytes in the chunk */
+ int contig_hint; /* max contiguous size hint */
+ struct vm_struct *vm; /* mapped vmalloc region */
+ int map_used; /* # of map entries used */
+ int map_alloc; /* # of map entries allocated */
+ int *map; /* allocation map */
+ bool immutable; /* no [de]population allowed */
+ struct page **page; /* points to page array */
+ struct page *page_ar[]; /* #cpus * UNIT_PAGES */
+};
+
+static int pcpu_unit_pages __read_mostly;
+static int pcpu_unit_size __read_mostly;
+static int pcpu_chunk_size __read_mostly;
+static int pcpu_nr_slots __read_mostly;
+static size_t pcpu_chunk_struct_size __read_mostly;
+
+/* the address of the first chunk which starts with the kernel static area */
+void *pcpu_base_addr __read_mostly;
+EXPORT_SYMBOL_GPL(pcpu_base_addr);
+
+/* optional reserved chunk, only accessible for reserved allocations */
+static struct pcpu_chunk *pcpu_reserved_chunk;
+/* offset limit of the reserved chunk */
+static int pcpu_reserved_chunk_limit;
+
+/*
+ * Synchronization rules.
+ *
+ * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
+ * protects allocation/reclaim paths, chunks and chunk->page arrays.
+ * The latter is a spinlock and protects the index data structures -
+ * chunk slots, rbtree, chunks and area maps in chunks.
+ *
+ * During allocation, pcpu_alloc_mutex is kept locked all the time and
+ * pcpu_lock is grabbed and released as necessary. All actual memory
+ * allocations are done using GFP_KERNEL with pcpu_lock released.
+ *
+ * Free path accesses and alters only the index data structures, so it
+ * can be safely called from atomic context. When memory needs to be
+ * returned to the system, free path schedules reclaim_work which
+ * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be
+ * reclaimed, release both locks and frees the chunks. Note that it's
+ * necessary to grab both locks to remove a chunk from circulation as
+ * allocation path might be referencing the chunk with only
+ * pcpu_alloc_mutex locked.
+ */
+static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */
+static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */
+
+static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
+static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */
+
+/* reclaim work to release fully free chunks, scheduled from free path */
+static void pcpu_reclaim(struct work_struct *work);
+static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim);
+
+static int __pcpu_size_to_slot(int size)
+{
+ int highbit = fls(size); /* size is in bytes */
+ return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
+}
+
+static int pcpu_size_to_slot(int size)
+{
+ if (size == pcpu_unit_size)
+ return pcpu_nr_slots - 1;
+ return __pcpu_size_to_slot(size);
+}
+
+static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
+{
+ if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
+ return 0;
+
+ return pcpu_size_to_slot(chunk->free_size);
+}
+
+static int pcpu_page_idx(unsigned int cpu, int page_idx)
+{
+ return cpu * pcpu_unit_pages + page_idx;
+}
+
+static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
+ unsigned int cpu, int page_idx)
+{
+ return &chunk->page[pcpu_page_idx(cpu, page_idx)];
+}
+
+static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
+ unsigned int cpu, int page_idx)
+{
+ return (unsigned long)chunk->vm->addr +
+ (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
+}
+
+static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
+ int page_idx)
+{
+ return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
+}
+
+/**
+ * pcpu_mem_alloc - allocate memory
+ * @size: bytes to allocate
+ *
+ * Allocate @size bytes. If @size is smaller than PAGE_SIZE,
+ * kzalloc() is used; otherwise, vmalloc() is used. The returned
+ * memory is always zeroed.
+ *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
+ *
+ * RETURNS:
+ * Pointer to the allocated area on success, NULL on failure.
+ */
+static void *pcpu_mem_alloc(size_t size)
+{
+ if (size <= PAGE_SIZE)
+ return kzalloc(size, GFP_KERNEL);
+ else {
+ void *ptr = vmalloc(size);
+ if (ptr)
+ memset(ptr, 0, size);
+ return ptr;
+ }
+}
+
+/**
+ * pcpu_mem_free - free memory
+ * @ptr: memory to free
+ * @size: size of the area
+ *
+ * Free @ptr. @ptr should have been allocated using pcpu_mem_alloc().
+ */
+static void pcpu_mem_free(void *ptr, size_t size)
+{
+ if (size <= PAGE_SIZE)
+ kfree(ptr);
+ else
+ vfree(ptr);
+}
+
+/**
+ * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
+ * @chunk: chunk of interest
+ * @oslot: the previous slot it was on
+ *
+ * This function is called after an allocation or free changed @chunk.
+ * New slot according to the changed state is determined and @chunk is
+ * moved to the slot. Note that the reserved chunk is never put on
+ * chunk slots.
+ *
+ * CONTEXT:
+ * pcpu_lock.
+ */
+static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
+{
+ int nslot = pcpu_chunk_slot(chunk);
+
+ if (chunk != pcpu_reserved_chunk && oslot != nslot) {
+ if (oslot < nslot)
+ list_move(&chunk->list, &pcpu_slot[nslot]);
+ else
+ list_move_tail(&chunk->list, &pcpu_slot[nslot]);
+ }
+}
+
+static struct rb_node **pcpu_chunk_rb_search(void *addr,
+ struct rb_node **parentp)
+{
+ struct rb_node **p = &pcpu_addr_root.rb_node;
+ struct rb_node *parent = NULL;
+ struct pcpu_chunk *chunk;
+
+ while (*p) {
+ parent = *p;
+ chunk = rb_entry(parent, struct pcpu_chunk, rb_node);
+
+ if (addr < chunk->vm->addr)
+ p = &(*p)->rb_left;
+ else if (addr > chunk->vm->addr)
+ p = &(*p)->rb_right;
+ else
+ break;
+ }
+
+ if (parentp)
+ *parentp = parent;
+ return p;
+}
+
+/**
+ * pcpu_chunk_addr_search - search for chunk containing specified address
+ * @addr: address to search for
+ *
+ * Look for chunk which might contain @addr. More specifically, it
+ * searchs for the chunk with the highest start address which isn't
+ * beyond @addr.
+ *
+ * CONTEXT:
+ * pcpu_lock.
+ *
+ * RETURNS:
+ * The address of the found chunk.
+ */
+static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
+{
+ struct rb_node *n, *parent;
+ struct pcpu_chunk *chunk;
+
+ /* is it in the reserved chunk? */
+ if (pcpu_reserved_chunk) {
+ void *start = pcpu_reserved_chunk->vm->addr;
+
+ if (addr >= start && addr < start + pcpu_reserved_chunk_limit)
+ return pcpu_reserved_chunk;
+ }
+
+ /* nah... search the regular ones */
+ n = *pcpu_chunk_rb_search(addr, &parent);
+ if (!n) {
+ /* no exactly matching chunk, the parent is the closest */
+ n = parent;
+ BUG_ON(!n);
+ }
+ chunk = rb_entry(n, struct pcpu_chunk, rb_node);
+
+ if (addr < chunk->vm->addr) {
+ /* the parent was the next one, look for the previous one */
+ n = rb_prev(n);
+ BUG_ON(!n);
+ chunk = rb_entry(n, struct pcpu_chunk, rb_node);
+ }
+
+ return chunk;
+}
+
+/**
+ * pcpu_chunk_addr_insert - insert chunk into address rb tree
+ * @new: chunk to insert
+ *
+ * Insert @new into address rb tree.
+ *
+ * CONTEXT:
+ * pcpu_lock.
+ */
+static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
+{
+ struct rb_node **p, *parent;
+
+ p = pcpu_chunk_rb_search(new->vm->addr, &parent);
+ BUG_ON(*p);
+ rb_link_node(&new->rb_node, parent, p);
+ rb_insert_color(&new->rb_node, &pcpu_addr_root);
+}
+
+/**
+ * pcpu_extend_area_map - extend area map for allocation
+ * @chunk: target chunk
+ *
+ * Extend area map of @chunk so that it can accomodate an allocation.
+ * A single allocation can split an area into three areas, so this
+ * function makes sure that @chunk->map has at least two extra slots.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex, pcpu_lock. pcpu_lock is released and reacquired
+ * if area map is extended.
+ *
+ * RETURNS:
+ * 0 if noop, 1 if successfully extended, -errno on failure.
+ */
+static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
+{
+ int new_alloc;
+ int *new;
+ size_t size;
+
+ /* has enough? */
+ if (chunk->map_alloc >= chunk->map_used + 2)
+ return 0;
+
+ spin_unlock_irq(&pcpu_lock);
+
+ new_alloc = PCPU_DFL_MAP_ALLOC;
+ while (new_alloc < chunk->map_used + 2)
+ new_alloc *= 2;
+
+ new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
+ if (!new) {
+ spin_lock_irq(&pcpu_lock);
+ return -ENOMEM;
+ }
+
+ /*
+ * Acquire pcpu_lock and switch to new area map. Only free
+ * could have happened inbetween, so map_used couldn't have
+ * grown.
+ */
+ spin_lock_irq(&pcpu_lock);
+ BUG_ON(new_alloc < chunk->map_used + 2);
+
+ size = chunk->map_alloc * sizeof(chunk->map[0]);
+ memcpy(new, chunk->map, size);
+
+ /*
+ * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is
+ * one of the first chunks and still using static map.
+ */
+ if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC)
+ pcpu_mem_free(chunk->map, size);
+
+ chunk->map_alloc = new_alloc;
+ chunk->map = new;
+ return 0;
+}
+
+/**
+ * pcpu_split_block - split a map block
+ * @chunk: chunk of interest
+ * @i: index of map block to split
+ * @head: head size in bytes (can be 0)
+ * @tail: tail size in bytes (can be 0)
+ *
+ * Split the @i'th map block into two or three blocks. If @head is
+ * non-zero, @head bytes block is inserted before block @i moving it
+ * to @i+1 and reducing its size by @head bytes.
+ *
+ * If @tail is non-zero, the target block, which can be @i or @i+1
+ * depending on @head, is reduced by @tail bytes and @tail byte block
+ * is inserted after the target block.
+ *
+ * @chunk->map must have enough free slots to accomodate the split.
+ *
+ * CONTEXT:
+ * pcpu_lock.
+ */
+static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
+ int head, int tail)
+{
+ int nr_extra = !!head + !!tail;
+
+ BUG_ON(chunk->map_alloc < chunk->map_used + nr_extra);
+
+ /* insert new subblocks */
+ memmove(&chunk->map[i + nr_extra], &chunk->map[i],
+ sizeof(chunk->map[0]) * (chunk->map_used - i));
+ chunk->map_used += nr_extra;
+
+ if (head) {
+ chunk->map[i + 1] = chunk->map[i] - head;
+ chunk->map[i++] = head;
+ }
+ if (tail) {
+ chunk->map[i++] -= tail;
+ chunk->map[i] = tail;
+ }
+}
+
+/**
+ * pcpu_alloc_area - allocate area from a pcpu_chunk
+ * @chunk: chunk of interest
+ * @size: wanted size in bytes
+ * @align: wanted align
+ *
+ * Try to allocate @size bytes area aligned at @align from @chunk.
+ * Note that this function only allocates the offset. It doesn't
+ * populate or map the area.
+ *
+ * @chunk->map must have at least two free slots.
+ *
+ * CONTEXT:
+ * pcpu_lock.
+ *
+ * RETURNS:
+ * Allocated offset in @chunk on success, -1 if no matching area is
+ * found.
+ */
+static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
+{
+ int oslot = pcpu_chunk_slot(chunk);
+ int max_contig = 0;
+ int i, off;
+
+ for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) {
+ bool is_last = i + 1 == chunk->map_used;
+ int head, tail;
+
+ /* extra for alignment requirement */
+ head = ALIGN(off, align) - off;
+ BUG_ON(i == 0 && head != 0);
+
+ if (chunk->map[i] < 0)
+ continue;
+ if (chunk->map[i] < head + size) {
+ max_contig = max(chunk->map[i], max_contig);
+ continue;
+ }
+
+ /*
+ * If head is small or the previous block is free,
+ * merge'em. Note that 'small' is defined as smaller
+ * than sizeof(int), which is very small but isn't too
+ * uncommon for percpu allocations.
+ */
+ if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) {
+ if (chunk->map[i - 1] > 0)
+ chunk->map[i - 1] += head;
+ else {
+ chunk->map[i - 1] -= head;
+ chunk->free_size -= head;
+ }
+ chunk->map[i] -= head;
+ off += head;
+ head = 0;
+ }
+
+ /* if tail is small, just keep it around */
+ tail = chunk->map[i] - head - size;
+ if (tail < sizeof(int))
+ tail = 0;
+
+ /* split if warranted */
+ if (head || tail) {
+ pcpu_split_block(chunk, i, head, tail);
+ if (head) {
+ i++;
+ off += head;
+ max_contig = max(chunk->map[i - 1], max_contig);
+ }
+ if (tail)
+ max_contig = max(chunk->map[i + 1], max_contig);
+ }
+
+ /* update hint and mark allocated */
+ if (is_last)
+ chunk->contig_hint = max_contig; /* fully scanned */
+ else
+ chunk->contig_hint = max(chunk->contig_hint,
+ max_contig);
+
+ chunk->free_size -= chunk->map[i];
+ chunk->map[i] = -chunk->map[i];
+
+ pcpu_chunk_relocate(chunk, oslot);
+ return off;
+ }
+
+ chunk->contig_hint = max_contig; /* fully scanned */
+ pcpu_chunk_relocate(chunk, oslot);
+
+ /* tell the upper layer that this chunk has no matching area */
+ return -1;
+}
+
+/**
+ * pcpu_free_area - free area to a pcpu_chunk
+ * @chunk: chunk of interest
+ * @freeme: offset of area to free
+ *
+ * Free area starting from @freeme to @chunk. Note that this function
+ * only modifies the allocation map. It doesn't depopulate or unmap
+ * the area.
+ *
+ * CONTEXT:
+ * pcpu_lock.
+ */
+static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
+{
+ int oslot = pcpu_chunk_slot(chunk);
+ int i, off;
+
+ for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++]))
+ if (off == freeme)
+ break;
+ BUG_ON(off != freeme);
+ BUG_ON(chunk->map[i] > 0);
+
+ chunk->map[i] = -chunk->map[i];
+ chunk->free_size += chunk->map[i];
+
+ /* merge with previous? */
+ if (i > 0 && chunk->map[i - 1] >= 0) {
+ chunk->map[i - 1] += chunk->map[i];
+ chunk->map_used--;
+ memmove(&chunk->map[i], &chunk->map[i + 1],
+ (chunk->map_used - i) * sizeof(chunk->map[0]));
+ i--;
+ }
+ /* merge with next? */
+ if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) {
+ chunk->map[i] += chunk->map[i + 1];
+ chunk->map_used--;
+ memmove(&chunk->map[i + 1], &chunk->map[i + 2],
+ (chunk->map_used - (i + 1)) * sizeof(chunk->map[0]));
+ }
+
+ chunk->contig_hint = max(chunk->map[i], chunk->contig_hint);
+ pcpu_chunk_relocate(chunk, oslot);
+}
+
+/**
+ * pcpu_unmap - unmap pages out of a pcpu_chunk
+ * @chunk: chunk of interest
+ * @page_start: page index of the first page to unmap
+ * @page_end: page index of the last page to unmap + 1
+ * @flush: whether to flush cache and tlb or not
+ *
+ * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
+ * If @flush is true, vcache is flushed before unmapping and tlb
+ * after.
+ */
+static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
+ bool flush)
+{
+ unsigned int last = num_possible_cpus() - 1;
+ unsigned int cpu;
+
+ /* unmap must not be done on immutable chunk */
+ WARN_ON(chunk->immutable);
+
+ /*
+ * Each flushing trial can be very expensive, issue flush on
+ * the whole region at once rather than doing it for each cpu.
+ * This could be an overkill but is more scalable.
+ */
+ if (flush)
+ flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
+ pcpu_chunk_addr(chunk, last, page_end));
+
+ for_each_possible_cpu(cpu)
+ unmap_kernel_range_noflush(
+ pcpu_chunk_addr(chunk, cpu, page_start),
+ (page_end - page_start) << PAGE_SHIFT);
+
+ /* ditto as flush_cache_vunmap() */
+ if (flush)
+ flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
+ pcpu_chunk_addr(chunk, last, page_end));
+}
+
+/**
+ * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
+ * @chunk: chunk to depopulate
+ * @off: offset to the area to depopulate
+ * @size: size of the area to depopulate in bytes
+ * @flush: whether to flush cache and tlb or not
+ *
+ * For each cpu, depopulate and unmap pages [@page_start,@page_end)
+ * from @chunk. If @flush is true, vcache is flushed before unmapping
+ * and tlb after.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex.
+ */
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
+ bool flush)
+{
+ int page_start = PFN_DOWN(off);
+ int page_end = PFN_UP(off + size);
+ int unmap_start = -1;
+ int uninitialized_var(unmap_end);
+ unsigned int cpu;
+ int i;
+
+ for (i = page_start; i < page_end; i++) {
+ for_each_possible_cpu(cpu) {
+ struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
+
+ if (!*pagep)
+ continue;
+
+ __free_page(*pagep);
+
+ /*
+ * If it's partial depopulation, it might get
+ * populated or depopulated again. Mark the
+ * page gone.
+ */
+ *pagep = NULL;
+
+ unmap_start = unmap_start < 0 ? i : unmap_start;
+ unmap_end = i + 1;
+ }
+ }
+
+ if (unmap_start >= 0)
+ pcpu_unmap(chunk, unmap_start, unmap_end, flush);
+}
+
+/**
+ * pcpu_map - map pages into a pcpu_chunk
+ * @chunk: chunk of interest
+ * @page_start: page index of the first page to map
+ * @page_end: page index of the last page to map + 1
+ *
+ * For each cpu, map pages [@page_start,@page_end) into @chunk.
+ * vcache is flushed afterwards.
+ */
+static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
+{
+ unsigned int last = num_possible_cpus() - 1;
+ unsigned int cpu;
+ int err;
+
+ /* map must not be done on immutable chunk */
+ WARN_ON(chunk->immutable);
+
+ for_each_possible_cpu(cpu) {
+ err = map_kernel_range_noflush(
+ pcpu_chunk_addr(chunk, cpu, page_start),
+ (page_end - page_start) << PAGE_SHIFT,
+ PAGE_KERNEL,
+ pcpu_chunk_pagep(chunk, cpu, page_start));
+ if (err < 0)
+ return err;
+ }
+
+ /* flush at once, please read comments in pcpu_unmap() */
+ flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
+ pcpu_chunk_addr(chunk, last, page_end));
+ return 0;
+}
+
+/**
+ * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
+ * @chunk: chunk of interest
+ * @off: offset to the area to populate
+ * @size: size of the area to populate in bytes
+ *
+ * For each cpu, populate and map pages [@page_start,@page_end) into
+ * @chunk. The area is cleared on return.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex, does GFP_KERNEL allocation.
+ */
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
+{
+ const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
+ int page_start = PFN_DOWN(off);
+ int page_end = PFN_UP(off + size);
+ int map_start = -1;
+ int uninitialized_var(map_end);
+ unsigned int cpu;
+ int i;
+
+ for (i = page_start; i < page_end; i++) {
+ if (pcpu_chunk_page_occupied(chunk, i)) {
+ if (map_start >= 0) {
+ if (pcpu_map(chunk, map_start, map_end))
+ goto err;
+ map_start = -1;
+ }
+ continue;
+ }
+
+ map_start = map_start < 0 ? i : map_start;
+ map_end = i + 1;
+
+ for_each_possible_cpu(cpu) {
+ struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
+
+ *pagep = alloc_pages_node(cpu_to_node(cpu),
+ alloc_mask, 0);
+ if (!*pagep)
+ goto err;
+ }
+ }
+
+ if (map_start >= 0 && pcpu_map(chunk, map_start, map_end))
+ goto err;
+
+ for_each_possible_cpu(cpu)
+ memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0,
+ size);
+
+ return 0;
+err:
+ /* likely under heavy memory pressure, give memory back */
+ pcpu_depopulate_chunk(chunk, off, size, true);
+ return -ENOMEM;
+}
+
+static void free_pcpu_chunk(struct pcpu_chunk *chunk)
+{
+ if (!chunk)
+ return;
+ if (chunk->vm)
+ free_vm_area(chunk->vm);
+ pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
+ kfree(chunk);
+}
+
+static struct pcpu_chunk *alloc_pcpu_chunk(void)
+{
+ struct pcpu_chunk *chunk;
+
+ chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL);
+ if (!chunk)
+ return NULL;
+
+ chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
+ chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
+ chunk->map[chunk->map_used++] = pcpu_unit_size;
+ chunk->page = chunk->page_ar;
+
+ chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
+ if (!chunk->vm) {
+ free_pcpu_chunk(chunk);
+ return NULL;
+ }
+
+ INIT_LIST_HEAD(&chunk->list);
+ chunk->free_size = pcpu_unit_size;
+ chunk->contig_hint = pcpu_unit_size;
+
+ return chunk;
+}
+
+/**
+ * pcpu_alloc - the percpu allocator
+ * @size: size of area to allocate in bytes
+ * @align: alignment of area (max PAGE_SIZE)
+ * @reserved: allocate from the reserved chunk if available
+ *
+ * Allocate percpu area of @size bytes aligned at @align.
+ *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
+ *
+ * RETURNS:
+ * Percpu pointer to the allocated area on success, NULL on failure.
+ */
+static void *pcpu_alloc(size_t size, size_t align, bool reserved)
+{
+ struct pcpu_chunk *chunk;
+ int slot, off;
+
+ if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
+ WARN(true, "illegal size (%zu) or align (%zu) for "
+ "percpu allocation\n", size, align);
+ return NULL;
+ }
+
+ mutex_lock(&pcpu_alloc_mutex);
+ spin_lock_irq(&pcpu_lock);
+
+ /* serve reserved allocations from the reserved chunk if available */
+ if (reserved && pcpu_reserved_chunk) {
+ chunk = pcpu_reserved_chunk;
+ if (size > chunk->contig_hint ||
+ pcpu_extend_area_map(chunk) < 0)
+ goto fail_unlock;
+ off = pcpu_alloc_area(chunk, size, align);
+ if (off >= 0)
+ goto area_found;
+ goto fail_unlock;
+ }
+
+restart:
+ /* search through normal chunks */
+ for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
+ list_for_each_entry(chunk, &pcpu_slot[slot], list) {
+ if (size > chunk->contig_hint)
+ continue;
+
+ switch (pcpu_extend_area_map(chunk)) {
+ case 0:
+ break;
+ case 1:
+ goto restart; /* pcpu_lock dropped, restart */
+ default:
+ goto fail_unlock;
+ }
+
+ off = pcpu_alloc_area(chunk, size, align);
+ if (off >= 0)
+ goto area_found;
+ }