summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-09-06 21:33:12 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2017-09-06 21:33:12 -0700
commita7cbfd05f427f8f1164bc53866971e89a0cbe103 (patch)
tree05d68a80d4d6635800a53e60e2638ebebc7a5761 /mm
parentd34fc1adf01ff87026da85fb972dc259dc347540 (diff)
parent5e81ee3e6a79cc9fa85af5c3db0f1f269709bbf1 (diff)
Merge branch 'for-4.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu
Pull percpu updates from Tejun Heo: "A lot of changes for percpu this time around. percpu inherited the same area allocator from the original pre-virtual-address-mapped implementation. This was from the time when percpu allocator wasn't used all that much and the implementation was focused on simplicity, with the unfortunate computational complexity of O(number of areas allocated from the chunk) per alloc / free. With the increase in percpu usage, we're hitting cases where the lack of scalability is hurting. The most prominent one right now is bpf perpcu map creation / destruction which may allocate and free a lot of entries consecutively and it's likely that the problem will become more prominent in the future. To address the issue, Dennis replaced the area allocator with hinted bitmap allocator which is more consistent. While the new allocator does perform a bit worse in some cases, it outperforms the old allocator way more than an order of magnitude in other more common scenarios while staying mostly flat in CPU overhead and completely flat in memory consumption" * 'for-4.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu: (27 commits) percpu: update header to contain bitmap allocator explanation. percpu: update pcpu_find_block_fit to use an iterator percpu: use metadata blocks to update the chunk contig hint percpu: update free path to take advantage of contig hints percpu: update alloc path to only scan if contig hints are broken percpu: keep track of the best offset for contig hints percpu: skip chunks if the alloc does not fit in the contig hint percpu: add first_bit to keep track of the first free in the bitmap percpu: introduce bitmap metadata blocks percpu: replace area map allocator with bitmap percpu: generalize bitmap (un)populated iterators percpu: increase minimum percpu allocation size and align first regions percpu: introduce nr_empty_pop_pages to help empty page accounting percpu: change the number of pages marked in the first_chunk pop bitmap percpu: combine percpu address checks percpu: modify base_addr to be region specific percpu: setup_first_chunk rename schunk/dchunk to chunk percpu: end chunk area maps page aligned for the populated bitmap percpu: unify allocation of schunk and dchunk percpu: setup_first_chunk remove dyn_size and consolidate logic ...
Diffstat (limited to 'mm')
-rw-r--r--mm/percpu-internal.h82
-rw-r--r--mm/percpu-km.c2
-rw-r--r--mm/percpu-stats.c111
-rw-r--r--mm/percpu.c1522
4 files changed, 1093 insertions, 624 deletions
diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
index cd2442e13d8f..7065faf74b46 100644
--- a/mm/percpu-internal.h
+++ b/mm/percpu-internal.h
@@ -4,6 +4,22 @@
#include <linux/types.h>
#include <linux/percpu.h>
+/*
+ * pcpu_block_md is the metadata block struct.
+ * Each chunk's bitmap is split into a number of full blocks.
+ * All units are in terms of bits.
+ */
+struct pcpu_block_md {
+ int contig_hint; /* contig hint for block */
+ int contig_hint_start; /* block relative starting
+ position of the contig hint */
+ int left_free; /* size of free space along
+ the left side of the block */
+ int right_free; /* size of free space along
+ the right side of the block */
+ int first_free; /* block position of first free */
+};
+
struct pcpu_chunk {
#ifdef CONFIG_PERCPU_STATS
int nr_alloc; /* # of allocations */
@@ -11,24 +27,29 @@ struct pcpu_chunk {
#endif
struct list_head list; /* linked to pcpu_slot lists */
- int free_size; /* free bytes in the chunk */
- int contig_hint; /* max contiguous size hint */
+ int free_bytes; /* free bytes in the chunk */
+ int contig_bits; /* max contiguous size hint */
+ int contig_bits_start; /* contig_bits starting
+ offset */
void *base_addr; /* base address of this chunk */
- int map_used; /* # of map entries used before the sentry */
- int map_alloc; /* # of map entries allocated */
- int *map; /* allocation map */
- struct list_head map_extend_list;/* on pcpu_map_extend_chunks */
+ unsigned long *alloc_map; /* allocation map */
+ unsigned long *bound_map; /* boundary map */
+ struct pcpu_block_md *md_blocks; /* metadata blocks */
void *data; /* chunk data */
- int first_free; /* no free below this */
+ int first_bit; /* no free below this */
bool immutable; /* no [de]population allowed */
- bool has_reserved; /* Indicates if chunk has reserved space
- at the beginning. Reserved chunk will
- contain reservation for static chunk.
- Dynamic chunk will contain reservation
- for static and reserved chunks. */
+ int start_offset; /* the overlap with the previous
+ region to have a page aligned
+ base_addr */
+ int end_offset; /* additional area required to
+ have the region end page
+ aligned */
+
+ int nr_pages; /* # of pages served by this chunk */
int nr_populated; /* # of populated pages */
+ int nr_empty_pop_pages; /* # of empty populated pages */
unsigned long populated[]; /* populated bitmap */
};
@@ -36,10 +57,47 @@ extern spinlock_t pcpu_lock;
extern struct list_head *pcpu_slot;
extern int pcpu_nr_slots;
+extern int pcpu_nr_empty_pop_pages;
extern struct pcpu_chunk *pcpu_first_chunk;
extern struct pcpu_chunk *pcpu_reserved_chunk;
+/**
+ * pcpu_chunk_nr_blocks - converts nr_pages to # of md_blocks
+ * @chunk: chunk of interest
+ *
+ * This conversion is from the number of physical pages that the chunk
+ * serves to the number of bitmap blocks used.
+ */
+static inline int pcpu_chunk_nr_blocks(struct pcpu_chunk *chunk)
+{
+ return chunk->nr_pages * PAGE_SIZE / PCPU_BITMAP_BLOCK_SIZE;
+}
+
+/**
+ * pcpu_nr_pages_to_map_bits - converts the pages to size of bitmap
+ * @pages: number of physical pages
+ *
+ * This conversion is from physical pages to the number of bits
+ * required in the bitmap.
+ */
+static inline int pcpu_nr_pages_to_map_bits(int pages)
+{
+ return pages * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
+}
+
+/**
+ * pcpu_chunk_map_bits - helper to convert nr_pages to size of bitmap
+ * @chunk: chunk of interest
+ *
+ * This conversion is from the number of physical pages that the chunk
+ * serves to the number of bits in the bitmap.
+ */
+static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
+{
+ return pcpu_nr_pages_to_map_bits(chunk->nr_pages);
+}
+
#ifdef CONFIG_PERCPU_STATS
#include <linux/spinlock.h>
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index eb58aa4c0997..d2a76642c4ae 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -69,7 +69,7 @@ static struct pcpu_chunk *pcpu_create_chunk(void)
chunk->base_addr = page_address(pages) - pcpu_group_offsets[0];
spin_lock_irq(&pcpu_lock);
- pcpu_chunk_populated(chunk, 0, nr_pages);
+ pcpu_chunk_populated(chunk, 0, nr_pages, false);
spin_unlock_irq(&pcpu_lock);
pcpu_stats_chunk_alloc();
diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c
index 03524a56eeff..6142484e88f7 100644
--- a/mm/percpu-stats.c
+++ b/mm/percpu-stats.c
@@ -18,7 +18,7 @@
#include "percpu-internal.h"
#define P(X, Y) \
- seq_printf(m, " %-24s: %8lld\n", X, (long long int)Y)
+ seq_printf(m, " %-20s: %12lld\n", X, (long long int)Y)
struct percpu_stats pcpu_stats;
struct pcpu_alloc_info pcpu_stats_ai;
@@ -29,64 +29,85 @@ static int cmpint(const void *a, const void *b)
}
/*
- * Iterates over all chunks to find the max # of map entries used.
+ * Iterates over all chunks to find the max nr_alloc entries.
*/
-static int find_max_map_used(void)
+static int find_max_nr_alloc(void)
{
struct pcpu_chunk *chunk;
- int slot, max_map_used;
+ int slot, max_nr_alloc;
- max_map_used = 0;
+ max_nr_alloc = 0;
for (slot = 0; slot < pcpu_nr_slots; slot++)
list_for_each_entry(chunk, &pcpu_slot[slot], list)
- max_map_used = max(max_map_used, chunk->map_used);
+ max_nr_alloc = max(max_nr_alloc, chunk->nr_alloc);
- return max_map_used;
+ return max_nr_alloc;
}
/*
* Prints out chunk state. Fragmentation is considered between
* the beginning of the chunk to the last allocation.
+ *
+ * All statistics are in bytes unless stated otherwise.
*/
static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
- void *buffer)
+ int *buffer)
{
- int i, s_index, last_alloc, alloc_sign, as_len;
+ int i, last_alloc, as_len, start, end;
int *alloc_sizes, *p;
/* statistics */
int sum_frag = 0, max_frag = 0;
int cur_min_alloc = 0, cur_med_alloc = 0, cur_max_alloc = 0;
alloc_sizes = buffer;
- s_index = chunk->has_reserved ? 1 : 0;
-
- /* find last allocation */
- last_alloc = -1;
- for (i = chunk->map_used - 1; i >= s_index; i--) {
- if (chunk->map[i] & 1) {
- last_alloc = i;
- break;
- }
- }
- /* if the chunk is not empty - ignoring reserve */
- if (last_alloc >= s_index) {
- as_len = last_alloc + 1 - s_index;
-
- /*
- * Iterate through chunk map computing size info.
- * The first bit is overloaded to be a used flag.
- * negative = free space, positive = allocated
- */
- for (i = 0, p = chunk->map + s_index; i < as_len; i++, p++) {
- alloc_sign = (*p & 1) ? 1 : -1;
- alloc_sizes[i] = alloc_sign *
- ((p[1] & ~1) - (p[0] & ~1));
+ /*
+ * find_last_bit returns the start value if nothing found.
+ * Therefore, we must determine if it is a failure of find_last_bit
+ * and set the appropriate value.
+ */
+ last_alloc = find_last_bit(chunk->alloc_map,
+ pcpu_chunk_map_bits(chunk) -
+ chunk->end_offset / PCPU_MIN_ALLOC_SIZE - 1);
+ last_alloc = test_bit(last_alloc, chunk->alloc_map) ?
+ last_alloc + 1 : 0;
+
+ as_len = 0;
+ start = chunk->start_offset;
+
+ /*
+ * If a bit is set in the allocation map, the bound_map identifies
+ * where the allocation ends. If the allocation is not set, the
+ * bound_map does not identify free areas as it is only kept accurate
+ * on allocation, not free.
+ *
+ * Positive values are allocations and negative values are free
+ * fragments.
+ */
+ while (start < last_alloc) {
+ if (test_bit(start, chunk->alloc_map)) {
+ end = find_next_bit(chunk->bound_map, last_alloc,
+ start + 1);
+ alloc_sizes[as_len] = 1;
+ } else {
+ end = find_next_bit(chunk->alloc_map, last_alloc,
+ start + 1);
+ alloc_sizes[as_len] = -1;
}
- sort(alloc_sizes, as_len, sizeof(chunk->map[0]), cmpint, NULL);
+ alloc_sizes[as_len++] *= (end - start) * PCPU_MIN_ALLOC_SIZE;
+
+ start = end;
+ }
+
+ /*
+ * The negative values are free fragments and thus sorting gives the
+ * free fragments at the beginning in largest first order.
+ */
+ if (as_len > 0) {
+ sort(alloc_sizes, as_len, sizeof(int), cmpint, NULL);
- /* Iterate through the unallocated fragements. */
+ /* iterate through the unallocated fragments */
for (i = 0, p = alloc_sizes; *p < 0 && i < as_len; i++, p++) {
sum_frag -= *p;
max_frag = max(max_frag, -1 * (*p));
@@ -99,8 +120,10 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
P("nr_alloc", chunk->nr_alloc);
P("max_alloc_size", chunk->max_alloc_size);
- P("free_size", chunk->free_size);
- P("contig_hint", chunk->contig_hint);
+ P("empty_pop_pages", chunk->nr_empty_pop_pages);
+ P("first_bit", chunk->first_bit);
+ P("free_bytes", chunk->free_bytes);
+ P("contig_bytes", chunk->contig_bits * PCPU_MIN_ALLOC_SIZE);
P("sum_frag", sum_frag);
P("max_frag", max_frag);
P("cur_min_alloc", cur_min_alloc);
@@ -112,29 +135,30 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
static int percpu_stats_show(struct seq_file *m, void *v)
{
struct pcpu_chunk *chunk;
- int slot, max_map_used;
- void *buffer;
+ int slot, max_nr_alloc;
+ int *buffer;
alloc_buffer:
spin_lock_irq(&pcpu_lock);
- max_map_used = find_max_map_used();
+ max_nr_alloc = find_max_nr_alloc();
spin_unlock_irq(&pcpu_lock);
- buffer = vmalloc(max_map_used * sizeof(pcpu_first_chunk->map[0]));
+ /* there can be at most this many free and allocated fragments */
+ buffer = vmalloc((2 * max_nr_alloc + 1) * sizeof(int));
if (!buffer)
return -ENOMEM;
spin_lock_irq(&pcpu_lock);
/* if the buffer allocated earlier is too small */
- if (max_map_used < find_max_map_used()) {
+ if (max_nr_alloc < find_max_nr_alloc()) {
spin_unlock_irq(&pcpu_lock);
vfree(buffer);
goto alloc_buffer;
}
#define PL(X) \
- seq_printf(m, " %-24s: %8lld\n", #X, (long long int)pcpu_stats_ai.X)
+ seq_printf(m, " %-20s: %12lld\n", #X, (long long int)pcpu_stats_ai.X)
seq_printf(m,
"Percpu Memory Statistics\n"
@@ -151,7 +175,7 @@ alloc_buffer:
#undef PL
#define PU(X) \
- seq_printf(m, " %-18s: %14llu\n", #X, (unsigned long long)pcpu_stats.X)
+ seq_printf(m, " %-20s: %12llu\n", #X, (unsigned long long)pcpu_stats.X)
seq_printf(m,
"Global Stats:\n"
@@ -164,6 +188,7 @@ alloc_buffer:
PU(nr_max_chunks);
PU(min_alloc_size);
PU(max_alloc_size);
+ P("empty_pop_pages", pcpu_nr_empty_pop_pages);
seq_putc(m, '\n');
#undef PU
diff --git a/mm/percpu.c b/mm/percpu.c
index bd4130a69bbc..59d44d61f5f1 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -4,44 +4,53 @@
* Copyright (C) 2009 SUSE Linux Products GmbH
* Copyright (C) 2009 Tejun Heo <tj@kernel.org>
*
- * This file is released under the GPLv2.
+ * Copyright (C) 2017 Facebook Inc.
+ * Copyright (C) 2017 Dennis Zhou <dennisszhou@gmail.com>
*
- * This is percpu allocator which can handle both static and dynamic
- * areas. Percpu areas are allocated in chunks. Each chunk is
- * consisted of boot-time determined number of units and the first
- * chunk is used for static percpu variables in the kernel image
- * (special boot time alloc/init handling necessary as these areas
- * need to be brought up before allocation services are running).
- * Unit grows as necessary and all units grow or shrink in unison.
- * When a chunk is filled up, another chunk is allocated.
+ * This file is released under the GPLv2 license.
+ *
+ * The percpu allocator handles both static and dynamic areas. Percpu
+ * areas are allocated in chunks which are divided into units. There is
+ * a 1-to-1 mapping for units to possible cpus. These units are grouped
+ * based on NUMA properties of the machine.
*
* c0 c1 c2
* ------------------- ------------------- ------------
* | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u
* ------------------- ...... ------------------- .... ------------
*
- * Allocation is done in offset-size areas of single unit space. Ie,
- * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
- * c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to
- * cpus. On NUMA, the mapping can be non-linear and even sparse.
- * Percpu access can be done by configuring percpu base registers
- * according to cpu to unit mapping and pcpu_unit_size.
- *
- * There are usually many small percpu allocations many of them being
- * as small as 4 bytes. The allocator organizes chunks into lists
- * according to free size and tries to allocate from the fullest one.
- * Each chunk keeps the maximum contiguous area size hint which is
- * guaranteed to be equal to or larger than the maximum contiguous
- * area in the chunk. This helps the allocator not to iterate the
- * chunk maps unnecessarily.
- *
- * Allocation state in each chunk is kept using an array of integers
- * on chunk->map. A positive value in the map represents a free
- * region and negative allocated. Allocation inside a chunk is done
- * by scanning this map sequentially and serving the first matching
- * entry. This is mostly copied from the percpu_modalloc() allocator.
- * Chunks can be determined from the address using the index field
- * in the page struct. The index field contains a pointer to the chunk.
+ * Allocation is done by offsets into a unit's address space. Ie., an
+ * area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0,
+ * c1:u1, c1:u2, etc. On NUMA machines, the mapping may be non-linear
+ * and even sparse. Access is handled by configuring percpu base
+ * registers according to the cpu to unit mappings and offsetting the
+ * base address using pcpu_unit_size.
+ *
+ * There is special consideration for the first chunk which must handle
+ * the static percpu variables in the kernel image as allocation services
+ * are not online yet. In short, the first chunk is structured like so:
+ *
+ * <Static | [Reserved] | Dynamic>
+ *
+ * The static data is copied from the original section managed by the
+ * linker. The reserved section, if non-zero, primarily manages static
+ * percpu variables from kernel modules. Finally, the dynamic section
+ * takes care of normal allocations.
+ *
+ * The allocator organizes chunks into lists according to free size and
+ * tries to allocate from the fullest chunk first. Each chunk is managed
+ * by a bitmap with metadata blocks. The allocation map is updated on
+ * every allocation and free to reflect the current state while the boundary
+ * map is only updated on allocation. Each metadata block contains
+ * information to help mitigate the need to iterate over large portions
+ * of the bitmap. The reverse mapping from page to chunk is stored in
+ * the page's index. Lastly, units are lazily backed and grow in unison.
+ *
+ * There is a unique conversion that goes on here between bytes and bits.
+ * Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE. The chunk
+ * tracks the number of pages it is responsible for in nr_pages. Helper
+ * functions are used to convert from between the bytes, bits, and blocks.
+ * All hints are managed in bits unless explicitly stated.
*
* To use this allocator, arch code should do the following:
*
@@ -58,6 +67,7 @@
#include <linux/bitmap.h>
#include <linux/bootmem.h>
#include <linux/err.h>
+#include <linux/lcm.h>
#include <linux/list.h>
#include <linux/log2.h>
#include <linux/mm.h>
@@ -81,10 +91,9 @@
#include "percpu-internal.h"
-#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */
-#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
-#define PCPU_ATOMIC_MAP_MARGIN_LOW 32
-#define PCPU_ATOMIC_MAP_MARGIN_HIGH 64
+/* the slots are sorted by free bytes left, 1-31 bytes share the same slot */
+#define PCPU_SLOT_BASE_SHIFT 5
+
#define PCPU_EMPTY_POP_PAGES_LOW 2
#define PCPU_EMPTY_POP_PAGES_HIGH 4
@@ -140,13 +149,10 @@ struct pcpu_chunk *pcpu_first_chunk __ro_after_init;
/*
* Optional reserved chunk. This chunk reserves part of the first
- * chunk and serves it for reserved allocations. The amount of
- * reserved offset is in pcpu_reserved_chunk_limit. When reserved
- * area doesn't exist, the following variables contain NULL and 0
- * respectively.
+ * chunk and serves it for reserved allocations. When the reserved
+ * region doesn't exist, the following variable is NULL.
*/
struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;
-static int pcpu_reserved_chunk_limit __ro_after_init;
DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */
static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */
@@ -160,7 +166,7 @@ static LIST_HEAD(pcpu_map_extend_chunks);
* The number of empty populated pages, protected by pcpu_lock. The
* reserved chunk doesn't contribute to the count.
*/
-static int pcpu_nr_empty_pop_pages;
+int pcpu_nr_empty_pop_pages;
/*
* Balance work is used to populate or destroy chunks asynchronously. We
@@ -179,19 +185,26 @@ static void pcpu_schedule_balance_work(void)
schedule_work(&pcpu_balance_work);
}
-static bool pcpu_addr_in_first_chunk(void *addr)
+/**
+ * pcpu_addr_in_chunk - check if the address is served from this chunk
+ * @chunk: chunk of interest
+ * @addr: percpu address
+ *
+ * RETURNS:
+ * True if the address is served from this chunk.
+ */
+static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr)
{
- void *first_start = pcpu_first_chunk->base_addr;
+ void *start_addr, *end_addr;
- return addr >= first_start && addr < first_start + pcpu_unit_size;
-}
+ if (!chunk)
+ return false;
-static bool pcpu_addr_in_reserved_chunk(void *addr)
-{
- void *first_start = pcpu_first_chunk->base_addr;
+ start_addr = chunk->base_addr + chunk->start_offset;
+ end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE -
+ chunk->end_offset;
- return addr >= first_start &&
- addr < first_start + pcpu_reserved_chunk_limit;
+ return addr >= start_addr && addr < end_addr;
}
static int __pcpu_size_to_slot(int size)
@@ -209,10 +222,10 @@ static int pcpu_size_to_slot(int size)
static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
{
- if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
+ if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE || chunk->contig_bits == 0)
return 0;
- return pcpu_size_to_slot(chunk->free_size);
+ return pcpu_size_to_slot(chunk->free_bytes);
}
/* set the pointer to a chunk in a page struct */
@@ -232,42 +245,200 @@ static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
}
+static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx)
+{
+ return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT);
+}
+
static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
unsigned int cpu, int page_idx)
{
- return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] +
- (page_idx << PAGE_SHIFT);
+ return (unsigned long)chunk->base_addr +
+ pcpu_unit_page_offset(cpu, page_idx);
}
-static void __maybe_unused pcpu_next_unpop(struct pcpu_chunk *chunk,
- int *rs, int *re, int end)
+static void pcpu_next_unpop(unsigned long *bitmap, int *rs, int *re, int end)
{
- *rs = find_next_zero_bit(chunk->populated, end, *rs);
- *re = find_next_bit(chunk->populated, end, *rs + 1);
+ *rs = find_next_zero_bit(bitmap, end, *rs);
+ *re = find_next_bit(bitmap, end, *rs + 1);
}
-static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
- int *rs, int *re, int end)
+static void pcpu_next_pop(unsigned long *bitmap, int *rs, int *re, int end)
{
- *rs = find_next_bit(chunk->populated, end, *rs);
- *re = find_next_zero_bit(chunk->populated, end, *rs + 1);
+ *rs = find_next_bit(bitmap, end, *rs);
+ *re = find_next_zero_bit(bitmap, end, *rs + 1);
}
/*
- * (Un)populated page region iterators. Iterate over (un)populated
- * page regions between @start and @end in @chunk. @rs and @re should
- * be integer variables and will be set to start and end page index of
- * the current region.
+ * Bitmap region iterators. Iterates over the bitmap between
+ * [@start, @end) in @chunk. @rs and @re should be integer variables
+ * and will be set to start and end index of the current free region.
+ */
+#define pcpu_for_each_unpop_region(bitmap, rs, re, start, end) \
+ for ((rs) = (start), pcpu_next_unpop((bitmap), &(rs), &(re), (end)); \
+ (rs) < (re); \
+ (rs) = (re) + 1, pcpu_next_unpop((bitmap), &(rs), &(re), (end)))
+
+#define pcpu_for_each_pop_region(bitmap, rs, re, start, end) \
+ for ((rs) = (start), pcpu_next_pop((bitmap), &(rs), &(re), (end)); \
+ (rs) < (re); \
+ (rs) = (re) + 1, pcpu_next_pop((bitmap), &(rs), &(re), (end)))
+
+/*
+ * The following are helper functions to help access bitmaps and convert
+ * between bitmap offsets to address offsets.
+ */
+static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index)
+{
+ return chunk->alloc_map +
+ (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG);
+}
+
+static unsigned long pcpu_off_to_block_index(int off)
+{
+ return off / PCPU_BITMAP_BLOCK_BITS;
+}
+
+static unsigned long pcpu_off_to_block_off(int off)
+{
+ return off & (PCPU_BITMAP_BLOCK_BITS - 1);
+}
+
+static unsigned long pcpu_block_off_to_off(int index, int off)
+{
+ return index * PCPU_BITMAP_BLOCK_BITS + off;
+}
+
+/**
+ * pcpu_next_md_free_region - finds the next hint free area
+ * @chunk: chunk of interest
+ * @bit_off: chunk offset
+ * @bits: size of free area
+ *
+ * Helper function for pcpu_for_each_md_free_region. It checks
+ * block->contig_hint and performs aggregation across blocks to find the
+ * next hint. It modifies bit_off and bits in-place to be consumed in the
+ * loop.
+ */
+static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off,
+ int *bits)
+{
+ int i = pcpu_off_to_block_index(*bit_off);
+ int block_off = pcpu_off_to_block_off(*bit_off);
+ struct pcpu_block_md *block;
+
+ *bits = 0;
+ for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
+ block++, i++) {
+ /* handles contig area across blocks */
+ if (*bits) {
+ *bits += block->left_free;
+ if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
+ continue;
+ return;
+ }
+
+ /*
+ * This checks three things. First is there a contig_hint to
+ * check. Second, have we checked this hint before by
+ * comparing the block_off. Third, is this the same as the
+ * right contig hint. In the last case, it spills over into
+ * the next block and should be handled by the contig area
+ * across blocks code.
+ */
+ *bits = block->contig_hint;
+ if (*bits && block->contig_hint_start >= block_off &&
+ *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) {
+ *bit_off = pcpu_block_off_to_off(i,
+ block->contig_hint_start);
+ return;
+ }
+
+ *bits = block->right_free;
+ *bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free;
+ }
+}
+
+/**
+ * pcpu_next_fit_region - finds fit areas for a given allocation request
+ * @chunk: chunk of interest
+ * @alloc_bits: size of allocation
+ * @align: alignment of area (max PAGE_SIZE)
+ * @bit_off: chunk offset
+ * @bits: size of free area
+ *
+ * Finds the next free region that is viable for use with a given size and
+ * alignment. This only returns if there is a valid area to be used for this
+ * allocation. block->first_free is returned if the allocation request fits
+ * within the block to see if the request can be fulfilled prior to the contig
+ * hint.
*/
-#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \
- for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \
- (rs) < (re); \
- (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end)))
+static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
+ int align, int *bit_off, int *bits)
+{
+ int i = pcpu_off_to_block_index(*bit_off);
+ int block_off = pcpu_off_to_block_off(*bit_off);
+ struct pcpu_block_md *block;
+
+ *bits = 0;
+ for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
+ block++, i++) {
+ /* handles contig area across blocks */
+ if (*bits) {
+ *bits += block->left_free;
+ if (*bits >= alloc_bits)
+ return;
+ if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
+ continue;
+ }
+
+ /* check block->contig_hint */
+ *bits = ALIGN(block->contig_hint_start, align) -
+ block->contig_hint_start;
+ /*
+ * This uses the block offset to determine if this has been
+ * checked in the prior iteration.
+ */
+ if (block->contig_hint &&
+ block->contig_hint_start >= block_off &&
+ block->contig_hint >= *bits + alloc_bits) {
+ *bits += alloc_bits + block->contig_hint_start -
+ block->first_free;
+ *bit_off = pcpu_block_off_to_off(i, block->first_free);
+ return;
+ }
+
+ *bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free,
+ align);
+ *bits = PCPU_BITMAP_BLOCK_BITS - *bit_off;
+ *bit_off = pcpu_block_off_to_off(i, *bit_off);
+ if (*bits >= alloc_bits)
+ return;
+ }
-#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \
- for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \
- (rs) < (re); \
- (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
+ /* no valid offsets were found - fail condition */
+ *bit_off = pcpu_chunk_map_bits(chunk);
+}
+
+/*
+ * Metadata free area iterators. These perform aggregation of free areas
+ * based on the metadata blocks and return the offset @bit_off and size in
+ * bits of the free area @bits. pcpu_for_each_fit_region only returns when
+ * a fit is found for the allocation request.
+ */
+#define pcpu_for_each_md_free_region(chunk, bit_off, bits) \
+ for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits)); \
+ (bit_off) < pcpu_chunk_map_bits((chunk)); \
+ (bit_off) += (bits) + 1, \
+ pcpu_next_md_free_region((chunk), &(bit_off), &(bits)))
+
+#define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) \
+ for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
+ &(bits)); \
+ (bit_off) < pcpu_chunk_map_bits((chunk)); \
+ (bit_off) += (bits), \
+ pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
+ &(bits)))
/**
* pcpu_mem_zalloc - allocate memory
@@ -306,38 +477,6 @@ static void pcpu_mem_free(void *ptr)
}
/**
- * pcpu_count_occupied_pages - count the number of pages an area occupies
- * @chunk: chunk of interest
- * @i: index of the area in question
- *
- * Count the number of pages chunk's @i'th area occupies. When the area's
- * start and/or end address isn't aligned to page boundary, the straddled
- * page is included in the count iff the rest of the page is free.
- */
-static int pcpu_count_occupied_pages(struct pcpu_chunk *chunk, int i)
-{
- int off = chunk->map[i] & ~1;
- int end = chunk->map[i + 1] & ~1;
-
- if (!PAGE_ALIGNED(off) && i > 0) {
- int prev = chunk->map[i - 1];
-
- if (!(prev & 1) && prev <= round_down(off, PAGE_SIZE))
- off = round_down(off, PAGE_SIZE);
- }
-
- if (!PAGE_ALIGNED(end) && i + 1 < chunk->map_used) {
- int next = chunk->map[i + 1];
- int nend = chunk->map[i + 2] & ~1;
-
- if (!(next & 1) && nend >= round_up(end, PAGE_SIZE))
- end = round_up(end, PAGE_SIZE);
- }
-
- return max_t(int, PFN_DOWN(end) - PFN_UP(off), 0);
-}
-
-/**
* pcpu_chunk_relocate - put chunk in the appropriate chunk slot
* @chunk: chunk of interest
* @oslot: the previous slot it was on
@@ -363,383 +502,706 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
}
/**
- * pcpu_need_to_extend - determine whether chunk area map needs to be extended
+ * pcpu_cnt_pop_pages- counts populated backing pages in range
* @chunk: chunk of interest
- * @is_atomic: the allocation context
+ * @bit_off: start offset
+ * @bits: size of area to check
*
- * Determine whether area map of @chunk needs to be extended. If
- * @is_atomic, only the amount necessary for a new allocation is
- * considered; however, async extension is scheduled if the left amount is
- * low. If !@is_atomic, it aims for more empty space. Combined, this
- * ensures that the map is likely to have enough available space to
- * accomodate atomic allocations which can't extend maps directly.
- *
- * CONTEXT:
- * pcpu_lock.
+ * Calculates the number of populated pages in the region
+ * [page_start, page_end). This keeps track of how many empty populated
+ * pages are available and decide if async work should be scheduled.
*
* RETURNS:
- * New target map allocation length if extension is necessary, 0
- * otherwise.
+ * The nr of populated pages.
*/
-static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic)
+static inline int pcpu_cnt_pop_pages(struct pcpu_chunk *chunk, int bit_off,
+ int bits)
{
- int margin, new_alloc;
-
- lockdep_assert_held(&pcpu_lock);
-
- if (is_atomic) {
- margin = 3;
+ int page_start = PFN_UP(bit_off * PCPU_MIN_ALLOC_SIZE);
+ int page_end = PFN_DOWN((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
- if (chunk->map_alloc <
- chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) {
- if (list_empty(&chunk->map_extend_list)) {
- list_add_tail(&chunk->map_extend_list,
- &pcpu_map_extend_chunks);
- pcpu_schedule_balance_work();
- }
- }
- } else {
- margin = PCPU_ATOMIC_MAP_MARGIN_HIGH;
- }
-
- if (chunk->map_alloc >= chunk->map_used + margin)
+ if (page_start >= page_end)
return 0;
- new_alloc = PCPU_DFL_MAP_ALLOC;
- while (new_alloc < chunk->map_used + margin)
- new_alloc *= 2;
-
- return new_alloc;
+ /*
+ * bitmap_weight counts the number of bits set in a bitmap up to
+ * the specified number of bits. This is counting the populated
+ * pages up to page_end and then subtracting the populated pages
+ * up to page_start to count the populated pages in
+ * [page_start, page_end).
+ */
+ return bitmap_weight(chunk->populated, page_end) -
+ bitmap_weight(chunk->populated, page_start);
}
/**
- * pcpu_extend_area_map - extend area map of a chunk
+ * pcpu_chunk_update - updates the chunk metadata given a free area
* @chunk: chunk of interest
- * @new_alloc: new target allocation length of the area map
+ * @bit_off: chunk offset
+ * @bits: size of free area
*
- * Extend area map of @chunk to have @new_alloc entries.
+ * This updates the chunk's contig hint and starting offset given a free area.
+ * Choose the best starting offset if the contig hint is equal.
+ */
+static void pcpu_chunk_update(struct pcpu_chunk *chunk, int bit_off, int bits)
+{
+ if (bits > chunk->contig_bits) {
+ chunk->contig_bits_start = bit_off;
+ chunk->contig_bits = bits;
+ } else if (bits == chunk->contig_bits && chunk->contig_bits_start &&
+ (!bit_off ||
+ __ffs(bit_off) > __ffs(chunk->contig_bits_start))) {
+ /* use the start with the best alignment */
+ chunk->contig_bits_start = bit_off;
+ }
+}
+
+/**
+ * pcpu_chunk_refresh_hint - updates metadata about a chunk
+ * @chunk: chunk of interest
*
- * CONTEXT:
- * Does GFP_KERNEL allocation. Grabs and releases pcpu_lock.
+ * Iterates over the metadata blocks to find the largest contig area.
+ * It also counts the populated pages and uses the delta to update the
+ * global count.
*
- * RETURNS:
- * 0 on success, -errno on failure.
+ * Updates:
+ * chunk->contig_bits
+ * chunk->contig_bits_start
+ * nr_empty_pop_pages (chunk and global)
*/
-static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
+static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk)
{
- int *old = NULL, *new = NULL;
- size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
- unsigned long flags;
+ int bit_off, bits, nr_empty_pop_pages;
- lockdep_assert_held(&pcpu_alloc_mutex);
+ /* clear metadata */
+ chunk->contig_bits = 0;
- new = pcpu_mem_zalloc(new_size);
- if (!new)
- return -ENOMEM;
+ bit_off = chunk->first_bit;
+ bits = nr_empty_pop_pages = 0;
+ pcpu_for_each_md_free_region(chunk, bit_off, bits) {
+ pcpu_chunk_update(chunk, bit_off, bits);
- /* acquire pcpu_lock and switch to new area map */
- spin_lock_irqsave(&pcpu_lock, flags);
+ nr_empty_pop_pages += pcpu_cnt_pop_pages(chunk, bit_off, bits);
+ }
- if (new_alloc <= chunk->map_alloc)
- goto out_unlock;
+ /*
+ * Keep track of nr_empty_pop_pages.
+ *
+ * The chunk maintains the previous number of free pages it held,
+ * so the delta is used to update the global counter. The reserved
+ * chunk is not part of the free page count as they are populated
+ * at init and are special to serving reserved allocations.
+ */
+ if (chunk != pcpu_reserved_chunk)
+ pcpu_nr_empty_pop_pages +=
+ (nr_empty_pop_pa