summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-12-08 21:10:03 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2014-12-08 21:10:03 -0800
commit140dfc9299c33bbfc9350fa061f5ab65cb83df13 (patch)
tree09508691964e277f4835d30f7b9c3962e8cac596 /drivers/md
parentf94784bdb114439eb3a5e62343826887bbf3f37c (diff)
parent1a71d6ffe18c0d0f03fc8531949cc8ed41d702ee (diff)
Merge tag 'dm-3.19-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper updates from Mike Snitzer: - Significant DM thin-provisioning performance improvements to meet performance requirements that were requested by the Gluster distributed filesystem. Specifically, dm-thinp now takes care to aggregate IO that will be issued to the same thinp block before issuing IO to the underlying devices. This really helps improve performance on HW RAID6 devices that have a writeback cache because it avoids RMW in the HW RAID controller. - Some stable fixes: fix leak in DM bufio if integrity profiles were enabled, use memzero_explicit in DM crypt to avoid any potential for information leak, and a DM cache fix to properly mark a cache block dirty if it was promoted to the cache via the overwrite optimization. - A few simple DM persistent data library fixes - DM cache multiqueue policy block promotion improvements. - DM cache discard improvements that take advantage of range (multiblock) discard support in the DM bio-prison. This allows for much more efficient bulk discard processing (e.g. when mkfs.xfs discards the entire device). - Some small optimizations in DM core and RCU deference cleanups - DM core changes to suspend/resume code to introduce the new internal suspend/resume interface that the DM thin-pool target now uses to suspend/resume active thin devices when the thin-pool must suspend/resume. This avoids forcing userspace to track all active thin volumes in a thin-pool when the thin-pool is suspended for the purposes of metadata or data space resize. * tag 'dm-3.19-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (49 commits) dm crypt: use memzero_explicit for on-stack buffer dm space map metadata: fix sm_bootstrap_get_count() dm space map metadata: fix sm_bootstrap_get_nr_blocks() dm bufio: fix memleak when using a dm_buffer's inline bio dm cache: fix spurious cell_defer when dealing with partial block at end of device dm cache: dirty flag was mistakenly being cleared when promoting via overwrite dm cache: only use overwrite optimisation for promotion when in writeback mode dm cache: discard block size must be a multiple of cache block size dm cache: fix a harmless race when working out if a block is discarded dm cache: when reloading a discard bitset allow for a different discard block size dm cache: fix some issues with the new discard range support dm array: if resizing the array is a noop set the new root to the old one dm: use rcu_dereference_protected instead of rcu_dereference dm thin: fix pool_io_hints to avoid looking at max_hw_sectors dm thin: suspend/resume active thin devices when reloading thin-pool dm: enhance internal suspend and resume interface dm thin: do not allow thin device activation while pool is suspended dm: add presuspend_undo hook to target_type dm: return earlier from dm_blk_ioctl if target doesn't implement .ioctl dm thin: remove stale 'trim' message in block comment above pool_message ...
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/dm-bio-prison.c186
-rw-r--r--drivers/md/dm-bio-prison.h28
-rw-r--r--drivers/md/dm-bufio.c226
-rw-r--r--drivers/md/dm-cache-block-types.h11
-rw-r--r--drivers/md/dm-cache-metadata.c34
-rw-r--r--drivers/md/dm-cache-metadata.h6
-rw-r--r--drivers/md/dm-cache-policy-mq.c82
-rw-r--r--drivers/md/dm-cache-target.c378
-rw-r--r--drivers/md/dm-crypt.c2
-rw-r--r--drivers/md/dm-ioctl.c5
-rw-r--r--drivers/md/dm-stats.c2
-rw-r--r--drivers/md/dm-table.c36
-rw-r--r--drivers/md/dm-thin-metadata.c35
-rw-r--r--drivers/md/dm-thin-metadata.h9
-rw-r--r--drivers/md/dm-thin.c760
-rw-r--r--drivers/md/dm.c273
-rw-r--r--drivers/md/dm.h10
-rw-r--r--drivers/md/persistent-data/dm-array.c4
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c8
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.c77
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.h7
21 files changed, 1610 insertions, 569 deletions
diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c
index f752d12081ff..be065300e93c 100644
--- a/drivers/md/dm-bio-prison.c
+++ b/drivers/md/dm-bio-prison.c
@@ -14,68 +14,38 @@
/*----------------------------------------------------------------*/
-struct bucket {
- spinlock_t lock;
- struct hlist_head cells;
-};
+#define MIN_CELLS 1024
struct dm_bio_prison {
+ spinlock_t lock;
mempool_t *cell_pool;
-
- unsigned nr_buckets;
- unsigned hash_mask;
- struct bucket *buckets;
+ struct rb_root cells;
};
-/*----------------------------------------------------------------*/
-
-static uint32_t calc_nr_buckets(unsigned nr_cells)
-{
- uint32_t n = 128;
-
- nr_cells /= 4;
- nr_cells = min(nr_cells, 8192u);
-
- while (n < nr_cells)
- n <<= 1;
-
- return n;
-}
-
static struct kmem_cache *_cell_cache;
-static void init_bucket(struct bucket *b)
-{
- spin_lock_init(&b->lock);
- INIT_HLIST_HEAD(&b->cells);
-}
+/*----------------------------------------------------------------*/
/*
* @nr_cells should be the number of cells you want in use _concurrently_.
* Don't confuse it with the number of distinct keys.
*/
-struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells)
+struct dm_bio_prison *dm_bio_prison_create(void)
{
- unsigned i;
- uint32_t nr_buckets = calc_nr_buckets(nr_cells);
- size_t len = sizeof(struct dm_bio_prison) +
- (sizeof(struct bucket) * nr_buckets);
- struct dm_bio_prison *prison = kmalloc(len, GFP_KERNEL);
+ struct dm_bio_prison *prison = kmalloc(sizeof(*prison), GFP_KERNEL);
if (!prison)
return NULL;
- prison->cell_pool = mempool_create_slab_pool(nr_cells, _cell_cache);
+ spin_lock_init(&prison->lock);
+
+ prison->cell_pool = mempool_create_slab_pool(MIN_CELLS, _cell_cache);
if (!prison->cell_pool) {
kfree(prison);
return NULL;
}
- prison->nr_buckets = nr_buckets;
- prison->hash_mask = nr_buckets - 1;
- prison->buckets = (struct bucket *) (prison + 1);
- for (i = 0; i < nr_buckets; i++)
- init_bucket(prison->buckets + i);
+ prison->cells = RB_ROOT;
return prison;
}
@@ -101,68 +71,73 @@ void dm_bio_prison_free_cell(struct dm_bio_prison *prison,
}
EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell);
-static uint32_t hash_key(struct dm_bio_prison *prison, struct dm_cell_key *key)
+static void __setup_new_cell(struct dm_cell_key *key,
+ struct bio *holder,
+ struct dm_bio_prison_cell *cell)
{
- const unsigned long BIG_PRIME = 4294967291UL;
- uint64_t hash = key->block * BIG_PRIME;
-
- return (uint32_t) (hash & prison->hash_mask);
+ memcpy(&cell->key, key, sizeof(cell->key));
+ cell->holder = holder;
+ bio_list_init(&cell->bios);
}
-static int keys_equal(struct dm_cell_key *lhs, struct dm_cell_key *rhs)
+static int cmp_keys(struct dm_cell_key *lhs,
+ struct dm_cell_key *rhs)
{
- return (lhs->virtual == rhs->virtual) &&
- (lhs->dev == rhs->dev) &&
- (lhs->block == rhs->block);
-}
+ if (lhs->virtual < rhs->virtual)
+ return -1;
-static struct bucket *get_bucket(struct dm_bio_prison *prison,
- struct dm_cell_key *key)
-{
- return prison->buckets + hash_key(prison, key);
-}
+ if (lhs->virtual > rhs->virtual)
+ return 1;
-static struct dm_bio_prison_cell *__search_bucket(struct bucket *b,
- struct dm_cell_key *key)
-{
- struct dm_bio_prison_cell *cell;
+ if (lhs->dev < rhs->dev)
+ return -1;
- hlist_for_each_entry(cell, &b->cells, list)
- if (keys_equal(&cell->key, key))
- return cell;
+ if (lhs->dev > rhs->dev)
+ return 1;
- return NULL;
-}
+ if (lhs->block_end <= rhs->block_begin)
+ return -1;
-static void __setup_new_cell(struct bucket *b,
- struct dm_cell_key *key,
- struct bio *holder,
- struct dm_bio_prison_cell *cell)
-{
- memcpy(&cell->key, key, sizeof(cell->key));
- cell->holder = holder;
- bio_list_init(&cell->bios);
- hlist_add_head(&cell->list, &b->cells);
+ if (lhs->block_begin >= rhs->block_end)
+ return 1;
+
+ return 0;
}
-static int __bio_detain(struct bucket *b,
+static int __bio_detain(struct dm_bio_prison *prison,
struct dm_cell_key *key,
struct bio *inmate,
struct dm_bio_prison_cell *cell_prealloc,
struct dm_bio_prison_cell **cell_result)
{
- struct dm_bio_prison_cell *cell;
-
- cell = __search_bucket(b, key);
- if (cell) {
- if (inmate)
- bio_list_add(&cell->bios, inmate);
- *cell_result = cell;
- return 1;
+ int r;
+ struct rb_node **new = &prison->cells.rb_node, *parent = NULL;
+
+ while (*new) {
+ struct dm_bio_prison_cell *cell =
+ container_of(*new, struct dm_bio_prison_cell, node);
+
+ r = cmp_keys(key, &cell->key);
+
+ parent = *new;
+ if (r < 0)
+ new = &((*new)->rb_left);
+ else if (r > 0)
+ new = &((*new)->rb_right);
+ else {
+ if (inmate)
+ bio_list_add(&cell->bios, inmate);
+ *cell_result = cell;
+ return 1;
+ }
}
- __setup_new_cell(b, key, inmate, cell_prealloc);
+ __setup_new_cell(key, inmate, cell_prealloc);
*cell_result = cell_prealloc;
+
+ rb_link_node(&cell_prealloc->node, parent, new);
+ rb_insert_color(&cell_prealloc->node, &prison->cells);
+
return 0;
}
@@ -174,11 +149,10 @@ static int bio_detain(struct dm_bio_prison *prison,
{
int r;
unsigned long flags;
- struct bucket *b = get_bucket(prison, key);
- spin_lock_irqsave(&b->lock, flags);
- r = __bio_detain(b, key, inmate, cell_prealloc, cell_result);
- spin_unlock_irqrestore(&b->lock, flags);
+ spin_lock_irqsave(&prison->lock, flags);
+ r = __bio_detain(prison, key, inmate, cell_prealloc, cell_result);
+ spin_unlock_irqrestore(&prison->lock, flags);
return r;
}
@@ -205,10 +179,11 @@ EXPORT_SYMBOL_GPL(dm_get_cell);
/*
* @inmates must have been initialised prior to this call
*/
-static void __cell_release(struct dm_bio_prison_cell *cell,
+static void __cell_release(struct dm_bio_prison *prison,
+ struct dm_bio_prison_cell *cell,
struct bio_list *inmates)
{
- hlist_del(&cell->list);
+ rb_erase(&cell->node, &prison->cells);
if (inmates) {
if (cell->holder)
@@ -222,21 +197,21 @@ void dm_cell_release(struct dm_bio_prison *prison,
struct bio_list *bios)
{
unsigned long flags;
- struct bucket *b = get_bucket(prison, &cell->key);
- spin_lock_irqsave(&b->lock, flags);
- __cell_release(cell, bios);
- spin_unlock_irqrestore(&b->lock, flags);
+ spin_lock_irqsave(&prison->lock, flags);
+ __cell_release(prison, cell, bios);
+ spin_unlock_irqrestore(&prison->lock, flags);
}
EXPORT_SYMBOL_GPL(dm_cell_release);
/*
* Sometimes we don't want the holder, just the additional bios.
*/
-static void __cell_release_no_holder(struct dm_bio_prison_cell *cell,
+static void __cell_release_no_holder(struct dm_bio_prison *prison,
+ struct dm_bio_prison_cell *cell,
struct bio_list *inmates)
{
- hlist_del(&cell->list);
+ rb_erase(&cell->node, &prison->cells);
bio_list_merge(inmates, &cell->bios);
}
@@ -245,11 +220,10 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison,
struct bio_list *inmates)
{
unsigned long flags;
- struct bucket *b = get_bucket(prison, &cell->key);
- spin_lock_irqsave(&b->lock, flags);
- __cell_release_no_holder(cell, inmates);
- spin_unlock_irqrestore(&b->lock, flags);
+ spin_lock_irqsave(&prison->lock, flags);
+ __cell_release_no_holder(prison, cell, inmates);
+ spin_unlock_irqrestore(&prison->lock, flags);
}
EXPORT_SYMBOL_GPL(dm_cell_release_no_holder);
@@ -267,6 +241,20 @@ void dm_cell_error(struct dm_bio_prison *prison,
}
EXPORT_SYMBOL_GPL(dm_cell_error);
+void dm_cell_visit_release(struct dm_bio_prison *prison,
+ void (*visit_fn)(void *, struct dm_bio_prison_cell *),
+ void *context,
+ struct dm_bio_prison_cell *cell)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&prison->lock, flags);
+ visit_fn(context, cell);
+ rb_erase(&cell->node, &prison->cells);
+ spin_unlock_irqrestore(&prison->lock, flags);
+}
+EXPORT_SYMBOL_GPL(dm_cell_visit_release);
+
/*----------------------------------------------------------------*/
#define DEFERRED_SET_SIZE 64
diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison.h
index 6805a142b750..74cf01144b1f 100644
--- a/drivers/md/dm-bio-prison.h
+++ b/drivers/md/dm-bio-prison.h
@@ -10,8 +10,8 @@
#include "persistent-data/dm-block-manager.h" /* FIXME: for dm_block_t */
#include "dm-thin-metadata.h" /* FIXME: for dm_thin_id */
-#include <linux/list.h>
#include <linux/bio.h>
+#include <linux/rbtree.h>
/*----------------------------------------------------------------*/
@@ -23,11 +23,14 @@
*/
struct dm_bio_prison;
-/* FIXME: this needs to be more abstract */
+/*
+ * Keys define a range of blocks within either a virtual or physical
+ * device.
+ */
struct dm_cell_key {
int virtual;
dm_thin_id dev;
- dm_block_t block;
+ dm_block_t block_begin, block_end;
};
/*
@@ -35,13 +38,15 @@ struct dm_cell_key {
* themselves.
*/
struct dm_bio_prison_cell {
- struct hlist_node list;
+ struct list_head user_list; /* for client use */
+ struct rb_node node;
+
struct dm_cell_key key;
struct bio *holder;
struct bio_list bios;
};
-struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells);
+struct dm_bio_prison *dm_bio_prison_create(void);
void dm_bio_prison_destroy(struct dm_bio_prison *prison);
/*
@@ -57,7 +62,7 @@ void dm_bio_prison_free_cell(struct dm_bio_prison *prison,
struct dm_bio_prison_cell *cell);
/*
- * Creates, or retrieves a cell for the given key.
+ * Creates, or retrieves a cell that overlaps the given key.
*
* Returns 1 if pre-existing cell returned, zero if new cell created using
* @cell_prealloc.
@@ -68,7 +73,8 @@ int dm_get_cell(struct dm_bio_prison *prison,
struct dm_bio_prison_cell **cell_result);
/*
- * An atomic op that combines retrieving a cell, and adding a bio to it.
+ * An atomic op that combines retrieving or creating a cell, and adding a
+ * bio to it.
*
* Returns 1 if the cell was already held, 0 if @inmate is the new holder.
*/
@@ -87,6 +93,14 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison,
void dm_cell_error(struct dm_bio_prison *prison,
struct dm_bio_prison_cell *cell, int error);
+/*
+ * Visits the cell and then releases. Guarantees no new inmates are
+ * inserted between the visit and release.
+ */
+void dm_cell_visit_release(struct dm_bio_prison *prison,
+ void (*visit_fn)(void *, struct dm_bio_prison_cell *),
+ void *context, struct dm_bio_prison_cell *cell);
+
/*----------------------------------------------------------------*/
/*
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index afe79719ea32..c33b49792b87 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -14,6 +14,7 @@
#include <linux/vmalloc.h>
#include <linux/shrinker.h>
#include <linux/module.h>
+#include <linux/rbtree.h>
#define DM_MSG_PREFIX "bufio"
@@ -34,26 +35,23 @@
/*
* Check buffer ages in this interval (seconds)
*/
-#define DM_BUFIO_WORK_TIMER_SECS 10
+#define DM_BUFIO_WORK_TIMER_SECS 30
/*
* Free buffers when they are older than this (seconds)
*/
-#define DM_BUFIO_DEFAULT_AGE_SECS 60
+#define DM_BUFIO_DEFAULT_AGE_SECS 300
/*
- * The number of bvec entries that are embedded directly in the buffer.
- * If the chunk size is larger, dm-io is used to do the io.
+ * The nr of bytes of cached data to keep around.
*/
-#define DM_BUFIO_INLINE_VECS 16
+#define DM_BUFIO_DEFAULT_RETAIN_BYTES (256 * 1024)
/*
- * Buffer hash
+ * The number of bvec entries that are embedded directly in the buffer.
+ * If the chunk size is larger, dm-io is used to do the io.
*/
-#define DM_BUFIO_HASH_BITS 20
-#define DM_BUFIO_HASH(block) \
- ((((block) >> DM_BUFIO_HASH_BITS) ^ (block)) & \
- ((1 << DM_BUFIO_HASH_BITS) - 1))
+#define DM_BUFIO_INLINE_VECS 16
/*
* Don't try to use kmem_cache_alloc for blocks larger than this.
@@ -106,7 +104,7 @@ struct dm_bufio_client {
unsigned minimum_buffers;
- struct hlist_head *cache_hash;
+ struct rb_root buffer_tree;
wait_queue_head_t free_buffer_wait;
int async_write_error;
@@ -135,7 +133,7 @@ enum data_mode {
};
struct dm_buffer {
- struct hlist_node hash_list;
+ struct rb_node node;
struct list_head lru_list;
sector_t block;
void *data;
@@ -223,6 +221,7 @@ static DEFINE_SPINLOCK(param_spinlock);
* Buffers are freed after this timeout
*/
static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
+static unsigned dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
static unsigned long dm_bufio_peak_allocated;
static unsigned long dm_bufio_allocated_kmem_cache;
@@ -253,6 +252,53 @@ static LIST_HEAD(dm_bufio_all_clients);
*/
static DEFINE_MUTEX(dm_bufio_clients_lock);
+/*----------------------------------------------------------------
+ * A red/black tree acts as an index for all the buffers.
+ *--------------------------------------------------------------*/
+static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
+{
+ struct rb_node *n = c->buffer_tree.rb_node;
+ struct dm_buffer *b;
+
+ while (n) {
+ b = container_of(n, struct dm_buffer, node);
+
+ if (b->block == block)
+ return b;
+
+ n = (b->block < block) ? n->rb_left : n->rb_right;
+ }
+
+ return NULL;
+}
+
+static void __insert(struct dm_bufio_client *c, struct dm_buffer *b)
+{
+ struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL;
+ struct dm_buffer *found;
+
+ while (*new) {
+ found = container_of(*new, struct dm_buffer, node);
+
+ if (found->block == b->block) {
+ BUG_ON(found != b);
+ return;
+ }
+
+ parent = *new;
+ new = (found->block < b->block) ?
+ &((*new)->rb_left) : &((*new)->rb_right);
+ }
+
+ rb_link_node(&b->node, parent, new);
+ rb_insert_color(&b->node, &c->buffer_tree);
+}
+
+static void __remove(struct dm_bufio_client *c, struct dm_buffer *b)
+{
+ rb_erase(&b->node, &c->buffer_tree);
+}
+
/*----------------------------------------------------------------*/
static void adjust_total_allocated(enum data_mode data_mode, long diff)
@@ -434,7 +480,7 @@ static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty)
b->block = block;
b->list_mode = dirty;
list_add(&b->lru_list, &c->lru[dirty]);
- hlist_add_head(&b->hash_list, &c->cache_hash[DM_BUFIO_HASH(block)]);
+ __insert(b->c, b);
b->last_accessed = jiffies;
}
@@ -448,7 +494,7 @@ static void __unlink_buffer(struct dm_buffer *b)
BUG_ON(!c->n_buffers[b->list_mode]);
c->n_buffers[b->list_mode]--;
- hlist_del(&b->hash_list);
+ __remove(b->c, b);
list_del(&b->lru_list);
}
@@ -532,6 +578,19 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t block,
end_io(&b->bio, r);
}
+static void inline_endio(struct bio *bio, int error)
+{
+ bio_end_io_t *end_fn = bio->bi_private;
+
+ /*
+ * Reset the bio to free any attached resources
+ * (e.g. bio integrity profiles).
+ */
+ bio_reset(bio);
+
+ end_fn(bio, error);
+}
+
static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
bio_end_io_t *end_io)
{
@@ -543,7 +602,12 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS;
b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits;
b->bio.bi_bdev = b->c->bdev;
- b->bio.bi_end_io = end_io;
+ b->bio.bi_end_io = inline_endio;
+ /*
+ * Use of .bi_private isn't a problem here because
+ * the dm_buffer's inline bio is local to bufio.
+ */
+ b->bio.bi_private = end_io;
/*
* We assume that if len >= PAGE_SIZE ptr is page-aligned.
@@ -887,23 +951,6 @@ static void __check_watermark(struct dm_bufio_client *c,
__write_dirty_buffers_async(c, 1, write_list);
}
-/*
- * Find a buffer in the hash.
- */
-static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
-{
- struct dm_buffer *b;
-
- hlist_for_each_entry(b, &c->cache_hash[DM_BUFIO_HASH(block)],
- hash_list) {
- dm_bufio_cond_resched();
- if (b->block == block)
- return b;
- }
-
- return NULL;
-}
-
/*----------------------------------------------------------------
* Getting a buffer
*--------------------------------------------------------------*/
@@ -1433,45 +1480,52 @@ static void drop_buffers(struct dm_bufio_client *c)
}
/*
- * Test if the buffer is unused and too old, and commit it.
+ * We may not be able to evict this buffer if IO pending or the client
+ * is still using it. Caller is expected to know buffer is too old.
+ *
* And if GFP_NOFS is used, we must not do any I/O because we hold
* dm_bufio_clients_lock and we would risk deadlock if the I/O gets
* rerouted to different bufio client.
*/
-static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp,
- unsigned long max_jiffies)
+static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp)
{
- if (jiffies - b->last_accessed < max_jiffies)
- return 0;
-
if (!(gfp & __GFP_FS)) {
if (test_bit(B_READING, &b->state) ||
test_bit(B_WRITING, &b->state) ||
test_bit(B_DIRTY, &b->state))
- return 0;
+ return false;
}
if (b->hold_count)
- return 0;
+ return false;
__make_buffer_clean(b);
__unlink_buffer(b);
__free_buffer_wake(b);
- return 1;
+ return true;
}
-static long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
- gfp_t gfp_mask)
+static unsigned get_retain_buffers(struct dm_bufio_client *c)
+{
+ unsigned retain_bytes = ACCESS_ONCE(dm_bufio_retain_bytes);
+ return retain_bytes / c->block_size;
+}
+
+static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
+ gfp_t gfp_mask)
{
int l;
struct dm_buffer *b, *tmp;
- long freed = 0;
+ unsigned long freed = 0;
+ unsigned long count = nr_to_scan;
+ unsigned retain_target = get_retain_buffers(c);
for (l = 0; l < LIST_SIZE; l++) {
list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) {
- freed += __cleanup_old_buffer(b, gfp_mask, 0);
- if (!--nr_to_scan)
+ if (__try_evict_buffer(b, gfp_mask))
+ freed++;
+ if (!--nr_to_scan || ((count - freed) <= retain_target))
return freed;
dm_bufio_cond_resched();
}
@@ -1533,11 +1587,7 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
r = -ENOMEM;
goto bad_client;
}
- c->cache_hash = vmalloc(sizeof(struct hlist_head) << DM_BUFIO_HASH_BITS);
- if (!c->cache_hash) {
- r = -ENOMEM;
- goto bad_hash;
- }
+ c->buffer_tree = RB_ROOT;
c->bdev = bdev;
c->block_size = block_size;
@@ -1556,9 +1606,6 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
c->n_buffers[i] = 0;
}
- for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++)
- INIT_HLIST_HEAD(&c->cache_hash[i]);
-
mutex_init(&c->lock);
INIT_LIST_HEAD(&c->reserved_buffers);
c->need_reserved_buffers = reserved_buffers;
@@ -1632,8 +1679,6 @@ bad_cache:
}
dm_io_client_destroy(c->dm_io);
bad_dm_io:
- vfree(c->cache_hash);
-bad_hash:
kfree(c);
bad_client:
return ERR_PTR(r);
@@ -1660,9 +1705,7 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c)
mutex_unlock(&dm_bufio_clients_lock);
- for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++)
- BUG_ON(!hlist_empty(&c->cache_hash[i]));
-
+ BUG_ON(!RB_EMPTY_ROOT(&c->buffer_tree));
BUG_ON(c->need_reserved_buffers);
while (!list_empty(&c->reserved_buffers)) {
@@ -1680,36 +1723,60 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c)
BUG_ON(c->n_buffers[i]);
dm_io_client_destroy(c->dm_io);
- vfree(c->cache_hash);
kfree(c);
}
EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
-static void cleanup_old_buffers(void)
+static unsigned get_max_age_hz(void)
{
- unsigned long max_age = ACCESS_ONCE(dm_bufio_max_age);
- struct dm_bufio_client *c;
+ unsigned max_age = ACCESS_ONCE(dm_bufio_max_age);
- if (max_age > ULONG_MAX / HZ)
- max_age = ULONG_MAX / HZ;
+ if (max_age > UINT_MAX / HZ)
+ max_age = UINT_MAX / HZ;
- mutex_lock(&dm_bufio_clients_lock);
- list_for_each_entry(c, &dm_bufio_all_clients, client_list) {
- if (!dm_bufio_trylock(c))
- continue;
+ return max_age * HZ;
+}
- while (!list_empty(&c->lru[LIST_CLEAN])) {
- struct dm_buffer *b;
- b = list_entry(c->lru[LIST_CLEAN].prev,
- struct dm_buffer, lru_list);
- if (!__cleanup_old_buffer(b, 0, max_age * HZ))
- break;
- dm_bufio_cond_resched();
- }
+static bool older_than(struct dm_buffer *b, unsigned long age_hz)
+{
+ return (jiffies - b->last_accessed) >= age_hz;
+}
+
+static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
+{
+ struct dm_buffer *b, *tmp;
+ unsigned retain_target = get_retain_buffers(c);
+ unsigned count;
+
+ dm_bufio_lock(c);
+
+ count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
+ list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) {
+ if (count <= retain_target)
+ break;
+
+ if (!older_than(b, age_hz))
+ break;
+
+ if (__try_evict_buffer(b, 0))
+ count--;
- dm_bufio_unlock(c);
dm_bufio_cond_resched();
}
+
+ dm_bufio_unlock(c);
+}
+
+static void cleanup_old_buffers(void)
+{
+ unsigned long max_age_hz = get_max_age_hz();
+ struct dm_bufio_client *c;
+
+ mutex_lock(&dm_bufio_clients_lock);
+
+ list_for_each_entry(c, &dm_bufio_all_clients, client_list)
+ __evict_old_buffers(c, max_age_hz);
+
mutex_unlock(&dm_bufio_clients_lock);
}
@@ -1834,6 +1901,9 @@ MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
+module_param_named(retain_bytes, dm_bufio_retain_bytes, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory");
+
module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory");
diff --git a/drivers/md/dm-cache-block-types.h b/drivers/md/dm-cache-block-types.h
index aac0e2df06be..bed4ad4e1b7c 100644
--- a/drivers/md/dm-cache-block-types.h
+++ b/drivers/md/dm-cache-block-types.h
@@ -19,6 +19,7 @@
typedef dm_block_t __bitwise__ dm_oblock_t;
typedef uint32_t __bitwise__ dm_cblock_t;
+typedef dm_block_t __bitwise__ dm_dblock_t;
static inline dm_oblock_t to_oblock(dm_block_t b)
{
@@ -40,4 +41,14 @@ static inline uint32_t from_cblock(dm_cblock_t b)
return (__force uint32_t) b;
}
+static inline dm_dblock_t to_dblock(dm_block_t b)
+{
+ return (__force dm_dblock_t) b;
+}
+
+static inline dm_block_t from_dblock(dm_dblock_t b)
+{
+ return (__force dm_block_t) b;
+}
+
#endif /* DM_CACHE_BLOCK_TYPES_H */
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index 06709257adde..9fc616c2755e 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -109,7 +109,7 @@ struct dm_cache_metadata {
dm_block_t discard_root;
sector_t discard_block_size;
- dm_oblock_t discard_nr_blocks;
+ dm_dblock_t discard_nr_blocks;
sector_t data_block_size;
dm_cblock_t cache_blocks;
@@ -329,7 +329,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
disk_super->hint_root = cpu_to_le64(cmd->hint_root);
disk_super->discard_root = cpu_to_le64(cmd->discard_root);
disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
- disk_super->discard_nr_blocks = cpu_to_le64(from_oblock(cmd->discard_nr_blocks));
+ disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks));
disk_super->metadata_block_size = cpu_to_le32(DM_CACHE_METADATA_BLOCK_SIZE);
disk_super->data_block_size = cpu_to_le32(cmd->data_block_size);
disk_super->cache_blocks = cpu_to_le32(0);
@@ -528,7 +528,7 @@ static void read_superblock_fields(struct dm_cache_metadata *cmd,
cmd->hint_root = le64_to_cpu(disk_super->hint_root);
cmd->discard_root = le64_to_cpu(disk_super->discard_root);
cmd->discard_block_size = le64_to_cpu(disk_super->discard_block_size);
- cmd->discard_nr_blocks = to_oblock(le64_to_cpu(disk_super->discard_nr_blocks));
+ cmd->discard_nr_blocks = to_dblock(le64_to_cpu(disk_super->discard_nr_blocks));
cmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
cmd->cache_blocks = to_cblock(le32_to_cpu(disk_super->cache_blocks));
strncpy(cmd->policy_name, disk_super->policy_name, sizeof(cmd->policy_name));
@@ -626,7 +626,7 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,
disk_super->hint_root = cpu_to_le64(cmd->hint_root);
disk_super->discard_root = cpu_to_le64(cmd->discard_root);
disk_super->discard_block_size = cpu_to_le64(cmd->discard_block_size);
- disk_super->discard_nr_blocks = cpu_to_le64(from_oblock(cmd->discard_nr_blocks));
+ disk_super->discard_nr_blocks = cpu_to_le64(from_dblock(cmd->discard_nr_blocks));
disk_super->cache_blocks = cpu_to_le32(from_cblock(cmd->cache_blocks));
strncpy(disk_super->policy_name, cmd->policy_name, sizeof(disk_super->policy_name));
disk_super->policy_version[0] = cpu_to_le32(cmd->policy_version[0]);
@@ -797,15 +797,15 @@ out:
int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
sector_t discard_block_size,
- dm_oblock_t new_nr_entries)
+ dm_dblock_t new_nr_entries)
{
int r;
down_write(&cmd->root_lock);
r = dm_bitset_resize(&cmd->discard_info,
cmd->discard_root,
- from_oblock(cmd->discard_nr_blocks),
- from_oblock(new_nr_entries),
+ from_dblock(cmd->discard_nr_blocks),
+ from_dblock(new_nr_entries),
false, &cmd->discard_root);
if (!r) {
cmd->discard_block_size = discard_block_size;
@@ -818,28 +818,28 @@ int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
return r;
}
-static int __set_discard(struct dm_cache_metadata *cmd, dm_oblock_t b)
+static int __set_discard(struct dm_cache_metadata *cmd, dm_dblock_t b)
{
return dm_bitset_set_bit(&cmd->discard_info, cmd->discard_root,
- from_oblock(b), &cmd->discard_root);
+ from_dblock(b), &cmd->discard_root);
}
-static int __clear_discard(struct dm_cache_metadata *cmd, dm_oblock_t b)
+static int __clear_discard(struct dm_cache_metadata *cmd, dm_dblock_t b)
{
return dm_bitset_clear_bit(&cmd-