diff options
Diffstat (limited to 'drivers/md/bcache/btree.c')
-rw-r--r-- | drivers/md/bcache/btree.c | 2503 |
1 files changed, 2503 insertions, 0 deletions
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c new file mode 100644 index 000000000000..7a5658f04e62 --- /dev/null +++ b/drivers/md/bcache/btree.c @@ -0,0 +1,2503 @@ +/* + * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> + * + * Uses a block device as cache for other block devices; optimized for SSDs. + * All allocation is done in buckets, which should match the erase block size + * of the device. + * + * Buckets containing cached data are kept on a heap sorted by priority; + * bucket priority is increased on cache hit, and periodically all the buckets + * on the heap have their priority scaled down. This currently is just used as + * an LRU but in the future should allow for more intelligent heuristics. + * + * Buckets have an 8 bit counter; freeing is accomplished by incrementing the + * counter. Garbage collection is used to remove stale pointers. + * + * Indexing is done via a btree; nodes are not necessarily fully sorted, rather + * as keys are inserted we only sort the pages that have not yet been written. + * When garbage collection is run, we resort the entire node. + * + * All configuration is done via sysfs; see Documentation/bcache.txt. + */ + +#include "bcache.h" +#include "btree.h" +#include "debug.h" +#include "request.h" + +#include <linux/slab.h> +#include <linux/bitops.h> +#include <linux/hash.h> +#include <linux/prefetch.h> +#include <linux/random.h> +#include <linux/rcupdate.h> +#include <trace/events/bcache.h> + +/* + * Todo: + * register_bcache: Return errors out to userspace correctly + * + * Writeback: don't undirty key until after a cache flush + * + * Create an iterator for key pointers + * + * On btree write error, mark bucket such that it won't be freed from the cache + * + * Journalling: + * Check for bad keys in replay + * Propagate barriers + * Refcount journal entries in journal_replay + * + * Garbage collection: + * Finish incremental gc + * Gc should free old UUIDs, data for invalid UUIDs + * + * Provide a way to list backing device UUIDs we have data cached for, and + * probably how long it's been since we've seen them, and a way to invalidate + * dirty data for devices that will never be attached again + * + * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so + * that based on that and how much dirty data we have we can keep writeback + * from being starved + * + * Add a tracepoint or somesuch to watch for writeback starvation + * + * When btree depth > 1 and splitting an interior node, we have to make sure + * alloc_bucket() cannot fail. This should be true but is not completely + * obvious. + * + * Make sure all allocations get charged to the root cgroup + * + * Plugging? + * + * If data write is less than hard sector size of ssd, round up offset in open + * bucket to the next whole sector + * + * Also lookup by cgroup in get_open_bucket() + * + * Superblock needs to be fleshed out for multiple cache devices + * + * Add a sysfs tunable for the number of writeback IOs in flight + * + * Add a sysfs tunable for the number of open data buckets + * + * IO tracking: Can we track when one process is doing io on behalf of another? + * IO tracking: Don't use just an average, weigh more recent stuff higher + * + * Test module load/unload + */ + +static const char * const op_types[] = { + "insert", "replace" +}; + +static const char *op_type(struct btree_op *op) +{ + return op_types[op->type]; +} + +#define MAX_NEED_GC 64 +#define MAX_SAVE_PRIO 72 + +#define PTR_DIRTY_BIT (((uint64_t) 1 << 36)) + +#define PTR_HASH(c, k) \ + (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) + +struct workqueue_struct *bch_gc_wq; +static struct workqueue_struct *btree_io_wq; + +void bch_btree_op_init_stack(struct btree_op *op) +{ + memset(op, 0, sizeof(struct btree_op)); + closure_init_stack(&op->cl); + op->lock = -1; + bch_keylist_init(&op->keys); +} + +/* Btree key manipulation */ + +static void bkey_put(struct cache_set *c, struct bkey *k, int level) +{ + if ((level && KEY_OFFSET(k)) || !level) + __bkey_put(c, k); +} + +/* Btree IO */ + +static uint64_t btree_csum_set(struct btree *b, struct bset *i) +{ + uint64_t crc = b->key.ptr[0]; + void *data = (void *) i + 8, *end = end(i); + + crc = bch_crc64_update(crc, data, end - data); + return crc ^ 0xffffffffffffffffULL; +} + +static void btree_bio_endio(struct bio *bio, int error) +{ + struct closure *cl = bio->bi_private; + struct btree *b = container_of(cl, struct btree, io.cl); + + if (error) + set_btree_node_io_error(b); + + bch_bbio_count_io_errors(b->c, bio, error, (bio->bi_rw & WRITE) + ? "writing btree" : "reading btree"); + closure_put(cl); +} + +static void btree_bio_init(struct btree *b) +{ + BUG_ON(b->bio); + b->bio = bch_bbio_alloc(b->c); + + b->bio->bi_end_io = btree_bio_endio; + b->bio->bi_private = &b->io.cl; +} + +void bch_btree_read_done(struct closure *cl) +{ + struct btree *b = container_of(cl, struct btree, io.cl); + struct bset *i = b->sets[0].data; + struct btree_iter *iter = b->c->fill_iter; + const char *err = "bad btree header"; + BUG_ON(b->nsets || b->written); + + bch_bbio_free(b->bio, b->c); + b->bio = NULL; + + mutex_lock(&b->c->fill_lock); + iter->used = 0; + + if (btree_node_io_error(b) || + !i->seq) + goto err; + + for (; + b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq; + i = write_block(b)) { + err = "unsupported bset version"; + if (i->version > BCACHE_BSET_VERSION) + goto err; + + err = "bad btree header"; + if (b->written + set_blocks(i, b->c) > btree_blocks(b)) + goto err; + + err = "bad magic"; + if (i->magic != bset_magic(b->c)) + goto err; + + err = "bad checksum"; + switch (i->version) { + case 0: + if (i->csum != csum_set(i)) + goto err; + break; + case BCACHE_BSET_VERSION: + if (i->csum != btree_csum_set(b, i)) + goto err; + break; + } + + err = "empty set"; + if (i != b->sets[0].data && !i->keys) + goto err; + + bch_btree_iter_push(iter, i->start, end(i)); + + b->written += set_blocks(i, b->c); + } + + err = "corrupted btree"; + for (i = write_block(b); + index(i, b) < btree_blocks(b); + i = ((void *) i) + block_bytes(b->c)) + if (i->seq == b->sets[0].data->seq) + goto err; + + bch_btree_sort_and_fix_extents(b, iter); + + i = b->sets[0].data; + err = "short btree key"; + if (b->sets[0].size && + bkey_cmp(&b->key, &b->sets[0].end) < 0) + goto err; + + if (b->written < btree_blocks(b)) + bch_bset_init_next(b); +out: + + mutex_unlock(&b->c->fill_lock); + + spin_lock(&b->c->btree_read_time_lock); + bch_time_stats_update(&b->c->btree_read_time, b->io_start_time); + spin_unlock(&b->c->btree_read_time_lock); + + smp_wmb(); /* read_done is our write lock */ + set_btree_node_read_done(b); + + closure_return(cl); +err: + set_btree_node_io_error(b); + bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", + err, PTR_BUCKET_NR(b->c, &b->key, 0), + index(i, b), i->keys); + goto out; +} + +void bch_btree_read(struct btree *b) +{ + BUG_ON(b->nsets || b->written); + + if (!closure_trylock(&b->io.cl, &b->c->cl)) + BUG(); + + b->io_start_time = local_clock(); + + btree_bio_init(b); + b->bio->bi_rw = REQ_META|READ_SYNC; + b->bio->bi_size = KEY_SIZE(&b->key) << 9; + + bch_bio_map(b->bio, b->sets[0].data); + + pr_debug("%s", pbtree(b)); + trace_bcache_btree_read(b->bio); + bch_submit_bbio(b->bio, b->c, &b->key, 0); + + continue_at(&b->io.cl, bch_btree_read_done, system_wq); +} + +static void btree_complete_write(struct btree *b, struct btree_write *w) +{ + if (w->prio_blocked && + !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) + wake_up(&b->c->alloc_wait); + + if (w->journal) { + atomic_dec_bug(w->journal); + __closure_wake_up(&b->c->journal.wait); + } + + if (w->owner) + closure_put(w->owner); + + w->prio_blocked = 0; + w->journal = NULL; + w->owner = NULL; +} + +static void __btree_write_done(struct closure *cl) +{ + struct btree *b = container_of(cl, struct btree, io.cl); + struct btree_write *w = btree_prev_write(b); + + bch_bbio_free(b->bio, b->c); + b->bio = NULL; + btree_complete_write(b, w); + + if (btree_node_dirty(b)) + queue_delayed_work(btree_io_wq, &b->work, + msecs_to_jiffies(30000)); + + closure_return(cl); +} + +static void btree_write_done(struct closure *cl) +{ + struct btree *b = container_of(cl, struct btree, io.cl); + struct bio_vec *bv; + int n; + + __bio_for_each_segment(bv, b->bio, n, 0) + __free_page(bv->bv_page); + + __btree_write_done(cl); +} + +static void do_btree_write(struct btree *b) +{ + struct closure *cl = &b->io.cl; + struct bset *i = b->sets[b->nsets].data; + BKEY_PADDED(key) k; + + i->version = BCACHE_BSET_VERSION; + i->csum = btree_csum_set(b, i); + + btree_bio_init(b); + b->bio->bi_rw = REQ_META|WRITE_SYNC; + b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); + bch_bio_map(b->bio, i); + + bkey_copy(&k.key, &b->key); + SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); + + if (!bch_bio_alloc_pages(b->bio, GFP_NOIO)) { + int j; + struct bio_vec *bv; + void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); + + bio_for_each_segment(bv, b->bio, j) + memcpy(page_address(bv->bv_page), + base + j * PAGE_SIZE, PAGE_SIZE); + + trace_bcache_btree_write(b->bio); + bch_submit_bbio(b->bio, b->c, &k.key, 0); + + continue_at(cl, btree_write_done, NULL); + } else { + b->bio->bi_vcnt = 0; + bch_bio_map(b->bio, i); + + trace_bcache_btree_write(b->bio); + bch_submit_bbio(b->bio, b->c, &k.key, 0); + + closure_sync(cl); + __btree_write_done(cl); + } +} + +static void __btree_write(struct btree *b) +{ + struct bset *i = b->sets[b->nsets].data; + + BUG_ON(current->bio_list); + + closure_lock(&b->io, &b->c->cl); + cancel_delayed_work(&b->work); + + clear_bit(BTREE_NODE_dirty, &b->flags); + change_bit(BTREE_NODE_write_idx, &b->flags); + + bch_check_key_order(b, i); + BUG_ON(b->written && !i->keys); + + do_btree_write(b); + + pr_debug("%s block %i keys %i", pbtree(b), b->written, i->keys); + + b->written += set_blocks(i, b->c); + atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size, + &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); + + bch_btree_sort_lazy(b); + + if (b->written < btree_blocks(b)) + bch_bset_init_next(b); +} + +static void btree_write_work(struct work_struct *w) +{ + struct btree *b = container_of(to_delayed_work(w), struct btree, work); + + down_write(&b->lock); + + if (btree_node_dirty(b)) + __btree_write(b); + up_write(&b->lock); +} + +void bch_btree_write(struct btree *b, bool now, struct btree_op *op) +{ + struct bset *i = b->sets[b->nsets].data; + struct btree_write *w = btree_current_write(b); + + BUG_ON(b->written && + (b->written >= btree_blocks(b) || + i->seq != b->sets[0].data->seq || + !i->keys)); + + if (!btree_node_dirty(b)) { + set_btree_node_dirty(b); + queue_delayed_work(btree_io_wq, &b->work, + msecs_to_jiffies(30000)); + } + + w->prio_blocked += b->prio_blocked; + b->prio_blocked = 0; + + if (op && op->journal && !b->level) { + if (w->journal && + journal_pin_cmp(b->c, w, op)) { + atomic_dec_bug(w->journal); + w->journal = NULL; + } + + if (!w->journal) { + w->journal = op->journal; + atomic_inc(w->journal); + } + } + + if (current->bio_list) + return; + + /* Force write if set is too big */ + if (now || + b->level || + set_bytes(i) > PAGE_SIZE - 48) { + if (op && now) { + /* Must wait on multiple writes */ + BUG_ON(w->owner); + w->owner = &op->cl; + closure_get(&op->cl); + } + + __btree_write(b); + } + BUG_ON(!b->written); +} + +/* + * Btree in memory cache - allocation/freeing + * mca -> memory cache + */ + +static void mca_reinit(struct btree *b) +{ + unsigned i; + + b->flags = 0; + b->written = 0; + b->nsets = 0; + + for (i = 0; i < MAX_BSETS; i++) + b->sets[i].size = 0; + /* + * Second loop starts at 1 because b->sets[0]->data is the memory we + * allocated + */ + for (i = 1; i < MAX_BSETS; i++) + b->sets[i].data = NULL; +} + +#define mca_reserve(c) (((c->root && c->root->level) \ + ? c->root->level : 1) * 8 + 16) +#define mca_can_free(c) \ + max_t(int, 0, c->bucket_cache_used - mca_reserve(c)) + +static void mca_data_free(struct btree *b) +{ + struct bset_tree *t = b->sets; + BUG_ON(!closure_is_unlocked(&b->io.cl)); + + if (bset_prev_bytes(b) < PAGE_SIZE) + kfree(t->prev); + else + free_pages((unsigned long) t->prev, + get_order(bset_prev_bytes(b))); + + if (bset_tree_bytes(b) < PAGE_SIZE) + kfree(t->tree); + else + free_pages((unsigned long) t->tree, + get_order(bset_tree_bytes(b))); + + free_pages((unsigned long) t->data, b->page_order); + + t->prev = NULL; + t->tree = NULL; + t->data = NULL; + list_move(&b->list, &b->c->btree_cache_freed); + b->c->bucket_cache_used--; +} + +static void mca_bucket_free(struct btree *b) +{ + BUG_ON(btree_node_dirty(b)); + + b->key.ptr[0] = 0; + hlist_del_init_rcu(&b->hash); + list_move(&b->list, &b->c->btree_cache_freeable); +} + +static unsigned btree_order(struct bkey *k) +{ + return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1); +} + +static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) +{ + struct bset_tree *t = b->sets; + BUG_ON(t->data); + + b->page_order = max_t(unsigned, + ilog2(b->c->btree_pages), + btree_order(k)); + + t->data = (void *) __get_free_pages(gfp, b->page_order); + if (!t->data) + goto err; + + t->tree = bset_tree_bytes(b) < PAGE_SIZE + ? kmalloc(bset_tree_bytes(b), gfp) + : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b))); + if (!t->tree) + goto err; + + t->prev = bset_prev_bytes(b) < PAGE_SIZE + ? kmalloc(bset_prev_bytes(b), gfp) + : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b))); + if (!t->prev) + goto err; + + list_move(&b->list, &b->c->btree_cache); + b->c->bucket_cache_used++; + return; +err: + mca_data_free(b); +} + +static struct btree *mca_bucket_alloc(struct cache_set *c, + struct bkey *k, gfp_t gfp) +{ + struct btree *b = kzalloc(sizeof(struct btree), gfp); + if (!b) + return NULL; + + init_rwsem(&b->lock); + lockdep_set_novalidate_class(&b->lock); + INIT_LIST_HEAD(&b->list); + INIT_DELAYED_WORK(&b->work, btree_write_work); + b->c = c; + closure_init_unlocked(&b->io); + + mca_data_alloc(b, k, gfp); + return b; +} + +static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order) +{ + lockdep_assert_held(&b->c->bucket_lock); + + if (!down_write_trylock(&b->lock)) + return -ENOMEM; + + if (b->page_order < min_order) { + rw_unlock(true, b); + return -ENOMEM; + } + + BUG_ON(btree_node_dirty(b) && !b->sets[0].data); + + if (cl && btree_node_dirty(b)) + bch_btree_write(b, true, NULL); + + if (cl) + closure_wait_event_async(&b->io.wait, cl, + atomic_read(&b->io.cl.remaining) == -1); + + if (btree_node_dirty(b) || + !closure_is_unlocked(&b->io.cl) || + work_pending(&b->work.work)) { + rw_unlock(true, b); + return -EAGAIN; + } + + return 0; +} + +static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc) +{ + struct cache_set *c = container_of(shrink, struct cache_set, shrink); + struct btree *b, *t; + unsigned long i, nr = sc->nr_to_scan; + + if (c->shrinker_disabled) + return 0; + + if (c->try_harder) + return 0; + + /* + * If nr == 0, we're supposed to return the number of items we have + * cached. Not allowed to return -1. + */ + if (!nr) + return mca_can_free(c) * c->btree_pages; + + /* Return -1 if we can't do anything right now */ + if (sc->gfp_mask & __GFP_WAIT) + mutex_lock(&c->bucket_lock); + else if (!mutex_trylock(&c->bucket_lock)) + return -1; + + nr /= c->btree_pages; + nr = min_t(unsigned long, nr, mca_can_free(c)); + + i = 0; + list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) { + if (!nr) + break; + + if (++i > 3 && + !mca_reap(b, NULL, 0)) { + mca_data_free(b); + rw_unlock(true, b); + --nr; + } + } + + /* + * Can happen right when we first start up, before we've read in any + * btree nodes + */ + if (list_empty(&c->btree_cache)) + goto out; + + for (i = 0; nr && i < c->bucket_cache_used; i++) { + b = list_first_entry(&c->btree_cache, struct btree, list); + list_rotate_left(&c->btree_cache); + + if (!b->accessed && + !mca_reap(b, NULL, 0)) { + mca_bucket_free(b); + mca_data_free(b); + rw_unlock(true, b); + --nr; + } else + b->accessed = 0; + } +out: + nr = mca_can_free(c) * c->btree_pages; + mutex_unlock(&c->bucket_lock); + return nr; +} + +void bch_btree_cache_free(struct cache_set *c) +{ + struct btree *b; + struct closure cl; + closure_init_stack(&cl); + + if (c->shrink.list.next) + unregister_shrinker(&c->shrink); + + mutex_lock(&c->bucket_lock); + +#ifdef CONFIG_BCACHE_DEBUG + if (c->verify_data) + list_move(&c->verify_data->list, &c->btree_cache); +#endif + + list_splice(&c->btree_cache_freeable, + &c->btree_cache); + + while (!list_empty(&c->btree_cache)) { + b = list_first_entry(&c->btree_cache, struct btree, list); + + if (btree_node_dirty(b)) + btree_complete_write(b, btree_current_write(b)); + clear_bit(BTREE_NODE_dirty, &b->flags); + + mca_data_free(b); + } + + while (!list_empty(&c->btree_cache_freed)) { + b = list_first_entry(&c->btree_cache_freed, + struct btree, list); + list_del(&b->list); + cancel_delayed_work_sync(&b->work); + kfree(b); + } + + mutex_unlock(&c->bucket_lock); +} + +int bch_btree_cache_alloc(struct cache_set *c) +{ + unsigned i; + + /* XXX: doesn't check for errors */ + + closure_init_unlocked(&c->gc); + + for (i = 0; i < mca_reserve(c); i++) + mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); + + list_splice_init(&c->btree_cache, + &c->btree_cache_freeable); + +#ifdef CONFIG_BCACHE_DEBUG + mutex_init(&c->verify_lock); + + c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); + + if (c->verify_data && + c->verify_data->sets[0].data) + list_del_init(&c->verify_data->list); + else + c->verify_data = NULL; +#endif + + c->shrink.shrink = bch_mca_shrink; + c->shrink.seeks = 4; + c->shrink.batch = c->btree_pages * 2; + register_shrinker(&c->shrink); + + return 0; +} + +/* Btree in memory cache - hash table */ + +static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k) +{ + return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)]; +} + +static struct btree *mca_find(struct cache_set *c, struct bkey *k) +{ + struct btree *b; + + rcu_read_lock(); + hlist_for_each_entry_rcu(b, mca_hash(c, k), hash) + if (PTR_HASH(c, &b->key) == PTR_HASH(c, k)) + goto out; + b = NULL; +out: + rcu_read_unlock(); + return b; +} + +static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, + int level, struct closure *cl) +{ + int ret = -ENOMEM; + struct btree *i; + + if (!cl) + return ERR_PTR(-ENOMEM); + + /* + * Trying to free up some memory - i.e. reuse some btree nodes - may + * require initiating IO to flush the dirty part of the node. If we're + * running under generic_make_request(), that IO will never finish and + * we would deadlock. Returning -EAGAIN causes the cache lookup code to + * punt to workqueue and retry. + */ + if (current->bio_list) + return ERR_PTR(-EAGAIN); + + if (c->try_harder && c->try_harder != cl) { + closure_wait_event_async(&c->try_wait, cl, !c->try_harder); + return ERR_PTR(-EAGAIN); + } + + /* XXX: tracepoint */ + c->try_harder = cl; + c->try_harder_start = local_clock(); +retry: + list_for_each_entry_reverse(i, &c->btree_cache, list) { + int r = mca_reap(i, cl, btree_order(k)); + if (!r) + return i; + if (r != -ENOMEM) + ret = r; + } + + if (ret == -EAGAIN && + closure_blocking(cl)) { + mutex_unlock(&c->bucket_lock); + closure_sync(cl); + mutex_lock(&c->bucket_lock); + goto retry; + } + + return ERR_PTR(ret); +} + +/* + * We can only have one thread cannibalizing other cached btree nodes at a time, + * or we'll deadlock. We use an open coded mutex to ensure that, which a + * cannibalize_bucket() will take. This means every time we unlock the root of + * the btree, we need to release this lock if we have it held. + */ +void bch_cannibalize_unlock(struct cache_set *c, struct closure *cl) +{ + if (c->try_harder == cl) { + bch_time_stats_update(&c->try_harder_time, c->try_harder_start); + c->try_harder = NULL; + __closure_wake_up(&c->try_wait); + } +} + +static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, + int level, struct closure *cl) +{ + struct btree *b; + + lockdep_assert_held(&c->bucket_lock); + + if (mca_find(c, k)) + return NULL; + + /* btree_free() doesn't free memory; it sticks the node on the end of + * the list. Check if there's any freed nodes there: + */ + list_for_each_entry(b, &c->btree_cache_freeable, list) + if (!mca_reap(b, NULL, btree_order(k))) + goto out; + + /* We never free struct btree itself, just the memory that holds the on + * disk node. Check the freed list before allocating a new one: + */ + list_for_each_entry(b, &c->btree_cache_freed, list) + if (!mca_reap(b, NULL, 0)) { + mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); + if (!b->sets[0].data) + goto err; + else + goto out; + } + + b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO); + if (!b) + goto err; + + BUG_ON(!down_write_trylock(&b->lock)); + if (!b->sets->data) + goto err; +out: + BUG_ON(!closure_is_unlocked(&b->io.cl)); + + bkey_copy(&b->key, k); + list_move(&b->list, &c->btree_cache); + hlist_del_init_rcu(&b->hash); + hlist_add_head_rcu(&b->hash, mca_hash(c, k)); + + lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); + b->level = level; + + mca_reinit(b); + + return b; +err: + if (b) + rw_unlock(true, b); + + b = mca_cannibalize(c, k, level, cl); + if (!IS_ERR(b)) + goto out; + + return b; +} + +/** + * bch_btree_node_get - find a btree node in the cache and lock it, reading it + * in from disk if necessary. + * + * If IO is necessary, it uses the closure embedded in struct btree_op to wait; + * if that closure is in non blocking mode, will return -EAGAIN. + * + * The btree node will have either a read or a write lock held, depending on + * level and op->lock. + */ +struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k, + int level, struct btree_op *op) +{ + int i = 0; + bool write = level <= op->lock; + struct btree *b; + + BUG_ON(level < 0); +retry: + b = mca_find(c, k); + + if (!b) { + mutex_lock(&c->bucket_lock); + b = mca_alloc(c, k, level, &op->cl); + mutex_unlock(&c->bucket_lock); + + if (!b) + goto retry; + if (IS_ERR(b)) + return b; + + bch_btree_read(b); + + if (!write) + downgrade_write(&b->lock); + } else { + rw_lock(write, b, level); + if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) { + rw_unlock(write, b); + goto retry; + } + BUG_ON(b->level != level); + } + + b->accessed = 1; + + for (; i <= b->nsets && b->sets[i].size; i++) { + prefetch(b->sets[i].tree); + prefetch(b->sets[i].data); + } + + for (; i <= b->nsets; i++) + prefetch(b->sets[i].data); + + if (!closure_wait_event(&b->io.wait, &op->cl, + btree_node_read_done(b))) { + rw_unlock(write, b); + b = ERR_PTR(-EAGAIN); + } else if (btree_node_io_error(b)) { + rw_unlock(write, b); + b = ERR_PTR(-EIO); + } else + BUG_ON(!b->written); + + return b; +} + +static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) +{ + struct btree *b; + + mutex_lock(&c->bucket_lock); + b = mca_alloc(c, k, level, NULL); + mutex_unlock(&c->bucket_lock); + + if (!IS_ERR_OR_NULL(b)) { + bch_btree_read(b); + rw_unlock(true, b); + } +} + +/* Btree alloc */ + +static void btree_node_free(struct btree *b, struct btree_op *op) +{ + unsigned i; + + /* + * The BUG_ON() in btree_node_get() implies that we must have a write + * lock on parent to free or even invalidate a node + */ + BUG_ON(op->lock <= b->level); + BUG_ON(b == b->c->root); + pr_debug("bucket %s", pbtree(b)); + + if (btree_node_dirty(b)) + btree_complete_write(b, btree_current_write(b)); + clear_bit(BTREE_NODE_dirty, &b->flags); + + if (b->prio_blocked && + !atomic_sub_return(b->prio_blocked, &b->c->prio_blocked)) + wake_up(&b->c->alloc_wait); + + b->prio_blocked = 0; + + cancel_delayed_work(&b->work); + + mutex_lock(&b->c->bucket_lock); + + for (i = 0; i < KEY_PTRS(&b->key); i++) { + BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin)); + + bch_inc_gen(PTR_CACHE(b->c, &b->key, i), + PTR_BUCKET(b->c, &b->key, i)); + } + + bch_bucket_free(b->c, &b->key); + mca_bucket_free(b); + mutex_unlock(&b->c->bucket_lock); +} + +struct btree *bch_btree_node_alloc(struct cache_set *c, int level, + struct closure *cl) +{ + BKEY_PADDED(key) k; + struct btree *b = ERR_PTR(-EAGAIN); + + mutex_lock(&c->bucket_lock); +retry: + if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, cl)) + goto err; + + SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); + + b = mca_alloc(c, &k.key, level, cl); + if (IS_ERR(b)) + goto err_free; + + if (!b) { + cache_bug(c, + "Tried to allocate bucket that was in btree cache"); + __bkey_put(c, &k.key); + goto retry; + } + + set_btree_node_read_done(b); + b->accessed = 1; + bch_bset_init_next(b); + + mutex_unlock(&c->bucket_lock); + return b; +err_free: + bch_bucket_free(c, &k.key); + __bkey_put(c, &k.key); +err: + mutex_unlock(&c->bucket_lock); + return b; +} + +static struct btree *btree_node_alloc_replacement(struct btree *b, + struct closure *cl) +{ + struct btree *n = bch_btree_node_alloc(b->c, b->level, cl); + if (!IS_ERR_OR_NULL(n)) + bch_btree_sort_into(b, n); + + return n; +} + +/* Garbage collection */ + +uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) +{ + uint8_t stale = 0; + unsigned i; + struct bucket *g; + + /* + * ptr_invalid() can't return true for the keys that mark btree nodes as + * freed, but since ptr_bad() returns true we'll never actually use them + * for anything and thus we don't want mark their pointers here + */ + if (!bkey_cmp(k, &ZERO_KEY)) + return stale; + + for (i = 0; i < KEY_PTRS(k); i++) { + if (!ptr_available(c, k, i)) + continue; + + g = PTR_BUCKET(c, k, i); + + if (gen_after(g->gc_gen, PTR_GEN(k, i))) + g->gc_gen = PTR_GEN(k, i); + + if (ptr_stale(c, k, i)) { + stale = max(stale, ptr_stale(c, k, i)); + continue; + } + + cache_bug_on(GC_MARK(g) && + (GC_MARK(g) == GC_MARK_METADATA) != (level != 0), + c, "inconsistent ptrs: mark = %llu, level = %i", + GC_MARK(g), level); + + if (level) + SET_GC_MARK(g, GC_MARK_METADATA); + else if (KEY_DIRTY(k)) + SET_GC_MARK(g, GC_MARK_DIRTY); + + /* guard against overflow */ + SET_GC_SECTORS_USED(g, min_t(unsigned, + GC_SECTORS_USED(g) + KEY_SIZE(k), + (1 << 14) - 1)); + + BUG_ON(!GC_SECTORS_USED(g)); + } + + return stale; +} + +#define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) + +static int btree_gc_mark_node(struct btree *b, unsigned *keys, + struct gc_stat *gc) +{ + uint8_t stale = 0; + unsigned last_dev = -1; + struct bcache_device *d = NULL; + struct bkey *k; + struct btree_iter iter; + struct bset_tree *t; + + gc->nodes++; + + for_each_key_filter(b, k, &iter, bch_ptr_invalid) { + if (last_dev != KEY_INODE(k)) { + last_dev = KEY_INODE(k); + + d = KEY_INODE(k) < b->c->nr_uuids + ? b->c->devices[last_dev] + : NULL; + } + + stale = max(stale, btree_mark_key(b, k)); + + if (bch_ptr_bad(b, k)) + continue; + + *keys += bkey_u64s(k); + + gc->key_bytes += bkey_u64s(k); + gc->nkeys++; + + gc->data += KEY_SIZE(k); + if (KEY_DIRTY(k)) { + gc->dirty += KEY_SIZE(k); + if (d) + d->sectors_dirty_gc += KEY_SIZE(k); + } + } + + for (t = b->sets; t <= &b->sets[b->nsets]; t++) + btree_bug_on(t->size && + bset_written(b, t) && + bkey_cmp(&b->key, &t->end) < 0, + b, "found short btree key in gc"); + + return stale; +} + +static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k, + struct btree_op *op) +{ + /* + * We block priorities from being written for the duration of garbage + * collection, so we can't sleep in btree_alloc() -> + * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it + * our closure. + */ + struct btree *n = btree_node_alloc_replacement(b, NULL); + + if (!IS_ERR_OR_NULL(n)) { + swap(b, n); + + memcpy(k->ptr, b->key.ptr, + sizeof(uint64_t) * KEY_PTRS(&b->key)); + + __bkey_put(b->c, &b->key); + atomic_inc(&b->c->prio_blocked); + b->prio_blocked++; + + btree_node_free(n, op); + up_write(&n->lock); + } + + return b; +} + +/* + * Leaving this at 2 until we've got incremental garbage collection done; it + * could be higher (and has been tested with 4) except that garbage collection + * could take much longer, adversely affecting latency. + */ +#define GC_MERGE_NODES 2U + +struct gc_merge_info { + struct btree *b; + struct bkey *k; + unsigned keys; +}; + +static void btree_gc_coalesce(struct btree *b, struct btree_op *op, + struct gc_stat *gc, struct gc_merge_info *r) +{ + unsigned nodes = 0, keys = 0, blocks; + int i; + + while (nodes < GC_MERGE_NODES && r[nodes].b) + keys += r[nodes++].keys; + + blocks = btree_default_blocks(b->c) * 2 / 3; + + if (nodes < 2 || + __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1)) + return; + + for (i = nodes - 1; i >= 0; --i) { + if (r[i].b->written) + r[i].b = btree_gc_alloc(r[i].b, r[i].k, op); + + if (r[i].b->written) + return; + } + + for (i = nodes - 1; i > 0; --i) { + struct bset *n1 = r[i].b->sets->data; + struct bset *n2 = r[i - 1].b->sets->data; + struct bkey *k, *last = NULL; + + keys = 0; + + if (i == 1) { + /* + * Last node we're not getting rid of - we're getting + * rid of the node at r[0]. Have to try and fit all of + * the remaining keys into this node; we can't ensure + * they will always fit due to rounding and variable + * length keys (shouldn't be possible in practice, + * though) + */ + if (__set_blocks(n1, n1->keys + r->keys, + b->c) > btree_blocks(r[i].b)) + return; + + keys = n2->keys; + last = &r->b->key; + } else + for (k = n2->start; + k < end(n2); + k = bkey_next(k)) { + if (__set_blocks(n1, n1->keys + keys + + bkey_u64s(k), b->c) > blocks) + break; + |