summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-01-30 11:40:10 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2014-01-30 11:40:10 -0800
commit53d8ab29f8f6d67e37857b68189b38fa3d87dd8e (patch)
tree3c770b58f0404c67b1b084f626dcafa8464c7512 /drivers/md
parentf568849edac8611d603e00bd6cbbcfea09395ae6 (diff)
parent14424be4dbfa127001ad623869f7ee4c7635e991 (diff)
Merge branch 'for-3.14/drivers' of git://git.kernel.dk/linux-block
Pull block IO driver changes from Jens Axboe: - bcache update from Kent Overstreet. - two bcache fixes from Nicholas Swenson. - cciss pci init error fix from Andrew. - underflow fix in the parallel IDE pg_write code from Dan Carpenter. I'm sure the 1 (or 0) users of that are now happy. - two PCI related fixes for sx8 from Jingoo Han. - floppy init fix for first block read from Jiri Kosina. - pktcdvd error return miss fix from Julia Lawall. - removal of IRQF_SHARED from the SEGA Dreamcast CD-ROM code from Michael Opdenacker. - comment typo fix for the loop driver from Olaf Hering. - potential oops fix for null_blk from Raghavendra K T. - two fixes from Sam Bradshaw (Micron) for the mtip32xx driver, fixing an OOM problem and a problem with handling security locked conditions * 'for-3.14/drivers' of git://git.kernel.dk/linux-block: (47 commits) mg_disk: Spelling s/finised/finished/ null_blk: Null pointer deference problem in alloc_page_buffers mtip32xx: Correctly handle security locked condition mtip32xx: Make SGL container per-command to eliminate high order dma allocation drivers/block/loop.c: fix comment typo in loop_config_discard drivers/block/cciss.c:cciss_init_one(): use proper errnos drivers/block/paride/pg.c: underflow bug in pg_write() drivers/block/sx8.c: remove unnecessary pci_set_drvdata() drivers/block/sx8.c: use module_pci_driver() floppy: bail out in open() if drive is not responding to block0 read bcache: Fix auxiliary search trees for key size > cacheline size bcache: Don't return -EINTR when insert finished bcache: Improve bucket_prio() calculation bcache: Add bch_bkey_equal_header() bcache: update bch_bkey_try_merge bcache: Move insert_fixup() to btree_keys_ops bcache: Convert sorting to btree_keys bcache: Convert debug code to btree_keys bcache: Convert btree_iter to struct btree_keys bcache: Refactor bset_tree sysfs stats ...
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bcache/Makefile5
-rw-r--r--drivers/md/bcache/alloc.c89
-rw-r--r--drivers/md/bcache/bcache.h82
-rw-r--r--drivers/md/bcache/bset.c904
-rw-r--r--drivers/md/bcache/bset.h440
-rw-r--r--drivers/md/bcache/btree.c676
-rw-r--r--drivers/md/bcache/btree.h62
-rw-r--r--drivers/md/bcache/closure.c90
-rw-r--r--drivers/md/bcache/closure.h355
-rw-r--r--drivers/md/bcache/debug.c247
-rw-r--r--drivers/md/bcache/debug.h27
-rw-r--r--drivers/md/bcache/extents.c616
-rw-r--r--drivers/md/bcache/extents.h13
-rw-r--r--drivers/md/bcache/journal.c75
-rw-r--r--drivers/md/bcache/journal.h1
-rw-r--r--drivers/md/bcache/movinggc.c2
-rw-r--r--drivers/md/bcache/request.c72
-rw-r--r--drivers/md/bcache/request.h21
-rw-r--r--drivers/md/bcache/super.c103
-rw-r--r--drivers/md/bcache/sysfs.c79
-rw-r--r--drivers/md/bcache/util.h8
-rw-r--r--drivers/md/raid5.c1
22 files changed, 2213 insertions, 1755 deletions
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
index 0e9c82523be6..c488b846f831 100644
--- a/drivers/md/bcache/Makefile
+++ b/drivers/md/bcache/Makefile
@@ -1,7 +1,8 @@
obj-$(CONFIG_BCACHE) += bcache.o
-bcache-y := alloc.o btree.o bset.o io.o journal.o writeback.o\
- movinggc.o request.o super.o sysfs.o debug.o util.o trace.o stats.o closure.o
+bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\
+ io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
+ util.o writeback.o
CFLAGS_request.o += -Iblock
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 4c9852d92b0a..c0d37d082443 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -132,10 +132,16 @@ bool bch_bucket_add_unused(struct cache *ca, struct bucket *b)
{
BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b));
- if (fifo_used(&ca->free) > ca->watermark[WATERMARK_MOVINGGC] &&
- CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO)
- return false;
+ if (CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO) {
+ unsigned i;
+
+ for (i = 0; i < RESERVE_NONE; i++)
+ if (!fifo_full(&ca->free[i]))
+ goto add;
+ return false;
+ }
+add:
b->prio = 0;
if (can_inc_bucket_gen(b) &&
@@ -162,8 +168,21 @@ static void invalidate_one_bucket(struct cache *ca, struct bucket *b)
fifo_push(&ca->free_inc, b - ca->buckets);
}
-#define bucket_prio(b) \
- (((unsigned) (b->prio - ca->set->min_prio)) * GC_SECTORS_USED(b))
+/*
+ * Determines what order we're going to reuse buckets, smallest bucket_prio()
+ * first: we also take into account the number of sectors of live data in that
+ * bucket, and in order for that multiply to make sense we have to scale bucket
+ *
+ * Thus, we scale the bucket priorities so that the bucket with the smallest
+ * prio is worth 1/8th of what INITIAL_PRIO is worth.
+ */
+
+#define bucket_prio(b) \
+({ \
+ unsigned min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; \
+ \
+ (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); \
+})
#define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r))
#define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r))
@@ -304,6 +323,21 @@ do { \
__set_current_state(TASK_RUNNING); \
} while (0)
+static int bch_allocator_push(struct cache *ca, long bucket)
+{
+ unsigned i;
+
+ /* Prios/gens are actually the most important reserve */
+ if (fifo_push(&ca->free[RESERVE_PRIO], bucket))
+ return true;
+
+ for (i = 0; i < RESERVE_NR; i++)
+ if (fifo_push(&ca->free[i], bucket))
+ return true;
+
+ return false;
+}
+
static int bch_allocator_thread(void *arg)
{
struct cache *ca = arg;
@@ -336,9 +370,7 @@ static int bch_allocator_thread(void *arg)
mutex_lock(&ca->set->bucket_lock);
}
- allocator_wait(ca, !fifo_full(&ca->free));
-
- fifo_push(&ca->free, bucket);
+ allocator_wait(ca, bch_allocator_push(ca, bucket));
wake_up(&ca->set->bucket_wait);
}
@@ -365,34 +397,29 @@ static int bch_allocator_thread(void *arg)
}
}
-long bch_bucket_alloc(struct cache *ca, unsigned watermark, bool wait)
+long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait)
{
DEFINE_WAIT(w);
struct bucket *b;
long r;
/* fastpath */
- if (fifo_used(&ca->free) > ca->watermark[watermark]) {
- fifo_pop(&ca->free, r);
+ if (fifo_pop(&ca->free[RESERVE_NONE], r) ||
+ fifo_pop(&ca->free[reserve], r))
goto out;
- }
if (!wait)
return -1;
- while (1) {
- if (fifo_used(&ca->free) > ca->watermark[watermark]) {
- fifo_pop(&ca->free, r);
- break;
- }
-
+ do {
prepare_to_wait(&ca->set->bucket_wait, &w,
TASK_UNINTERRUPTIBLE);
mutex_unlock(&ca->set->bucket_lock);
schedule();
mutex_lock(&ca->set->bucket_lock);
- }
+ } while (!fifo_pop(&ca->free[RESERVE_NONE], r) &&
+ !fifo_pop(&ca->free[reserve], r));
finish_wait(&ca->set->bucket_wait, &w);
out:
@@ -401,12 +428,14 @@ out:
if (expensive_debug_checks(ca->set)) {
size_t iter;
long i;
+ unsigned j;
for (iter = 0; iter < prio_buckets(ca) * 2; iter++)
BUG_ON(ca->prio_buckets[iter] == (uint64_t) r);
- fifo_for_each(i, &ca->free, iter)
- BUG_ON(i == r);
+ for (j = 0; j < RESERVE_NR; j++)
+ fifo_for_each(i, &ca->free[j], iter)
+ BUG_ON(i == r);
fifo_for_each(i, &ca->free_inc, iter)
BUG_ON(i == r);
fifo_for_each(i, &ca->unused, iter)
@@ -419,7 +448,7 @@ out:
SET_GC_SECTORS_USED(b, ca->sb.bucket_size);
- if (watermark <= WATERMARK_METADATA) {
+ if (reserve <= RESERVE_PRIO) {
SET_GC_MARK(b, GC_MARK_METADATA);
SET_GC_MOVE(b, 0);
b->prio = BTREE_PRIO;
@@ -445,7 +474,7 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k)
}
}
-int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
+int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
struct bkey *k, int n, bool wait)
{
int i;
@@ -459,7 +488,7 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
for (i = 0; i < n; i++) {
struct cache *ca = c->cache_by_alloc[i];
- long b = bch_bucket_alloc(ca, watermark, wait);
+ long b = bch_bucket_alloc(ca, reserve, wait);
if (b == -1)
goto err;
@@ -478,12 +507,12 @@ err:
return -1;
}
-int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
+int bch_bucket_alloc_set(struct cache_set *c, unsigned reserve,
struct bkey *k, int n, bool wait)
{
int ret;
mutex_lock(&c->bucket_lock);
- ret = __bch_bucket_alloc_set(c, watermark, k, n, wait);
+ ret = __bch_bucket_alloc_set(c, reserve, k, n, wait);
mutex_unlock(&c->bucket_lock);
return ret;
}
@@ -573,8 +602,8 @@ bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors,
while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) {
unsigned watermark = write_prio
- ? WATERMARK_MOVINGGC
- : WATERMARK_NONE;
+ ? RESERVE_MOVINGGC
+ : RESERVE_NONE;
spin_unlock(&c->data_bucket_lock);
@@ -689,7 +718,7 @@ int bch_cache_allocator_init(struct cache *ca)
* Then 8 for btree allocations
* Then half for the moving garbage collector
*/
-
+#if 0
ca->watermark[WATERMARK_PRIO] = 0;
ca->watermark[WATERMARK_METADATA] = prio_buckets(ca);
@@ -699,6 +728,6 @@ int bch_cache_allocator_init(struct cache *ca)
ca->watermark[WATERMARK_NONE] = ca->free.size / 2 +
ca->watermark[WATERMARK_MOVINGGC];
-
+#endif
return 0;
}
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index dbdbca5a9591..0c707e4f4eaf 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -187,6 +187,7 @@
#include <linux/types.h>
#include <linux/workqueue.h>
+#include "bset.h"
#include "util.h"
#include "closure.h"
@@ -309,7 +310,8 @@ struct cached_dev {
struct cache_sb sb;
struct bio sb_bio;
struct bio_vec sb_bv[1];
- struct closure_with_waitlist sb_write;
+ struct closure sb_write;
+ struct semaphore sb_write_mutex;
/* Refcount on the cache set. Always nonzero when we're caching. */
atomic_t count;
@@ -382,12 +384,12 @@ struct cached_dev {
unsigned writeback_rate_p_term_inverse;
};
-enum alloc_watermarks {
- WATERMARK_PRIO,
- WATERMARK_METADATA,
- WATERMARK_MOVINGGC,
- WATERMARK_NONE,
- WATERMARK_MAX
+enum alloc_reserve {
+ RESERVE_BTREE,
+ RESERVE_PRIO,
+ RESERVE_MOVINGGC,
+ RESERVE_NONE,
+ RESERVE_NR,
};
struct cache {
@@ -399,8 +401,6 @@ struct cache {
struct kobject kobj;
struct block_device *bdev;
- unsigned watermark[WATERMARK_MAX];
-
struct task_struct *alloc_thread;
struct closure prio;
@@ -429,7 +429,7 @@ struct cache {
* because all the data they contained was overwritten), so we only
* need to discard them before they can be moved to the free list.
*/
- DECLARE_FIFO(long, free);
+ DECLARE_FIFO(long, free)[RESERVE_NR];
DECLARE_FIFO(long, free_inc);
DECLARE_FIFO(long, unused);
@@ -514,7 +514,8 @@ struct cache_set {
uint64_t cached_dev_sectors;
struct closure caching;
- struct closure_with_waitlist sb_write;
+ struct closure sb_write;
+ struct semaphore sb_write_mutex;
mempool_t *search;
mempool_t *bio_meta;
@@ -629,13 +630,15 @@ struct cache_set {
#ifdef CONFIG_BCACHE_DEBUG
struct btree *verify_data;
+ struct bset *verify_ondisk;
struct mutex verify_lock;
#endif
unsigned nr_uuids;
struct uuid_entry *uuids;
BKEY_PADDED(uuid_bucket);
- struct closure_with_waitlist uuid_write;
+ struct closure uuid_write;
+ struct semaphore uuid_write_mutex;
/*
* A btree node on disk could have too many bsets for an iterator to fit
@@ -643,13 +646,7 @@ struct cache_set {
*/
mempool_t *fill_iter;
- /*
- * btree_sort() is a merge sort and requires temporary space - single
- * element mempool
- */
- struct mutex sort_lock;
- struct bset *sort;
- unsigned sort_crit_factor;
+ struct bset_sort_state sort;
/* List of buckets we're currently writing data to */
struct list_head data_buckets;
@@ -665,7 +662,6 @@ struct cache_set {
unsigned congested_read_threshold_us;
unsigned congested_write_threshold_us;
- struct time_stats sort_time;
struct time_stats btree_gc_time;
struct time_stats btree_split_time;
struct time_stats btree_read_time;
@@ -683,9 +679,9 @@ struct cache_set {
unsigned error_decay;
unsigned short journal_delay_ms;
+ bool expensive_debug_checks;
unsigned verify:1;
unsigned key_merging_disabled:1;
- unsigned expensive_debug_checks:1;
unsigned gc_always_rewrite:1;
unsigned shrinker_disabled:1;
unsigned copy_gc_enabled:1;
@@ -707,13 +703,8 @@ struct bbio {
struct bio bio;
};
-static inline unsigned local_clock_us(void)
-{
- return local_clock() >> 10;
-}
-
#define BTREE_PRIO USHRT_MAX
-#define INITIAL_PRIO 32768
+#define INITIAL_PRIO 32768U
#define btree_bytes(c) ((c)->btree_pages * PAGE_SIZE)
#define btree_blocks(b) \
@@ -726,21 +717,6 @@ static inline unsigned local_clock_us(void)
#define bucket_bytes(c) ((c)->sb.bucket_size << 9)
#define block_bytes(c) ((c)->sb.block_size << 9)
-#define __set_bytes(i, k) (sizeof(*(i)) + (k) * sizeof(uint64_t))
-#define set_bytes(i) __set_bytes(i, i->keys)
-
-#define __set_blocks(i, k, c) DIV_ROUND_UP(__set_bytes(i, k), block_bytes(c))
-#define set_blocks(i, c) __set_blocks(i, (i)->keys, c)
-
-#define node(i, j) ((struct bkey *) ((i)->d + (j)))
-#define end(i) node(i, (i)->keys)
-
-#define index(i, b) \
- ((size_t) (((void *) i - (void *) (b)->sets[0].data) / \
- block_bytes(b->c)))
-
-#define btree_data_space(b) (PAGE_SIZE << (b)->page_order)
-
#define prios_per_bucket(c) \
((bucket_bytes(c) - sizeof(struct prio_set)) / \
sizeof(struct bucket_disk))
@@ -783,20 +759,34 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c,
return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr);
}
-/* Btree key macros */
+static inline uint8_t gen_after(uint8_t a, uint8_t b)
+{
+ uint8_t r = a - b;
+ return r > 128U ? 0 : r;
+}
-static inline void bkey_init(struct bkey *k)
+static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k,
+ unsigned i)
{
- *k = ZERO_KEY;
+ return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i));
}
+static inline bool ptr_available(struct cache_set *c, const struct bkey *k,
+ unsigned i)
+{
+ return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i);
+}
+
+/* Btree key macros */
+
/*
* This is used for various on disk data structures - cache_sb, prio_set, bset,
* jset: The checksum is _always_ the first 8 bytes of these structs
*/
#define csum_set(i) \
bch_crc64(((void *) (i)) + sizeof(uint64_t), \
- ((void *) end(i)) - (((void *) (i)) + sizeof(uint64_t)))
+ ((void *) bset_bkey_last(i)) - \
+ (((void *) (i)) + sizeof(uint64_t)))
/* Error handling macros */
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 7d388b8bb50e..4f6b5940e609 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -5,30 +5,134 @@
* Copyright 2012 Google, Inc.
*/
-#include "bcache.h"
-#include "btree.h"
-#include "debug.h"
+#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
+#include "util.h"
+#include "bset.h"
+
+#include <linux/console.h>
#include <linux/random.h>
#include <linux/prefetch.h>
+#ifdef CONFIG_BCACHE_DEBUG
+
+void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set)
+{
+ struct bkey *k, *next;
+
+ for (k = i->start; k < bset_bkey_last(i); k = next) {
+ next = bkey_next(k);
+
+ printk(KERN_ERR "block %u key %zi/%u: ", set,
+ (uint64_t *) k - i->d, i->keys);
+
+ if (b->ops->key_dump)
+ b->ops->key_dump(b, k);
+ else
+ printk("%llu:%llu\n", KEY_INODE(k), KEY_OFFSET(k));
+
+ if (next < bset_bkey_last(i) &&
+ bkey_cmp(k, b->ops->is_extents ?
+ &START_KEY(next) : next) > 0)
+ printk(KERN_ERR "Key skipped backwards\n");
+ }
+}
+
+void bch_dump_bucket(struct btree_keys *b)
+{
+ unsigned i;
+
+ console_lock();
+ for (i = 0; i <= b->nsets; i++)
+ bch_dump_bset(b, b->set[i].data,
+ bset_sector_offset(b, b->set[i].data));
+ console_unlock();
+}
+
+int __bch_count_data(struct btree_keys *b)
+{
+ unsigned ret = 0;
+ struct btree_iter iter;
+ struct bkey *k;
+
+ if (b->ops->is_extents)
+ for_each_key(b, k, &iter)
+ ret += KEY_SIZE(k);
+ return ret;
+}
+
+void __bch_check_keys(struct btree_keys *b, const char *fmt, ...)
+{
+ va_list args;
+ struct bkey *k, *p = NULL;
+ struct btree_iter iter;
+ const char *err;
+
+ for_each_key(b, k, &iter) {
+ if (b->ops->is_extents) {
+ err = "Keys out of order";
+ if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0)
+ goto bug;
+
+ if (bch_ptr_invalid(b, k))
+ continue;
+
+ err = "Overlapping keys";
+ if (p && bkey_cmp(p, &START_KEY(k)) > 0)
+ goto bug;
+ } else {
+ if (bch_ptr_bad(b, k))
+ continue;
+
+ err = "Duplicate keys";
+ if (p && !bkey_cmp(p, k))
+ goto bug;
+ }
+ p = k;
+ }
+#if 0
+ err = "Key larger than btree node key";
+ if (p && bkey_cmp(p, &b->key) > 0)
+ goto bug;
+#endif
+ return;
+bug:
+ bch_dump_bucket(b);
+
+ va_start(args, fmt);
+ vprintk(fmt, args);
+ va_end(args);
+
+ panic("bch_check_keys error: %s:\n", err);
+}
+
+static void bch_btree_iter_next_check(struct btree_iter *iter)
+{
+ struct bkey *k = iter->data->k, *next = bkey_next(k);
+
+ if (next < iter->data->end &&
+ bkey_cmp(k, iter->b->ops->is_extents ?
+ &START_KEY(next) : next) > 0) {
+ bch_dump_bucket(iter->b);
+ panic("Key skipped backwards\n");
+ }
+}
+
+#else
+
+static inline void bch_btree_iter_next_check(struct btree_iter *iter) {}
+
+#endif
+
/* Keylists */
-int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c)
+int __bch_keylist_realloc(struct keylist *l, unsigned u64s)
{
size_t oldsize = bch_keylist_nkeys(l);
- size_t newsize = oldsize + 2 + nptrs;
+ size_t newsize = oldsize + u64s;
uint64_t *old_keys = l->keys_p == l->inline_keys ? NULL : l->keys_p;
uint64_t *new_keys;
- /* The journalling code doesn't handle the case where the keys to insert
- * is bigger than an empty write: If we just return -ENOMEM here,
- * bio_insert() and bio_invalidate() will insert the keys created so far
- * and finish the rest when the keylist is empty.
- */
- if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset))
- return -ENOMEM;
-
newsize = roundup_pow_of_two(newsize);
if (newsize <= KEYLIST_INLINE ||
@@ -71,136 +175,6 @@ void bch_keylist_pop_front(struct keylist *l)
bch_keylist_bytes(l));
}
-/* Pointer validation */
-
-static bool __ptr_invalid(struct cache_set *c, const struct bkey *k)
-{
- unsigned i;
-
- for (i = 0; i < KEY_PTRS(k); i++)
- if (ptr_available(c, k, i)) {
- struct cache *ca = PTR_CACHE(c, k, i);
- size_t bucket = PTR_BUCKET_NR(c, k, i);
- size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
-
- if (KEY_SIZE(k) + r > c->sb.bucket_size ||
- bucket < ca->sb.first_bucket ||
- bucket >= ca->sb.nbuckets)
- return true;
- }
-
- return false;
-}
-
-bool bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k)
-{
- char buf[80];
-
- if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))
- goto bad;
-
- if (__ptr_invalid(c, k))
- goto bad;
-
- return false;
-bad:
- bch_bkey_to_text(buf, sizeof(buf), k);
- cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k));
- return true;
-}
-
-bool bch_extent_ptr_invalid(struct cache_set *c, const struct bkey *k)
-{
- char buf[80];
-
- if (!KEY_SIZE(k))
- return true;
-
- if (KEY_SIZE(k) > KEY_OFFSET(k))
- goto bad;
-
- if (__ptr_invalid(c, k))
- goto bad;
-
- return false;
-bad:
- bch_bkey_to_text(buf, sizeof(buf), k);
- cache_bug(c, "spotted extent %s: %s", buf, bch_ptr_status(c, k));
- return true;
-}
-
-static bool ptr_bad_expensive_checks(struct btree *b, const struct bkey *k,
- unsigned ptr)
-{
- struct bucket *g = PTR_BUCKET(b->c, k, ptr);
- char buf[80];
-
- if (mutex_trylock(&b->c->bucket_lock)) {
- if (b->level) {
- if (KEY_DIRTY(k) ||
- g->prio != BTREE_PRIO ||
- (b->c->gc_mark_valid &&
- GC_MARK(g) != GC_MARK_METADATA))
- goto err;
-
- } else {
- if (g->prio == BTREE_PRIO)
- goto err;
-
- if (KEY_DIRTY(k) &&
- b->c->gc_mark_valid &&
- GC_MARK(g) != GC_MARK_DIRTY)
- goto err;
- }
- mutex_unlock(&b->c->bucket_lock);
- }
-
- return false;
-err:
- mutex_unlock(&b->c->bucket_lock);
- bch_bkey_to_text(buf, sizeof(buf), k);
- btree_bug(b,
-"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
- buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin),
- g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
- return true;
-}
-
-bool bch_ptr_bad(struct btree *b, const struct bkey *k)
-{
- struct bucket *g;
- unsigned i, stale;
-
- if (!bkey_cmp(k, &ZERO_KEY) ||
- !KEY_PTRS(k) ||
- bch_ptr_invalid(b, k))
- return true;
-
- for (i = 0; i < KEY_PTRS(k); i++) {
- if (!ptr_available(b->c, k, i))
- return true;
-
- g = PTR_BUCKET(b->c, k, i);
- stale = ptr_stale(b->c, k, i);
-
- btree_bug_on(stale > 96, b,
- "key too stale: %i, need_gc %u",
- stale, b->c->need_gc);
-
- btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
- b, "stale dirty pointer");
-
- if (stale)
- return true;
-
- if (expensive_debug_checks(b->c) &&
- ptr_bad_expensive_checks(b, k, i))
- return true;
- }
-
- return false;
-}
-
/* Key/pointer manipulation */
void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src,
@@ -255,56 +229,138 @@ bool __bch_cut_back(const struct bkey *where, struct bkey *k)
return true;
}
-static uint64_t merge_chksums(struct bkey *l, struct bkey *r)
+/* Auxiliary search trees */
+
+/* 32 bits total: */
+#define BKEY_MID_BITS 3
+#define BKEY_EXPONENT_BITS 7
+#define BKEY_MANTISSA_BITS (32 - BKEY_MID_BITS - BKEY_EXPONENT_BITS)
+#define BKEY_MANTISSA_MASK ((1 << BKEY_MANTISSA_BITS) - 1)
+
+struct bkey_float {
+ unsigned exponent:BKEY_EXPONENT_BITS;
+ unsigned m:BKEY_MID_BITS;
+ unsigned mantissa:BKEY_MANTISSA_BITS;
+} __packed;
+
+/*
+ * BSET_CACHELINE was originally intended to match the hardware cacheline size -
+ * it used to be 64, but I realized the lookup code would touch slightly less
+ * memory if it was 128.
+ *
+ * It definites the number of bytes (in struct bset) per struct bkey_float in
+ * the auxiliar search tree - when we're done searching the bset_float tree we
+ * have this many bytes left that we do a linear search over.
+ *
+ * Since (after level 5) every level of the bset_tree is on a new cacheline,
+ * we're touching one fewer cacheline in the bset tree in exchange for one more
+ * cacheline in the linear search - but the linear search might stop before it
+ * gets to the second cacheline.
+ */
+
+#define BSET_CACHELINE 128
+
+/* Space required for the btree node keys */
+static inline size_t btree_keys_bytes(struct btree_keys *b)
{
- return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) &
- ~((uint64_t)1 << 63);
+ return PAGE_SIZE << b->page_order;
}
-/* Tries to merge l and r: l should be lower than r
- * Returns true if we were able to merge. If we did merge, l will be the merged
- * key, r will be untouched.
- */
-bool bch_bkey_try_merge(struct btree *b, struct bkey *l, struct bkey *r)
+static inline size_t btree_keys_cachelines(struct btree_keys *b)
{
- unsigned i;
+ return btree_keys_bytes(b) / BSET_CACHELINE;
+}
- if (key_merging_disabled(b->c))
- return false;
+/* Space required for the auxiliary search trees */
+static inline size_t bset_tree_bytes(struct btree_keys *b)
+{
+ return btree_keys_cachelines(b) * sizeof(struct bkey_float);
+}
- if (KEY_PTRS(l) != KEY_PTRS(r) ||
- KEY_DIRTY(l) != KEY_DIRTY(r) ||
- bkey_cmp(l, &START_KEY(r)))
- return false;
+/* Space required for the prev pointers */
+static inline size_t bset_prev_bytes(struct btree_keys *b)
+{
+ return btree_keys_cachelines(b) * sizeof(uint8_t);
+}
- for (i = 0; i < KEY_PTRS(l); i++)
- if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
- PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i))
- return false;
+/* Memory allocation */
- /* Keys with no pointers aren't restricted to one bucket and could
- * overflow KEY_SIZE
- */
- if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) {
- SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l));
- SET_KEY_SIZE(l, USHRT_MAX);
+void bch_btree_keys_free(struct btree_keys *b)
+{
+ struct bset_tree *t = b->set;
- bch_cut_front(l, r);
- return false;
- }
+ if (bset_prev_bytes(b) < PAGE_SIZE)
+ kfree(t->prev);
+ else
+ free_pages((unsigned long) t->prev,
+ get_order(bset_prev_bytes(b)));
- if (KEY_CSUM(l)) {
- if (KEY_CSUM(r))
- l->ptr[KEY_PTRS(l)] = merge_chksums(l, r);
- else
- SET_KEY_CSUM(l, 0);
- }
+ if (bset_tree_bytes(b) < PAGE_SIZE)
+ kfree(t->tree);
+ else
+ free_pages((unsigned long) t->tree,
+ get_order(bset_tree_bytes(b)));
- SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r));
- SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r));
+ free_pages((unsigned long) t->data, b->page_order);
- return true;
+ t->prev = NULL;
+ t->tree = NULL;
+ t->data = NULL;
+}
+EXPORT_SYMBOL(bch_btree_keys_free);
+
+int bch_btree_keys_alloc(struct btree_keys *b, unsigned page_order, gfp_t gfp)
+{
+ struct bset_tree *t = b->set;
+
+ BUG_ON(t->data);
+
+ b->page_order = page_order;
+
+ t->data = (void *) __get_free_pages(gfp, b->page_order);
+ if (!t->data)
+ goto err;
+
+ t->tree = bset_tree_bytes(b) < PAGE_SIZE
+ ? kmalloc(bset_tree_bytes(b), gfp)
+ : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b)));
+ if (!t->tree)
+ goto err;
+
+ t->prev = bset_prev_bytes(b) < PAGE_SIZE
+ ? kmalloc(bset_prev_bytes(b), gfp)
+ : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b)));
+ if (!t->prev)
+ goto err;
+
+ return 0;
+err:
+ bch_btree_keys_free(b);
+ return -ENOMEM;
}
+EXPORT_SYMBOL(bch_btree_keys_alloc);
+
+void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops,
+ bool *expensive_debug_checks)
+{
+ unsigned i;
+
+ b->ops = ops;
+ b->expensive_debug_checks = expensive_debug_checks;
+ b->nsets = 0;
+ b->last_set_unwritten = 0;
+
+ /* XXX: shouldn't be needed */
+ for (i = 0; i < MAX_BSETS; i++)
+ b->set[i].size = 0;
+ /*
+ * Second loop starts at 1 because b->keys[0]->data is the memory we
+ * allocated
+ */
+ for (i = 1; i < MAX_BSETS; i++)
+ b->set[i].data = NULL;
+}
+EXPORT_SYMBOL(bch_btree_keys_init);
/* Binary tree stuff for auxiliary search trees */
@@ -455,9 +511,11 @@ static unsigned bkey_to_cacheline(struct bset_tree *t, struct bkey *k)
return ((void *) k - (void *) t->data) / BSET_CACHELINE;
}
-static unsigned bkey_to_cacheline_offset(struct bkey *k)
+static unsigned bkey_to_cacheline_offset(struct bset_tree *t,
+ unsigned cacheline,
+ struct bkey *k)
{
- return ((size_t) k & (BSET_CACHELINE - 1)) / sizeof(uint64_t);
+ return (u64 *) k - (u64 *) cacheline_to_bkey(t, cacheline, 0);
}
static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned j)
@@ -504,7 +562,7 @@ static void make_bfloat(struct bset_tree *t, unsigned j)
: tree_to_prev_bkey(t, j >> ffs(j));
struct bkey *r = is_power_of_2(j + 1)
- ? node(t->data, t->data->keys - bkey_u64s(&t->end))
+ ? bset_bkey_idx(t->data, t->data->keys - bkey_u64s(&t->end))
: tree_to_bkey(t, j >> (ffz(j) + 1));
BUG_ON(m < l || m > r);
@@ -528,9 +586,9 @@ static void make_bfloat(struct bset_tree *t, unsigned j)
f->exponent = 127;
}
-static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
+static void bset_alloc_tree(struct btree_keys *b, struct bset_tree *t)
{
- if (t != b->sets) {
+ if (t != b->set) {
unsigned j = roundup(t[-1].size,
64 / sizeof(struct bkey_float));
@@ -538,33 +596,54 @@ static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
t->prev = t[-1].prev + j;
}
- while (t < b->sets + MAX_BSETS)
+ while (t < b->set + MAX_BSETS)
t++->size = 0;
}
-static void bset_build_unwritten_tree(struct btree *b)
+static void bch_bset_build_unwritten_tree(struct btree_keys *b)
{
- struct bset_tree *t = b->sets + b->nsets;
+ struct bset_tree *t = bset_tree_last(b);
+
+ BUG_ON(b->last_set_unwritten);
+ b->last_set_unwritten = 1;
bset_alloc_tree(b, t);
- if (t->tree != b->sets->tree + bset_tree_space(b)) {
- t->prev[0] = bkey_to_cacheline_offset(t->data->start);
+ if (t->tree != b->set->tree + btree_keys_cachelines(b)) {
+ t->prev[0] = bkey_to_cacheline_offset(t, 0, t->data->start);
t->size = 1;
}
}
-static void bset_build_written_tree(struct btree *b)
+void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic)
+{