summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-01-11 22:25:00 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2016-01-11 22:25:00 -0800
commit03891f9c853d5c4473224478a1e03ea00d70ff8d (patch)
tree63e3d6849f65ecf0f230a748c09ed09970ec981d /drivers/md
parent47c62e4be78303ef52ffa8134026919d0890c5a9 (diff)
parent385277bfb57faac44e92497104ba542cdd82d5fe (diff)
Merge tag 'dm-4.5-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper updates from Mike Snitzer: - The most significant set of changes this cycle is the Forward Error Correction (FEC) support that has been added to the DM verity target. Google uses DM verity on all Android devices and it is believed that this FEC support will enable DM verity to recover from storage failures seen since DM verity was first deployed as part of Android. - A stable fix for a race in the destruction of DM thin pool's workqueue - A stable fix for hung IO if a DM snapshot copy hit an error - A few small cleanups in DM core and DM persistent data. - A couple DM thinp range discard improvements (address atomicity of finding a range and the efficiency of discarding a partially mapped thin device) - Add ability to debug DM bufio leaks by recording stack trace when a buffer is allocated. Upon detected leak the recorded stack is dumped. * tag 'dm-4.5-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: dm snapshot: fix hung bios when copy error occurs dm thin: bump thin and thin-pool target versions dm thin: fix race condition when destroying thin pool workqueue dm space map metadata: remove unused variable in brb_pop() dm verity: add ignore_zero_blocks feature dm verity: add support for forward error correction dm verity: factor out verity_for_bv_block() dm verity: factor out structures and functions useful to separate object dm verity: move dm-verity.c to dm-verity-target.c dm verity: separate function for parsing opt args dm verity: clean up duplicate hashing code dm btree: factor out need_insert() helper dm bufio: use BUG_ON instead of conditional call to BUG dm bufio: store stacktrace in buffers to help find buffer leaks dm bufio: return NULL to improve code clarity dm block manager: cleanup code that prints stacktrace dm: don't save and restore bi_private dm thin metadata: make dm_thin_find_mapped_range() atomic dm thin metadata: speed up discard of partially mapped volumes
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig21
-rw-r--r--drivers/md/Makefile5
-rw-r--r--drivers/md/dm-bufio.c44
-rw-r--r--drivers/md/dm-cache-target.c3
-rw-r--r--drivers/md/dm-exception-store.h2
-rw-r--r--drivers/md/dm-snap-persistent.c5
-rw-r--r--drivers/md/dm-snap-transient.c4
-rw-r--r--drivers/md/dm-snap.c26
-rw-r--r--drivers/md/dm-thin-metadata.c114
-rw-r--r--drivers/md/dm-thin.c8
-rw-r--r--drivers/md/dm-verity-fec.c818
-rw-r--r--drivers/md/dm-verity-fec.h152
-rw-r--r--drivers/md/dm-verity-target.c (renamed from drivers/md/dm-verity.c)602
-rw-r--r--drivers/md/dm-verity.h129
-rw-r--r--drivers/md/persistent-data/Kconfig9
-rw-r--r--drivers/md/persistent-data/dm-block-manager.c11
-rw-r--r--drivers/md/persistent-data/dm-btree.c17
-rw-r--r--drivers/md/persistent-data/dm-space-map-metadata.c3
18 files changed, 1619 insertions, 354 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 7913fdcfc849..0a2e7273db9e 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -240,6 +240,15 @@ config DM_BUFIO
as a cache, holding recently-read blocks in memory and performing
delayed writes.
+config DM_DEBUG_BLOCK_STACK_TRACING
+ bool "Keep stack trace of persistent data block lock holders"
+ depends on STACKTRACE_SUPPORT && DM_BUFIO
+ select STACKTRACE
+ ---help---
+ Enable this for messages that may help debug problems with the
+ block manager locking used by thin provisioning and caching.
+
+ If unsure, say N.
config DM_BIO_PRISON
tristate
depends on BLK_DEV_DM
@@ -458,6 +467,18 @@ config DM_VERITY
If unsure, say N.
+config DM_VERITY_FEC
+ bool "Verity forward error correction support"
+ depends on DM_VERITY
+ select REED_SOLOMON
+ select REED_SOLOMON_DEC8
+ ---help---
+ Add forward error correction support to dm-verity. This option
+ makes it possible to use pre-generated error correction data to
+ recover from corrupted blocks.
+
+ If unsure, say N.
+
config DM_SWITCH
tristate "Switch target support (EXPERIMENTAL)"
depends on BLK_DEV_DM
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index f34979cd141a..62a65764e8e0 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -16,6 +16,7 @@ dm-cache-mq-y += dm-cache-policy-mq.o
dm-cache-smq-y += dm-cache-policy-smq.o
dm-cache-cleaner-y += dm-cache-policy-cleaner.o
dm-era-y += dm-era-target.o
+dm-verity-y += dm-verity-target.o
md-mod-y += md.o bitmap.o
raid456-y += raid5.o raid5-cache.o
@@ -63,3 +64,7 @@ obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o
ifeq ($(CONFIG_DM_UEVENT),y)
dm-mod-objs += dm-uevent.o
endif
+
+ifeq ($(CONFIG_DM_VERITY_FEC),y)
+dm-verity-objs += dm-verity-fec.o
+endif
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 2dd33085b331..6b832e06580d 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -16,6 +16,7 @@
#include <linux/shrinker.h>
#include <linux/module.h>
#include <linux/rbtree.h>
+#include <linux/stacktrace.h>
#define DM_MSG_PREFIX "bufio"
@@ -149,6 +150,11 @@ struct dm_buffer {
struct list_head write_list;
struct bio bio;
struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS];
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+#define MAX_STACK 10
+ struct stack_trace stack_trace;
+ unsigned long stack_entries[MAX_STACK];
+#endif
};
/*----------------------------------------------------------------*/
@@ -253,6 +259,17 @@ static LIST_HEAD(dm_bufio_all_clients);
*/
static DEFINE_MUTEX(dm_bufio_clients_lock);
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+static void buffer_record_stack(struct dm_buffer *b)
+{
+ b->stack_trace.nr_entries = 0;
+ b->stack_trace.max_entries = MAX_STACK;
+ b->stack_trace.entries = b->stack_entries;
+ b->stack_trace.skip = 2;
+ save_stack_trace(&b->stack_trace);
+}
+#endif
+
/*----------------------------------------------------------------
* A red/black tree acts as an index for all the buffers.
*--------------------------------------------------------------*/
@@ -454,6 +471,9 @@ static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask)
adjust_total_allocated(b->data_mode, (long)c->block_size);
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+ memset(&b->stack_trace, 0, sizeof(b->stack_trace));
+#endif
return b;
}
@@ -1063,12 +1083,16 @@ static void *new_read(struct dm_bufio_client *c, sector_t block,
dm_bufio_lock(c);
b = __bufio_new(c, block, nf, &need_submit, &write_list);
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+ if (b && b->hold_count == 1)
+ buffer_record_stack(b);
+#endif
dm_bufio_unlock(c);
__flush_write_list(&write_list);
if (!b)
- return b;
+ return NULL;
if (need_submit)
submit_io(b, READ, b->block, read_endio);
@@ -1462,6 +1486,7 @@ static void drop_buffers(struct dm_bufio_client *c)
{
struct dm_buffer *b;
int i;
+ bool warned = false;
BUG_ON(dm_bufio_in_request());
@@ -1476,9 +1501,21 @@ static void drop_buffers(struct dm_bufio_client *c)
__free_buffer_wake(b);
for (i = 0; i < LIST_SIZE; i++)
- list_for_each_entry(b, &c->lru[i], lru_list)
+ list_for_each_entry(b, &c->lru[i], lru_list) {
+ WARN_ON(!warned);
+ warned = true;
DMERR("leaked buffer %llx, hold count %u, list %d",
(unsigned long long)b->block, b->hold_count, i);
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+ print_stack_trace(&b->stack_trace, 1);
+ b->hold_count = 0; /* mark unclaimed to avoid BUG_ON below */
+#endif
+ }
+
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+ while ((b = __get_unclaimed_buffer(c)))
+ __free_buffer_wake(b);
+#endif
for (i = 0; i < LIST_SIZE; i++)
BUG_ON(!list_empty(&c->lru[i]));
@@ -1891,8 +1928,7 @@ static void __exit dm_bufio_exit(void)
bug = 1;
}
- if (bug)
- BUG();
+ BUG_ON(bug);
}
module_init(dm_bufio_init)
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 2fd4c8296144..5780accffa30 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -118,14 +118,12 @@ static void iot_io_end(struct io_tracker *iot, sector_t len)
*/
struct dm_hook_info {
bio_end_io_t *bi_end_io;
- void *bi_private;
};
static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio,
bio_end_io_t *bi_end_io, void *bi_private)
{
h->bi_end_io = bio->bi_end_io;
- h->bi_private = bio->bi_private;
bio->bi_end_io = bi_end_io;
bio->bi_private = bi_private;
@@ -134,7 +132,6 @@ static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio,
static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
{
bio->bi_end_io = h->bi_end_io;
- bio->bi_private = h->bi_private;
}
/*----------------------------------------------------------------*/
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index fae34e7a0b1e..12b5216c2cfe 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -69,7 +69,7 @@ struct dm_exception_store_type {
* Update the metadata with this exception.
*/
void (*commit_exception) (struct dm_exception_store *store,
- struct dm_exception *e,
+ struct dm_exception *e, int valid,
void (*callback) (void *, int success),
void *callback_context);
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 3164b8bce294..4d3909393f2c 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -695,7 +695,7 @@ static int persistent_prepare_exception(struct dm_exception_store *store,
}
static void persistent_commit_exception(struct dm_exception_store *store,
- struct dm_exception *e,
+ struct dm_exception *e, int valid,
void (*callback) (void *, int success),
void *callback_context)
{
@@ -704,6 +704,9 @@ static void persistent_commit_exception(struct dm_exception_store *store,
struct core_exception ce;
struct commit_callback *cb;
+ if (!valid)
+ ps->valid = 0;
+
ce.old_chunk = e->old_chunk;
ce.new_chunk = e->new_chunk;
write_exception(ps, ps->current_committed++, &ce);
diff --git a/drivers/md/dm-snap-transient.c b/drivers/md/dm-snap-transient.c
index 9b7c8c8049d6..4d50a12cf00c 100644
--- a/drivers/md/dm-snap-transient.c
+++ b/drivers/md/dm-snap-transient.c
@@ -52,12 +52,12 @@ static int transient_prepare_exception(struct dm_exception_store *store,
}
static void transient_commit_exception(struct dm_exception_store *store,
- struct dm_exception *e,
+ struct dm_exception *e, int valid,
void (*callback) (void *, int success),
void *callback_context)
{
/* Just succeed */
- callback(callback_context, 1);
+ callback(callback_context, valid);
}
static void transient_usage(struct dm_exception_store *store,
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index c06b74e91cd6..3766386080a4 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -207,7 +207,6 @@ struct dm_snap_pending_exception {
*/
struct bio *full_bio;
bio_end_io_t *full_bio_end_io;
- void *full_bio_private;
};
/*
@@ -1438,8 +1437,9 @@ static void __invalidate_snapshot(struct dm_snapshot *s, int err)
dm_table_event(s->ti->table);
}
-static void pending_complete(struct dm_snap_pending_exception *pe, int success)
+static void pending_complete(void *context, int success)
{
+ struct dm_snap_pending_exception *pe = context;
struct dm_exception *e;
struct dm_snapshot *s = pe->snap;
struct bio *origin_bios = NULL;
@@ -1485,10 +1485,8 @@ out:
snapshot_bios = bio_list_get(&pe->snapshot_bios);
origin_bios = bio_list_get(&pe->origin_bios);
full_bio = pe->full_bio;
- if (full_bio) {
+ if (full_bio)
full_bio->bi_end_io = pe->full_bio_end_io;
- full_bio->bi_private = pe->full_bio_private;
- }
increment_pending_exceptions_done_count();
up_write(&s->lock);
@@ -1509,24 +1507,13 @@ out:
free_pending_exception(pe);
}
-static void commit_callback(void *context, int success)
-{
- struct dm_snap_pending_exception *pe = context;
-
- pending_complete(pe, success);
-}
-
static void complete_exception(struct dm_snap_pending_exception *pe)
{
struct dm_snapshot *s = pe->snap;
- if (unlikely(pe->copy_error))
- pending_complete(pe, 0);
-
- else
- /* Update the metadata if we are persistent */
- s->store->type->commit_exception(s->store, &pe->e,
- commit_callback, pe);
+ /* Update the metadata if we are persistent */
+ s->store->type->commit_exception(s->store, &pe->e, !pe->copy_error,
+ pending_complete, pe);
}
/*
@@ -1605,7 +1592,6 @@ static void start_full_bio(struct dm_snap_pending_exception *pe,
pe->full_bio = bio;
pe->full_bio_end_io = bio->bi_end_io;
- pe->full_bio_private = bio->bi_private;
callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client,
copy_callback, pe);
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index c219a053c7f6..f962d6453afd 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1395,8 +1395,21 @@ static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time)
return td->snapshotted_time > time;
}
-int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
- int can_issue_io, struct dm_thin_lookup_result *result)
+static void unpack_lookup_result(struct dm_thin_device *td, __le64 value,
+ struct dm_thin_lookup_result *result)
+{
+ uint64_t block_time = 0;
+ dm_block_t exception_block;
+ uint32_t exception_time;
+
+ block_time = le64_to_cpu(value);
+ unpack_block_time(block_time, &exception_block, &exception_time);
+ result->block = exception_block;
+ result->shared = __snapshotted_since(td, exception_time);
+}
+
+static int __find_block(struct dm_thin_device *td, dm_block_t block,
+ int can_issue_io, struct dm_thin_lookup_result *result)
{
int r;
__le64 value;
@@ -1404,39 +1417,56 @@ int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
dm_block_t keys[2] = { td->id, block };
struct dm_btree_info *info;
- down_read(&pmd->root_lock);
- if (pmd->fail_io) {
- up_read(&pmd->root_lock);
- return -EINVAL;
- }
-
if (can_issue_io) {
info = &pmd->info;
} else
info = &pmd->nb_info;
r = dm_btree_lookup(info, pmd->root, keys, &value);
- if (!r) {
- uint64_t block_time = 0;
- dm_block_t exception_block;
- uint32_t exception_time;
-
- block_time = le64_to_cpu(value);
- unpack_block_time(block_time, &exception_block,
- &exception_time);
- result->block = exception_block;
- result->shared = __snapshotted_since(td, exception_time);
+ if (!r)
+ unpack_lookup_result(td, value, result);
+
+ return r;
+}
+
+int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
+ int can_issue_io, struct dm_thin_lookup_result *result)
+{
+ int r;
+ struct dm_pool_metadata *pmd = td->pmd;
+
+ down_read(&pmd->root_lock);
+ if (pmd->fail_io) {
+ up_read(&pmd->root_lock);
+ return -EINVAL;
}
+ r = __find_block(td, block, can_issue_io, result);
+
up_read(&pmd->root_lock);
return r;
}
-/* FIXME: write a more efficient one in btree */
-int dm_thin_find_mapped_range(struct dm_thin_device *td,
- dm_block_t begin, dm_block_t end,
- dm_block_t *thin_begin, dm_block_t *thin_end,
- dm_block_t *pool_begin, bool *maybe_shared)
+static int __find_next_mapped_block(struct dm_thin_device *td, dm_block_t block,
+ dm_block_t *vblock,
+ struct dm_thin_lookup_result *result)
+{
+ int r;
+ __le64 value;
+ struct dm_pool_metadata *pmd = td->pmd;
+ dm_block_t keys[2] = { td->id, block };
+
+ r = dm_btree_lookup_next(&pmd->info, pmd->root, keys, vblock, &value);
+ if (!r)
+ unpack_lookup_result(td, value, result);
+
+ return r;
+}
+
+static int __find_mapped_range(struct dm_thin_device *td,
+ dm_block_t begin, dm_block_t end,
+ dm_block_t *thin_begin, dm_block_t *thin_end,
+ dm_block_t *pool_begin, bool *maybe_shared)
{
int r;
dm_block_t pool_end;
@@ -1445,21 +1475,11 @@ int dm_thin_find_mapped_range(struct dm_thin_device *td,
if (end < begin)
return -ENODATA;
- /*
- * Find first mapped block.
- */
- while (begin < end) {
- r = dm_thin_find_block(td, begin, true, &lookup);
- if (r) {
- if (r != -ENODATA)
- return r;
- } else
- break;
-
- begin++;
- }
+ r = __find_next_mapped_block(td, begin, &begin, &lookup);
+ if (r)
+ return r;
- if (begin == end)
+ if (begin >= end)
return -ENODATA;
*thin_begin = begin;
@@ -1469,7 +1489,7 @@ int dm_thin_find_mapped_range(struct dm_thin_device *td,
begin++;
pool_end = *pool_begin + 1;
while (begin != end) {
- r = dm_thin_find_block(td, begin, true, &lookup);
+ r = __find_block(td, begin, true, &lookup);
if (r) {
if (r == -ENODATA)
break;
@@ -1489,6 +1509,24 @@ int dm_thin_find_mapped_range(struct dm_thin_device *td,
return 0;
}
+int dm_thin_find_mapped_range(struct dm_thin_device *td,
+ dm_block_t begin, dm_block_t end,
+ dm_block_t *thin_begin, dm_block_t *thin_end,
+ dm_block_t *pool_begin, bool *maybe_shared)
+{
+ int r = -EINVAL;
+ struct dm_pool_metadata *pmd = td->pmd;
+
+ down_read(&pmd->root_lock);
+ if (!pmd->fail_io) {
+ r = __find_mapped_range(td, begin, end, thin_begin, thin_end,
+ pool_begin, maybe_shared);
+ }
+ up_read(&pmd->root_lock);
+
+ return r;
+}
+
static int __insert(struct dm_thin_device *td, dm_block_t block,
dm_block_t data_block)
{
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 63903a5a5d9e..72d91f477683 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -3453,8 +3453,8 @@ static void pool_postsuspend(struct dm_target *ti)
struct pool_c *pt = ti->private;
struct pool *pool = pt->pool;
- cancel_delayed_work(&pool->waker);
- cancel_delayed_work(&pool->no_space_timeout);
+ cancel_delayed_work_sync(&pool->waker);
+ cancel_delayed_work_sync(&pool->no_space_timeout);
flush_workqueue(pool->wq);
(void) commit(pool);
}
@@ -3886,7 +3886,7 @@ static struct target_type pool_target = {
.name = "thin-pool",
.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
DM_TARGET_IMMUTABLE,
- .version = {1, 16, 0},
+ .version = {1, 17, 0},
.module = THIS_MODULE,
.ctr = pool_ctr,
.dtr = pool_dtr,
@@ -4260,7 +4260,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type thin_target = {
.name = "thin",
- .version = {1, 16, 0},
+ .version = {1, 17, 0},
.module = THIS_MODULE,
.ctr = thin_ctr,
.dtr = thin_dtr,
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
new file mode 100644
index 000000000000..1cc10c4de701
--- /dev/null
+++ b/drivers/md/dm-verity-fec.c
@@ -0,0 +1,818 @@
+/*
+ * Copyright (C) 2015 Google, Inc.
+ *
+ * Author: Sami Tolvanen <samitolvanen@google.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include "dm-verity-fec.h"
+#include <linux/math64.h>
+
+#define DM_MSG_PREFIX "verity-fec"
+
+/*
+ * If error correction has been configured, returns true.
+ */
+bool verity_fec_is_enabled(struct dm_verity *v)
+{
+ return v->fec && v->fec->dev;
+}
+
+/*
+ * Return a pointer to dm_verity_fec_io after dm_verity_io and its variable
+ * length fields.
+ */
+static inline struct dm_verity_fec_io *fec_io(struct dm_verity_io *io)
+{
+ return (struct dm_verity_fec_io *) verity_io_digest_end(io->v, io);
+}
+
+/*
+ * Return an interleaved offset for a byte in RS block.
+ */
+static inline u64 fec_interleave(struct dm_verity *v, u64 offset)
+{
+ u32 mod;
+
+ mod = do_div(offset, v->fec->rsn);
+ return offset + mod * (v->fec->rounds << v->data_dev_block_bits);
+}
+
+/*
+ * Decode an RS block using Reed-Solomon.
+ */
+static int fec_decode_rs8(struct dm_verity *v, struct dm_verity_fec_io *fio,
+ u8 *data, u8 *fec, int neras)
+{
+ int i;
+ uint16_t par[DM_VERITY_FEC_RSM - DM_VERITY_FEC_MIN_RSN];
+
+ for (i = 0; i < v->fec->roots; i++)
+ par[i] = fec[i];
+
+ return decode_rs8(fio->rs, data, par, v->fec->rsn, NULL, neras,
+ fio->erasures, 0, NULL);
+}
+
+/*
+ * Read error-correcting codes for the requested RS block. Returns a pointer
+ * to the data block. Caller is responsible for releasing buf.
+ */
+static u8 *fec_read_parity(struct dm_verity *v, u64 rsb, int index,
+ unsigned *offset, struct dm_buffer **buf)
+{
+ u64 position, block;
+ u8 *res;
+
+ position = (index + rsb) * v->fec->roots;
+ block = position >> v->data_dev_block_bits;
+ *offset = (unsigned)(position - (block << v->data_dev_block_bits));
+
+ res = dm_bufio_read(v->fec->bufio, v->fec->start + block, buf);
+ if (unlikely(IS_ERR(res))) {
+ DMERR("%s: FEC %llu: parity read failed (block %llu): %ld",
+ v->data_dev->name, (unsigned long long)rsb,
+ (unsigned long long)(v->fec->start + block),
+ PTR_ERR(res));
+ *buf = NULL;
+ }
+
+ return res;
+}
+
+/* Loop over each preallocated buffer slot. */
+#define fec_for_each_prealloc_buffer(__i) \
+ for (__i = 0; __i < DM_VERITY_FEC_BUF_PREALLOC; __i++)
+
+/* Loop over each extra buffer slot. */
+#define fec_for_each_extra_buffer(io, __i) \
+ for (__i = DM_VERITY_FEC_BUF_PREALLOC; __i < DM_VERITY_FEC_BUF_MAX; __i++)
+
+/* Loop over each allocated buffer. */
+#define fec_for_each_buffer(io, __i) \
+ for (__i = 0; __i < (io)->nbufs; __i++)
+
+/* Loop over each RS block in each allocated buffer. */
+#define fec_for_each_buffer_rs_block(io, __i, __j) \
+ fec_for_each_buffer(io, __i) \
+ for (__j = 0; __j < 1 << DM_VERITY_FEC_BUF_RS_BITS; __j++)
+
+/*
+ * Return a pointer to the current RS block when called inside
+ * fec_for_each_buffer_rs_block.
+ */
+static inline u8 *fec_buffer_rs_block(struct dm_verity *v,
+ struct dm_verity_fec_io *fio,
+ unsigned i, unsigned j)
+{
+ return &fio->bufs[i][j * v->fec->rsn];
+}
+
+/*
+ * Return an index to the current RS block when called inside
+ * fec_for_each_buffer_rs_block.
+ */
+static inline unsigned fec_buffer_rs_index(unsigned i, unsigned j)
+{
+ return (i << DM_VERITY_FEC_BUF_RS_BITS) + j;
+}
+
+/*
+ * Decode all RS blocks from buffers and copy corrected bytes into fio->output
+ * starting from block_offset.
+ */
+static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio,
+ u64 rsb, int byte_index, unsigned block_offset,
+ int neras)
+{
+ int r, corrected = 0, res;
+ struct dm_buffer *buf;
+ unsigned n, i, offset;
+ u8 *par, *block;
+
+ par = fec_read_parity(v, rsb, block_offset, &offset, &buf);
+ if (IS_ERR(par))
+ return PTR_ERR(par);
+
+ /*
+ * Decode the RS blocks we have in bufs. Each RS block results in
+ * one corrected target byte and consumes fec->roots parity bytes.
+ */
+ fec_for_each_buffer_rs_block(fio, n, i) {
+ block = fec_buffer_rs_block(v, fio, n, i);
+ res = fec_decode_rs8(v, fio, block, &par[offset], neras);
+ if (res < 0) {
+ dm_bufio_release(buf);
+
+ r = res;
+ goto error;
+ }
+
+ corrected += res;
+ fio->output[block_offset] = block[byte_index];
+
+ block_offset++;
+ if (block_offset >= 1 << v->data_dev_block_bits)
+ goto done;
+
+ /* read the next block when we run out of parity bytes */
+ offset += v->fec->roots;
+ if (offset >= 1 << v->data_dev_block_bits) {
+ dm_bufio_release(buf);
+
+ par = fec_read_parity(v, rsb, block_offset, &offset, &buf);
+ if (unlikely(IS_ERR(par)))
+ return PTR_ERR(par);
+ }
+ }
+done:
+ r = corrected;
+error:
+ if (r < 0 && neras)
+ DMERR_LIMIT("%s: FEC %llu: failed to correct: %d",
+ v->data_dev->name, (unsigned long long)rsb, r);
+ else if (r > 0)
+ DMWARN_LIMIT("%s: FEC %llu: corrected %d errors",
+ v->data_dev->name, (unsigned long long)rsb, r);
+
+ return r;
+}
+
+/*
+ * Locate data block erasures using verity hashes.
+ */
+static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io,
+ u8 *want_digest, u8 *data)
+{
+ if (unlikely(verity_hash(v, verity_io_hash_desc(v, io),
+ data, 1 << v->data_dev_block_bits,
+ verity_io_real_digest(v, io))))
+ return 0;
+
+ return memcmp(verity_io_real_digest(v, io), want_digest,
+ v->digest_size) != 0;
+}
+
+/*
+ * Read data blocks that are part of the RS block and deinterleave as much as
+ * fits into buffers. Check for erasure locations if @neras is non-NULL.
+ */
+static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io,
+ u64 rsb, u64 target, unsigned block_offset,
+ int *neras)
+{
+ bool is_zero;
+ int i, j, target_index = -1;
+ struct dm_buffer *buf;
+ struct dm_bufio_client *bufio;
+ struct dm_verity_fec_io *fio = fec_io(io);
+ u64 block, ileaved;
+ u8 *bbuf, *rs_block;
+ u8 want_digest[v->digest_size];
+ unsigned n, k;
+
+ if (neras)
+ *neras = 0;
+
+ /*
+ * read each of the rsn data blocks that are part of the RS block, and
+ * interleave contents to available bufs
+ */
+ for (i = 0; i < v->fec->rsn; i++) {
+ ileaved = fec_interleave(v, rsb * v->fec->rsn + i);
+
+ /*
+ * target is the data block we want to correct, target_index is
+ * the index of this block within the rsn RS blocks
+ */
+ if (ileaved == target)
+ target_index = i;
+
+ block = ileaved >> v->data_dev_block_bits;
+ bufio = v->fec->data_bufio;
+
+ if (block >= v->data_blocks) {
+ block -= v->data_blocks;
+
+ /*
+ * blocks outside the area were assumed to contain
+ * zeros when encoding data was generated
+ */
+ if (unlikely(block >= v->fec->hash_blocks))
+ continue;
+
+ block += v->hash_start;
+ bufio = v->bufio;
+ }
+
+ bbuf = dm_bufio_read(bufio, block, &buf);
+ if (unlikely(IS_ERR(bbuf))) {
+ DMWARN_LIMIT("%s: FEC %llu: read failed (%llu): %ld",
+ v->data_dev->name,
+ (unsigned long long)rsb,
+ (unsigned long long)block, PTR_ERR(bbuf));
+
+ /* assume the block is corrupted */
+ if (neras && *neras <= v->fec->roots)
+ fio->erasures[(*neras)++] = i;
+
+ continue;
+ }
+
+ /* locate erasures if the block is on the data device */
+ if (bufio == v->fec->data_bufio &&
+ verity_hash_for_block(v, io, block, want_digest,
+ &is_zero) == 0) {
+ /* skip known zero blocks entirely */
+ if (is_zero)
+ continue;
+
+ /*
+ * skip if we have already found the theoretical
+ * maximum number (i.e. fec->roots) of erasures
+ */
+ if (neras && *neras <= v->fec->roots &&
+ fec_is_erasure(v, io, want_digest, bbuf))
+ fio->erasures[(*neras)++] = i;
+ }
+
+ /*
+ * deinterleave and copy the bytes that fit into bufs,
+ * starting from block_offset
+ */
+ fec_for_each_buffer_rs_block(fio, n, j) {
+ k = fec_buffer_rs_index(n, j) + block_offset;
+
+ if (k >= 1 << v->data_dev_block_bits)
+ goto done;
+
+ rs_block = fec_buffer_rs_block(v, fio, n, j);
+ rs_block[i] = bbuf[k];
+ }
+done:
+ dm_bufio_release(buf);
+ }
+
+ return target_index;
+}
+
+/*
+ * Allocate RS control structure and FEC buffers from preallocated mempools,
+ * and attempt to allocate as many extra buffers as available.
+ */
+static int fec_alloc_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio)
+{
+ unsigned n;
+
+ if (!fio->rs) {
+ fio->rs = mempool_alloc(v->fec->rs_pool, 0);
+ if (unlikely(!fio->rs)) {
+ DMERR("failed to allocate RS");
+ return -ENOMEM;
+ }
+ }
+
+ fec_for_each_prealloc_buffer(n) {
+ if (fio->bufs[n])
+ continue;
+
+ fio->bufs[n] = mempool_alloc(v->fec->prealloc_pool, GFP_NOIO);
+ if (unlikely(!fio->bufs[n])) {
+ DMERR("failed to allocate FEC buffer");
+ return -ENOMEM;
+ }
+ }
+
+ /* try to allocate the maximum number of buffers */
+ fec_for_each_extra_buffer(fio, n) {
+ if (fio->bufs[n])
+ continue;
+
+ fio->bufs[n] = mempool_alloc(v->fec->extra_pool, GFP_NOIO);
+ /* we can manage with even one buffer if necessary */
+ if (unlikely(!fio->bufs[n]))
+ break;
+ }
+ fio->nbufs = n;
+
+ if (!fio->output) {
+ fio->output = mempool_alloc(v->fec->output_pool, GFP_NOIO);
+
+ if (!fio->output) {
+ DMERR("failed to allocate FEC page");
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Initialize buffers and clear erasures. fec_read_bufs() assumes buffers are
+ * zeroed before deinterleaving.
+ */
+static void fec_init_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio)
+{
+ unsigned n;
+
+ fec_for_each_buffer(fio, n)
+ memset(fio->bufs[n], 0, v->fec->rsn << DM_VERITY_FEC_BUF_RS_BITS);
+
+ memset(fio->erasures, 0, sizeof(fio->erasures));
+}
+
+/*
+ * Decode all RS blocks in a single data block and return the target block
+ * (indicated by @offset) in fio->output. If @use_erasures is non-zero, uses
+ * hashes to locate erasures.
+ */
+static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io,
+ struct dm_verity_fec_io *fio, u64 rsb, u64 offset,
+ bool use_erasures)
+{
+ int r, neras = 0;
+ unsigned pos;
+
+ r = fec_alloc_bufs(v, fio);
+ if (unlikely(r < 0))
+ return r;
+
+ for (pos = 0; pos < 1 << v->data_dev_block_bits; ) {
+ fec_init_bufs(v, fio);
+
+ r = fec_read_bufs(v, io, rsb, offset, pos,
+ use_erasures ? &neras : NULL);
+ if (unlikely(r < 0))
+ return r;
+
+ r = fec_decode_bufs(v, fio, rsb, r, pos, neras);
+ if (r < 0)
+ return r;
+
+ pos += fio->nbufs << DM_VERITY_FEC_BUF_RS_BITS;
+ }
+
+ /* Always re-validate the corrected block against the expected hash */
+ r = verity_hash(v, verity_io_hash_desc(v, io), fio->output,
+ 1 << v->data_dev_block_bits,
+ verity_io_real_digest(v, io));
+ if (unlikely(r < 0))
+ return r;
+
+ if (memcmp(verity_io_real_digest(v