1 files changed, 3238 insertions, 0 deletions
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
new file mode 100644
index 000000000000..c7f7c8d76576
--- /dev/null
+++ b/drivers/md/dm-integrity.c
@@ -0,0 +1,3238 @@
+/*
+ * Copyright (C) 2016-2017 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2016-2017 Milan Broz
+ * Copyright (C) 2016-2017 Mikulas Patocka
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/module.h>
+#include <linux/device-mapper.h>
+#include <linux/dm-io.h>
+#include <linux/vmalloc.h>
+#include <linux/sort.h>
+#include <linux/rbtree.h>
+#include <linux/delay.h>
+#include <linux/random.h>
+#include <crypto/hash.h>
+#include <crypto/skcipher.h>
+#include <linux/async_tx.h>
+#include "dm-bufio.h"
+
+#define DM_MSG_PREFIX "integrity"
+
+#define DEFAULT_INTERLEAVE_SECTORS	32768
+#define DEFAULT_JOURNAL_SIZE_FACTOR	7
+#define DEFAULT_BUFFER_SECTORS		128
+#define DEFAULT_JOURNAL_WATERMARK	50
+#define DEFAULT_SYNC_MSEC		10000
+#define DEFAULT_MAX_JOURNAL_SECTORS	131072
+#define MIN_LOG2_INTERLEAVE_SECTORS	3
+#define MAX_LOG2_INTERLEAVE_SECTORS	31
+#define METADATA_WORKQUEUE_MAX_ACTIVE	16
+
+/*
+ * Warning - DEBUG_PRINT prints security-sensitive data to the log,
+ * so it should not be enabled in the official kernel
+ */
+//#define DEBUG_PRINT
+//#define INTERNAL_VERIFY
+
+/*
+ * On disk structures
+ */
+
+#define SB_MAGIC			"integrt"
+#define SB_VERSION			1
+#define SB_SECTORS			8
+#define MAX_SECTORS_PER_BLOCK		8
+
+struct superblock {
+	__u8 magic[8];
+	__u8 version;
+	__u8 log2_interleave_sectors;
+	__u16 integrity_tag_size;
+	__u32 journal_sections;
+	__u64 provided_data_sectors;	/* userspace uses this value */
+	__u32 flags;
+	__u8 log2_sectors_per_block;
+};
+
+#define SB_FLAG_HAVE_JOURNAL_MAC	0x1
+
+#define	JOURNAL_ENTRY_ROUNDUP		8
+
+typedef __u64 commit_id_t;
+#define JOURNAL_MAC_PER_SECTOR		8
+
+struct journal_entry {
+	union {
+		struct {
+			__u32 sector_lo;
+			__u32 sector_hi;
+		} s;
+		__u64 sector;
+	} u;
+	commit_id_t last_bytes[0];
+	/* __u8 tag[0]; */
+};
+
+#define journal_entry_tag(ic, je)		((__u8 *)&(je)->last_bytes[(ic)->sectors_per_block])
+
+#if BITS_PER_LONG == 64
+#define journal_entry_set_sector(je, x)		do { smp_wmb(); ACCESS_ONCE((je)->u.sector) = cpu_to_le64(x); } while (0)
+#define journal_entry_get_sector(je)		le64_to_cpu((je)->u.sector)
+#elif defined(CONFIG_LBDAF)
+#define journal_entry_set_sector(je, x)		do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); ACCESS_ONCE((je)->u.s.sector_hi) = cpu_to_le32((x) >> 32); } while (0)
+#define journal_entry_get_sector(je)		le64_to_cpu((je)->u.sector)
+#else
+#define journal_entry_set_sector(je, x)		do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); ACCESS_ONCE((je)->u.s.sector_hi) = cpu_to_le32(0); } while (0)
+#define journal_entry_get_sector(je)		le32_to_cpu((je)->u.s.sector_lo)
+#endif
+#define journal_entry_is_unused(je)		((je)->u.s.sector_hi == cpu_to_le32(-1))
+#define journal_entry_set_unused(je)		do { ((je)->u.s.sector_hi = cpu_to_le32(-1)); } while (0)
+#define journal_entry_is_inprogress(je)		((je)->u.s.sector_hi == cpu_to_le32(-2))
+#define journal_entry_set_inprogress(je)	do { ((je)->u.s.sector_hi = cpu_to_le32(-2)); } while (0)
+
+#define JOURNAL_BLOCK_SECTORS		8
+#define JOURNAL_SECTOR_DATA		((1 << SECTOR_SHIFT) - sizeof(commit_id_t))
+#define JOURNAL_MAC_SIZE		(JOURNAL_MAC_PER_SECTOR * JOURNAL_BLOCK_SECTORS)
+
+struct journal_sector {
+	__u8 entries[JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR];
+	__u8 mac[JOURNAL_MAC_PER_SECTOR];
+	commit_id_t commit_id;
+};
+
+#define MAX_TAG_SIZE			(JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR - offsetof(struct journal_entry, last_bytes[MAX_SECTORS_PER_BLOCK]))
+
+#define METADATA_PADDING_SECTORS	8
+
+#define N_COMMIT_IDS			4
+
+static unsigned char prev_commit_seq(unsigned char seq)
+{
+	return (seq + N_COMMIT_IDS - 1) % N_COMMIT_IDS;
+}
+
+static unsigned char next_commit_seq(unsigned char seq)
+{
+	return (seq + 1) % N_COMMIT_IDS;
+}
+
+/*
+ * In-memory structures
+ */
+
+struct journal_node {
+	struct rb_node node;
+	sector_t sector;
+};
+
+struct alg_spec {
+	char *alg_string;
+	char *key_string;
+	__u8 *key;
+	unsigned key_size;
+};
+
+struct dm_integrity_c {
+	struct dm_dev *dev;
+	unsigned tag_size;
+	__s8 log2_tag_size;
+	sector_t start;
+	mempool_t *journal_io_mempool;
+	struct dm_io_client *io;
+	struct dm_bufio_client *bufio;
+	struct workqueue_struct *metadata_wq;
+	struct superblock *sb;
+	unsigned journal_pages;
+	struct page_list *journal;
+	struct page_list *journal_io;
+	struct page_list *journal_xor;
+
+	struct crypto_skcipher *journal_crypt;
+	struct scatterlist **journal_scatterlist;
+	struct scatterlist **journal_io_scatterlist;
+	struct skcipher_request **sk_requests;
+
+	struct crypto_shash *journal_mac;
+
+	struct journal_node *journal_tree;
+	struct rb_root journal_tree_root;
+
+	sector_t provided_data_sectors;
+
+	unsigned short journal_entry_size;
+	unsigned char journal_entries_per_sector;
+	unsigned char journal_section_entries;
+	unsigned short journal_section_sectors;
+	unsigned journal_sections;
+	unsigned journal_entries;
+	sector_t device_sectors;
+	unsigned initial_sectors;
+	unsigned metadata_run;
+	__s8 log2_metadata_run;
+	__u8 log2_buffer_sectors;
+	__u8 sectors_per_block;
+
+	unsigned char mode;
+	bool suspending;
+
+	int failed;
+
+	struct crypto_shash *internal_hash;
+
+	/* these variables are locked with endio_wait.lock */
+	struct rb_root in_progress;
+	wait_queue_head_t endio_wait;
+	struct workqueue_struct *wait_wq;
+
+	unsigned char commit_seq;
+	commit_id_t commit_ids[N_COMMIT_IDS];
+
+	unsigned committed_section;
+	unsigned n_committed_sections;
+
+	unsigned uncommitted_section;
+	unsigned n_uncommitted_sections;
+
+	unsigned free_section;
+	unsigned char free_section_entry;
+	unsigned free_sectors;
+
+	unsigned free_sectors_threshold;
+
+	struct workqueue_struct *commit_wq;
+	struct work_struct commit_work;
+
+	struct workqueue_struct *writer_wq;
+	struct work_struct writer_work;
+
+	struct bio_list flush_bio_list;
+
+	unsigned long autocommit_jiffies;
+	struct timer_list autocommit_timer;
+	unsigned autocommit_msec;
+
+	wait_queue_head_t copy_to_journal_wait;
+
+	struct completion crypto_backoff;
+
+	bool journal_uptodate;
+	bool just_formatted;
+
+	struct alg_spec internal_hash_alg;
+	struct alg_spec journal_crypt_alg;
+	struct alg_spec journal_mac_alg;
+};
+
+struct dm_integrity_range {
+	sector_t logical_sector;
+	unsigned n_sectors;
+	struct rb_node node;
+};
+
+struct dm_integrity_io {
+	struct work_struct work;
+
+	struct dm_integrity_c *ic;
+	bool write;
+	bool fua;
+
+	struct dm_integrity_range range;
+
+	sector_t metadata_block;
+	unsigned metadata_offset;
+
+	atomic_t in_flight;
+	int bi_error;
+
+	struct completion *completion;
+
+	struct block_device *orig_bi_bdev;
+	bio_end_io_t *orig_bi_end_io;
+	struct bio_integrity_payload *orig_bi_integrity;
+	struct bvec_iter orig_bi_iter;
+};
+
+struct journal_completion {
+	struct dm_integrity_c *ic;
+	atomic_t in_flight;
+	struct completion comp;
+};
+
+struct journal_io {
+	struct dm_integrity_range range;
+	struct journal_completion *comp;
+};
+
+static struct kmem_cache *journal_io_cache;
+
+#define JOURNAL_IO_MEMPOOL	32
+
+#ifdef DEBUG_PRINT
+#define DEBUG_print(x, ...)	printk(KERN_DEBUG x, ##__VA_ARGS__)
+static void __DEBUG_bytes(__u8 *bytes, size_t len, const char *msg, ...)
+{
+	va_list args;
+	va_start(args, msg);
+	vprintk(msg, args);
+	va_end(args);
+	if (len)
+		pr_cont(":");
+	while (len) {
+		pr_cont(" %02x", *bytes);
+		bytes++;
+		len--;
+	}
+	pr_cont("\n");
+}
+#define DEBUG_bytes(bytes, len, msg, ...)	__DEBUG_bytes(bytes, len, KERN_DEBUG msg, ##__VA_ARGS__)
+#else
+#define DEBUG_print(x, ...)			do { } while (0)
+#define DEBUG_bytes(bytes, len, msg, ...)	do { } while (0)
+#endif
+
+/*
+ * DM Integrity profile, protection is performed layer above (dm-crypt)
+ */
+static struct blk_integrity_profile dm_integrity_profile = {
+	.name			= "DM-DIF-EXT-TAG",
+	.generate_fn		= NULL,
+	.verify_fn		= NULL,
+};
+
+static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map);
+static void integrity_bio_wait(struct work_struct *w);
+static void dm_integrity_dtr(struct dm_target *ti);
+
+static void dm_integrity_io_error(struct dm_integrity_c *ic, const char *msg, int err)
+{
+	if (!cmpxchg(&ic->failed, 0, err))
+		DMERR("Error on %s: %d", msg, err);
+}
+
+static int dm_integrity_failed(struct dm_integrity_c *ic)
+{
+	return ACCESS_ONCE(ic->failed);
+}
+
+static commit_id_t dm_integrity_commit_id(struct dm_integrity_c *ic, unsigned i,
+					  unsigned j, unsigned char seq)
+{
+	/*
+	 * Xor the number with section and sector, so that if a piece of
+	 * journal is written at wrong place, it is detected.
+	 */
+	return ic->commit_ids[seq] ^ cpu_to_le64(((__u64)i << 32) ^ j);
+}
+
+static void get_area_and_offset(struct dm_integrity_c *ic, sector_t data_sector,
+				sector_t *area, sector_t *offset)
+{
+	__u8 log2_interleave_sectors = ic->sb->log2_interleave_sectors;
+
+	*area = data_sector >> log2_interleave_sectors;
+	*offset = (unsigned)data_sector & ((1U << log2_interleave_sectors) - 1);
+}
+
+#define sector_to_block(ic, n)						\
+do {									\
+	BUG_ON((n) & (unsigned)((ic)->sectors_per_block - 1));		\
+	(n) >>= (ic)->sb->log2_sectors_per_block;			\
+} while (0)
+
+static __u64 get_metadata_sector_and_offset(struct dm_integrity_c *ic, sector_t area,
+					    sector_t offset, unsigned *metadata_offset)
+{
+	__u64 ms;
+	unsigned mo;
+
+	ms = area << ic->sb->log2_interleave_sectors;
+	if (likely(ic->log2_metadata_run >= 0))
+		ms += area << ic->log2_metadata_run;
+	else
+		ms += area * ic->metadata_run;
+	ms >>= ic->log2_buffer_sectors;
+
+	sector_to_block(ic, offset);
+
+	if (likely(ic->log2_tag_size >= 0)) {
+		ms += offset >> (SECTOR_SHIFT + ic->log2_buffer_sectors - ic->log2_tag_size);
+		mo = (offset << ic->log2_tag_size) & ((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - 1);
+	} else {
+		ms += (__u64)offset * ic->tag_size >> (SECTOR_SHIFT + ic->log2_buffer_sectors);
+		mo = (offset * ic->tag_size) & ((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - 1);
+	}
+	*metadata_offset = mo;
+	return ms;
+}
+
+static sector_t get_data_sector(struct dm_integrity_c *ic, sector_t area, sector_t offset)
+{
+	sector_t result;
+
+	result = area << ic->sb->log2_interleave_sectors;
+	if (likely(ic->log2_metadata_run >= 0))
+		result += (area + 1) << ic->log2_metadata_run;
+	else
+		result += (area + 1) * ic->metadata_run;
+
+	result += (sector_t)ic->initial_sectors + offset;
+	return result;
+}
+
+static void wraparound_section(struct dm_integrity_c *ic, unsigned *sec_ptr)
+{
+	if (unlikely(*sec_ptr >= ic->journal_sections))
+		*sec_ptr -= ic->journal_sections;
+}
+
+static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags)
+{
+	struct dm_io_request io_req;
+	struct dm_io_region io_loc;
+
+	io_req.bi_op = op;
+	io_req.bi_op_flags = op_flags;
+	io_req.mem.type = DM_IO_KMEM;
+	io_req.mem.ptr.addr = ic->sb;
+	io_req.notify.fn = NULL;
+	io_req.client = ic->io;
+	io_loc.bdev = ic->dev->bdev;
+	io_loc.sector = ic->start;
+	io_loc.count = SB_SECTORS;
+
+	return dm_io(&io_req, 1, &io_loc, NULL);
+}
+
+static void access_journal_check(struct dm_integrity_c *ic, unsigned section, unsigned offset,
+				 bool e, const char *function)
+{
+#if defined(CONFIG_DM_DEBUG) || defined(INTERNAL_VERIFY)
+	unsigned limit = e ? ic->journal_section_entries : ic->journal_section_sectors;
+
+	if (unlikely(section >= ic->journal_sections) ||
+	    unlikely(offset >= limit)) {
+		printk(KERN_CRIT "%s: invalid access at (%u,%u), limit (%u,%u)\n",
+			function, section, offset, ic->journal_sections, limit);
+		BUG();
+	}
+#endif
+}
+
+static void page_list_location(struct dm_integrity_c *ic, unsigned section, unsigned offset,
+			       unsigned *pl_index, unsigned *pl_offset)
+{
+	unsigned sector;
+
+	access_journal_check(ic, section, offset, false, "page_list_location");
+
+	sector = section * ic->journal_section_sectors + offset;
+
+	*pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
+	*pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
+}
+
+static struct journal_sector *access_page_list(struct dm_integrity_c *ic, struct page_list *pl,
+					       unsigned section, unsigned offset, unsigned *n_sectors)
+{
+	unsigned pl_index, pl_offset;
+	char *va;
+
+	page_list_location(ic, section, offset, &pl_index, &pl_offset);
+
+	if (n_sectors)
+		*n_sectors = (PAGE_SIZE - pl_offset) >> SECTOR_SHIFT;
+
+	va = lowmem_page_address(pl[pl_index].page);
+
+	return (struct journal_sector *)(va + pl_offset);
+}
+
+static struct journal_sector *access_journal(struct dm_integrity_c *ic, unsigned section, unsigned offset)
+{
+	return access_page_list(ic, ic->journal, section, offset, NULL);
+}
+
+static struct journal_entry *access_journal_entry(struct dm_integrity_c *ic, unsigned section, unsigned n)
+{
+	unsigned rel_sector, offset;
+	struct journal_sector *js;
+
+	access_journal_check(ic, section, n, true, "access_journal_entry");
+
+	rel_sector = n % JOURNAL_BLOCK_SECTORS;
+	offset = n / JOURNAL_BLOCK_SECTORS;
+
+	js = access_journal(ic, section, rel_sector);
+	return (struct journal_entry *)((char *)js + offset * ic->journal_entry_size);
+}
+
+static struct journal_sector *access_journal_data(struct dm_integrity_c *ic, unsigned section, unsigned n)
+{
+	n <<= ic->sb->log2_sectors_per_block;
+
+	n += JOURNAL_BLOCK_SECTORS;
+
+	access_journal_check(ic, section, n, false, "access_journal_data");
+
+	return access_journal(ic, section, n);
+}
+
+static void section_mac(struct dm_integrity_c *ic, unsigned section, __u8 result[JOURNAL_MAC_SIZE])
+{
+	SHASH_DESC_ON_STACK(desc, ic->journal_mac);
+	int r;
+	unsigned j, size;
+
+	desc->tfm = ic->journal_mac;
+	desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+	r = crypto_shash_init(desc);
+	if (unlikely(r)) {
+		dm_integrity_io_error(ic, "crypto_shash_init", r);
+		goto err;
+	}
+
+	for (j = 0; j < ic->journal_section_entries; j++) {
+		struct journal_entry *je = access_journal_entry(ic, section, j);
+		r = crypto_shash_update(desc, (__u8 *)&je->u.sector, sizeof je->u.sector);
+		if (unlikely(r)) {
+			dm_integrity_io_error(ic, "crypto_shash_update", r);
+			goto err;
+		}
+	}
+
+	size = crypto_shash_digestsize(ic->journal_mac);
+
+	if (likely(size <= JOURNAL_MAC_SIZE)) {
+		r = crypto_shash_final(desc, result);
+		if (unlikely(r)) {
+			dm_integrity_io_error(ic, "crypto_shash_final", r);
+			goto err;
+		}
+		memset(result + size, 0, JOURNAL_MAC_SIZE - size);
+	} else {
+		__u8 digest[size];
+		r = crypto_shash_final(desc, digest);
+		if (unlikely(r)) {
+			dm_integrity_io_error(ic, "crypto_shash_final", r);
+			goto err;
+		}
+		memcpy(result, digest, JOURNAL_MAC_SIZE);
+	}
+
+	return;
+err:
+	memset(result, 0, JOURNAL_MAC_SIZE);
+}
+
+static void rw_section_mac(struct dm_integrity_c *ic, unsigned section, bool wr)
+{
+	__u8 result[JOURNAL_MAC_SIZE];
+	unsigned j;
+
+	if (!ic->journal_mac)
+		return;
+
+	section_mac(ic, section, result);
+
+	for (j = 0; j < JOURNAL_BLOCK_SECTORS; j++) {
+		struct journal_sector *js = access_journal(ic, section, j);
+
+		if (likely(wr))
+			memcpy(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR);
+		else {
+			if (memcmp(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR))
+				dm_integrity_io_error(ic, "journal mac", -EILSEQ);
+		}
+	}
+}
+
+static void complete_journal_op(void *context)
+{
+	struct journal_completion *comp = context;
+	BUG_ON(!atomic_read(&comp->in_flight));
+	if (likely(atomic_dec_and_test(&comp->in_flight)))
+		complete(&comp->comp);
+}
+
+static void xor_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section,
+			unsigned n_sections, struct journal_completion *comp)
+{
+	struct async_submit_ctl submit;
+	size_t n_bytes = (size_t)(n_sections * ic->journal_section_sectors) << SECTOR_SHIFT;
+	unsigned pl_index, pl_offset, section_index;
+	struct page_list *source_pl, *target_pl;
+
+	if (likely(encrypt)) {
+		source_pl = ic->journal;
+		target_pl = ic->journal_io;
+	} else {
+		source_pl = ic->journal_io;
+		target_pl = ic->journal;
+	}
+
+	page_list_location(ic, section, 0, &pl_index, &pl_offset);
+
+	atomic_add(roundup(pl_offset + n_bytes, PAGE_SIZE) >> PAGE_SHIFT, &comp->in_flight);
+
+	init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL, complete_journal_op, comp, NULL);
+
+	section_index = pl_index;
+
+	do {
+		size_t this_step;
+		struct page *src_pages[2];
+		struct page *dst_page;
+
+		while (unlikely(pl_index == section_index)) {
+			unsigned dummy;
+			if (likely(encrypt))
+				rw_section_mac(ic, section, true);
+			section++;
+			n_sections--;
+			if (!n_sections)
+				break;
+			page_list_location(ic, section, 0, &section_index, &dummy);
+		}
+
+		this_step = min(n_bytes, (size_t)PAGE_SIZE - pl_offset);
+		dst_page = target_pl[pl_index].page;
+		src_pages[0] = source_pl[pl_index].page;
+		src_pages[1] = ic->journal_xor[pl_index].page;
+
+		async_xor(dst_page, src_pages, pl_offset, 2, this_step, &submit);
+
+		pl_index++;
+		pl_offset = 0;
+		n_bytes -= this_step;
+	} while (n_bytes);
+
+	BUG_ON(n_sections);
+
+	async_tx_issue_pending_all();
+}
+
+static void complete_journal_encrypt(struct crypto_async_request *req, int err)
+{
+	struct journal_completion *comp = req->data;
+	if (unlikely(err)) {
+		if (likely(err == -EINPROGRESS)) {
+			complete(&comp->ic->crypto_backoff);
+			return;
+		}
+		dm_integrity_io_error(comp->ic, "asynchronous encrypt", err);
+	}
+	complete_journal_op(comp);
+}
+
+static bool do_crypt(bool encrypt, struct skcipher_request *req, struct journal_completion *comp)
+{
+	int r;
+	skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+				      complete_journal_encrypt, comp);
+	if (likely(encrypt))
+		r = crypto_skcipher_encrypt(req);
+	else
+		r = crypto_skcipher_decrypt(req);
+	if (likely(!r))
+		return false;
+	if (likely(r == -EINPROGRESS))
+		return true;
+	if (likely(r == -EBUSY)) {
+		wait_for_completion(&comp->ic->crypto_backoff);
+		reinit_completion(&comp->ic->crypto_backoff);
+		return true;
+	}
+	dm_integrity_io_error(comp->ic, "encrypt", r);
+	return false;
+}
+
+static void crypt_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section,
+			  unsigned n_sections, struct journal_completion *comp)
+{
+	struct scatterlist **source_sg;
+	struct scatterlist **target_sg;
+
+	atomic_add(2, &comp->in_flight);
+
+	if (likely(encrypt)) {
+		source_sg = ic->journal_scatterlist;
+		target_sg = ic->journal_io_scatterlist;
+	} else {
+		source_sg = ic->journal_io_scatterlist;
+		target_sg = ic->journal_scatterlist;
+	}
+
+	do {
+		struct skcipher_request *req;
+		unsigned ivsize;
+		char *iv;
+
+		if (likely(encrypt))
+			rw_section_mac(ic, section, true);
+
+		req = ic->sk_requests[section];
+		ivsize = crypto_skcipher_ivsize(ic->journal_crypt);
+		iv = req->iv;
+
+		memcpy(iv, iv + ivsize, ivsize);
+
+		req->src = source_sg[section];
+		req->dst = target_sg[section];
+
+		if (unlikely(do_crypt(encrypt, req, comp)))
+			atomic_inc(&comp->in_flight);
+
+		section++;
+		n_sections--;
+	} while (n_sections);
+
+	atomic_dec(&comp->in_flight);
+	complete_journal_op(comp);
+}
+
+static void encrypt_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section,
+			    unsigned n_sections, struct journal_completion *comp)
+{
+	if (ic->journal_xor)
+		return xor_journal(ic, encrypt, section, n_sections, comp);
+	else
+		return crypt_journal(ic, encrypt, section, n_sections, comp);
+}
+
+static void complete_journal_io(unsigned long error, void *context)
+{
+	struct journal_completion *comp = context;
+	if (unlikely(error != 0))
+		dm_integrity_io_error(comp->ic, "writing journal", -EIO);
+	complete_journal_op(comp);
+}
+
+static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned section,
+		       unsigned n_sections, struct journal_completion *comp)
+{
+	struct dm_io_request io_req;
+	struct dm_io_region io_loc;
+	unsigned sector, n_sectors, pl_index, pl_offset;
+	int r;
+
+	if (unlikely(dm_integrity_failed(ic))) {
+		if (comp)
+			complete_journal_io(-1UL, comp);
+		return;
+	}
+
+	sector = section * ic->journal_section_sectors;
+	n_sectors = n_sections * ic->journal_section_sectors;
+
+	pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
+	pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
+
+	io_req.bi_op = op;
+	io_req.bi_op_flags = op_flags;
+	io_req.mem.type = DM_IO_PAGE_LIST;
+	if (ic->journal_io)
+		io_req.mem.ptr.pl = &ic->journal_io[pl_index];
+	else
+		io_req.mem.ptr.pl = &ic->journal[pl_index];
+	io_req.mem.offset = pl_offset;
+	if (likely(comp != NULL)) {
+		io_req.notify.fn = complete_journal_io;
+		io_req.notify.context = comp;
+	} else {
+		io_req.notify.fn = NULL;
+	}
+	io_req.client = ic->io;
+	io_loc.bdev = ic->dev->bdev;
+	io_loc.sector = ic->start + SB_SECTORS + sector;
+	io_loc.count = n_sectors;
+
+	r = dm_io(&io_req, 1, &io_loc, NULL);
+	if (unlikely(r)) {
+		dm_integrity_io_error(ic, op == REQ_OP_READ ? "reading journal" : "writing journal", r);
+		if (comp) {
+			WARN_ONCE(1, "asynchronous dm_io failed: %d", r);
+			complete_journal_io(-1UL, comp);
+		}
+	}
+}
+
+static void write_journal(struct dm_integrity_c *ic, unsigned commit_start, unsigned commit_sections)
+{
+	struct journal_completion io_comp;
+	struct journal_completion crypt_comp_1;
+	struct journal_completion crypt_comp_2;
+	unsigned i;
+
+	io_comp.ic = ic;
+	io_comp.comp = COMPLETION_INITIALIZER_ONSTACK(io_comp.comp);
+
+	if (commit_start + commit_sections <= ic->journal_sections) {
+		io_comp.in_flight = (atomic_t)ATOMIC_INIT(1);
+		if (ic->journal_io) {
+			crypt_comp_1.ic = ic;
+			crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp);
+			crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
+			encrypt_journal(ic, true, commit_start, commit_sections, &crypt_comp_1);
+			wait_for_completion_io(&crypt_comp_1.comp);
+		} else {
+			for (i = 0; i < commit_sections; i++)
+				rw_section_mac(ic, commit_start + i, true);
+		}
+		rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, commit_sections, &io_comp);
+	} else {
+		unsigned to_end;
+		io_comp.in_flight = (atomic_t)ATOMIC_INIT(2);
+		to_end = ic->journal_sections - commit_start;
+		if (ic->journal_io) {
+			crypt_comp_1.ic = ic;
+			crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp);
+			crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
+			encrypt_journal(ic, true, commit_start, to_end, &crypt_comp_1);
+			if (try_wait_for_completion(&crypt_comp_1.comp)) {
+				rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp);
+				crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp);
+				crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
+				encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_1);
+				wait_for_completion_io(&crypt_comp_1.comp);
+			} else {
+				crypt_comp_2.ic = ic;
+				crypt_comp_2.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_2.comp);
+				crypt_comp_2.in_flight = (atomic_t)ATOMIC_INIT(0);
+				encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_2);
+				wait_for_completion_io(&crypt_comp_1.comp);
+				rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp);
+				wait_for_completion_io(&crypt_comp_2.comp);
+			}
+		} else {
+			for (i = 0; i < to_end; i++)
+				rw_section_mac(ic, commit_start + i, true);
+			rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp);
+			for (i = 0; i < commit_sections - to_end; i++)
+				rw_section_mac(ic, i, true);
+		}
+		rw_journal(ic, REQ_OP_WRITE, REQ_FUA, 0, commit_sections - to_end, &io_comp);
+	}
+
+	wait_for_completion_io(&io_comp.comp);
+}
+
+static void copy_from_journal(struct dm_integrity_c *ic, unsigned section, unsigned offset,
+			      unsigned n_sectors, sector_t target, io_notify_fn fn, void *data)
+{
+	struct dm_io_request io_req;
+	struct dm_io_region io_loc;
+	int r;
+	unsigned sector, pl_index, pl_offset;
+
+	BUG_ON((target | n_sectors | offset) & (unsigned)(ic->sectors_per_block - 1));
+
+	if (unlikely(dm_integrity_failed(ic))) {
+		fn(-1UL, data);
+		return;
+	}
+
+	sector = section * ic->journal_section_sectors + JOURNAL_BLOCK_SECTORS + offset;
+
+	pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
+	pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
+
+	io_req.bi_op = REQ_OP_WRITE;
+	io_req.bi_op_flags = 0;
+	io_req.mem.type = DM_IO_PAGE_LIST;
+	io_req.mem.ptr.pl = &ic->journal[pl_index];
+	io_req.mem.offset = pl_offset;
+	io_req.notify.fn = fn;
+	io_req.notify.context = data;
+	io_req.client = ic->io;
+	io_loc.bdev = ic->dev->bdev;
+	io_loc.sector = ic->start + target;
+	io_loc.count = n_sectors;
+
+	r = dm_io(&io_req, 1, &io_loc, NULL);
+	if (unlikely(r)) {
+		WARN_ONCE(1, "asynchronous dm_io failed: %d", r);
+		fn(-1UL, data);
+	}
+}
+
+static bool add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range)
+{
+	struct rb_node **n = &ic->in_progress.rb_node;
+	struct rb_node *parent;
+
+	BUG_ON((new_range->logical_sector | new_range->n_sectors) & (unsigned)(ic->sectors_per_block - 1));
+
+	parent = NULL;
+
+	while (*n) {
+		struct dm_integrity_range *range = container_of(*n, struct dm_integrity_range, node);
+
+		parent = *n;
+		if (new_range->logical_sector + new_range->n_sectors <= range->logical_sector) {
+			n = &range->node.rb_left;
+		} else if (new_range->logical_sector >= range->logical_sector + range->n_sectors) {
+			n = &range->node.rb_right;
+		} else {
+			return false;
+		}
+	}
+
+	rb_link_node(&new_range->node, parent, n);
+	rb_insert_color(&new_range->node, &ic->in_progress);
+
+	return true;
+}
+
+static void remove_range_unlocked(struct dm_integrity_c *ic, struct dm_integrity_range *range)
+{
+	rb_erase(&range->node, &ic->in_progress);
+	wake_up_locked(&ic->endio_wait);
+}
+
+static void remove_range(struct dm_integrity_c *ic, struct dm_integrity_range *range)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ic->endio_wait.lock, flags);
+	remove_range_unlocked(ic, range);
+	spin_unlock_irqrestore(&ic->endio_wait.lock, flags);
+}
+
+static void init_journal_node(struct journal_node *node)
+{
+	RB_CLEAR_NODE(&node->node);
+	node->sector = (sector_t)-1;
+}
+
+static void add_journal_node(struct dm_integrity_c *ic, struct journal_node *node, sector_t sector)
+{
+	struct rb_node **link;
+	struct rb_node *parent;
+
+	node->sector = sector;
+	BUG_ON(!RB_EMPTY_NODE(&node->node));
+
+	link = &ic->journal_tree_root.rb_node;
+	parent = NULL;
+
+	while (*link) {
+		struct journal_node *j;
+		parent = *link;
+		j = container_of(parent, struct journal_node, node);
+		if (sector < j->sector)
+			link = &j->node.rb_left;
+		else
+			link = &j->node.rb_right;
+	}
+
+	rb_link_node(&node->node, parent, link);
+	rb_insert_color(&node->node, &ic->journal_tree_root);
+}
+
+static void remove_journal_node(struct dm_integrity_c *ic, struct journal_node *node)
+{
+	BUG_ON(RB_EMPTY_NODE(&node->node));
+	rb_erase(&node->node, &ic->journal_tree_root);
+	init_journal_node(node);
+}
+
+#define NOT_FOUND	(-1U)
+
+static unsigned find_journal_node(struct dm_integrity_c *ic, sector_t sector, sector_t *next_sector)
+{
+	struct rb_node *n = ic->journal_tree_root.rb_node;
+	unsigned found = NOT_FOUND;
+	*next_sector = (sector_t)-1;
+	while (n) {
+		struct journal_node *j = container_of(n, struct journal_node, node);
+		if (sector == j->sector) {
+			found = j - ic->journal_tree;
+		}
+		if (sector < j->sector) {
+			*next_sector = j->sector;
+			n = j->node.rb_left;
+		} else {
+			n = j->node.rb_right;
+		}
+	}
+
+	return found;
+}
+
+static bool test_journal_node(struct dm_integrity_c *ic, unsigned pos, sector_t sector)
+{
+	struct journal_node *node, *next_node;
+	struct rb_node *next;
+
+	if (unlikely(pos >= ic->journal_entries))
+		return false;
+	node = &ic->journal_tree[pos];
+	if (unlikely(RB_EMPTY_NODE(&node->node)))
+		return false;
+	if (unlikely(node->sector != sector))
+		return false;
+
+	next = rb_next(&node->node);
+	if (unlikely(!next))
+		return true;
+
+	next_node = container_of(next, struct journal_node, node);
+	return next_node->sector != sector;
+}
+
+static bool find_newer_committed_node(struct dm_integrity_c *ic, struct journal_node *node)
+{
+	struct rb_node *next;
+	struct journal_node *next_node;
+	unsigned next_section;
+
+	BUG_ON(RB_EMPTY_NODE(&node->node));
+
+	next = rb_next(&node->node);
+	if (unlikely(!next))
+		return false;
+
+	next_node = container_of(next, struct journal_node, node);
+
+	if (next_node->sector != node->sector)
+		return false;
+
+	next_section = (unsigned)(next_node - ic->journal_tree) / ic->journal_section_entries;
+	if (next_section >= ic->committed_section &&
+	    next_section < ic->committed_section + ic->n_committed_sections)
+		return true;
+	if (next_section + ic->journal_sections < ic->committed_section + ic->n_committed_sections)
+		return true;
+
+	return false;
+}
+
+#define TAG_READ	0
+#define TAG_WRITE	1
+#define TAG_CMP		2
+
+static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, sector_t *metadata_block,
+			       unsigned *metadata_offset, unsigned total_size, int op)
+{
+	do {
+		unsigned char *data, *dp;
+		struct dm_buffer *b;
+		unsigned to_copy;
+		int r;
+
+		r = dm_integrity_failed(ic);
+		if (unlikely(r))
+			return r;
+
+		data = dm_bufio_read(ic->bufio, *metadata_block, &b);
+		if (unlikely(IS_ERR(data)))
+			return PTR_ERR(data);
+
+		to_copy = min((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - *metadata_offset, total_size);
+		dp = data + *metadata_offset;
+		if (op == TAG_READ) {
+			memcpy(tag, dp, to_copy);
+		} else if (op == TAG_WRITE) {
+			memcpy(dp, tag, to_copy);
+			dm_bufio_mark_buffer_dirty(b);
+		} else  {
+			/* e.g.: op == TAG_CMP */
+			if (unlikely(memcmp(dp, tag, to_copy))) {
+				unsigned i;
+
+				for (i = 0; i < to_copy; i++) {
+					if (dp[i] != tag[i])
+						break;
+					total_size--;
+				}
+				dm_bufio_release(b);
+				return total_size;
+			}
+		}
+		dm_bufio_release(b);
+
+		tag += to_copy;
+		*metadata_offset += to_copy;
+		if (unlikely(*metadata_offset == 1U << SECTOR_SHIFT << ic->log2_buffer_sectors)) {
+			(*metadata_block)++;
+			*metadata_offset = 0;
+		}
+		total_size -= to_copy;
+	} while (unlikely(total_size));
+
+	return 0;
+}
+
+static void dm_integrity_flush_buffers(struct dm_integrity_c *ic)
+{
+	int r;
+	r = dm_bufio_write_dirty_buffers(ic->bufio);
+	if (unlikely(r))
+		dm_integrity_io_error(ic, "writing tags", r);
+}
+
+static void sleep_on_endio_wait(struct dm_integrity_c *ic)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	__add_wait_queue(&ic->endio_wait, &wait);
+	__set_current_state(TASK_UNINTERRUPTIBLE);
+	spin_unlock_irq(&ic->endio_wait.lock);
+	io_schedule();
+	spin_lock_irq(&ic->endio_wait.lock);
+	__remove_wait_queue(&ic->endio_wait, &wait);
+}
+
+static void autocommit_fn(unsigned long data)
+{
+	struct dm_integrity_c *ic = (struct dm_integrity_c *)data;
+
+	if (likely(!dm_integrity_failed(ic)))
+		queue_work(ic->commit_wq, &ic->commit_work);
+}
+
+static void schedule_autocommit(struct dm_integrity_c *ic)
+{
+	if (!timer_pending(&ic->autocommit_timer))
+		mod_timer(&ic->autocommit_timer, jiffies + ic->autocommit_jiffies);
+}
+
+static void submit_flush_bio(struct dm_integrity_c *ic, struct dm_integrity_io *dio)
+{
+	struct bio *bio;
+	spin_lock_irq(&ic->endio_wait.lock);
+	bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
+	bio_list_add(&ic->flush_bio_list, bio);
+	spin_unlock_irq(&ic->endio_wait.lock);
+	queue_work(ic->commit_wq, &ic->commit_work);
+}
+
+static void do_endio(struct dm_integrity_c *ic, struct bio *bio)
+{
+	int r = dm_integrity_failed(ic);
+	if (unlikely(r) && !bio->bi_error)
+		bio->bi_error = r;
+	bio_endio(bio);
+}
+
+static void do_endio_flush(struct dm_integrity_c *ic, struct dm_integrity_io *dio)
+{
+	struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
+
+	if (unlikely(dio->fua) && likely(!bio->bi_error) && likely(!dm_integrity_failed(ic)))
+		submit_flush_bio(ic, dio);
+	else
+		do_endio(ic, bio);
+}
+
+static void dec_in_flight(struct dm_integrity_io *dio)
+{
+	if (atomic_dec_and_test(&dio->in_flight)) {
+		struct dm_integrity_c *ic = dio->ic;
+		struct bio *bio;
+
+		remove_range(ic, &dio->range);
+
+		if (unlikely(dio->write))
+			schedule_autocommit(ic);
+
+		bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
+
+		if (unlikely(dio->bi_error) && !bio->bi_error)
+			bio->bi_error = dio->bi_error;
+		if (likely(!bio->bi_error) && unlikely(bio_sectors(bio) != dio->range.n_sectors)) {
+			dio->range.logical_sector += dio->range.n_sectors;
+			bio_advance(bio, dio->range.n_sectors << SECTOR_SHIFT);
+			INIT_WORK(&dio->work, integrity_bio_wait);
+			queue_work(ic->wait_wq, &dio->work);
+			return;
+		}
+		do_endio_flush(ic, dio);
+	}
+}
+