summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/device-mapper/dm-integrity.txt189
-rw-r--r--drivers/md/Kconfig10
-rw-r--r--drivers/md/Makefile1
-rw-r--r--drivers/md/dm-integrity.c3085
4 files changed, 3285 insertions, 0 deletions
diff --git a/Documentation/device-mapper/dm-integrity.txt b/Documentation/device-mapper/dm-integrity.txt
new file mode 100644
index 000000000000..2406f56501dc
--- /dev/null
+++ b/Documentation/device-mapper/dm-integrity.txt
@@ -0,0 +1,189 @@
+The dm-integrity target emulates a block device that has additional
+per-sector tags that can be used for storing integrity information.
+
+A general problem with storing integrity tags with every sector is that
+writing the sector and the integrity tag must be atomic - i.e. in case of
+crash, either both sector and integrity tag or none of them is written.
+
+To guarantee write atomicity, the dm-integrity target uses journal, it
+writes sector data and integrity tags into a journal, commits the journal
+and then copies the data and integrity tags to their respective location.
+
+The dm-integrity target can be used with the dm-crypt target - in this
+situation the dm-crypt target creates the integrity data and passes them
+to the dm-integrity target via bio_integrity_payload attached to the bio.
+In this mode, the dm-crypt and dm-integrity targets provide authenticated
+disk encryption - if the attacker modifies the encrypted device, an I/O
+error is returned instead of random data.
+
+The dm-integrity target can also be used as a standalone target, in this
+mode it calculates and verifies the integrity tag internally. In this
+mode, the dm-integrity target can be used to detect silent data
+corruption on the disk or in the I/O path.
+
+
+When loading the target for the first time, the kernel driver will format
+the device. But it will only format the device if the superblock contains
+zeroes. If the superblock is neither valid nor zeroed, the dm-integrity
+target can't be loaded.
+
+To use the target for the first time:
+1. overwrite the superblock with zeroes
+2. load the dm-integrity target with one-sector size, the kernel driver
+ will format the device
+3. unload the dm-integrity target
+4. read the "provided_data_sectors" value from the superblock
+5. load the dm-integrity target with the the target size
+ "provided_data_sectors"
+6. if you want to use dm-integrity with dm-crypt, load the dm-crypt target
+ with the size "provided_data_sectors"
+
+
+Target arguments:
+
+1. the underlying block device
+
+2. the number of reserved sector at the beginning of the device - the
+ dm-integrity won't read of write these sectors
+
+3. the size of the integrity tag (if "-" is used, the size is taken from
+ the internal-hash algorithm)
+
+4. mode:
+ D - direct writes (without journal) - in this mode, journaling is
+ not used and data sectors and integrity tags are written
+ separately. In case of crash, it is possible that the data
+ and integrity tag doesn't match.
+ J - journaled writes - data and integrity tags are written to the
+ journal and atomicity is guaranteed. In case of crash,
+ either both data and tag or none of them are written. The
+ journaled mode degrades write throughput twice because the
+ data have to be written twice.
+
+5. the number of additional arguments
+
+Additional arguments:
+
+journal-sectors:number
+ The size of journal, this argument is used only if formatting the
+ device. If the device is already formatted, the value from the
+ superblock is used.
+
+interleave-sectors:number
+ The number of interleaved sectors. This values is rounded down to
+ a power of two. If the device is already formatted, the value from
+ the superblock is used.
+
+buffer-sectors:number
+ The number of sectors in one buffer. The value is rounded down to
+ a power of two.
+
+ The tag area is accessed using buffers, the buffer size is
+ configurable. The large buffer size means that the I/O size will
+ be larger, but there could be less I/Os issued.
+
+journal-watermark:number
+ The journal watermark in percents. When the size of the journal
+ exceeds this watermark, the thread that flushes the journal will
+ be started.
+
+commit-time:number
+ Commit time in milliseconds. When this time passes, the journal is
+ written. The journal is also written immediatelly if the FLUSH
+ request is received.
+
+internal-hash:algorithm(:key) (the key is optional)
+ Use internal hash or crc.
+ When this argument is used, the dm-integrity target won't accept
+ integrity tags from the upper target, but it will automatically
+ generate and verify the integrity tags.
+
+ You can use a crc algorithm (such as crc32), then integrity target
+ will protect the data against accidental corruption.
+ You can also use a hmac algorithm (for example
+ "hmac(sha256):0123456789abcdef"), in this mode it will provide
+ cryptographic authentication of the data without encryption.
+
+ When this argument is not used, the integrity tags are accepted
+ from an upper layer target, such as dm-crypt. The upper layer
+ target should check the validity of the integrity tags.
+
+journal-crypt:algorithm(:key) (the key is optional)
+ Encrypt the journal using given algorithm to make sure that the
+ attacker can't read the journal. You can use a block cipher here
+ (such as "cbc(aes)") or a stream cipher (for example "chacha20",
+ "salsa20", "ctr(aes)" or "ecb(arc4)").
+
+ The journal contains history of last writes to the block device,
+ an attacker reading the journal could see the last sector nubmers
+ that were written. From the sector numbers, the attacker can infer
+ the size of files that were written. To protect against this
+ situation, you can encrypt the journal.
+
+journal-mac:algorithm(:key) (the key is optional)
+ Protect sector numbers in the journal from accidental or malicious
+ modification. To protect against accidental modification, use a
+ crc algorithm, to protect against malicious modification, use a
+ hmac algorithm with a key.
+
+ This option is not needed when using internal-hash because in this
+ mode, the integrity of journal entries is checked when replaying
+ the journal. Thus, modified sector number would be detected at
+ this stage.
+
+
+The journal mode (D/J), buffer-sectors, journal-watermark, commit-time can
+be changed when reloading the target (load an inactive table and swap the
+tables with suspend and resume). The other arguments should not be changed
+when reloading the target because the layout of disk data depend on them
+and the reloaded target would be non-functional.
+
+
+The layout of the formatted block device:
+* reserved sectors (they are not used by this target, they can be used for
+ storing LUKS metadata or for other purpose), the size of the reserved
+ area is specified in the target arguments
+* superblock (4kiB)
+ * magic string - identifies that the device was formatted
+ * version
+ * log2(interleave sectors)
+ * integrity tag size
+ * the number of journal sections
+ * provided data sectors - the number of sectors that this target
+ provides (i.e. the size of the device minus the size of all
+ metadata and padding). The user of this target should not send
+ bios that access data beyond the "provided data sectors" limit.
+ * flags - a flag is set if journal-mac is used
+* journal
+ The journal is divided into sections, each section contains:
+ * metadata area (4kiB), it contains journal entries
+ every journal entry contains:
+ * logical sector (specifies where the data and tag should
+ be written)
+ * last 8 bytes of data
+ * integrity tag (the size is specified in the superblock)
+ every metadata sector ends with
+ * mac (8-bytes), all the macs in 8 metadata sectors form a
+ 64-byte value. It is used to store hmac of sector
+ numbers in the journal section, to protect against a
+ possibility that the attacker tampers with sector
+ numbers in the journal.
+ * commit id
+ * data area (the size is variable; it depends on how many journal
+ entries fit into the metadata area)
+ every sector in the data area contains:
+ * data (504 bytes of data, the last 8 bytes are stored in
+ the journal entry)
+ * commit id
+ To test if the whole journal section was written correctly, every
+ 512-byte sector of the journal ends with 8-byte commit id. If the
+ commit id matches on all sectors in a journal section, then it is
+ assumed that the section was written correctly. If the commit id
+ doesn't match, the section was written partially and it should not
+ be replayed.
+* one or more runs of interleaved tags and data. Each run contains:
+ * tag area - it contains integrity tags. There is one tag for each
+ sector in the data area
+ * data area - it contains data sectors. The number of data sectors
+ in one run must be a power of two. log2 of this value is stored
+ in the superblock.
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 982cd0626bc7..5c5ed97c9fda 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -500,4 +500,14 @@ config DM_LOG_WRITES
If unsure, say N.
+config DM_INTEGRITY
+ tristate "Integrity target"
+ depends on BLK_DEV_DM
+ select BLK_DEV_INTEGRITY
+ select DM_BUFIO
+ select CRYPTO
+ select ASYNC_XOR
+ ---help---
+ This is the integrity target.
+
endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 2801b2fb452d..39cf2a1b5f90 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -59,6 +59,7 @@ obj-$(CONFIG_DM_CACHE) += dm-cache.o
obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o
obj-$(CONFIG_DM_ERA) += dm-era.o
obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o
+obj-$(CONFIG_DM_INTEGRITY) += dm-integrity.o
ifeq ($(CONFIG_DM_UEVENT),y)
dm-mod-objs += dm-uevent.o
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
new file mode 100644
index 000000000000..ea779cca8b45
--- /dev/null
+++ b/drivers/md/dm-integrity.c
@@ -0,0 +1,3085 @@
+/*
+ * Copyright (C) 2016-2017 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2016-2017 Milan Broz
+ * Copyright (C) 2016-2017 Mikulas Patocka
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/module.h>
+#include <linux/device-mapper.h>
+#include <linux/dm-io.h>
+#include <linux/vmalloc.h>
+#include <linux/sort.h>
+#include <linux/rbtree.h>
+#include <linux/delay.h>
+#include <linux/random.h>
+#include <crypto/hash.h>
+#include <crypto/skcipher.h>
+#include <linux/async_tx.h>
+#include "dm-bufio.h"
+
+#define DM_MSG_PREFIX "integrity"
+
+#define DEFAULT_INTERLEAVE_SECTORS 32768
+#define DEFAULT_JOURNAL_SIZE_FACTOR 7
+#define DEFAULT_BUFFER_SECTORS 128
+#define DEFAULT_JOURNAL_WATERMARK 50
+#define DEFAULT_SYNC_MSEC 10000
+#define DEFAULT_MAX_JOURNAL_SECTORS 131072
+#define MIN_INTERLEAVE_SECTORS 3
+#define MAX_INTERLEAVE_SECTORS 31
+#define METADATA_WORKQUEUE_MAX_ACTIVE 16
+
+/*
+ * Warning - DEBUG_PRINT prints security-sensitive data to the log,
+ * so it should not be enabled in the official kernel
+ */
+//#define DEBUG_PRINT
+//#define INTERNAL_VERIFY
+
+/*
+ * On disk structures
+ */
+
+#define SB_MAGIC "integrt"
+#define SB_VERSION 1
+#define SB_SECTORS 8
+
+struct superblock {
+ __u8 magic[8];
+ __u8 version;
+ __u8 log2_interleave_sectors;
+ __u16 integrity_tag_size;
+ __u32 journal_sections;
+ __u64 provided_data_sectors; /* userspace uses this value */
+ __u32 flags;
+};
+
+#define SB_FLAG_HAVE_JOURNAL_MAC 0x1
+
+#define JOURNAL_ENTRY_ROUNDUP 8
+
+typedef __u64 commit_id_t;
+#define JOURNAL_MAC_PER_SECTOR 8
+
+struct journal_entry {
+ union {
+ struct {
+ __u32 sector_lo;
+ __u32 sector_hi;
+ } s;
+ __u64 sector;
+ } u;
+ commit_id_t last_bytes;
+ __u8 tag[0];
+};
+
+#if BITS_PER_LONG == 64
+#define journal_entry_set_sector(je, x) do { smp_wmb(); ACCESS_ONCE((je)->u.sector) = cpu_to_le64(x); } while (0)
+#define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector)
+#elif defined(CONFIG_LBDAF)
+#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); ACCESS_ONCE((je)->u.s.sector_hi) = cpu_to_le32((x) >> 32); } while (0)
+#define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector)
+#else
+#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); ACCESS_ONCE((je)->u.s.sector_hi) = cpu_to_le32(0); } while (0)
+#define journal_entry_get_sector(je) le32_to_cpu((je)->u.s.sector_lo)
+#endif
+#define journal_entry_is_unused(je) ((je)->u.s.sector_hi == cpu_to_le32(-1))
+#define journal_entry_set_unused(je) do { ((je)->u.s.sector_hi = cpu_to_le32(-1)); } while (0)
+#define journal_entry_is_inprogress(je) ((je)->u.s.sector_hi == cpu_to_le32(-2))
+#define journal_entry_set_inprogress(je) do { ((je)->u.s.sector_hi = cpu_to_le32(-2)); } while (0)
+
+#define JOURNAL_BLOCK_SECTORS 8
+#define JOURNAL_SECTOR_DATA ((1 << SECTOR_SHIFT) - sizeof(commit_id_t))
+#define JOURNAL_MAC_SIZE (JOURNAL_MAC_PER_SECTOR * JOURNAL_BLOCK_SECTORS)
+
+struct journal_sector {
+ __u8 entries[JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR];
+ __u8 mac[JOURNAL_MAC_PER_SECTOR];
+ commit_id_t commit_id;
+};
+
+#define MAX_TAG_SIZE (JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR - offsetof(struct journal_entry, tag))
+
+#define METADATA_PADDING_SECTORS 8
+
+#define N_COMMIT_IDS 4
+
+static unsigned char prev_commit_seq(unsigned char seq)
+{
+ return (seq + N_COMMIT_IDS - 1) % N_COMMIT_IDS;
+}
+
+static unsigned char next_commit_seq(unsigned char seq)
+{
+ return (seq + 1) % N_COMMIT_IDS;
+}
+
+/*
+ * In-memory structures
+ */
+
+struct journal_node {
+ struct rb_node node;
+ sector_t sector;
+};
+
+struct alg_spec {
+ char *alg_string;
+ char *key_string;
+ __u8 *key;
+ unsigned key_size;
+};
+
+struct dm_integrity_c {
+ struct dm_dev *dev;
+ unsigned tag_size;
+ __s8 log2_tag_size;
+ sector_t start;
+ mempool_t *journal_io_mempool;
+ struct dm_io_client *io;
+ struct dm_bufio_client *bufio;
+ struct workqueue_struct *metadata_wq;
+ struct superblock *sb;
+ unsigned journal_pages;
+ struct page_list *journal;
+ struct page_list *journal_io;
+ struct page_list *journal_xor;
+
+ struct crypto_skcipher *journal_crypt;
+ struct scatterlist **journal_scatterlist;
+ struct scatterlist **journal_io_scatterlist;
+ struct skcipher_request **sk_requests;
+
+ struct crypto_shash *journal_mac;
+
+ struct journal_node *journal_tree;
+ struct rb_root journal_tree_root;
+
+ sector_t provided_data_sectors;
+
+ unsigned short journal_entry_size;
+ unsigned char journal_entries_per_sector;
+ unsigned char journal_section_entries;
+ unsigned char journal_section_sectors;
+ unsigned journal_sections;
+ unsigned journal_entries;
+ sector_t device_sectors;
+ unsigned initial_sectors;
+ unsigned metadata_run;
+ __s8 log2_metadata_run;
+ __u8 log2_buffer_sectors;
+
+ unsigned char mode;
+ bool suspending;
+
+ int failed;
+
+ struct crypto_shash *internal_hash;
+
+ /* these variables are locked with endio_wait.lock */
+ struct rb_root in_progress;
+ wait_queue_head_t endio_wait;
+ struct workqueue_struct *wait_wq;
+
+ unsigned char commit_seq;
+ commit_id_t commit_ids[N_COMMIT_IDS];
+
+ unsigned committed_section;
+ unsigned n_committed_sections;
+
+ unsigned uncommitted_section;
+ unsigned n_uncommitted_sections;
+
+ unsigned free_section;
+ unsigned char free_section_entry;
+ unsigned free_sectors;
+
+ unsigned free_sectors_threshold;
+
+ struct workqueue_struct *commit_wq;
+ struct work_struct commit_work;
+
+ struct workqueue_struct *writer_wq;
+ struct work_struct writer_work;
+
+ struct bio_list flush_bio_list;
+
+ unsigned long autocommit_jiffies;
+ struct timer_list autocommit_timer;
+ unsigned autocommit_msec;
+
+ wait_queue_head_t copy_to_journal_wait;
+
+ struct completion crypto_backoff;
+
+ bool journal_uptodate;
+ bool just_formatted;
+
+ struct alg_spec internal_hash_alg;
+ struct alg_spec journal_crypt_alg;
+ struct alg_spec journal_mac_alg;
+};
+
+struct dm_integrity_range {
+ sector_t logical_sector;
+ unsigned n_sectors;
+ struct rb_node node;
+};
+
+struct dm_integrity_io {
+ struct work_struct work;
+
+ struct dm_integrity_c *ic;
+ bool write;
+ bool fua;
+
+ struct dm_integrity_range range;
+
+ sector_t metadata_block;
+ unsigned metadata_offset;
+
+ atomic_t in_flight;
+ int bi_error;
+
+ struct completion *completion;
+
+ struct block_device *orig_bi_bdev;
+ bio_end_io_t *orig_bi_end_io;
+ struct bio_integrity_payload *orig_bi_integrity;
+ struct bvec_iter orig_bi_iter;
+};
+
+struct journal_completion {
+ struct dm_integrity_c *ic;
+ atomic_t in_flight;
+ struct completion comp;
+};
+
+struct journal_io {
+ struct dm_integrity_range range;
+ struct journal_completion *comp;
+};
+
+static struct kmem_cache *journal_io_cache;
+
+#define JOURNAL_IO_MEMPOOL 32
+
+#ifdef DEBUG_PRINT
+#define DEBUG_print(x, ...) printk(KERN_DEBUG x, ##__VA_ARGS__)
+static void __DEBUG_bytes(__u8 *bytes, size_t len, const char *msg, ...)
+{
+ va_list args;
+ va_start(args, msg);
+ vprintk(msg, args);
+ va_end(args);
+ if (len)
+ pr_cont(":");
+ while (len) {
+ pr_cont(" %02x", *bytes);
+ bytes++;
+ len--;
+ }
+ pr_cont("\n");
+}
+#define DEBUG_bytes(bytes, len, msg, ...) __DEBUG_bytes(bytes, len, KERN_DEBUG msg, ##__VA_ARGS__)
+#else
+#define DEBUG_print(x, ...) do { } while (0)
+#define DEBUG_bytes(bytes, len, msg, ...) do { } while (0)
+#endif
+
+/*
+ * DM Integrity profile, protection is performed layer above (dm-crypt)
+ */
+static struct blk_integrity_profile dm_integrity_profile = {
+ .name = "DM-DIF-EXT-TAG",
+ .generate_fn = NULL,
+ .verify_fn = NULL,
+};
+
+static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map);
+static void integrity_bio_wait(struct work_struct *w);
+static void dm_integrity_dtr(struct dm_target *ti);
+
+static void dm_integrity_io_error(struct dm_integrity_c *ic, const char *msg, int err)
+{
+ if (!cmpxchg(&ic->failed, 0, err))
+ DMERR("Error on %s: %d", msg, err);
+}
+
+static int dm_integrity_failed(struct dm_integrity_c *ic)
+{
+ return ACCESS_ONCE(ic->failed);
+}
+
+static commit_id_t dm_integrity_commit_id(struct dm_integrity_c *ic, unsigned i,
+ unsigned j, unsigned char seq)
+{
+ /*
+ * Xor the number with section and sector, so that if a piece of
+ * journal is written at wrong place, it is detected.
+ */
+ return ic->commit_ids[seq] ^ cpu_to_le64(((__u64)i << 32) ^ j);
+}
+
+static void get_area_and_offset(struct dm_integrity_c *ic, sector_t data_sector,
+ sector_t *area, sector_t *offset)
+{
+ __u8 log2_interleave_sectors = ic->sb->log2_interleave_sectors;
+
+ *area = data_sector >> log2_interleave_sectors;
+ *offset = (unsigned)data_sector & ((1U << log2_interleave_sectors) - 1);
+}
+
+static __u64 get_metadata_sector_and_offset(struct dm_integrity_c *ic, sector_t area,
+ sector_t offset, unsigned *metadata_offset)
+{
+ __u64 ms;
+ unsigned mo;
+
+ ms = area << ic->sb->log2_interleave_sectors;
+ if (likely(ic->log2_metadata_run >= 0))
+ ms += area << ic->log2_metadata_run;
+ else
+ ms += area * ic->metadata_run;
+ ms >>= ic->log2_buffer_sectors;
+
+ if (likely(ic->log2_tag_size >= 0)) {
+ ms += offset >> (SECTOR_SHIFT + ic->log2_buffer_sectors - ic->log2_tag_size);
+ mo = (offset << ic->log2_tag_size) & ((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - 1);
+ } else {
+ ms += (__u64)offset * ic->tag_size >> (SECTOR_SHIFT + ic->log2_buffer_sectors);
+ mo = (offset * ic->tag_size) & ((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - 1);
+ }
+ *metadata_offset = mo;
+ return ms;
+}
+
+static sector_t get_data_sector(struct dm_integrity_c *ic, sector_t area, sector_t offset)
+{
+ sector_t result;
+
+ result = area << ic->sb->log2_interleave_sectors;
+ if (likely(ic->log2_metadata_run >= 0))
+ result += (area + 1) << ic->log2_metadata_run;
+ else
+ result += (area + 1) * ic->metadata_run;
+
+ result += (sector_t)ic->initial_sectors + offset;
+ return result;
+}
+
+static void wraparound_section(struct dm_integrity_c *ic, unsigned *sec_ptr)
+{
+ if (unlikely(*sec_ptr >= ic->journal_sections))
+ *sec_ptr -= ic->journal_sections;
+}
+
+static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags)
+{
+ struct dm_io_request io_req;
+ struct dm_io_region io_loc;
+
+ io_req.bi_op = op;
+ io_req.bi_op_flags = op_flags;
+ io_req.mem.type = DM_IO_KMEM;
+ io_req.mem.ptr.addr = ic->sb;
+ io_req.notify.fn = NULL;
+ io_req.client = ic->io;
+ io_loc.bdev = ic->dev->bdev;
+ io_loc.sector = ic->start;
+ io_loc.count = SB_SECTORS;
+
+ return dm_io(&io_req, 1, &io_loc, NULL);
+}
+
+static void access_journal_check(struct dm_integrity_c *ic, unsigned section, unsigned offset,
+ bool e, const char *function)
+{
+#if defined(CONFIG_DM_DEBUG) || defined(INTERNAL_VERIFY)
+ unsigned limit = e ? ic->journal_section_entries : ic->journal_section_sectors;
+
+ if (unlikely(section >= ic->journal_sections) ||
+ unlikely(offset >= limit)) {
+ printk(KERN_CRIT "%s: invalid access at (%u,%u), limit (%u,%u)\n",
+ function, section, offset, ic->journal_sections, limit);
+ BUG();
+ }
+#endif
+}
+
+static void page_list_location(struct dm_integrity_c *ic, unsigned section, unsigned offset,
+ unsigned *pl_index, unsigned *pl_offset)
+{
+ unsigned sector;
+
+ access_journal_check(ic, section, offset, false, "access_journal");
+
+ sector = section * ic->journal_section_sectors + offset;
+
+ *pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
+ *pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
+}
+
+static struct journal_sector *access_page_list(struct dm_integrity_c *ic, struct page_list *pl,
+ unsigned section, unsigned offset, unsigned *n_sectors)
+{
+ unsigned pl_index, pl_offset;
+ char *va;
+
+ page_list_location(ic, section, offset, &pl_index, &pl_offset);
+
+ if (n_sectors)
+ *n_sectors = (PAGE_SIZE - pl_offset) >> SECTOR_SHIFT;
+
+ va = lowmem_page_address(pl[pl_index].page);
+
+ return (struct journal_sector *)(va + pl_offset);
+}
+
+static struct journal_sector *access_journal(struct dm_integrity_c *ic, unsigned section, unsigned offset)
+{
+ return access_page_list(ic, ic->journal, section, offset, NULL);
+}
+
+static struct journal_entry *access_journal_entry(struct dm_integrity_c *ic, unsigned section, unsigned n)
+{
+ unsigned rel_sector, offset;
+ struct journal_sector *js;
+
+ access_journal_check(ic, section, n, true, "access_journal_entry");
+
+ rel_sector = n % JOURNAL_BLOCK_SECTORS;
+ offset = n / JOURNAL_BLOCK_SECTORS;
+
+ js = access_journal(ic, section, rel_sector);
+ return (struct journal_entry *)((char *)js + offset * ic->journal_entry_size);
+}
+
+static struct journal_sector *access_journal_data(struct dm_integrity_c *ic, unsigned section, unsigned n)
+{
+ access_journal_check(ic, section, n, true, "access_journal_data");
+
+ return access_journal(ic, section, n + JOURNAL_BLOCK_SECTORS);
+}
+
+static void section_mac(struct dm_integrity_c *ic, unsigned section, __u8 result[JOURNAL_MAC_SIZE])
+{
+ SHASH_DESC_ON_STACK(desc, ic->journal_mac);
+ int r;
+ unsigned j, size;
+
+ desc->tfm = ic->journal_mac;
+ desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ r = crypto_shash_init(desc);
+ if (unlikely(r)) {
+ dm_integrity_io_error(ic, "crypto_shash_init", r);
+ goto err;
+ }
+
+ for (j = 0; j < ic->journal_section_entries; j++) {
+ struct journal_entry *je = access_journal_entry(ic, section, j);
+ r = crypto_shash_update(desc, (__u8 *)&je->u.sector, sizeof je->u.sector);
+ if (unlikely(r)) {
+ dm_integrity_io_error(ic, "crypto_shash_update", r);
+ goto err;
+ }
+ }
+
+ size = crypto_shash_digestsize(ic->journal_mac);
+
+ if (likely(size <= JOURNAL_MAC_SIZE)) {
+ r = crypto_shash_final(desc, result);
+ if (unlikely(r)) {
+ dm_integrity_io_error(ic, "crypto_shash_final", r);
+ goto err;
+ }
+ memset(result + size, 0, JOURNAL_MAC_SIZE - size);
+ } else {
+ __u8 digest[size];
+ r = crypto_shash_final(desc, digest);
+ if (unlikely(r)) {
+ dm_integrity_io_error(ic, "crypto_shash_final", r);
+ goto err;
+ }
+ memcpy(result, digest, JOURNAL_MAC_SIZE);
+ }
+
+ return;
+err:
+ memset(result, 0, JOURNAL_MAC_SIZE);
+}
+
+static void rw_section_mac(struct dm_integrity_c *ic, unsigned section, bool wr)
+{
+ __u8 result[JOURNAL_MAC_SIZE];
+ unsigned j;
+
+ if (!ic->journal_mac)
+ return;
+
+ section_mac(ic, section, result);
+
+ for (j = 0; j < JOURNAL_BLOCK_SECTORS; j++) {
+ struct journal_sector *js = access_journal(ic, section, j);
+
+ if (likely(wr))
+ memcpy(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR);
+ else {
+ if (memcmp(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR))
+ dm_integrity_io_error(ic, "journal mac", -EILSEQ);
+ }
+ }
+}
+
+static void complete_journal_op(void *context)
+{
+ struct journal_completion *comp = context;
+ BUG_ON(!atomic_read(&comp->in_flight));
+ if (likely(atomic_dec_and_test(&comp->in_flight)))
+ complete(&comp->comp);
+}
+
+static void xor_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section,
+ unsigned n_sections, struct journal_completion *comp)
+{
+ struct async_submit_ctl submit;
+ size_t n_bytes = (size_t)(n_sections * ic->journal_section_sectors) << SECTOR_SHIFT;
+ unsigned pl_index, pl_offset, section_index;
+ struct page_list *source_pl, *target_pl;
+
+ if (likely(encrypt)) {
+ source_pl = ic->journal;
+ target_pl = ic->journal_io;
+ } else {
+ source_pl = ic->journal_io;
+ target_pl = ic->journal;
+ }
+
+ page_list_location(ic, section, 0, &pl_index, &pl_offset);
+
+ atomic_add(roundup(pl_offset + n_bytes, PAGE_SIZE) >> PAGE_SHIFT, &comp->in_flight);
+
+ init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL, complete_journal_op, comp, NULL);
+
+ section_index = pl_index;
+
+ do {
+ size_t this_step;
+ struct page *src_pages[2];
+ struct page *dst_page;
+
+ while (unlikely(pl_index == section_index)) {
+ unsigned dummy;
+ if (likely(encrypt))
+ rw_section_mac(ic, section, true);
+ section++;
+ n_sections--;
+ if (!n_sections)
+ break;
+ page_list_location(ic, section, 0, &section_index, &dummy);
+ }
+
+ this_step = min(n_bytes, (size_t)PAGE_SIZE - pl_offset);
+ dst_page = target_pl[pl_index].page;
+ src_pages[0] = source_pl[pl_index].page;
+ src_pages[1] = ic->journal_xor[pl_index].page;
+
+ async_xor(dst_page, src_pages, pl_offset, 2, this_step, &submit);
+
+ pl_index++;
+ pl_offset = 0;
+ n_bytes -= this_step;
+ } while (n_bytes);
+
+ BUG_ON(n_sections);
+
+ async_tx_issue_pending_all();
+}
+
+static void complete_journal_encrypt(struct crypto_async_request *req, int err)
+{
+ struct journal_completion *comp = req->data;
+ if (unlikely(err)) {
+ if (likely(err == -EINPROGRESS)) {
+ complete(&comp->ic->crypto_backoff);
+ return;
+ }
+ dm_integrity_io_error(comp->ic, "asynchronous encrypt", err);
+ }
+ complete_journal_op(comp);
+}
+
+static bool do_crypt(bool encrypt, struct skcipher_request *req, struct journal_completion *comp)
+{
+ int r;
+ skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ complete_journal_encrypt, comp);
+ if (likely(encrypt))
+ r = crypto_skcipher_encrypt(req);
+ else
+ r = crypto_skcipher_decrypt(req);
+ if (likely(!r))
+ return false;
+ if (likely(r == -EINPROGRESS))
+ return true;
+ if (likely(r == -EBUSY)) {
+ wait_for_completion(&comp->ic->crypto_backoff);
+ reinit_completion(&comp->ic->crypto_backoff);
+ return true;
+ }
+ dm_integrity_io_error(comp->ic, "encrypt", r);
+ return false;
+}
+
+static void crypt_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section,
+ unsigned n_sections, struct journal_completion *comp)
+{
+ struct scatterlist **source_sg;
+ struct scatterlist **target_sg;
+
+ atomic_add(2, &comp->in_flight);
+
+ if (likely(encrypt)) {
+ source_sg = ic->journal_scatterlist;
+ target_sg = ic->journal_io_scatterlist;
+ } else {
+ source_sg = ic->journal_io_scatterlist;
+ target_sg = ic->journal_scatterlist;
+ }
+
+ do {
+ struct skcipher_request *req;
+ unsigned ivsize;
+ char *iv;
+
+ if (likely(encrypt))
+ rw_section_mac(ic, section, true);
+
+ req = ic->sk_requests[section];
+ ivsize = crypto_skcipher_ivsize(ic->journal_crypt);
+ iv = req->iv;
+
+ memcpy(iv, iv + ivsize, ivsize);
+
+ req->src = source_sg[section];
+ req->dst = target_sg[section];
+
+ if (unlikely(do_crypt(encrypt, req, comp)))
+ atomic_inc(&comp->in_flight);
+
+ section++;
+ n_sections--;
+ } while (n_sections);
+
+ atomic_dec(&comp->in_flight);
+ complete_journal_op(comp);
+}
+
+static void encrypt_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section,
+ unsigned n_sections, struct journal_completion *comp)
+{
+ if (ic->journal_xor)
+ return xor_journal(ic, encrypt, section, n_sections, comp);
+ else
+ return crypt_journal(ic, encrypt, section, n_sections, comp);
+}
+
+static void complete_journal_io(unsigned long error, void *context)
+{
+ struct journal_completion *comp = context;
+ if (unlikely(error != 0))
+ dm_integrity_io_error(comp->ic, "writing journal", -EIO);
+ complete_journal_op(comp);
+}
+
+static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned section,
+ unsigned n_sections, struct journal_completion *comp)
+{
+ struct dm_io_request io_req;
+ struct dm_io_region io_loc;
+ unsigned sector, n_sectors, pl_index, pl_offset;
+ int r;
+
+ if (unlikely(dm_integrity_failed(ic))) {
+ if (comp)
+ complete_journal_io(-1UL, comp);
+ return;
+ }
+
+ sector = section * ic->journal_section_sectors;
+ n_sectors = n_sections * ic->journal_section_sectors;
+
+ pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
+ pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
+
+ io_req.bi_op = op;
+ io_req.bi_op_flags = op_flags;
+ io_req.mem.type = DM_IO_PAGE_LIST;
+ if (ic->journal_io)
+ io_req.mem.ptr.pl = &ic->journal_io[pl_index];
+ else
+ io_req.mem.ptr.pl = &ic->journal[pl_index];
+ io_req.mem.offset = pl_offset;
+ if (likely(comp != NULL)) {
+ io_req.notify.fn = complete_journal_io;
+ io_req.notify.context = comp;
+ } else {
+ io_req.notify.fn = NULL;
+ }
+ io_req.client = ic->io;
+ io_loc.bdev = ic->dev->bdev;
+ io_loc.sector = ic->start + SB_SECTORS + sector;
+ io_loc.count = n_sectors;
+
+ r = dm_io(&io_req, 1, &io_loc, NULL);
+ if (unlikely(r)) {
+ dm_integrity_io_error(ic, op == REQ_OP_READ ? "reading journal" : "writing journal", r);
+ if (comp) {
+ WARN_ONCE(1, "asynchronous dm_io failed: %d", r);
+ complete_journal_io(-1UL, comp);
+ }
+ }
+}
+
+static void write_journal(struct dm_integrity_c *ic, unsigned commit_start, unsigned commit_sections)
+{
+ struct journal_completion io_comp;
+ struct journal_completion crypt_comp_1;
+ struct journal_completion crypt_comp_2;
+ unsigned i;
+
+ io_comp.ic = ic;
+ io_comp.comp = COMPLETION_INITIALIZER_ONSTACK(io_comp.comp);
+
+ if (commit_start + commit_sections <= ic->journal_sections) {
+ io_comp.in_flight = (atomic_t)ATOMIC_INIT(1);
+ if (ic->journal_io) {
+ crypt_comp_1.ic = ic;
+ crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp);
+ crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
+ encrypt_journal(ic, true, commit_start, commit_sections, &crypt_comp_1);
+ wait_for_completion_io(&crypt_comp_1.comp);
+ } else {
+ for (i = 0; i < commit_sections; i++)
+ rw_section_mac(ic, commit_start + i, true);
+ }
+ rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, commit_sections, &io_comp);
+ } else {
+ unsigned to_end;
+ io_comp.in_flight = (atomic_t)ATOMIC_INIT(2);
+ to_end = ic->journal_sections - commit_start;
+ if (ic->journal_io) {
+ crypt_comp_1.ic = ic;
+ crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp);
+ crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
+ encrypt_journal(ic, true, commit_start, to_end, &crypt_comp_1);
+ if (try_wait_for_completion(&crypt_comp_1.comp)) {
+ rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp);
+ crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp);
+ crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0);
+ encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_1);
+ wait_for_completion_io(&crypt_comp_1.comp);
+ } else {
+ crypt_comp_2.ic = ic;
+ crypt_comp_2.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_2.comp);
+ crypt_comp_2.in_flight = (atomic_t)ATOMIC_INIT(0);
+ encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_2);
+ wait_for_completion_io(&crypt_comp_1.comp);
+ rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp);
+ wait_for_completion_io(&crypt_comp_2.comp);
+ }
+ } else {
+ for (i = 0; i < to_end; i++)
+ rw_section_mac(ic, commit_start + i, true);
+ rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp);
+ for (i = 0; i < commit_sections - to_end; i++)
+ rw_section_mac(ic, i, true);
+ }
+ rw_journal(ic, REQ_OP_WRITE, REQ_FUA, 0, commit_sections - to_end, &io_comp);
+ }
+
+ wait_for_completion_io(&io_comp.comp);
+}
+
+static void copy_from_journal(struct dm_integrity_c *ic, unsigned section, unsigned offset,
+ unsigned n_sectors, sector_t target, io_notify_fn fn, void *data)
+{
+ struct dm_io_request io_req;
+ struct dm_io_region io_loc;
+ int r;
+ unsigned sector, pl_index, pl_offset;
+
+ if (unlikely(dm_integrity_failed(ic))) {
+ fn(-1UL, data);
+ return;
+ }
+
+ sector = section * ic->journal_section_sectors + JOURNAL_BLOCK_SECTORS + offset;
+
+ pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT);
+ pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1);
+
+ io_req.bi_op = REQ_OP_WRITE;
+ io_req.bi_op_flags = 0;
+ io_req.mem.type = DM_IO_PAGE_LIST;
+ io_req.mem.ptr.pl = &ic->journal[pl_index];
+ io_req.mem.offset = pl_offset;
+ io_req.notify.fn = fn;
+ io_req.notify.context = data;
+ io_req.client = ic->io;
+ io_loc.bdev = ic->dev->bdev;
+ io_loc.sector = ic->start + target;
+ io_loc.count = n_sectors;
+
+ r = dm_io(&io_req, 1, &io_loc, NULL);
+ if (unlikely(r)) {
+ WARN_ONCE(1, "asynchronous dm_io failed: %d", r);
+ fn(-1UL, data);
+ }
+}
+
+static bool add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range)
+{
+ struct rb_node **n = &ic->in_progress.rb_node;
+ struct rb_node *parent;
+
+ parent = NULL;
+
+ while (*n) {
+ struct dm_integrity_range *range = container_of(*n, struct dm_integrity_range, node);
+
+ parent = *n;
+ if (new_range->logical_sector + new_range->n_s