diff options
Diffstat (limited to 'drivers/md/dm-integrity.c')
-rw-r--r-- | drivers/md/dm-integrity.c | 3238 |
1 files changed, 3238 insertions, 0 deletions
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c new file mode 100644 index 000000000000..c7f7c8d76576 --- /dev/null +++ b/drivers/md/dm-integrity.c @@ -0,0 +1,3238 @@ +/* + * Copyright (C) 2016-2017 Red Hat, Inc. All rights reserved. + * Copyright (C) 2016-2017 Milan Broz + * Copyright (C) 2016-2017 Mikulas Patocka + * + * This file is released under the GPL. + */ + +#include <linux/module.h> +#include <linux/device-mapper.h> +#include <linux/dm-io.h> +#include <linux/vmalloc.h> +#include <linux/sort.h> +#include <linux/rbtree.h> +#include <linux/delay.h> +#include <linux/random.h> +#include <crypto/hash.h> +#include <crypto/skcipher.h> +#include <linux/async_tx.h> +#include "dm-bufio.h" + +#define DM_MSG_PREFIX "integrity" + +#define DEFAULT_INTERLEAVE_SECTORS 32768 +#define DEFAULT_JOURNAL_SIZE_FACTOR 7 +#define DEFAULT_BUFFER_SECTORS 128 +#define DEFAULT_JOURNAL_WATERMARK 50 +#define DEFAULT_SYNC_MSEC 10000 +#define DEFAULT_MAX_JOURNAL_SECTORS 131072 +#define MIN_LOG2_INTERLEAVE_SECTORS 3 +#define MAX_LOG2_INTERLEAVE_SECTORS 31 +#define METADATA_WORKQUEUE_MAX_ACTIVE 16 + +/* + * Warning - DEBUG_PRINT prints security-sensitive data to the log, + * so it should not be enabled in the official kernel + */ +//#define DEBUG_PRINT +//#define INTERNAL_VERIFY + +/* + * On disk structures + */ + +#define SB_MAGIC "integrt" +#define SB_VERSION 1 +#define SB_SECTORS 8 +#define MAX_SECTORS_PER_BLOCK 8 + +struct superblock { + __u8 magic[8]; + __u8 version; + __u8 log2_interleave_sectors; + __u16 integrity_tag_size; + __u32 journal_sections; + __u64 provided_data_sectors; /* userspace uses this value */ + __u32 flags; + __u8 log2_sectors_per_block; +}; + +#define SB_FLAG_HAVE_JOURNAL_MAC 0x1 + +#define JOURNAL_ENTRY_ROUNDUP 8 + +typedef __u64 commit_id_t; +#define JOURNAL_MAC_PER_SECTOR 8 + +struct journal_entry { + union { + struct { + __u32 sector_lo; + __u32 sector_hi; + } s; + __u64 sector; + } u; + commit_id_t last_bytes[0]; + /* __u8 tag[0]; */ +}; + +#define journal_entry_tag(ic, je) ((__u8 *)&(je)->last_bytes[(ic)->sectors_per_block]) + +#if BITS_PER_LONG == 64 +#define journal_entry_set_sector(je, x) do { smp_wmb(); ACCESS_ONCE((je)->u.sector) = cpu_to_le64(x); } while (0) +#define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector) +#elif defined(CONFIG_LBDAF) +#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); ACCESS_ONCE((je)->u.s.sector_hi) = cpu_to_le32((x) >> 32); } while (0) +#define journal_entry_get_sector(je) le64_to_cpu((je)->u.sector) +#else +#define journal_entry_set_sector(je, x) do { (je)->u.s.sector_lo = cpu_to_le32(x); smp_wmb(); ACCESS_ONCE((je)->u.s.sector_hi) = cpu_to_le32(0); } while (0) +#define journal_entry_get_sector(je) le32_to_cpu((je)->u.s.sector_lo) +#endif +#define journal_entry_is_unused(je) ((je)->u.s.sector_hi == cpu_to_le32(-1)) +#define journal_entry_set_unused(je) do { ((je)->u.s.sector_hi = cpu_to_le32(-1)); } while (0) +#define journal_entry_is_inprogress(je) ((je)->u.s.sector_hi == cpu_to_le32(-2)) +#define journal_entry_set_inprogress(je) do { ((je)->u.s.sector_hi = cpu_to_le32(-2)); } while (0) + +#define JOURNAL_BLOCK_SECTORS 8 +#define JOURNAL_SECTOR_DATA ((1 << SECTOR_SHIFT) - sizeof(commit_id_t)) +#define JOURNAL_MAC_SIZE (JOURNAL_MAC_PER_SECTOR * JOURNAL_BLOCK_SECTORS) + +struct journal_sector { + __u8 entries[JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR]; + __u8 mac[JOURNAL_MAC_PER_SECTOR]; + commit_id_t commit_id; +}; + +#define MAX_TAG_SIZE (JOURNAL_SECTOR_DATA - JOURNAL_MAC_PER_SECTOR - offsetof(struct journal_entry, last_bytes[MAX_SECTORS_PER_BLOCK])) + +#define METADATA_PADDING_SECTORS 8 + +#define N_COMMIT_IDS 4 + +static unsigned char prev_commit_seq(unsigned char seq) +{ + return (seq + N_COMMIT_IDS - 1) % N_COMMIT_IDS; +} + +static unsigned char next_commit_seq(unsigned char seq) +{ + return (seq + 1) % N_COMMIT_IDS; +} + +/* + * In-memory structures + */ + +struct journal_node { + struct rb_node node; + sector_t sector; +}; + +struct alg_spec { + char *alg_string; + char *key_string; + __u8 *key; + unsigned key_size; +}; + +struct dm_integrity_c { + struct dm_dev *dev; + unsigned tag_size; + __s8 log2_tag_size; + sector_t start; + mempool_t *journal_io_mempool; + struct dm_io_client *io; + struct dm_bufio_client *bufio; + struct workqueue_struct *metadata_wq; + struct superblock *sb; + unsigned journal_pages; + struct page_list *journal; + struct page_list *journal_io; + struct page_list *journal_xor; + + struct crypto_skcipher *journal_crypt; + struct scatterlist **journal_scatterlist; + struct scatterlist **journal_io_scatterlist; + struct skcipher_request **sk_requests; + + struct crypto_shash *journal_mac; + + struct journal_node *journal_tree; + struct rb_root journal_tree_root; + + sector_t provided_data_sectors; + + unsigned short journal_entry_size; + unsigned char journal_entries_per_sector; + unsigned char journal_section_entries; + unsigned short journal_section_sectors; + unsigned journal_sections; + unsigned journal_entries; + sector_t device_sectors; + unsigned initial_sectors; + unsigned metadata_run; + __s8 log2_metadata_run; + __u8 log2_buffer_sectors; + __u8 sectors_per_block; + + unsigned char mode; + bool suspending; + + int failed; + + struct crypto_shash *internal_hash; + + /* these variables are locked with endio_wait.lock */ + struct rb_root in_progress; + wait_queue_head_t endio_wait; + struct workqueue_struct *wait_wq; + + unsigned char commit_seq; + commit_id_t commit_ids[N_COMMIT_IDS]; + + unsigned committed_section; + unsigned n_committed_sections; + + unsigned uncommitted_section; + unsigned n_uncommitted_sections; + + unsigned free_section; + unsigned char free_section_entry; + unsigned free_sectors; + + unsigned free_sectors_threshold; + + struct workqueue_struct *commit_wq; + struct work_struct commit_work; + + struct workqueue_struct *writer_wq; + struct work_struct writer_work; + + struct bio_list flush_bio_list; + + unsigned long autocommit_jiffies; + struct timer_list autocommit_timer; + unsigned autocommit_msec; + + wait_queue_head_t copy_to_journal_wait; + + struct completion crypto_backoff; + + bool journal_uptodate; + bool just_formatted; + + struct alg_spec internal_hash_alg; + struct alg_spec journal_crypt_alg; + struct alg_spec journal_mac_alg; +}; + +struct dm_integrity_range { + sector_t logical_sector; + unsigned n_sectors; + struct rb_node node; +}; + +struct dm_integrity_io { + struct work_struct work; + + struct dm_integrity_c *ic; + bool write; + bool fua; + + struct dm_integrity_range range; + + sector_t metadata_block; + unsigned metadata_offset; + + atomic_t in_flight; + int bi_error; + + struct completion *completion; + + struct block_device *orig_bi_bdev; + bio_end_io_t *orig_bi_end_io; + struct bio_integrity_payload *orig_bi_integrity; + struct bvec_iter orig_bi_iter; +}; + +struct journal_completion { + struct dm_integrity_c *ic; + atomic_t in_flight; + struct completion comp; +}; + +struct journal_io { + struct dm_integrity_range range; + struct journal_completion *comp; +}; + +static struct kmem_cache *journal_io_cache; + +#define JOURNAL_IO_MEMPOOL 32 + +#ifdef DEBUG_PRINT +#define DEBUG_print(x, ...) printk(KERN_DEBUG x, ##__VA_ARGS__) +static void __DEBUG_bytes(__u8 *bytes, size_t len, const char *msg, ...) +{ + va_list args; + va_start(args, msg); + vprintk(msg, args); + va_end(args); + if (len) + pr_cont(":"); + while (len) { + pr_cont(" %02x", *bytes); + bytes++; + len--; + } + pr_cont("\n"); +} +#define DEBUG_bytes(bytes, len, msg, ...) __DEBUG_bytes(bytes, len, KERN_DEBUG msg, ##__VA_ARGS__) +#else +#define DEBUG_print(x, ...) do { } while (0) +#define DEBUG_bytes(bytes, len, msg, ...) do { } while (0) +#endif + +/* + * DM Integrity profile, protection is performed layer above (dm-crypt) + */ +static struct blk_integrity_profile dm_integrity_profile = { + .name = "DM-DIF-EXT-TAG", + .generate_fn = NULL, + .verify_fn = NULL, +}; + +static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map); +static void integrity_bio_wait(struct work_struct *w); +static void dm_integrity_dtr(struct dm_target *ti); + +static void dm_integrity_io_error(struct dm_integrity_c *ic, const char *msg, int err) +{ + if (!cmpxchg(&ic->failed, 0, err)) + DMERR("Error on %s: %d", msg, err); +} + +static int dm_integrity_failed(struct dm_integrity_c *ic) +{ + return ACCESS_ONCE(ic->failed); +} + +static commit_id_t dm_integrity_commit_id(struct dm_integrity_c *ic, unsigned i, + unsigned j, unsigned char seq) +{ + /* + * Xor the number with section and sector, so that if a piece of + * journal is written at wrong place, it is detected. + */ + return ic->commit_ids[seq] ^ cpu_to_le64(((__u64)i << 32) ^ j); +} + +static void get_area_and_offset(struct dm_integrity_c *ic, sector_t data_sector, + sector_t *area, sector_t *offset) +{ + __u8 log2_interleave_sectors = ic->sb->log2_interleave_sectors; + + *area = data_sector >> log2_interleave_sectors; + *offset = (unsigned)data_sector & ((1U << log2_interleave_sectors) - 1); +} + +#define sector_to_block(ic, n) \ +do { \ + BUG_ON((n) & (unsigned)((ic)->sectors_per_block - 1)); \ + (n) >>= (ic)->sb->log2_sectors_per_block; \ +} while (0) + +static __u64 get_metadata_sector_and_offset(struct dm_integrity_c *ic, sector_t area, + sector_t offset, unsigned *metadata_offset) +{ + __u64 ms; + unsigned mo; + + ms = area << ic->sb->log2_interleave_sectors; + if (likely(ic->log2_metadata_run >= 0)) + ms += area << ic->log2_metadata_run; + else + ms += area * ic->metadata_run; + ms >>= ic->log2_buffer_sectors; + + sector_to_block(ic, offset); + + if (likely(ic->log2_tag_size >= 0)) { + ms += offset >> (SECTOR_SHIFT + ic->log2_buffer_sectors - ic->log2_tag_size); + mo = (offset << ic->log2_tag_size) & ((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - 1); + } else { + ms += (__u64)offset * ic->tag_size >> (SECTOR_SHIFT + ic->log2_buffer_sectors); + mo = (offset * ic->tag_size) & ((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - 1); + } + *metadata_offset = mo; + return ms; +} + +static sector_t get_data_sector(struct dm_integrity_c *ic, sector_t area, sector_t offset) +{ + sector_t result; + + result = area << ic->sb->log2_interleave_sectors; + if (likely(ic->log2_metadata_run >= 0)) + result += (area + 1) << ic->log2_metadata_run; + else + result += (area + 1) * ic->metadata_run; + + result += (sector_t)ic->initial_sectors + offset; + return result; +} + +static void wraparound_section(struct dm_integrity_c *ic, unsigned *sec_ptr) +{ + if (unlikely(*sec_ptr >= ic->journal_sections)) + *sec_ptr -= ic->journal_sections; +} + +static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags) +{ + struct dm_io_request io_req; + struct dm_io_region io_loc; + + io_req.bi_op = op; + io_req.bi_op_flags = op_flags; + io_req.mem.type = DM_IO_KMEM; + io_req.mem.ptr.addr = ic->sb; + io_req.notify.fn = NULL; + io_req.client = ic->io; + io_loc.bdev = ic->dev->bdev; + io_loc.sector = ic->start; + io_loc.count = SB_SECTORS; + + return dm_io(&io_req, 1, &io_loc, NULL); +} + +static void access_journal_check(struct dm_integrity_c *ic, unsigned section, unsigned offset, + bool e, const char *function) +{ +#if defined(CONFIG_DM_DEBUG) || defined(INTERNAL_VERIFY) + unsigned limit = e ? ic->journal_section_entries : ic->journal_section_sectors; + + if (unlikely(section >= ic->journal_sections) || + unlikely(offset >= limit)) { + printk(KERN_CRIT "%s: invalid access at (%u,%u), limit (%u,%u)\n", + function, section, offset, ic->journal_sections, limit); + BUG(); + } +#endif +} + +static void page_list_location(struct dm_integrity_c *ic, unsigned section, unsigned offset, + unsigned *pl_index, unsigned *pl_offset) +{ + unsigned sector; + + access_journal_check(ic, section, offset, false, "page_list_location"); + + sector = section * ic->journal_section_sectors + offset; + + *pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT); + *pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1); +} + +static struct journal_sector *access_page_list(struct dm_integrity_c *ic, struct page_list *pl, + unsigned section, unsigned offset, unsigned *n_sectors) +{ + unsigned pl_index, pl_offset; + char *va; + + page_list_location(ic, section, offset, &pl_index, &pl_offset); + + if (n_sectors) + *n_sectors = (PAGE_SIZE - pl_offset) >> SECTOR_SHIFT; + + va = lowmem_page_address(pl[pl_index].page); + + return (struct journal_sector *)(va + pl_offset); +} + +static struct journal_sector *access_journal(struct dm_integrity_c *ic, unsigned section, unsigned offset) +{ + return access_page_list(ic, ic->journal, section, offset, NULL); +} + +static struct journal_entry *access_journal_entry(struct dm_integrity_c *ic, unsigned section, unsigned n) +{ + unsigned rel_sector, offset; + struct journal_sector *js; + + access_journal_check(ic, section, n, true, "access_journal_entry"); + + rel_sector = n % JOURNAL_BLOCK_SECTORS; + offset = n / JOURNAL_BLOCK_SECTORS; + + js = access_journal(ic, section, rel_sector); + return (struct journal_entry *)((char *)js + offset * ic->journal_entry_size); +} + +static struct journal_sector *access_journal_data(struct dm_integrity_c *ic, unsigned section, unsigned n) +{ + n <<= ic->sb->log2_sectors_per_block; + + n += JOURNAL_BLOCK_SECTORS; + + access_journal_check(ic, section, n, false, "access_journal_data"); + + return access_journal(ic, section, n); +} + +static void section_mac(struct dm_integrity_c *ic, unsigned section, __u8 result[JOURNAL_MAC_SIZE]) +{ + SHASH_DESC_ON_STACK(desc, ic->journal_mac); + int r; + unsigned j, size; + + desc->tfm = ic->journal_mac; + desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + r = crypto_shash_init(desc); + if (unlikely(r)) { + dm_integrity_io_error(ic, "crypto_shash_init", r); + goto err; + } + + for (j = 0; j < ic->journal_section_entries; j++) { + struct journal_entry *je = access_journal_entry(ic, section, j); + r = crypto_shash_update(desc, (__u8 *)&je->u.sector, sizeof je->u.sector); + if (unlikely(r)) { + dm_integrity_io_error(ic, "crypto_shash_update", r); + goto err; + } + } + + size = crypto_shash_digestsize(ic->journal_mac); + + if (likely(size <= JOURNAL_MAC_SIZE)) { + r = crypto_shash_final(desc, result); + if (unlikely(r)) { + dm_integrity_io_error(ic, "crypto_shash_final", r); + goto err; + } + memset(result + size, 0, JOURNAL_MAC_SIZE - size); + } else { + __u8 digest[size]; + r = crypto_shash_final(desc, digest); + if (unlikely(r)) { + dm_integrity_io_error(ic, "crypto_shash_final", r); + goto err; + } + memcpy(result, digest, JOURNAL_MAC_SIZE); + } + + return; +err: + memset(result, 0, JOURNAL_MAC_SIZE); +} + +static void rw_section_mac(struct dm_integrity_c *ic, unsigned section, bool wr) +{ + __u8 result[JOURNAL_MAC_SIZE]; + unsigned j; + + if (!ic->journal_mac) + return; + + section_mac(ic, section, result); + + for (j = 0; j < JOURNAL_BLOCK_SECTORS; j++) { + struct journal_sector *js = access_journal(ic, section, j); + + if (likely(wr)) + memcpy(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR); + else { + if (memcmp(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR)) + dm_integrity_io_error(ic, "journal mac", -EILSEQ); + } + } +} + +static void complete_journal_op(void *context) +{ + struct journal_completion *comp = context; + BUG_ON(!atomic_read(&comp->in_flight)); + if (likely(atomic_dec_and_test(&comp->in_flight))) + complete(&comp->comp); +} + +static void xor_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section, + unsigned n_sections, struct journal_completion *comp) +{ + struct async_submit_ctl submit; + size_t n_bytes = (size_t)(n_sections * ic->journal_section_sectors) << SECTOR_SHIFT; + unsigned pl_index, pl_offset, section_index; + struct page_list *source_pl, *target_pl; + + if (likely(encrypt)) { + source_pl = ic->journal; + target_pl = ic->journal_io; + } else { + source_pl = ic->journal_io; + target_pl = ic->journal; + } + + page_list_location(ic, section, 0, &pl_index, &pl_offset); + + atomic_add(roundup(pl_offset + n_bytes, PAGE_SIZE) >> PAGE_SHIFT, &comp->in_flight); + + init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL, complete_journal_op, comp, NULL); + + section_index = pl_index; + + do { + size_t this_step; + struct page *src_pages[2]; + struct page *dst_page; + + while (unlikely(pl_index == section_index)) { + unsigned dummy; + if (likely(encrypt)) + rw_section_mac(ic, section, true); + section++; + n_sections--; + if (!n_sections) + break; + page_list_location(ic, section, 0, §ion_index, &dummy); + } + + this_step = min(n_bytes, (size_t)PAGE_SIZE - pl_offset); + dst_page = target_pl[pl_index].page; + src_pages[0] = source_pl[pl_index].page; + src_pages[1] = ic->journal_xor[pl_index].page; + + async_xor(dst_page, src_pages, pl_offset, 2, this_step, &submit); + + pl_index++; + pl_offset = 0; + n_bytes -= this_step; + } while (n_bytes); + + BUG_ON(n_sections); + + async_tx_issue_pending_all(); +} + +static void complete_journal_encrypt(struct crypto_async_request *req, int err) +{ + struct journal_completion *comp = req->data; + if (unlikely(err)) { + if (likely(err == -EINPROGRESS)) { + complete(&comp->ic->crypto_backoff); + return; + } + dm_integrity_io_error(comp->ic, "asynchronous encrypt", err); + } + complete_journal_op(comp); +} + +static bool do_crypt(bool encrypt, struct skcipher_request *req, struct journal_completion *comp) +{ + int r; + skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + complete_journal_encrypt, comp); + if (likely(encrypt)) + r = crypto_skcipher_encrypt(req); + else + r = crypto_skcipher_decrypt(req); + if (likely(!r)) + return false; + if (likely(r == -EINPROGRESS)) + return true; + if (likely(r == -EBUSY)) { + wait_for_completion(&comp->ic->crypto_backoff); + reinit_completion(&comp->ic->crypto_backoff); + return true; + } + dm_integrity_io_error(comp->ic, "encrypt", r); + return false; +} + +static void crypt_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section, + unsigned n_sections, struct journal_completion *comp) +{ + struct scatterlist **source_sg; + struct scatterlist **target_sg; + + atomic_add(2, &comp->in_flight); + + if (likely(encrypt)) { + source_sg = ic->journal_scatterlist; + target_sg = ic->journal_io_scatterlist; + } else { + source_sg = ic->journal_io_scatterlist; + target_sg = ic->journal_scatterlist; + } + + do { + struct skcipher_request *req; + unsigned ivsize; + char *iv; + + if (likely(encrypt)) + rw_section_mac(ic, section, true); + + req = ic->sk_requests[section]; + ivsize = crypto_skcipher_ivsize(ic->journal_crypt); + iv = req->iv; + + memcpy(iv, iv + ivsize, ivsize); + + req->src = source_sg[section]; + req->dst = target_sg[section]; + + if (unlikely(do_crypt(encrypt, req, comp))) + atomic_inc(&comp->in_flight); + + section++; + n_sections--; + } while (n_sections); + + atomic_dec(&comp->in_flight); + complete_journal_op(comp); +} + +static void encrypt_journal(struct dm_integrity_c *ic, bool encrypt, unsigned section, + unsigned n_sections, struct journal_completion *comp) +{ + if (ic->journal_xor) + return xor_journal(ic, encrypt, section, n_sections, comp); + else + return crypt_journal(ic, encrypt, section, n_sections, comp); +} + +static void complete_journal_io(unsigned long error, void *context) +{ + struct journal_completion *comp = context; + if (unlikely(error != 0)) + dm_integrity_io_error(comp->ic, "writing journal", -EIO); + complete_journal_op(comp); +} + +static void rw_journal(struct dm_integrity_c *ic, int op, int op_flags, unsigned section, + unsigned n_sections, struct journal_completion *comp) +{ + struct dm_io_request io_req; + struct dm_io_region io_loc; + unsigned sector, n_sectors, pl_index, pl_offset; + int r; + + if (unlikely(dm_integrity_failed(ic))) { + if (comp) + complete_journal_io(-1UL, comp); + return; + } + + sector = section * ic->journal_section_sectors; + n_sectors = n_sections * ic->journal_section_sectors; + + pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT); + pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1); + + io_req.bi_op = op; + io_req.bi_op_flags = op_flags; + io_req.mem.type = DM_IO_PAGE_LIST; + if (ic->journal_io) + io_req.mem.ptr.pl = &ic->journal_io[pl_index]; + else + io_req.mem.ptr.pl = &ic->journal[pl_index]; + io_req.mem.offset = pl_offset; + if (likely(comp != NULL)) { + io_req.notify.fn = complete_journal_io; + io_req.notify.context = comp; + } else { + io_req.notify.fn = NULL; + } + io_req.client = ic->io; + io_loc.bdev = ic->dev->bdev; + io_loc.sector = ic->start + SB_SECTORS + sector; + io_loc.count = n_sectors; + + r = dm_io(&io_req, 1, &io_loc, NULL); + if (unlikely(r)) { + dm_integrity_io_error(ic, op == REQ_OP_READ ? "reading journal" : "writing journal", r); + if (comp) { + WARN_ONCE(1, "asynchronous dm_io failed: %d", r); + complete_journal_io(-1UL, comp); + } + } +} + +static void write_journal(struct dm_integrity_c *ic, unsigned commit_start, unsigned commit_sections) +{ + struct journal_completion io_comp; + struct journal_completion crypt_comp_1; + struct journal_completion crypt_comp_2; + unsigned i; + + io_comp.ic = ic; + io_comp.comp = COMPLETION_INITIALIZER_ONSTACK(io_comp.comp); + + if (commit_start + commit_sections <= ic->journal_sections) { + io_comp.in_flight = (atomic_t)ATOMIC_INIT(1); + if (ic->journal_io) { + crypt_comp_1.ic = ic; + crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp); + crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0); + encrypt_journal(ic, true, commit_start, commit_sections, &crypt_comp_1); + wait_for_completion_io(&crypt_comp_1.comp); + } else { + for (i = 0; i < commit_sections; i++) + rw_section_mac(ic, commit_start + i, true); + } + rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, commit_sections, &io_comp); + } else { + unsigned to_end; + io_comp.in_flight = (atomic_t)ATOMIC_INIT(2); + to_end = ic->journal_sections - commit_start; + if (ic->journal_io) { + crypt_comp_1.ic = ic; + crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp); + crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0); + encrypt_journal(ic, true, commit_start, to_end, &crypt_comp_1); + if (try_wait_for_completion(&crypt_comp_1.comp)) { + rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp); + crypt_comp_1.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_1.comp); + crypt_comp_1.in_flight = (atomic_t)ATOMIC_INIT(0); + encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_1); + wait_for_completion_io(&crypt_comp_1.comp); + } else { + crypt_comp_2.ic = ic; + crypt_comp_2.comp = COMPLETION_INITIALIZER_ONSTACK(crypt_comp_2.comp); + crypt_comp_2.in_flight = (atomic_t)ATOMIC_INIT(0); + encrypt_journal(ic, true, 0, commit_sections - to_end, &crypt_comp_2); + wait_for_completion_io(&crypt_comp_1.comp); + rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp); + wait_for_completion_io(&crypt_comp_2.comp); + } + } else { + for (i = 0; i < to_end; i++) + rw_section_mac(ic, commit_start + i, true); + rw_journal(ic, REQ_OP_WRITE, REQ_FUA, commit_start, to_end, &io_comp); + for (i = 0; i < commit_sections - to_end; i++) + rw_section_mac(ic, i, true); + } + rw_journal(ic, REQ_OP_WRITE, REQ_FUA, 0, commit_sections - to_end, &io_comp); + } + + wait_for_completion_io(&io_comp.comp); +} + +static void copy_from_journal(struct dm_integrity_c *ic, unsigned section, unsigned offset, + unsigned n_sectors, sector_t target, io_notify_fn fn, void *data) +{ + struct dm_io_request io_req; + struct dm_io_region io_loc; + int r; + unsigned sector, pl_index, pl_offset; + + BUG_ON((target | n_sectors | offset) & (unsigned)(ic->sectors_per_block - 1)); + + if (unlikely(dm_integrity_failed(ic))) { + fn(-1UL, data); + return; + } + + sector = section * ic->journal_section_sectors + JOURNAL_BLOCK_SECTORS + offset; + + pl_index = sector >> (PAGE_SHIFT - SECTOR_SHIFT); + pl_offset = (sector << SECTOR_SHIFT) & (PAGE_SIZE - 1); + + io_req.bi_op = REQ_OP_WRITE; + io_req.bi_op_flags = 0; + io_req.mem.type = DM_IO_PAGE_LIST; + io_req.mem.ptr.pl = &ic->journal[pl_index]; + io_req.mem.offset = pl_offset; + io_req.notify.fn = fn; + io_req.notify.context = data; + io_req.client = ic->io; + io_loc.bdev = ic->dev->bdev; + io_loc.sector = ic->start + target; + io_loc.count = n_sectors; + + r = dm_io(&io_req, 1, &io_loc, NULL); + if (unlikely(r)) { + WARN_ONCE(1, "asynchronous dm_io failed: %d", r); + fn(-1UL, data); + } +} + +static bool add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range) +{ + struct rb_node **n = &ic->in_progress.rb_node; + struct rb_node *parent; + + BUG_ON((new_range->logical_sector | new_range->n_sectors) & (unsigned)(ic->sectors_per_block - 1)); + + parent = NULL; + + while (*n) { + struct dm_integrity_range *range = container_of(*n, struct dm_integrity_range, node); + + parent = *n; + if (new_range->logical_sector + new_range->n_sectors <= range->logical_sector) { + n = &range->node.rb_left; + } else if (new_range->logical_sector >= range->logical_sector + range->n_sectors) { + n = &range->node.rb_right; + } else { + return false; + } + } + + rb_link_node(&new_range->node, parent, n); + rb_insert_color(&new_range->node, &ic->in_progress); + + return true; +} + +static void remove_range_unlocked(struct dm_integrity_c *ic, struct dm_integrity_range *range) +{ + rb_erase(&range->node, &ic->in_progress); + wake_up_locked(&ic->endio_wait); +} + +static void remove_range(struct dm_integrity_c *ic, struct dm_integrity_range *range) +{ + unsigned long flags; + + spin_lock_irqsave(&ic->endio_wait.lock, flags); + remove_range_unlocked(ic, range); + spin_unlock_irqrestore(&ic->endio_wait.lock, flags); +} + +static void init_journal_node(struct journal_node *node) +{ + RB_CLEAR_NODE(&node->node); + node->sector = (sector_t)-1; +} + +static void add_journal_node(struct dm_integrity_c *ic, struct journal_node *node, sector_t sector) +{ + struct rb_node **link; + struct rb_node *parent; + + node->sector = sector; + BUG_ON(!RB_EMPTY_NODE(&node->node)); + + link = &ic->journal_tree_root.rb_node; + parent = NULL; + + while (*link) { + struct journal_node *j; + parent = *link; + j = container_of(parent, struct journal_node, node); + if (sector < j->sector) + link = &j->node.rb_left; + else + link = &j->node.rb_right; + } + + rb_link_node(&node->node, parent, link); + rb_insert_color(&node->node, &ic->journal_tree_root); +} + +static void remove_journal_node(struct dm_integrity_c *ic, struct journal_node *node) +{ + BUG_ON(RB_EMPTY_NODE(&node->node)); + rb_erase(&node->node, &ic->journal_tree_root); + init_journal_node(node); +} + +#define NOT_FOUND (-1U) + +static unsigned find_journal_node(struct dm_integrity_c *ic, sector_t sector, sector_t *next_sector) +{ + struct rb_node *n = ic->journal_tree_root.rb_node; + unsigned found = NOT_FOUND; + *next_sector = (sector_t)-1; + while (n) { + struct journal_node *j = container_of(n, struct journal_node, node); + if (sector == j->sector) { + found = j - ic->journal_tree; + } + if (sector < j->sector) { + *next_sector = j->sector; + n = j->node.rb_left; + } else { + n = j->node.rb_right; + } + } + + return found; +} + +static bool test_journal_node(struct dm_integrity_c *ic, unsigned pos, sector_t sector) +{ + struct journal_node *node, *next_node; + struct rb_node *next; + + if (unlikely(pos >= ic->journal_entries)) + return false; + node = &ic->journal_tree[pos]; + if (unlikely(RB_EMPTY_NODE(&node->node))) + return false; + if (unlikely(node->sector != sector)) + return false; + + next = rb_next(&node->node); + if (unlikely(!next)) + return true; + + next_node = container_of(next, struct journal_node, node); + return next_node->sector != sector; +} + +static bool find_newer_committed_node(struct dm_integrity_c *ic, struct journal_node *node) +{ + struct rb_node *next; + struct journal_node *next_node; + unsigned next_section; + + BUG_ON(RB_EMPTY_NODE(&node->node)); + + next = rb_next(&node->node); + if (unlikely(!next)) + return false; + + next_node = container_of(next, struct journal_node, node); + + if (next_node->sector != node->sector) + return false; + + next_section = (unsigned)(next_node - ic->journal_tree) / ic->journal_section_entries; + if (next_section >= ic->committed_section && + next_section < ic->committed_section + ic->n_committed_sections) + return true; + if (next_section + ic->journal_sections < ic->committed_section + ic->n_committed_sections) + return true; + + return false; +} + +#define TAG_READ 0 +#define TAG_WRITE 1 +#define TAG_CMP 2 + +static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, sector_t *metadata_block, + unsigned *metadata_offset, unsigned total_size, int op) +{ + do { + unsigned char *data, *dp; + struct dm_buffer *b; + unsigned to_copy; + int r; + + r = dm_integrity_failed(ic); + if (unlikely(r)) + return r; + + data = dm_bufio_read(ic->bufio, *metadata_block, &b); + if (unlikely(IS_ERR(data))) + return PTR_ERR(data); + + to_copy = min((1U << SECTOR_SHIFT << ic->log2_buffer_sectors) - *metadata_offset, total_size); + dp = data + *metadata_offset; + if (op == TAG_READ) { + memcpy(tag, dp, to_copy); + } else if (op == TAG_WRITE) { + memcpy(dp, tag, to_copy); + dm_bufio_mark_buffer_dirty(b); + } else { + /* e.g.: op == TAG_CMP */ + if (unlikely(memcmp(dp, tag, to_copy))) { + unsigned i; + + for (i = 0; i < to_copy; i++) { + if (dp[i] != tag[i]) + break; + total_size--; + } + dm_bufio_release(b); + return total_size; + } + } + dm_bufio_release(b); + + tag += to_copy; + *metadata_offset += to_copy; + if (unlikely(*metadata_offset == 1U << SECTOR_SHIFT << ic->log2_buffer_sectors)) { + (*metadata_block)++; + *metadata_offset = 0; + } + total_size -= to_copy; + } while (unlikely(total_size)); + + return 0; +} + +static void dm_integrity_flush_buffers(struct dm_integrity_c *ic) +{ + int r; + r = dm_bufio_write_dirty_buffers(ic->bufio); + if (unlikely(r)) + dm_integrity_io_error(ic, "writing tags", r); +} + +static void sleep_on_endio_wait(struct dm_integrity_c *ic) +{ + DECLARE_WAITQUEUE(wait, current); + __add_wait_queue(&ic->endio_wait, &wait); + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&ic->endio_wait.lock); + io_schedule(); + spin_lock_irq(&ic->endio_wait.lock); + __remove_wait_queue(&ic->endio_wait, &wait); +} + +static void autocommit_fn(unsigned long data) +{ + struct dm_integrity_c *ic = (struct dm_integrity_c *)data; + + if (likely(!dm_integrity_failed(ic))) + queue_work(ic->commit_wq, &ic->commit_work); +} + +static void schedule_autocommit(struct dm_integrity_c *ic) +{ + if (!timer_pending(&ic->autocommit_timer)) + mod_timer(&ic->autocommit_timer, jiffies + ic->autocommit_jiffies); +} + +static void submit_flush_bio(struct dm_integrity_c *ic, struct dm_integrity_io *dio) +{ + struct bio *bio; + spin_lock_irq(&ic->endio_wait.lock); + bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); + bio_list_add(&ic->flush_bio_list, bio); + spin_unlock_irq(&ic->endio_wait.lock); + queue_work(ic->commit_wq, &ic->commit_work); +} + +static void do_endio(struct dm_integrity_c *ic, struct bio *bio) +{ + int r = dm_integrity_failed(ic); + if (unlikely(r) && !bio->bi_error) + bio->bi_error = r; + bio_endio(bio); +} + +static void do_endio_flush(struct dm_integrity_c *ic, struct dm_integrity_io *dio) +{ + struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); + + if (unlikely(dio->fua) && likely(!bio->bi_error) && likely(!dm_integrity_failed(ic))) + submit_flush_bio(ic, dio); + else + do_endio(ic, bio); +} + +static void dec_in_flight(struct dm_integrity_io *dio) +{ + if (atomic_dec_and_test(&dio->in_flight)) { + struct dm_integrity_c *ic = dio->ic; + struct bio *bio; + + remove_range(ic, &dio->range); + + if (unlikely(dio->write)) + schedule_autocommit(ic); + + bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); + + if (unlikely(dio->bi_error) && !bio->bi_error) + bio->bi_error = dio->bi_error; + if (likely(!bio->bi_error) && unlikely(bio_sectors(bio) != dio->range.n_sectors)) { + dio->range.logical_sector += dio->range.n_sectors; + bio_advance(bio, dio->range.n_sectors << SECTOR_SHIFT); + INIT_WORK(&dio->work, integrity_bio_wait); + queue_work(ic->wait_wq, &dio->work); + return; + } + do_endio_flush(ic, dio); + } +} + |