diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-06-05 15:45:03 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-06-05 15:45:03 -0700 |
commit | b25c6644bfd3affd7d0127ce95c5c96c155a7515 (patch) | |
tree | 9ef9c0fe74a08b7baf3a3c3753368b0f481b581f /drivers | |
parent | 818dbde78e0f4f11c9f804c36913a7ccfc2e87ad (diff) | |
parent | 64611a15ca9da91ff532982429c44686f4593b5f (diff) |
Merge tag 'for-5.8/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper updates from Mike Snitzer:
- The largest change for this cycle is the DM zoned target's metadata
version 2 feature that adds support for pairing regular block devices
with a zoned device to ease the performance impact associated with
finite random zones of zoned device.
The changes came in three batches: the first prepared for and then
added the ability to pair a single regular block device, the second
was a batch of fixes to improve zoned's reclaim heuristic, and the
third removed the limitation of only adding a single additional
regular block device to allow many devices.
Testing has shown linear scaling as more devices are added.
- Add new emulated block size (ebs) target that emulates a smaller
logical_block_size than a block device supports
The primary use-case is to emulate "512e" devices that have 512 byte
logical_block_size and 4KB physical_block_size. This is useful to
some legacy applications that otherwise wouldn't be able to be used
on 4K devices because they depend on issuing IO in 512 byte
granularity.
- Add discard interfaces to DM bufio. First consumer of the interface
is the dm-ebs target that makes heavy use of dm-bufio.
- Fix DM crypt's block queue_limits stacking to not truncate
logic_block_size.
- Add Documentation for DM integrity's status line.
- Switch DMDEBUG from a compile time config option to instead use
dynamic debug via pr_debug.
- Fix DM multipath target's hueristic for how it manages
"queue_if_no_path" state internally.
DM multipath now avoids disabling "queue_if_no_path" unless it is
actually needed (e.g. in response to configure timeout or explicit
"fail_if_no_path" message).
This fixes reports of spurious -EIO being reported back to userspace
application during fault tolerance testing with an NVMe backend.
Added various dynamic DMDEBUG messages to assist with debugging
queue_if_no_path in the future.
- Add a new DM multipath "Historical Service Time" Path Selector.
- Fix DM multipath's dm_blk_ioctl() to switch paths on IO error.
- Improve DM writecache target performance by using explicit cache
flushing for target's single-threaded usecase and a small cleanup to
remove unnecessary test in persistent_memory_claim.
- Other small cleanups in DM core, dm-persistent-data, and DM
integrity.
* tag 'for-5.8/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (62 commits)
dm crypt: avoid truncating the logical block size
dm mpath: add DM device name to Failing/Reinstating path log messages
dm mpath: enhance queue_if_no_path debugging
dm mpath: restrict queue_if_no_path state machine
dm mpath: simplify __must_push_back
dm zoned: check superblock location
dm zoned: prefer full zones for reclaim
dm zoned: select reclaim zone based on device index
dm zoned: allocate zone by device index
dm zoned: support arbitrary number of devices
dm zoned: move random and sequential zones into struct dmz_dev
dm zoned: per-device reclaim
dm zoned: add metadata pointer to struct dmz_dev
dm zoned: add device pointer to struct dm_zone
dm zoned: allocate temporary superblock for tertiary devices
dm zoned: convert to xarray
dm zoned: add a 'reserved' zone flag
dm zoned: improve logging messages for reclaim
dm zoned: avoid unnecessary device recalulation for secondary superblock
dm zoned: add debugging message for reading superblocks
...
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/md/Kconfig | 20 | ||||
-rw-r--r-- | drivers/md/Makefile | 3 | ||||
-rw-r--r-- | drivers/md/dm-bufio.c | 109 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 80 | ||||
-rw-r--r-- | drivers/md/dm-ebs-target.c | 471 | ||||
-rw-r--r-- | drivers/md/dm-historical-service-time.c | 561 | ||||
-rw-r--r-- | drivers/md/dm-integrity.c | 6 | ||||
-rw-r--r-- | drivers/md/dm-log-writes.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 123 | ||||
-rw-r--r-- | drivers/md/dm-path-selector.h | 2 | ||||
-rw-r--r-- | drivers/md/dm-queue-length.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-raid.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-raid1.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-service-time.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-stats.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-stripe.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-switch.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-writecache.c | 42 | ||||
-rw-r--r-- | drivers/md/dm-zoned-metadata.c | 1046 | ||||
-rw-r--r-- | drivers/md/dm-zoned-reclaim.c | 210 | ||||
-rw-r--r-- | drivers/md/dm-zoned-target.c | 463 | ||||
-rw-r--r-- | drivers/md/dm-zoned.h | 113 | ||||
-rw-r--r-- | drivers/md/dm.c | 11 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-btree-internal.h | 4 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-btree-spine.c | 6 |
25 files changed, 2650 insertions, 636 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index d6d5ab23c088..6665b56865b7 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -269,6 +269,7 @@ config DM_UNSTRIPED config DM_CRYPT tristate "Crypt target support" depends on BLK_DEV_DM + depends on (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n) select CRYPTO select CRYPTO_CBC select CRYPTO_ESSIV @@ -336,6 +337,14 @@ config DM_WRITECACHE The writecache target doesn't cache reads because reads are supposed to be cached in standard RAM. +config DM_EBS + tristate "Emulated block size target (EXPERIMENTAL)" + depends on BLK_DEV_DM + select DM_BUFIO + help + dm-ebs emulates smaller logical block size on backing devices + with larger ones (e.g. 512 byte sectors on 4K native disks). + config DM_ERA tristate "Era target (EXPERIMENTAL)" depends on BLK_DEV_DM @@ -443,6 +452,17 @@ config DM_MULTIPATH_ST If unsure, say N. +config DM_MULTIPATH_HST + tristate "I/O Path Selector based on historical service time" + depends on DM_MULTIPATH + help + This path selector is a dynamic load balancer which selects + the path expected to complete the incoming I/O in the shortest + time by comparing estimated service time (based on historical + service time). + + If unsure, say N. + config DM_DELAY tristate "I/O delaying target" depends on BLK_DEV_DM diff --git a/drivers/md/Makefile b/drivers/md/Makefile index d91a7edcd2ab..31840f95cd40 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -17,6 +17,7 @@ dm-thin-pool-y += dm-thin.o dm-thin-metadata.o dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \ dm-cache-background-tracker.o dm-cache-smq-y += dm-cache-policy-smq.o +dm-ebs-y += dm-ebs-target.o dm-era-y += dm-era-target.o dm-clone-y += dm-clone-target.o dm-clone-metadata.o dm-verity-y += dm-verity-target.o @@ -54,6 +55,7 @@ obj-$(CONFIG_DM_FLAKEY) += dm-flakey.o obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o +obj-$(CONFIG_DM_MULTIPATH_HST) += dm-historical-service-time.o obj-$(CONFIG_DM_SWITCH) += dm-switch.o obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o obj-$(CONFIG_DM_PERSISTENT_DATA) += persistent-data/ @@ -65,6 +67,7 @@ obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o obj-$(CONFIG_DM_VERITY) += dm-verity.o obj-$(CONFIG_DM_CACHE) += dm-cache.o obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o +obj-$(CONFIG_DM_EBS) += dm-ebs.o obj-$(CONFIG_DM_ERA) += dm-era.o obj-$(CONFIG_DM_CLONE) += dm-clone.o obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index d1786cfd7f22..6d1565021d74 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -256,12 +256,35 @@ static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) if (b->block == block) return b; - n = (b->block < block) ? n->rb_left : n->rb_right; + n = block < b->block ? n->rb_left : n->rb_right; } return NULL; } +static struct dm_buffer *__find_next(struct dm_bufio_client *c, sector_t block) +{ + struct rb_node *n = c->buffer_tree.rb_node; + struct dm_buffer *b; + struct dm_buffer *best = NULL; + + while (n) { + b = container_of(n, struct dm_buffer, node); + + if (b->block == block) + return b; + + if (block <= b->block) { + n = n->rb_left; + best = b; + } else { + n = n->rb_right; + } + } + + return best; +} + static void __insert(struct dm_bufio_client *c, struct dm_buffer *b) { struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL; @@ -276,8 +299,8 @@ static void __insert(struct dm_bufio_client *c, struct dm_buffer *b) } parent = *new; - new = (found->block < b->block) ? - &((*new)->rb_left) : &((*new)->rb_right); + new = b->block < found->block ? + &found->node.rb_left : &found->node.rb_right; } rb_link_node(&b->node, parent, new); @@ -631,6 +654,19 @@ dmio: submit_bio(bio); } +static inline sector_t block_to_sector(struct dm_bufio_client *c, sector_t block) +{ + sector_t sector; + + if (likely(c->sectors_per_block_bits >= 0)) + sector = block << c->sectors_per_block_bits; + else + sector = block * (c->block_size >> SECTOR_SHIFT); + sector += c->start; + + return sector; +} + static void submit_io(struct dm_buffer *b, int rw, void (*end_io)(struct dm_buffer *, blk_status_t)) { unsigned n_sectors; @@ -639,11 +675,7 @@ static void submit_io(struct dm_buffer *b, int rw, void (*end_io)(struct dm_buff b->end_io = end_io; - if (likely(b->c->sectors_per_block_bits >= 0)) - sector = b->block << b->c->sectors_per_block_bits; - else - sector = b->block * (b->c->block_size >> SECTOR_SHIFT); - sector += b->c->start; + sector = block_to_sector(b->c, b->block); if (rw != REQ_OP_WRITE) { n_sectors = b->c->block_size >> SECTOR_SHIFT; @@ -1326,6 +1358,30 @@ int dm_bufio_issue_flush(struct dm_bufio_client *c) EXPORT_SYMBOL_GPL(dm_bufio_issue_flush); /* + * Use dm-io to send a discard request to flush the device. + */ +int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t count) +{ + struct dm_io_request io_req = { + .bi_op = REQ_OP_DISCARD, + .bi_op_flags = REQ_SYNC, + .mem.type = DM_IO_KMEM, + .mem.ptr.addr = NULL, + .client = c->dm_io, + }; + struct dm_io_region io_reg = { + .bdev = c->bdev, + .sector = block_to_sector(c, block), + .count = block_to_sector(c, count), + }; + + BUG_ON(dm_bufio_in_request()); + + return dm_io(&io_req, 1, &io_reg, NULL); +} +EXPORT_SYMBOL_GPL(dm_bufio_issue_discard); + +/* * We first delete any other buffer that may be at that new location. * * Then, we write the buffer to the original location if it was dirty. @@ -1401,6 +1457,14 @@ retry: } EXPORT_SYMBOL_GPL(dm_bufio_release_move); +static void forget_buffer_locked(struct dm_buffer *b) +{ + if (likely(!b->hold_count) && likely(!b->state)) { + __unlink_buffer(b); + __free_buffer_wake(b); + } +} + /* * Free the given buffer. * @@ -1414,15 +1478,36 @@ void dm_bufio_forget(struct dm_bufio_client *c, sector_t block) dm_bufio_lock(c); b = __find(c, block); - if (b && likely(!b->hold_count) && likely(!b->state)) { - __unlink_buffer(b); - __free_buffer_wake(b); - } + if (b) + forget_buffer_locked(b); dm_bufio_unlock(c); } EXPORT_SYMBOL_GPL(dm_bufio_forget); +void dm_bufio_forget_buffers(struct dm_bufio_client *c, sector_t block, sector_t n_blocks) +{ + struct dm_buffer *b; + sector_t end_block = block + n_blocks; + + while (block < end_block) { + dm_bufio_lock(c); + + b = __find_next(c, block); + if (b) { + block = b->block + 1; + forget_buffer_locked(b); + } + + dm_bufio_unlock(c); + + if (!b) + break; + } + +} +EXPORT_SYMBOL_GPL(dm_bufio_forget_buffers); + void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n) { c->minimum_buffers = n; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 3df90daba89e..000ddfab5ba0 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -34,7 +34,9 @@ #include <crypto/aead.h> #include <crypto/authenc.h> #include <linux/rtnetlink.h> /* for struct rtattr and RTA macros only */ +#include <linux/key-type.h> #include <keys/user-type.h> +#include <keys/encrypted-type.h> #include <linux/device-mapper.h> @@ -212,7 +214,7 @@ struct crypt_config { struct mutex bio_alloc_lock; u8 *authenc_key; /* space for keys in authenc() format (if used) */ - u8 key[0]; + u8 key[]; }; #define MIN_IOS 64 @@ -2215,12 +2217,47 @@ static bool contains_whitespace(const char *str) return false; } +static int set_key_user(struct crypt_config *cc, struct key *key) +{ + const struct user_key_payload *ukp; + + ukp = user_key_payload_locked(key); + if (!ukp) + return -EKEYREVOKED; + + if (cc->key_size != ukp->datalen) + return -EINVAL; + + memcpy(cc->key, ukp->data, cc->key_size); + + return 0; +} + +#if defined(CONFIG_ENCRYPTED_KEYS) || defined(CONFIG_ENCRYPTED_KEYS_MODULE) +static int set_key_encrypted(struct crypt_config *cc, struct key *key) +{ + const struct encrypted_key_payload *ekp; + + ekp = key->payload.data[0]; + if (!ekp) + return -EKEYREVOKED; + + if (cc->key_size != ekp->decrypted_datalen) + return -EINVAL; + + memcpy(cc->key, ekp->decrypted_data, cc->key_size); + + return 0; +} +#endif /* CONFIG_ENCRYPTED_KEYS */ + static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string) { char *new_key_string, *key_desc; int ret; + struct key_type *type; struct key *key; - const struct user_key_payload *ukp; + int (*set_key)(struct crypt_config *cc, struct key *key); /* * Reject key_string with whitespace. dm core currently lacks code for @@ -2236,16 +2273,26 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string if (!key_desc || key_desc == key_string || !strlen(key_desc + 1)) return -EINVAL; - if (strncmp(key_string, "logon:", key_desc - key_string + 1) && - strncmp(key_string, "user:", key_desc - key_string + 1)) + if (!strncmp(key_string, "logon:", key_desc - key_string + 1)) { + type = &key_type_logon; + set_key = set_key_user; + } else if (!strncmp(key_string, "user:", key_desc - key_string + 1)) { + type = &key_type_user; + set_key = set_key_user; +#if defined(CONFIG_ENCRYPTED_KEYS) || defined(CONFIG_ENCRYPTED_KEYS_MODULE) + } else if (!strncmp(key_string, "encrypted:", key_desc - key_string + 1)) { + type = &key_type_encrypted; + set_key = set_key_encrypted; +#endif + } else { return -EINVAL; + } new_key_string = kstrdup(key_string, GFP_KERNEL); if (!new_key_string) return -ENOMEM; - key = request_key(key_string[0] == 'l' ? &key_type_logon : &key_type_user, - key_desc + 1, NULL); + key = request_key(type, key_desc + 1, NULL); if (IS_ERR(key)) { kzfree(new_key_string); return PTR_ERR(key); @@ -2253,23 +2300,14 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string down_read(&key->sem); - ukp = user_key_payload_locked(key); - if (!ukp) { - up_read(&key->sem); - key_put(key); - kzfree(new_key_string); - return -EKEYREVOKED; - } - - if (cc->key_size != ukp->datalen) { + ret = set_key(cc, key); + if (ret < 0) { up_read(&key->sem); key_put(key); kzfree(new_key_string); - return -EINVAL; + return ret; } - memcpy(cc->key, ukp->data, cc->key_size); - up_read(&key->sem); key_put(key); @@ -2323,7 +2361,7 @@ static int get_key_size(char **key_string) return (*key_string[0] == ':') ? -EINVAL : strlen(*key_string) >> 1; } -#endif +#endif /* CONFIG_KEYS */ static int crypt_set_key(struct crypt_config *cc, char *key) { @@ -3274,7 +3312,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) limits->max_segment_size = PAGE_SIZE; limits->logical_block_size = - max_t(unsigned short, limits->logical_block_size, cc->sector_size); + max_t(unsigned, limits->logical_block_size, cc->sector_size); limits->physical_block_size = max_t(unsigned, limits->physical_block_size, cc->sector_size); limits->io_min = max_t(unsigned, limits->io_min, cc->sector_size); @@ -3282,7 +3320,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type crypt_target = { .name = "crypt", - .version = {1, 20, 0}, + .version = {1, 21, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c new file mode 100644 index 000000000000..44451276f128 --- /dev/null +++ b/drivers/md/dm-ebs-target.c @@ -0,0 +1,471 @@ +/* + * Copyright (C) 2020 Red Hat GmbH + * + * This file is released under the GPL. + * + * Device-mapper target to emulate smaller logical block + * size on backing devices exposing (natively) larger ones. + * + * E.g. 512 byte sector emulation on 4K native disks. + */ + +#include "dm.h" +#include <linux/module.h> +#include <linux/workqueue.h> +#include <linux/dm-bufio.h> + +#define DM_MSG_PREFIX "ebs" + +static void ebs_dtr(struct dm_target *ti); + +/* Emulated block size context. */ +struct ebs_c { + struct dm_dev *dev; /* Underlying device to emulate block size on. */ + struct dm_bufio_client *bufio; /* Use dm-bufio for read and read-modify-write processing. */ + struct workqueue_struct *wq; /* Workqueue for ^ processing of bios. */ + struct work_struct ws; /* Work item used for ^. */ + struct bio_list bios_in; /* Worker bios input list. */ + spinlock_t lock; /* Guard bios input list above. */ + sector_t start; /* <start> table line argument, see ebs_ctr below. */ + unsigned int e_bs; /* Emulated block size in sectors exposed to upper layer. */ + unsigned int u_bs; /* Underlying block size in sectors retrievd from/set on lower layer device. */ + unsigned char block_shift; /* bitshift sectors -> blocks used in dm-bufio API. */ + bool u_bs_set:1; /* Flag to indicate underlying block size is set on table line. */ +}; + +static inline sector_t __sector_to_block(struct ebs_c *ec, sector_t sector) +{ + return sector >> ec->block_shift; +} + +static inline sector_t __block_mod(sector_t sector, unsigned int bs) +{ + return sector & (bs - 1); +} + +/* Return number of blocks for a bio, accounting for misalignement of start and end sectors. */ +static inline unsigned int __nr_blocks(struct ebs_c *ec, struct bio *bio) +{ + sector_t end_sector = __block_mod(bio->bi_iter.bi_sector, ec->u_bs) + bio_sectors(bio); + + return __sector_to_block(ec, end_sector) + (__block_mod(end_sector, ec->u_bs) ? 1 : 0); +} + +static inline bool __ebs_check_bs(unsigned int bs) +{ + return bs && is_power_of_2(bs); +} + +/* + * READ/WRITE: + * + * copy blocks between bufio blocks and bio vector's (partial/overlapping) pages. + */ +static int __ebs_rw_bvec(struct ebs_c *ec, int rw, struct bio_vec *bv, struct bvec_iter *iter) +{ + int r = 0; + unsigned char *ba, *pa; + unsigned int cur_len; + unsigned int bv_len = bv->bv_len; + unsigned int buf_off = to_bytes(__block_mod(iter->bi_sector, ec->u_bs)); + sector_t block = __sector_to_block(ec, iter->bi_sector); + struct dm_buffer *b; + + if (unlikely(!bv->bv_page || !bv_len)) + return -EIO; + + pa = page_address(bv->bv_page) + bv->bv_offset; + + /* Handle overlapping page <-> blocks */ + while (bv_len) { + cur_len = min(dm_bufio_get_block_size(ec->bufio) - buf_off, bv_len); + + /* Avoid reading for writes in case bio vector's page overwrites block completely. */ + if (rw == READ || buf_off || bv_len < dm_bufio_get_block_size(ec->bufio)) + ba = dm_bufio_read(ec->bufio, block, &b); + else + ba = dm_bufio_new(ec->bufio, block, &b); + + if (unlikely(IS_ERR(ba))) { + /* + * Carry on with next buffer, if any, to issue all possible + * data but return error. + */ + r = PTR_ERR(ba); + } else { + /* Copy data to/from bio to buffer if read/new was successful above. */ + ba += buf_off; + if (rw == READ) { + memcpy(pa, ba, cur_len); + flush_dcache_page(bv->bv_page); + } else { + flush_dcache_page(bv->bv_page); + memcpy(ba, pa, cur_len); + dm_bufio_mark_partial_buffer_dirty(b, buf_off, buf_off + cur_len); + } + + dm_bufio_release(b); + } + + pa += cur_len; + bv_len -= cur_len; + buf_off = 0; + block++; + } + + return r; +} + +/* READ/WRITE: iterate bio vector's copying between (partial) pages and bufio blocks. */ +static int __ebs_rw_bio(struct ebs_c *ec, int rw, struct bio *bio) +{ + int r = 0, rr; + struct bio_vec bv; + struct bvec_iter iter; + + bio_for_each_bvec(bv, bio, iter) { + rr = __ebs_rw_bvec(ec, rw, &bv, &iter); + if (rr) + r = rr; + } + + return r; +} + +/* + * Discard bio's blocks, i.e. pass discards down. + * + * Avoid discarding partial blocks at beginning and end; + * return 0 in case no blocks can be discarded as a result. + */ +static int __ebs_discard_bio(struct ebs_c *ec, struct bio *bio) +{ + sector_t block, blocks, sector = bio->bi_iter.bi_sector; + + block = __sector_to_block(ec, sector); + blocks = __nr_blocks(ec, bio); + + /* + * Partial first underlying block (__nr_blocks() may have + * resulted in one block). + */ + if (__block_mod(sector, ec->u_bs)) { + block++; + blocks--; + } + + /* Partial last underlying block if any. */ + if (blocks && __block_mod(bio_end_sector(bio), ec->u_bs)) + blocks--; + + return blocks ? dm_bufio_issue_discard(ec->bufio, block, blocks) : 0; +} + +/* Release blocks them from the bufio cache. */ +static void __ebs_forget_bio(struct ebs_c *ec, struct bio *bio) +{ + sector_t blocks, sector = bio->bi_iter.bi_sector; + + blocks = __nr_blocks(ec, bio); + + dm_bufio_forget_buffers(ec->bufio, __sector_to_block(ec, sector), blocks); +} + +/* Worker funtion to process incoming bios. */ +static void __ebs_process_bios(struct work_struct *ws) +{ + int r; + bool write = false; + sector_t block1, block2; + struct ebs_c *ec = container_of(ws, struct ebs_c, ws); + struct bio *bio; + struct bio_list bios; + + bio_list_init(&bios); + + spin_lock_irq(&ec->lock); + bios = ec->bios_in; + bio_list_init(&ec->bios_in); + spin_unlock_irq(&ec->lock); + + /* Prefetch all read and any mis-aligned write buffers */ + bio_list_for_each(bio, &bios) { + block1 = __sector_to_block(ec, bio->bi_iter.bi_sector); + if (bio_op(bio) == REQ_OP_READ) + dm_bufio_prefetch(ec->bufio, block1, __nr_blocks(ec, bio)); + else if (bio_op(bio) == REQ_OP_WRITE && !(bio->bi_opf & REQ_PREFLUSH)) { + block2 = __sector_to_block(ec, bio_end_sector(bio)); + if (__block_mod(bio->bi_iter.bi_sector, ec->u_bs)) + dm_bufio_prefetch(ec->bufio, block1, 1); + if (__block_mod(bio_end_sector(bio), ec->u_bs) && block2 != block1) + dm_bufio_prefetch(ec->bufio, block2, 1); + } + } + + bio_list_for_each(bio, &bios) { + r = -EIO; + if (bio_op(bio) == REQ_OP_READ) + r = __ebs_rw_bio(ec, READ, bio); + else if (bio_op(bio) == REQ_OP_WRITE) { + write = true; + r = __ebs_rw_bio(ec, WRITE, bio); + } else if (bio_op(bio) == REQ_OP_DISCARD) { + __ebs_forget_bio(ec, bio); + r = __ebs_discard_bio(ec, bio); + } + + if (r < 0) + bio->bi_status = errno_to_blk_status(r); + } + + /* + * We write dirty buffers after processing I/O on them + * but before we endio thus addressing REQ_FUA/REQ_SYNC. + */ + r = write ? dm_bufio_write_dirty_buffers(ec->bufio) : 0; + + while ((bio = bio_list_pop(&bios))) { + /* Any other request is endioed. */ + if (unlikely(r && bio_op(bio) == REQ_OP_WRITE)) + bio_io_error(bio); + else + bio_endio(bio); + } +} + +/* + * Construct an emulated block size mapping: <dev_path> <offset> <ebs> [<ubs>] + * + * <dev_path>: path of the underlying device + * <offset>: offset in 512 bytes sectors into <dev_path> + * <ebs>: emulated block size in units of 512 bytes exposed to the upper layer + * [<ubs>]: underlying block size in units of 512 bytes imposed on the lower layer; + * optional, if not supplied, retrieve logical block size from underlying device + */ +static int ebs_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + int r; + unsigned short tmp1; + unsigned long long tmp; + char dummy; + struct ebs_c *ec; + + if (argc < 3 || argc > 4) { + ti->error = "Invalid argument count"; + return -EINVAL; + } + + ec = ti->private = kzalloc(sizeof(*ec), GFP_KERNEL); + if (!ec) { + ti->error = "Cannot allocate ebs context"; + return -ENOMEM; + } + + r = -EINVAL; + if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1 || + tmp != (sector_t)tmp || + (sector_t)tmp >= ti->len) { + ti->error = "Invalid device offset sector"; + goto bad; + } + ec->start = tmp; + + if (sscanf(argv[2], "%hu%c", &tmp1, &dummy) != 1 || + !__ebs_check_bs(tmp1) || + to_bytes(tmp1) > PAGE_SIZE) { + ti->error = "Invalid emulated block size"; + goto bad; + } + ec->e_bs = tmp1; + + if (argc > 3) { + if (sscanf(argv[3], "%hu%c", &tmp1, &dummy) != 1 || !__ebs_check_bs(tmp1)) { + ti->error = "Invalid underlying block size"; + goto bad; + } + ec->u_bs = tmp1; + ec->u_bs_set = true; + } else + ec->u_bs_set = false; + + r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &ec->dev); + if (r) { + ti->error = "Device lookup failed"; + ec->dev = NULL; + goto bad; + } + + r = -EINVAL; + if (!ec->u_bs_set) { + ec->u_bs = to_sector(bdev_logical_block_size(ec->dev->bdev)); + if (!__ebs_check_bs(ec->u_bs)) { + ti->error = "Invalid retrieved underlying block size"; + goto bad; + } + } + + if (!ec->u_bs_set && ec->e_bs == ec->u_bs) + DMINFO("Emulation superfluous: emulated equal to underlying block size"); + + if (__block_mod(ec->start, ec->u_bs)) { + ti->error = "Device offset must be multiple of underlying block size"; + goto bad; + } + + ec->bufio = dm_bufio_client_create(ec->dev->bdev, to_bytes(ec->u_bs), 1, 0, NULL, NULL); + if (IS_ERR(ec->bufio)) { + ti->error = "Cannot create dm bufio client"; + r = PTR_ERR(ec->bufio); + ec->bufio = NULL; + goto bad; + } + + ec->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); + if (!ec->wq) { + ti->error = "Cannot create dm-" DM_MSG_PREFIX " workqueue"; + r = -ENOMEM; + goto bad; + } + + ec->block_shift = __ffs(ec->u_bs); + INIT_WORK(&ec->ws, &__ebs_process_bios); + bio_list_init(&ec->bios_in); + spin_lock_init(&ec->lock); + + ti->num_flush_bios = 1; + ti->num_discard_bios = 1; + ti->num_secure_erase_bios = 0; + ti->num_write_same_bios = 0; + ti->num_write_zeroes_bios = 0; + return 0; +bad: + ebs_dtr(ti); + return r; +} + +static void ebs_dtr(struct dm_target *ti) +{ + struct ebs_c *ec = ti->private; + + if (ec->wq) + destroy_workqueue(ec->wq); + if (ec->bufio) + dm_bufio_client_destroy(ec->bufio); + if (ec->dev) + dm_put_device(ti, ec->dev); + kfree(ec); +} + +static int ebs_map(struct dm_target *ti, struct bio *bio) +{ + struct ebs_c *ec = ti->private; + + bio_set_dev(bio, ec->dev->bdev); + bio->bi_iter.bi_sector = ec->start + dm_target_offset(ti, bio->bi_iter.bi_sector); + + if (unlikely(bio->bi_opf & REQ_OP_FLUSH)) + return DM_MAPIO_REMAPPED; + /* + * Only queue for bufio processing in case of partial or overlapping buffers + * -or- + * emulation with ebs == ubs aiming for tests of dm-bufio overhead. + */ + if (likely(__block_mod(bio->bi_iter.bi_sector, ec->u_bs) || + __block_mod(bio_end_sector(bio), ec->u_bs) || + ec->e_bs == ec->u_bs)) { + spin_lock_irq(&ec->lock); + bio_list_add(&ec->bios_in, bio); + spin_unlock_irq(&ec->lock); + + queue_work(ec->wq, &ec->ws); + + return DM_MAPIO_SUBMITTED; + } + + /* Forget any buffer content relative to this direct backing device I/O. */ + __ebs_forget_bio(ec, bio); + + return DM_MAPIO_REMAPPED; +} + +static void ebs_status(struct dm_target *ti, status_type_t type, + unsigned status_flags, char *result, unsigned maxlen) +{ + struct ebs_c *ec = ti->private; + + switch (type) { + case STATUSTYPE_INFO: + *result = '\0'; + break; + case STATUSTYPE_TABLE: + snprintf(result, maxlen, ec->u_bs_set ? "%s %llu %u %u" : "%s %llu %u", + ec->dev->name, (unsigned long long) ec->start, ec->e_bs, ec->u_bs); + break; + } +} + +static int ebs_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +{ + struct ebs_c *ec = ti->private; + struct dm_dev *dev = ec->dev; + + /* + * Only pass ioctls through if the device sizes match exactly. + */ + *bdev = dev->bdev; + return !!(ec->start || ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT); +} + +static void ebs_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct ebs_c *ec = ti->private; + + limits->logical_block_size = to_bytes(ec->e_bs); + limits->physical_block_size = to_bytes(ec->u_bs); + limits->alignment_offset = limits->physical_block_size; + blk_limits_io_min(limits, limits->logical_block_size); +} + +static int ebs_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, void *data) +{ + struct ebs_c *ec = ti->private; + + return fn(ti, ec->dev, ec->start, ti->len, data); +} + +static struct target_type ebs_target = { + .name = "ebs", + .version = {1, 0, 1}, + .features = DM_TARGET_PASSES_INTEGRITY, + .module = THIS_MODULE, + .ctr = ebs_ctr, + .dtr = ebs_dtr, + .map = ebs_map, + .status = ebs_status, + .io_hints = ebs_io_hints, + .prepare_ioctl = ebs_prepare_ioctl, + .iterate_devices = ebs_iterate_devices, +}; + +static int __init dm_ebs_init(void) +{ + int r = dm_register_target(&ebs_target); + + if (r < 0) + DMERR("register failed %d", r); + + return r; +} + +static void dm_ebs_exit(void) +{ + dm_unregister_target(&ebs_target); +} + +module_init(dm_ebs_init); +module_exit(dm_ebs_exit); + +MODULE_AUTHOR("Heinz Mauelshagen <dm-devel@redhat.com>"); +MODUL |