summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/device-mapper/dm-crypt.txt53
-rw-r--r--Documentation/device-mapper/dm-integrity.txt199
-rw-r--r--Documentation/device-mapper/dm-raid.txt14
-rw-r--r--drivers/md/Kconfig19
-rw-r--r--drivers/md/Makefile7
-rw-r--r--drivers/md/dm-bio-prison-v1.c (renamed from drivers/md/dm-bio-prison.c)46
-rw-r--r--drivers/md/dm-bio-prison-v1.h (renamed from drivers/md/dm-bio-prison.h)2
-rw-r--r--drivers/md/dm-bio-prison-v2.c369
-rw-r--r--drivers/md/dm-bio-prison-v2.h152
-rw-r--r--drivers/md/dm-bufio.c70
-rw-r--r--drivers/md/dm-bufio.h7
-rw-r--r--drivers/md/dm-cache-background-tracker.c238
-rw-r--r--drivers/md/dm-cache-background-tracker.h46
-rw-r--r--drivers/md/dm-cache-metadata.c3
-rw-r--r--drivers/md/dm-cache-metadata.h2
-rw-r--r--drivers/md/dm-cache-policy-cleaner.c469
-rw-r--r--drivers/md/dm-cache-policy-internal.h76
-rw-r--r--drivers/md/dm-cache-policy-smq.c821
-rw-r--r--drivers/md/dm-cache-policy.h187
-rw-r--r--drivers/md/dm-cache-target.c2475
-rw-r--r--drivers/md/dm-core.h2
-rw-r--r--drivers/md/dm-crypt.c1253
-rw-r--r--drivers/md/dm-delay.c1
-rw-r--r--drivers/md/dm-era-target.c10
-rw-r--r--drivers/md/dm-integrity.c3238
-rw-r--r--drivers/md/dm-ioctl.c16
-rw-r--r--drivers/md/dm-linear.c1
-rw-r--r--drivers/md/dm-mpath.c171
-rw-r--r--drivers/md/dm-raid.c164
-rw-r--r--drivers/md/dm-rq.c8
-rw-r--r--drivers/md/dm-stripe.c1
-rw-r--r--drivers/md/dm-table.c99
-rw-r--r--drivers/md/dm-thin-metadata.c2
-rw-r--r--drivers/md/dm-thin.c3
-rw-r--r--drivers/md/dm-verity-fec.c4
-rw-r--r--drivers/md/dm-verity-target.c201
-rw-r--r--drivers/md/dm-verity.h23
-rw-r--r--drivers/md/dm.c35
-rw-r--r--drivers/md/dm.h8
-rw-r--r--drivers/md/persistent-data/dm-block-manager.c1
-rw-r--r--drivers/md/persistent-data/dm-block-manager.h2
-rw-r--r--drivers/md/persistent-data/dm-btree.c8
-rw-r--r--drivers/md/raid5-cache.c62
-rw-r--r--drivers/md/raid5.h11
-rw-r--r--include/linux/device-mapper.h26
45 files changed, 7610 insertions, 2995 deletions
diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.txt
index ff1f87bf26e8..3b3e1de21c9c 100644
--- a/Documentation/device-mapper/dm-crypt.txt
+++ b/Documentation/device-mapper/dm-crypt.txt
@@ -11,14 +11,31 @@ Parameters: <cipher> <key> <iv_offset> <device path> \
<offset> [<#opt_params> <opt_params>]
<cipher>
- Encryption cipher and an optional IV generation mode.
- (In format cipher[:keycount]-chainmode-ivmode[:ivopts]).
+ Encryption cipher, encryption mode and Initial Vector (IV) generator.
+
+ The cipher specifications format is:
+ cipher[:keycount]-chainmode-ivmode[:ivopts]
Examples:
- des
aes-cbc-essiv:sha256
- twofish-ecb
+ aes-xts-plain64
+ serpent-xts-plain64
+
+ Cipher format also supports direct specification with kernel crypt API
+ format (selected by capi: prefix). The IV specification is the same
+ as for the first format type.
+ This format is mainly used for specification of authenticated modes.
- /proc/crypto contains supported crypto modes
+ The crypto API cipher specifications format is:
+ capi:cipher_api_spec-ivmode[:ivopts]
+ Examples:
+ capi:cbc(aes)-essiv:sha256
+ capi:xts(aes)-plain64
+ Examples of authenticated modes:
+ capi:gcm(aes)-random
+ capi:authenc(hmac(sha256),xts(aes))-random
+ capi:rfc7539(chacha20,poly1305)-random
+
+ The /proc/crypto contains a list of curently loaded crypto modes.
<key>
Key used for encryption. It is encoded either as a hexadecimal number
@@ -93,6 +110,32 @@ submit_from_crypt_cpus
thread because it benefits CFQ to have writes submitted using the
same context.
+integrity:<bytes>:<type>
+ The device requires additional <bytes> metadata per-sector stored
+ in per-bio integrity structure. This metadata must by provided
+ by underlying dm-integrity target.
+
+ The <type> can be "none" if metadata is used only for persistent IV.
+
+ For Authenticated Encryption with Additional Data (AEAD)
+ the <type> is "aead". An AEAD mode additionally calculates and verifies
+ integrity for the encrypted device. The additional space is then
+ used for storing authentication tag (and persistent IV if needed).
+
+sector_size:<bytes>
+ Use <bytes> as the encryption unit instead of 512 bytes sectors.
+ This option can be in range 512 - 4096 bytes and must be power of two.
+ Virtual device will announce this size as a minimal IO and logical sector.
+
+iv_large_sectors
+ IV generators will use sector number counted in <sector_size> units
+ instead of default 512 bytes sectors.
+
+ For example, if <sector_size> is 4096 bytes, plain64 IV for the second
+ sector will be 8 (without flag) and 1 if iv_large_sectors is present.
+ The <iv_offset> must be multiple of <sector_size> (in 512 bytes units)
+ if this flag is specified.
+
Example scripts
===============
LUKS (Linux Unified Key Setup) is now the preferred way to set up disk
diff --git a/Documentation/device-mapper/dm-integrity.txt b/Documentation/device-mapper/dm-integrity.txt
new file mode 100644
index 000000000000..f33e3ade7a09
--- /dev/null
+++ b/Documentation/device-mapper/dm-integrity.txt
@@ -0,0 +1,199 @@
+The dm-integrity target emulates a block device that has additional
+per-sector tags that can be used for storing integrity information.
+
+A general problem with storing integrity tags with every sector is that
+writing the sector and the integrity tag must be atomic - i.e. in case of
+crash, either both sector and integrity tag or none of them is written.
+
+To guarantee write atomicity, the dm-integrity target uses journal, it
+writes sector data and integrity tags into a journal, commits the journal
+and then copies the data and integrity tags to their respective location.
+
+The dm-integrity target can be used with the dm-crypt target - in this
+situation the dm-crypt target creates the integrity data and passes them
+to the dm-integrity target via bio_integrity_payload attached to the bio.
+In this mode, the dm-crypt and dm-integrity targets provide authenticated
+disk encryption - if the attacker modifies the encrypted device, an I/O
+error is returned instead of random data.
+
+The dm-integrity target can also be used as a standalone target, in this
+mode it calculates and verifies the integrity tag internally. In this
+mode, the dm-integrity target can be used to detect silent data
+corruption on the disk or in the I/O path.
+
+
+When loading the target for the first time, the kernel driver will format
+the device. But it will only format the device if the superblock contains
+zeroes. If the superblock is neither valid nor zeroed, the dm-integrity
+target can't be loaded.
+
+To use the target for the first time:
+1. overwrite the superblock with zeroes
+2. load the dm-integrity target with one-sector size, the kernel driver
+ will format the device
+3. unload the dm-integrity target
+4. read the "provided_data_sectors" value from the superblock
+5. load the dm-integrity target with the the target size
+ "provided_data_sectors"
+6. if you want to use dm-integrity with dm-crypt, load the dm-crypt target
+ with the size "provided_data_sectors"
+
+
+Target arguments:
+
+1. the underlying block device
+
+2. the number of reserved sector at the beginning of the device - the
+ dm-integrity won't read of write these sectors
+
+3. the size of the integrity tag (if "-" is used, the size is taken from
+ the internal-hash algorithm)
+
+4. mode:
+ D - direct writes (without journal) - in this mode, journaling is
+ not used and data sectors and integrity tags are written
+ separately. In case of crash, it is possible that the data
+ and integrity tag doesn't match.
+ J - journaled writes - data and integrity tags are written to the
+ journal and atomicity is guaranteed. In case of crash,
+ either both data and tag or none of them are written. The
+ journaled mode degrades write throughput twice because the
+ data have to be written twice.
+ R - recovery mode - in this mode, journal is not replayed,
+ checksums are not checked and writes to the device are not
+ allowed. This mode is useful for data recovery if the
+ device cannot be activated in any of the other standard
+ modes.
+
+5. the number of additional arguments
+
+Additional arguments:
+
+journal_sectors:number
+ The size of journal, this argument is used only if formatting the
+ device. If the device is already formatted, the value from the
+ superblock is used.
+
+interleave_sectors:number
+ The number of interleaved sectors. This values is rounded down to
+ a power of two. If the device is already formatted, the value from
+ the superblock is used.
+
+buffer_sectors:number
+ The number of sectors in one buffer. The value is rounded down to
+ a power of two.
+
+ The tag area is accessed using buffers, the buffer size is
+ configurable. The large buffer size means that the I/O size will
+ be larger, but there could be less I/Os issued.
+
+journal_watermark:number
+ The journal watermark in percents. When the size of the journal
+ exceeds this watermark, the thread that flushes the journal will
+ be started.
+
+commit_time:number
+ Commit time in milliseconds. When this time passes, the journal is
+ written. The journal is also written immediatelly if the FLUSH
+ request is received.
+
+internal_hash:algorithm(:key) (the key is optional)
+ Use internal hash or crc.
+ When this argument is used, the dm-integrity target won't accept
+ integrity tags from the upper target, but it will automatically
+ generate and verify the integrity tags.
+
+ You can use a crc algorithm (such as crc32), then integrity target
+ will protect the data against accidental corruption.
+ You can also use a hmac algorithm (for example
+ "hmac(sha256):0123456789abcdef"), in this mode it will provide
+ cryptographic authentication of the data without encryption.
+
+ When this argument is not used, the integrity tags are accepted
+ from an upper layer target, such as dm-crypt. The upper layer
+ target should check the validity of the integrity tags.
+
+journal_crypt:algorithm(:key) (the key is optional)
+ Encrypt the journal using given algorithm to make sure that the
+ attacker can't read the journal. You can use a block cipher here
+ (such as "cbc(aes)") or a stream cipher (for example "chacha20",
+ "salsa20", "ctr(aes)" or "ecb(arc4)").
+
+ The journal contains history of last writes to the block device,
+ an attacker reading the journal could see the last sector nubmers
+ that were written. From the sector numbers, the attacker can infer
+ the size of files that were written. To protect against this
+ situation, you can encrypt the journal.
+
+journal_mac:algorithm(:key) (the key is optional)
+ Protect sector numbers in the journal from accidental or malicious
+ modification. To protect against accidental modification, use a
+ crc algorithm, to protect against malicious modification, use a
+ hmac algorithm with a key.
+
+ This option is not needed when using internal-hash because in this
+ mode, the integrity of journal entries is checked when replaying
+ the journal. Thus, modified sector number would be detected at
+ this stage.
+
+block_size:number
+ The size of a data block in bytes. The larger the block size the
+ less overhead there is for per-block integrity metadata.
+ Supported values are 512, 1024, 2048 and 4096 bytes. If not
+ specified the default block size is 512 bytes.
+
+The journal mode (D/J), buffer_sectors, journal_watermark, commit_time can
+be changed when reloading the target (load an inactive table and swap the
+tables with suspend and resume). The other arguments should not be changed
+when reloading the target because the layout of disk data depend on them
+and the reloaded target would be non-functional.
+
+
+The layout of the formatted block device:
+* reserved sectors (they are not used by this target, they can be used for
+ storing LUKS metadata or for other purpose), the size of the reserved
+ area is specified in the target arguments
+* superblock (4kiB)
+ * magic string - identifies that the device was formatted
+ * version
+ * log2(interleave sectors)
+ * integrity tag size
+ * the number of journal sections
+ * provided data sectors - the number of sectors that this target
+ provides (i.e. the size of the device minus the size of all
+ metadata and padding). The user of this target should not send
+ bios that access data beyond the "provided data sectors" limit.
+ * flags - a flag is set if journal_mac is used
+* journal
+ The journal is divided into sections, each section contains:
+ * metadata area (4kiB), it contains journal entries
+ every journal entry contains:
+ * logical sector (specifies where the data and tag should
+ be written)
+ * last 8 bytes of data
+ * integrity tag (the size is specified in the superblock)
+ every metadata sector ends with
+ * mac (8-bytes), all the macs in 8 metadata sectors form a
+ 64-byte value. It is used to store hmac of sector
+ numbers in the journal section, to protect against a
+ possibility that the attacker tampers with sector
+ numbers in the journal.
+ * commit id
+ * data area (the size is variable; it depends on how many journal
+ entries fit into the metadata area)
+ every sector in the data area contains:
+ * data (504 bytes of data, the last 8 bytes are stored in
+ the journal entry)
+ * commit id
+ To test if the whole journal section was written correctly, every
+ 512-byte sector of the journal ends with 8-byte commit id. If the
+ commit id matches on all sectors in a journal section, then it is
+ assumed that the section was written correctly. If the commit id
+ doesn't match, the section was written partially and it should not
+ be replayed.
+* one or more runs of interleaved tags and data. Each run contains:
+ * tag area - it contains integrity tags. There is one tag for each
+ sector in the data area
+ * data area - it contains data sectors. The number of data sectors
+ in one run must be a power of two. log2 of this value is stored
+ in the superblock.
diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
index cd2cb2fc85ea..7e06e65586d4 100644
--- a/Documentation/device-mapper/dm-raid.txt
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -170,6 +170,13 @@ The target is named "raid" and it accepts the following parameters:
Takeover/reshape is not possible with a raid4/5/6 journal device;
it has to be deconfigured before requesting these.
+ [journal_mode <mode>]
+ This option sets the caching mode on journaled raid4/5/6 raid sets
+ (see 'journal_dev <dev>' above) to 'writethrough' or 'writeback'.
+ If 'writeback' is selected the journal device has to be resilient
+ and must not suffer from the 'write hole' problem itself (e.g. use
+ raid1 or raid10) to avoid a single point of failure.
+
<#raid_devs>: The number of devices composing the array.
Each device consists of two entries. The first is the device
containing the metadata (if any); the second is the one containing the
@@ -254,7 +261,8 @@ recovery. Here is a fuller description of the individual fields:
<data_offset> The current data offset to the start of the user data on
each component device of a raid set (see the respective
raid parameter to support out-of-place reshaping).
- <journal_char> 'A' - active raid4/5/6 journal device.
+ <journal_char> 'A' - active write-through journal device.
+ 'a' - active write-back journal device.
'D' - dead journal device.
'-' - no journal device.
@@ -331,3 +339,7 @@ Version History
'D' on the status line. If '- -' is passed into the constructor, emit
'- -' on the table line and '-' as the status line health character.
1.10.0 Add support for raid4/5/6 journal device
+1.10.1 Fix data corruption on reshape request
+1.11.0 Fix table line argument order
+ (wrong raid10_copies/raid10_format sequence)
+1.11.1 Add raid4/5/6 journal write-back support via journal_mode option
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index b7767da50c26..ee2c21e3d232 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -325,14 +325,6 @@ config DM_CACHE_SMQ
of less memory utilization, improved performance and increased
adaptability in the face of changing workloads.
-config DM_CACHE_CLEANER
- tristate "Cleaner Cache Policy (EXPERIMENTAL)"
- depends on DM_CACHE
- default y
- ---help---
- A simple cache policy that writes back all data to the
- origin. Used when decommissioning a dm-cache.
-
config DM_ERA
tristate "Era target (EXPERIMENTAL)"
depends on BLK_DEV_DM
@@ -365,6 +357,7 @@ config DM_LOG_USERSPACE
config DM_RAID
tristate "RAID 1/4/5/6/10 target"
depends on BLK_DEV_DM
+ select MD_RAID0
select MD_RAID1
select MD_RAID10
select MD_RAID456
@@ -508,4 +501,14 @@ config DM_LOG_WRITES
If unsure, say N.
+config DM_INTEGRITY
+ tristate "Integrity target"
+ depends on BLK_DEV_DM
+ select BLK_DEV_INTEGRITY
+ select DM_BUFIO
+ select CRYPTO
+ select ASYNC_XOR
+ ---help---
+ This is the integrity target.
+
endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 3cbda1af87a0..39cf2a1b5f90 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -11,10 +11,11 @@ dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \
dm-mirror-y += dm-raid1.o
dm-log-userspace-y \
+= dm-log-userspace-base.o dm-log-userspace-transfer.o
+dm-bio-prison-y += dm-bio-prison-v1.o dm-bio-prison-v2.o
dm-thin-pool-y += dm-thin.o dm-thin-metadata.o
-dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o
+dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \
+ dm-cache-background-tracker.o
dm-cache-smq-y += dm-cache-policy-smq.o
-dm-cache-cleaner-y += dm-cache-policy-cleaner.o
dm-era-y += dm-era-target.o
dm-verity-y += dm-verity-target.o
md-mod-y += md.o bitmap.o
@@ -56,9 +57,9 @@ obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o
obj-$(CONFIG_DM_VERITY) += dm-verity.o
obj-$(CONFIG_DM_CACHE) += dm-cache.o
obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o
-obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o
obj-$(CONFIG_DM_ERA) += dm-era.o
obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o
+obj-$(CONFIG_DM_INTEGRITY) += dm-integrity.o
ifeq ($(CONFIG_DM_UEVENT),y)
dm-mod-objs += dm-uevent.o
diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison-v1.c
index 03af174485d3..ae7da2c30a57 100644
--- a/drivers/md/dm-bio-prison.c
+++ b/drivers/md/dm-bio-prison-v1.c
@@ -5,7 +5,8 @@
*/
#include "dm.h"
-#include "dm-bio-prison.h"
+#include "dm-bio-prison-v1.h"
+#include "dm-bio-prison-v2.h"
#include <linux/spinlock.h>
#include <linux/mempool.h>
@@ -398,7 +399,7 @@ EXPORT_SYMBOL_GPL(dm_deferred_set_add_work);
/*----------------------------------------------------------------*/
-static int __init dm_bio_prison_init(void)
+static int __init dm_bio_prison_init_v1(void)
{
_cell_cache = KMEM_CACHE(dm_bio_prison_cell, 0);
if (!_cell_cache)
@@ -407,12 +408,51 @@ static int __init dm_bio_prison_init(void)
return 0;
}
-static void __exit dm_bio_prison_exit(void)
+static void dm_bio_prison_exit_v1(void)
{
kmem_cache_destroy(_cell_cache);
_cell_cache = NULL;
}
+static int (*_inits[])(void) __initdata = {
+ dm_bio_prison_init_v1,
+ dm_bio_prison_init_v2,
+};
+
+static void (*_exits[])(void) = {
+ dm_bio_prison_exit_v1,
+ dm_bio_prison_exit_v2,
+};
+
+static int __init dm_bio_prison_init(void)
+{
+ const int count = ARRAY_SIZE(_inits);
+
+ int r, i;
+
+ for (i = 0; i < count; i++) {
+ r = _inits[i]();
+ if (r)
+ goto bad;
+ }
+
+ return 0;
+
+ bad:
+ while (i--)
+ _exits[i]();
+
+ return r;
+}
+
+static void __exit dm_bio_prison_exit(void)
+{
+ int i = ARRAY_SIZE(_exits);
+
+ while (i--)
+ _exits[i]();
+}
+
/*
* module hooks
*/
diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison-v1.h
index 54352f009bfd..cddd4ac07e2c 100644
--- a/drivers/md/dm-bio-prison.h
+++ b/drivers/md/dm-bio-prison-v1.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2011-2012 Red Hat, Inc.
+ * Copyright (C) 2011-2017 Red Hat, Inc.
*
* This file is released under the GPL.
*/
diff --git a/drivers/md/dm-bio-prison-v2.c b/drivers/md/dm-bio-prison-v2.c
new file mode 100644
index 000000000000..c9b11f799cd8
--- /dev/null
+++ b/drivers/md/dm-bio-prison-v2.c
@@ -0,0 +1,369 @@
+/*
+ * Copyright (C) 2012-2017 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm.h"
+#include "dm-bio-prison-v2.h"
+
+#include <linux/spinlock.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/rwsem.h>
+
+/*----------------------------------------------------------------*/
+
+#define MIN_CELLS 1024
+
+struct dm_bio_prison_v2 {
+ struct workqueue_struct *wq;
+
+ spinlock_t lock;
+ mempool_t *cell_pool;
+ struct rb_root cells;
+};
+
+static struct kmem_cache *_cell_cache;
+
+/*----------------------------------------------------------------*/
+
+/*
+ * @nr_cells should be the number of cells you want in use _concurrently_.
+ * Don't confuse it with the number of distinct keys.
+ */
+struct dm_bio_prison_v2 *dm_bio_prison_create_v2(struct workqueue_struct *wq)
+{
+ struct dm_bio_prison_v2 *prison = kmalloc(sizeof(*prison), GFP_KERNEL);
+
+ if (!prison)
+ return NULL;
+
+ prison->wq = wq;
+ spin_lock_init(&prison->lock);
+
+ prison->cell_pool = mempool_create_slab_pool(MIN_CELLS, _cell_cache);
+ if (!prison->cell_pool) {
+ kfree(prison);
+ return NULL;
+ }
+
+ prison->cells = RB_ROOT;
+
+ return prison;
+}
+EXPORT_SYMBOL_GPL(dm_bio_prison_create_v2);
+
+void dm_bio_prison_destroy_v2(struct dm_bio_prison_v2 *prison)
+{
+ mempool_destroy(prison->cell_pool);
+ kfree(prison);
+}
+EXPORT_SYMBOL_GPL(dm_bio_prison_destroy_v2);
+
+struct dm_bio_prison_cell_v2 *dm_bio_prison_alloc_cell_v2(struct dm_bio_prison_v2 *prison, gfp_t gfp)
+{
+ return mempool_alloc(prison->cell_pool, gfp);
+}
+EXPORT_SYMBOL_GPL(dm_bio_prison_alloc_cell_v2);
+
+void dm_bio_prison_free_cell_v2(struct dm_bio_prison_v2 *prison,
+ struct dm_bio_prison_cell_v2 *cell)
+{
+ mempool_free(cell, prison->cell_pool);
+}
+EXPORT_SYMBOL_GPL(dm_bio_prison_free_cell_v2);
+
+static void __setup_new_cell(struct dm_cell_key_v2 *key,
+ struct dm_bio_prison_cell_v2 *cell)
+{
+ memset(cell, 0, sizeof(*cell));
+ memcpy(&cell->key, key, sizeof(cell->key));
+ bio_list_init(&cell->bios);
+}
+
+static int cmp_keys(struct dm_cell_key_v2 *lhs,
+ struct dm_cell_key_v2 *rhs)
+{
+ if (lhs->virtual < rhs->virtual)
+ return -1;
+
+ if (lhs->virtual > rhs->virtual)
+ return 1;
+
+ if (lhs->dev < rhs->dev)
+ return -1;
+
+ if (lhs->dev > rhs->dev)
+ return 1;
+
+ if (lhs->block_end <= rhs->block_begin)
+ return -1;
+
+ if (lhs->block_begin >= rhs->block_end)
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Returns true if node found, otherwise it inserts a new one.
+ */
+static bool __find_or_insert(struct dm_bio_prison_v2 *prison,
+ struct dm_cell_key_v2 *key,
+ struct dm_bio_prison_cell_v2 *cell_prealloc,
+ struct dm_bio_prison_cell_v2 **result)
+{
+ int r;
+ struct rb_node **new = &prison->cells.rb_node, *parent = NULL;
+
+ while (*new) {
+ struct dm_bio_prison_cell_v2 *cell =
+ container_of(*new, struct dm_bio_prison_cell_v2, node);
+
+ r = cmp_keys(key, &cell->key);
+
+ parent = *new;
+ if (r < 0)
+ new = &((*new)->rb_left);
+
+ else if (r > 0)
+ new = &((*new)->rb_right);
+
+ else {
+ *result = cell;
+ return true;
+ }
+ }
+
+ __setup_new_cell(key, cell_prealloc);
+ *result = cell_prealloc;
+ rb_link_node(&cell_prealloc->node, parent, new);
+ rb_insert_color(&cell_prealloc->node, &prison->cells);
+
+ return false;
+}
+
+static bool __get(struct dm_bio_prison_v2 *prison,
+ struct dm_cell_key_v2 *key,
+ unsigned lock_level,
+ struct bio *inmate,
+ struct dm_bio_prison_cell_v2 *cell_prealloc,
+ struct dm_bio_prison_cell_v2 **cell)
+{
+ if (__find_or_insert(prison, key, cell_prealloc, cell)) {
+ if ((*cell)->exclusive_lock) {
+ if (lock_level <= (*cell)->exclusive_level) {
+ bio_list_add(&(*cell)->bios, inmate);
+ return false;
+ }
+ }
+
+ (*cell)->shared_count++;
+
+ } else
+ (*cell)->shared_count = 1;
+
+ return true;
+}
+
+bool dm_cell_get_v2(struct dm_bio_prison_v2 *prison,
+ struct dm_cell_key_v2 *key,
+ unsigned lock_level,
+ struct bio *inmate,
+ struct dm_bio_prison_cell_v2 *cell_prealloc,
+ struct dm_bio_prison_cell_v2 **cell_result)
+{
+ int r;
+ unsigned long flags;
+
+ spin_lock_irqsave(&prison->lock, flags);
+ r = __get(prison, key, lock_level, inmate, cell_prealloc, cell_result);
+ spin_unlock_irqrestore(&prison->lock, flags);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(dm_cell_get_v2);
+
+static bool __put(struct dm_bio_prison_v2 *prison,
+ struct dm_bio_prison_cell_v2 *cell)
+{
+ BUG_ON(!cell->shared_count);
+ cell->shared_count--;
+
+ // FIXME: shared locks granted above the lock level could starve this
+ if (!cell->shared_count) {
+ if (cell->exclusive_lock){
+ if (cell->quiesce_continuation) {
+ queue_work(prison->wq, cell->quiesce_continuation);
+ cell->quiesce_continuation = NULL;
+ }
+ } else {
+ rb_erase(&cell->node, &prison->cells);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool dm_cell_put_v2(struct dm_bio_prison_v2 *prison,
+ struct dm_bio_prison_cell_v2 *cell)
+{
+ bool r;
+ unsigned long flags;
+
+ spin_lock_irqsave(&prison->lock, flags);
+ r = __put(prison, cell);
+ spin_unlock_irqrestore(&prison->lock, flags);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(dm_cell_put_v2);
+
+static int __lock(struct dm_bio_prison_v2 *prison,
+ struct dm_cell_key_v2 *key,
+ unsigned lock_level,
+ struct dm_bio_prison_cell_v2 *cell_prealloc,
+ struct dm_bio_prison_cell_v2 **cell_result)
+{
+ struct dm_bio_prison_cell_v