From c217649bf2d60ac119afd71d938278cffd55962b Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 13 Jan 2011 19:53:46 +0000
Subject: dm: dont take i_mutex to change device size

No longer needlessly hold md->bdev->bd_inode->i_mutex when changing the
size of a DM device.  This additional locking is unnecessary because
i_size_write() is already protected by the existing critical section in
dm_swap_table().  DM already has a reference on md->bdev so the
associated bd_inode may be changed without lifetime concerns.

A negative side-effect of having held md->bdev->bd_inode->i_mutex was
that a concurrent DM device resize and flush (via fsync) would deadlock.
Dropping md->bdev->bd_inode->i_mutex eliminates this potential for
deadlock.  The following reproducer no longer deadlocks:
  https://www.redhat.com/archives/dm-devel/2009-July/msg00284.html

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Cc: stable@kernel.org
---
 drivers/md/dm.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f48a2f359ac4..4ef066a3090f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1992,13 +1992,14 @@ static void event_callback(void *context)
 	wake_up(&md->eventq);
 }
 
+/*
+ * Protected by md->suspend_lock obtained by dm_swap_table().
+ */
 static void __set_size(struct mapped_device *md, sector_t size)
 {
 	set_capacity(md->disk, size);
 
-	mutex_lock(&md->bdev->bd_inode->i_mutex);
 	i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
-	mutex_unlock(&md->bdev->bd_inode->i_mutex);
 }
 
 /*
-- 
cgit v1.2.3


From 09c9d4c9b6a2b5909ae3c6265e4cd3820b636863 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 13 Jan 2011 19:59:46 +0000
Subject: dm mpath: disable blk_abort_queue

Revert commit 224cb3e981f1b2f9f93dbd49eaef505d17d894c2
  dm: Call blk_abort_queue on failed paths

Multipath began to use blk_abort_queue() to allow for
lower latency path deactivation.  This was found to
cause list corruption:

   the cmd gets blk_abort_queued/timedout run on it and the scsi eh
   somehow is able to complete and run scsi_queue_insert while
   scsi_request_fn is still trying to process the request.

   https://www.redhat.com/archives/dm-devel/2010-November/msg00085.html

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Cc: Mike Anderson <andmike@linux.vnet.ibm.com>
Cc: Mike Christie <michaelc@cs.wisc.edu>
Cc: stable@kernel.org
---
 drivers/md/dm-mpath.c | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 487ecda90ad4..406091f9692b 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -33,7 +33,6 @@ struct pgpath {
 	unsigned fail_count;		/* Cumulative failure count */
 
 	struct dm_path path;
-	struct work_struct deactivate_path;
 	struct work_struct activate_path;
 };
 
@@ -116,7 +115,6 @@ static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
 static void process_queued_ios(struct work_struct *work);
 static void trigger_event(struct work_struct *work);
 static void activate_path(struct work_struct *work);
-static void deactivate_path(struct work_struct *work);
 
 
 /*-----------------------------------------------
@@ -129,7 +127,6 @@ static struct pgpath *alloc_pgpath(void)
 
 	if (pgpath) {
 		pgpath->is_active = 1;
-		INIT_WORK(&pgpath->deactivate_path, deactivate_path);
 		INIT_WORK(&pgpath->activate_path, activate_path);
 	}
 
@@ -141,14 +138,6 @@ static void free_pgpath(struct pgpath *pgpath)
 	kfree(pgpath);
 }
 
-static void deactivate_path(struct work_struct *work)
-{
-	struct pgpath *pgpath =
-		container_of(work, struct pgpath, deactivate_path);
-
-	blk_abort_queue(pgpath->path.dev->bdev->bd_disk->queue);
-}
-
 static struct priority_group *alloc_priority_group(void)
 {
 	struct priority_group *pg;
@@ -995,7 +984,6 @@ static int fail_path(struct pgpath *pgpath)
 		      pgpath->path.dev->name, m->nr_valid_paths);
 
 	schedule_work(&m->trigger_event);
-	queue_work(kmultipathd, &pgpath->deactivate_path);
 
 out:
 	spin_unlock_irqrestore(&m->lock, flags);
-- 
cgit v1.2.3


From d9bf0b508ddfe19883b982b29a03c02ccbf53806 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 13 Jan 2011 19:59:47 +0000
Subject: dm io: remove BIO_RW_SYNCIO flag from kcopyd

Remove the REQ_SYNC flag to improve write throughput when writing
to the origin with a snapshot on the same device (using the CFQ I/O
scheduler).

Sequential write throughput (chunksize of 4k, 32k, 512k)
  unpatched:  8.5,  8.6,  9.3 MB/s
  patched:   15.2, 18.5, 17.5 MB/s

Snapshot exception reallocations are triggered by writes that are
usually async, so mark the associated dm_io_request as async as well.
This helps when using the CFQ I/O scheduler because it has separate
queues for sync and async I/O.  Async is optimized for throughput; sync
for latency.  With this change we're consciously favoring throughput over
latency.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-kcopyd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index d8587bac5682..5ad9231c8700 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -345,7 +345,7 @@ static int run_io_job(struct kcopyd_job *job)
 {
 	int r;
 	struct dm_io_request io_req = {
-		.bi_rw = job->rw | REQ_SYNC | REQ_UNPLUG,
+		.bi_rw = job->rw | REQ_UNPLUG,
 		.mem.type = DM_IO_PAGE_LIST,
 		.mem.ptr.pl = job->pages,
 		.mem.offset = job->offset,
-- 
cgit v1.2.3


From 84c89557a302e18414a011cc52b1abd034860743 Mon Sep 17 00:00:00 2001
From: Peter Jones <pjones@redhat.com>
Date: Thu, 13 Jan 2011 19:59:47 +0000
Subject: dm ioctl: allow rename to fill empty uuid

Allow the uuid of a mapped device to be set after device creation.
Previously the uuid (which is optional) could only be set by
DM_DEV_CREATE.  If no uuid was supplied it could not be set later.

Sometimes it's necessary to create the device before the uuid is known,
and in such cases the uuid must be filled in after the creation.

This patch extends DM_DEV_RENAME to accept a uuid accompanied by
a new flag DM_UUID_FLAG.  This can only be done once and if no
uuid was previously supplied.  It cannot be used to change an
existing uuid.

DM_VERSION_MINOR is also bumped to 19 to indicate this interface
extension is available.

Signed-off-by: Peter Jones <pjones@redhat.com>
Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-ioctl.c | 103 ++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 78 insertions(+), 25 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 4b54618b4159..f3481d922206 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -295,19 +295,55 @@ retry:
 		DMWARN("remove_all left %d open device(s)", dev_skipped);
 }
 
+/*
+ * Set the uuid of a hash_cell that isn't already set.
+ */
+static void __set_cell_uuid(struct hash_cell *hc, char *new_uuid)
+{
+	mutex_lock(&dm_hash_cells_mutex);
+	hc->uuid = new_uuid;
+	mutex_unlock(&dm_hash_cells_mutex);
+
+	list_add(&hc->uuid_list, _uuid_buckets + hash_str(new_uuid));
+}
+
+/*
+ * Changes the name of a hash_cell and returns the old name for
+ * the caller to free.
+ */
+static char *__change_cell_name(struct hash_cell *hc, char *new_name)
+{
+	char *old_name;
+
+	/*
+	 * Rename and move the name cell.
+	 */
+	list_del(&hc->name_list);
+	old_name = hc->name;
+
+	mutex_lock(&dm_hash_cells_mutex);
+	hc->name = new_name;
+	mutex_unlock(&dm_hash_cells_mutex);
+
+	list_add(&hc->name_list, _name_buckets + hash_str(new_name));
+
+	return old_name;
+}
+
 static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
 					    const char *new)
 {
-	char *new_name, *old_name;
+	char *new_data, *old_name = NULL;
 	struct hash_cell *hc;
 	struct dm_table *table;
 	struct mapped_device *md;
+	unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0;
 
 	/*
 	 * duplicate new.
 	 */
-	new_name = kstrdup(new, GFP_KERNEL);
-	if (!new_name)
+	new_data = kstrdup(new, GFP_KERNEL);
+	if (!new_data)
 		return ERR_PTR(-ENOMEM);
 
 	down_write(&_hash_lock);
@@ -315,13 +351,19 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
 	/*
 	 * Is new free ?
 	 */
-	hc = __get_name_cell(new);
+	if (change_uuid)
+		hc = __get_uuid_cell(new);
+	else
+		hc = __get_name_cell(new);
+
 	if (hc) {
-		DMWARN("asked to rename to an already-existing name %s -> %s",
+		DMWARN("Unable to change %s on mapped device %s to one that "
+		       "already exists: %s",
+		       change_uuid ? "uuid" : "name",
 		       param->name, new);
 		dm_put(hc->md);
 		up_write(&_hash_lock);
-		kfree(new_name);
+		kfree(new_data);
 		return ERR_PTR(-EBUSY);
 	}
 
@@ -330,22 +372,30 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
 	 */
 	hc = __get_name_cell(param->name);
 	if (!hc) {
-		DMWARN("asked to rename a non-existent device %s -> %s",
-		       param->name, new);
+		DMWARN("Unable to rename non-existent device, %s to %s%s",
+		       param->name, change_uuid ? "uuid " : "", new);
 		up_write(&_hash_lock);
-		kfree(new_name);
+		kfree(new_data);
 		return ERR_PTR(-ENXIO);
 	}
 
 	/*
-	 * rename and move the name cell.
+	 * Does this device already have a uuid?
 	 */
-	list_del(&hc->name_list);
-	old_name = hc->name;
-	mutex_lock(&dm_hash_cells_mutex);
-	hc->name = new_name;
-	mutex_unlock(&dm_hash_cells_mutex);
-	list_add(&hc->name_list, _name_buckets + hash_str(new_name));
+	if (change_uuid && hc->uuid) {
+		DMWARN("Unable to change uuid of mapped device %s to %s "
+		       "because uuid is already set to %s",
+		       param->name, new, hc->uuid);
+		dm_put(hc->md);
+		up_write(&_hash_lock);
+		kfree(new_data);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (change_uuid)
+		__set_cell_uuid(hc, new_data);
+	else
+		old_name = __change_cell_name(hc, new_data);
 
 	/*
 	 * Wake up any dm event waiters.
@@ -774,21 +824,24 @@ static int invalid_str(char *str, void *end)
 static int dev_rename(struct dm_ioctl *param, size_t param_size)
 {
 	int r;
-	char *new_name = (char *) param + param->data_start;
+	char *new_data = (char *) param + param->data_start;
 	struct mapped_device *md;
+	unsigned change_uuid = (param->flags & DM_UUID_FLAG) ? 1 : 0;
 
-	if (new_name < param->data ||
-	    invalid_str(new_name, (void *) param + param_size) ||
-	    strlen(new_name) > DM_NAME_LEN - 1) {
-		DMWARN("Invalid new logical volume name supplied.");
+	if (new_data < param->data ||
+	    invalid_str(new_data, (void *) param + param_size) ||
+	    strlen(new_data) > (change_uuid ? DM_UUID_LEN - 1 : DM_NAME_LEN - 1)) {
+		DMWARN("Invalid new mapped device name or uuid string supplied.");
 		return -EINVAL;
 	}
 
-	r = check_name(new_name);
-	if (r)
-		return r;
+	if (!change_uuid) {
+		r = check_name(new_data);
+		if (r)
+			return r;
+	}
 
-	md = dm_hash_rename(param, new_name);
+	md = dm_hash_rename(param, new_data);
 	if (IS_ERR(md))
 		return PTR_ERR(md);
 
-- 
cgit v1.2.3


From 5fc2ffeabb9ee0fc0e71ff16b49f34f0ed3d05b4 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 13 Jan 2011 19:59:48 +0000
Subject: dm raid1: support discard

Enable discard support in the DM mirror target.
Also change an existing use of 'bvec' to 'addr' in the union.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-raid1.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 19a59b041c27..c87756eb7a21 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -261,7 +261,7 @@ static int mirror_flush(struct dm_target *ti)
 	struct dm_io_request io_req = {
 		.bi_rw = WRITE_FLUSH,
 		.mem.type = DM_IO_KMEM,
-		.mem.ptr.bvec = NULL,
+		.mem.ptr.addr = NULL,
 		.client = ms->io_client,
 	};
 
@@ -637,6 +637,12 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
 		.client = ms->io_client,
 	};
 
+	if (bio->bi_rw & REQ_DISCARD) {
+		io_req.bi_rw |= REQ_DISCARD;
+		io_req.mem.type = DM_IO_KMEM;
+		io_req.mem.ptr.addr = NULL;
+	}
+
 	for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++)
 		map_region(dest++, m, bio);
 
@@ -670,7 +676,8 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
 	bio_list_init(&requeue);
 
 	while ((bio = bio_list_pop(writes))) {
-		if (bio->bi_rw & REQ_FLUSH) {
+		if ((bio->bi_rw & REQ_FLUSH) ||
+		    (bio->bi_rw & REQ_DISCARD)) {
 			bio_list_add(&sync, bio);
 			continue;
 		}
@@ -1076,6 +1083,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	ti->private = ms;
 	ti->split_io = dm_rh_get_region_size(ms->rh);
 	ti->num_flush_requests = 1;
+	ti->num_discard_requests = 1;
 
 	ms->kmirrord_wq = create_singlethread_workqueue("kmirrord");
 	if (!ms->kmirrord_wq) {
-- 
cgit v1.2.3


From 4a1aeb98297e17f4e0a8cdda919e63bf528b2e5d Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Thu, 13 Jan 2011 19:59:48 +0000
Subject: dm: remove dm_mutex after bkl conversion

This patch replaces dm_mutex with _minor_lock in dm_blk_close()
and then removes it.

During the BKL conversion, commit 6e9624b8caec290d28b4c6d9ec75749df6372b87
(block: push down BKL into .open and .release) pushed lock_kernel()
down into dm_blk_open/close calls.
Commit 2a48fc0ab24241755dc93bfd4f01d68efab47f5a
(block: autoconvert trivial BKL users to private mutex) converted it to a
local mutex, but _minor_lock is sufficient.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 4ef066a3090f..0de692176ad4 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -32,7 +32,6 @@
 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
 #define DM_COOKIE_LENGTH 24
 
-static DEFINE_MUTEX(dm_mutex);
 static const char *_name = DM_NAME;
 
 static unsigned int major = 0;
@@ -328,7 +327,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)
 {
 	struct mapped_device *md;
 
-	mutex_lock(&dm_mutex);
 	spin_lock(&_minor_lock);
 
 	md = bdev->bd_disk->private_data;
@@ -346,7 +344,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)
 
 out:
 	spin_unlock(&_minor_lock);
-	mutex_unlock(&dm_mutex);
 
 	return md ? 0 : -ENXIO;
 }
@@ -355,10 +352,12 @@ static int dm_blk_close(struct gendisk *disk, fmode_t mode)
 {
 	struct mapped_device *md = disk->private_data;
 
-	mutex_lock(&dm_mutex);
+	spin_lock(&_minor_lock);
+
 	atomic_dec(&md->open_count);
 	dm_put(md);
-	mutex_unlock(&dm_mutex);
+
+	spin_unlock(&_minor_lock);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 69a8cfcda21017364df1c21b720daf304b5598a6 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Thu, 13 Jan 2011 19:59:49 +0000
Subject: dm crypt: set key size early

Simplify key size verification (hexadecimal string) and
set key size early in constructor.

(Patch required by later changes.)

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index d5b0e4c0e702..4c4408a2602f 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -973,15 +973,15 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
 
 static int crypt_set_key(struct crypt_config *cc, char *key)
 {
-	unsigned key_size = strlen(key) >> 1;
-
-	if (cc->key_size && cc->key_size != key_size)
+	/* The key size may not be changed. */
+	if (cc->key_size != (strlen(key) >> 1))
 		return -EINVAL;
 
-	cc->key_size = key_size; /* initial settings */
+	/* Hyphen (which gives a key_size of zero) means there is no key. */
+	if (!cc->key_size && strcmp(key, "-"))
+		return -EINVAL;
 
-	if ((!key_size && strcmp(key, "-")) ||
-	   (key_size && crypt_decode_key(cc->key, key, key_size) < 0))
+	if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0)
 		return -EINVAL;
 
 	set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
@@ -1194,6 +1194,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		ti->error = "Cannot allocate encryption context";
 		return -ENOMEM;
 	}
+	cc->key_size = key_size;
 
 	ti->private = cc;
 	ret = crypt_ctr_cipher(ti, argv[0], argv[1]);
-- 
cgit v1.2.3


From 4a038677df4da84e42fd68b5ab2dfa4d82baa444 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Thu, 13 Jan 2011 19:59:49 +0000
Subject: dm log userspace: trap all failed log construction errors

When constructing a mirror log, it is possible for the initial request
to fail for other reasons besides -ESRCH.  These must be handled too.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-log-userspace-base.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 1ed0094f064b..1c25ad3d02a2 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -181,8 +181,11 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
 	r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR,
 				 ctr_str, str_size, NULL, NULL);
 
-	if (r == -ESRCH) {
-		DMERR("Userspace log server not found");
+	if (r < 0) {
+		if (r == -ESRCH)
+			DMERR("Userspace log server not found");
+		else
+			DMERR("Userspace log server failed to create log");
 		goto out;
 	}
 
@@ -214,10 +217,9 @@ out:
 
 static void userspace_dtr(struct dm_dirty_log *log)
 {
-	int r;
 	struct log_c *lc = log->context;
 
-	r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR,
+	(void) dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR,
 				 NULL, 0,
 				 NULL, NULL);
 
-- 
cgit v1.2.3


From 8d35d3e37eed884ba15229a146df846f399909b4 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 13 Jan 2011 19:59:50 +0000
Subject: dm kcopyd: delay unplugging

Make kcopyd merge more I/O requests by using device unplugging.

Without this patch, each I/O request is dispatched separately to the device.
If the device supports tagged queuing, there are many small requests sent
to the device. To improve performance, this patch will batch as many requests
as possible, allowing the queue to merge consecutive requests, and send them
to the device at once.

In my tests (15k SCSI disk), this patch improves sequential write throughput:

  Sequential write throughput (chunksize of 4k, 32k, 512k)
  unpatched: 15.2, 18.5, 17.5 MB/s
  patched:   14.4, 22.6, 23.0 MB/s

In most common uses (snapshot or two-way mirror), kcopyd is only used for
two devices, one for reading and the other for writing, thus this optimization
is implemented only for two devices. The optimization may be extended to n-way
mirrors with some code complexity increase.

We keep track of two block devices to unplug (one for read and the
other for write) and unplug them when exiting "do_work" thread.  If
there are more devices used (in theory it could happen, in practice it
is rare), we unplug immediately.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-kcopyd.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 51 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 5ad9231c8700..dad32f8bce7d 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -37,6 +37,13 @@ struct dm_kcopyd_client {
 	unsigned int nr_pages;
 	unsigned int nr_free_pages;
 
+	/*
+	 * Block devices to unplug.
+	 * Non-NULL pointer means that a block device has some pending requests
+	 * and needs to be unplugged.
+	 */
+	struct block_device *unplug[2];
+
 	struct dm_io_client *io_client;
 
 	wait_queue_head_t destroyq;
@@ -308,6 +315,31 @@ static int run_complete_job(struct kcopyd_job *job)
 	return 0;
 }
 
+/*
+ * Unplug the block device at the specified index.
+ */
+static void unplug(struct dm_kcopyd_client *kc, int rw)
+{
+	if (kc->unplug[rw] != NULL) {
+		blk_unplug(bdev_get_queue(kc->unplug[rw]));
+		kc->unplug[rw] = NULL;
+	}
+}
+
+/*
+ * Prepare block device unplug. If there's another device
+ * to be unplugged at the same array index, we unplug that
+ * device first.
+ */
+static void prepare_unplug(struct dm_kcopyd_client *kc, int rw,
+			   struct block_device *bdev)
+{
+	if (likely(kc->unplug[rw] == bdev))
+		return;
+	unplug(kc, rw);
+	kc->unplug[rw] = bdev;
+}
+
 static void complete_io(unsigned long error, void *context)
 {
 	struct kcopyd_job *job = (struct kcopyd_job *) context;
@@ -345,7 +377,7 @@ static int run_io_job(struct kcopyd_job *job)
 {
 	int r;
 	struct dm_io_request io_req = {
-		.bi_rw = job->rw | REQ_UNPLUG,
+		.bi_rw = job->rw,
 		.mem.type = DM_IO_PAGE_LIST,
 		.mem.ptr.pl = job->pages,
 		.mem.offset = job->offset,
@@ -354,10 +386,16 @@ static int run_io_job(struct kcopyd_job *job)
 		.client = job->kc->io_client,
 	};
 
-	if (job->rw == READ)
+	if (job->rw == READ) {
 		r = dm_io(&io_req, 1, &job->source, NULL);
-	else
+		prepare_unplug(job->kc, READ, job->source.bdev);
+	} else {
+		if (job->num_dests > 1)
+			io_req.bi_rw |= REQ_UNPLUG;
 		r = dm_io(&io_req, job->num_dests, job->dests, NULL);
+		if (!(io_req.bi_rw & REQ_UNPLUG))
+			prepare_unplug(job->kc, WRITE, job->dests[0].bdev);
+	}
 
 	return r;
 }
@@ -435,10 +473,18 @@ static void do_work(struct work_struct *work)
 	 * Pages jobs when successful will jump onto the io jobs
 	 * list.  io jobs call wake when they complete and it all
 	 * starts again.
+	 *
+	 * Note that io_jobs add block devices to the unplug array,
+	 * this array is cleared with "unplug" calls. It is thus
+	 * forbidden to run complete_jobs after io_jobs and before
+	 * unplug because the block device could be destroyed in
+	 * job completion callback.
 	 */
 	process_jobs(&kc->complete_jobs, kc, run_complete_job);
 	process_jobs(&kc->pages_jobs, kc, run_pages_job);
 	process_jobs(&kc->io_jobs, kc, run_io_job);
+	unplug(kc, READ);
+	unplug(kc, WRITE);
 }
 
 /*
@@ -619,6 +665,8 @@ int dm_kcopyd_client_create(unsigned int nr_pages,
 	INIT_LIST_HEAD(&kc->io_jobs);
 	INIT_LIST_HEAD(&kc->pages_jobs);
 
+	memset(kc->unplug, 0, sizeof(kc->unplug));
+
 	kc->job_pool = mempool_create_slab_pool(MIN_JOBS, _job_cache);
 	if (!kc->job_pool)
 		goto bad_slab;
-- 
cgit v1.2.3


From 909cc4fb48dd9870f6ebe4bd32cfbe37c102df62 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Thu, 13 Jan 2011 19:59:50 +0000
Subject: dm log userspace: split flush queue

Split the 'flush_list', which contained a mix of both 'mark' and 'clear'
requests, into two distinct lists ('mark_list' and 'clear_list').

The device mapper log implementations (used by various DM targets) are
allowed to cache 'mark' and 'clear' requests until a 'flush' is
received.  Until now, these cached requests were kept in the same list.
They will now be put into distinct lists to facilitate group processing
of these requests (in the next patch).

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-log-userspace-base.c | 41 +++++++++++++++++++++++++++++---------
 1 file changed, 32 insertions(+), 9 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 1c25ad3d02a2..767adf300fa5 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -37,8 +37,15 @@ struct log_c {
 	 */
 	uint64_t in_sync_hint;
 
+	/*
+	 * Mark and clear requests are held until a flush is issued
+	 * so that we can group, and thereby limit, the amount of
+	 * network traffic between kernel and userspace.  The 'flush_lock'
+	 * is used to protect these lists.
+	 */
 	spinlock_t flush_lock;
-	struct list_head flush_list;  /* only for clear and mark requests */
+	struct list_head mark_list;
+	struct list_head clear_list;
 };
 
 static mempool_t *flush_entry_pool;
@@ -169,7 +176,8 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
 
 	strncpy(lc->uuid, argv[0], DM_UUID_LEN);
 	spin_lock_init(&lc->flush_lock);
-	INIT_LIST_HEAD(&lc->flush_list);
+	INIT_LIST_HEAD(&lc->mark_list);
+	INIT_LIST_HEAD(&lc->clear_list);
 
 	str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str);
 	if (str_size < 0) {
@@ -362,14 +370,16 @@ static int userspace_flush(struct dm_dirty_log *log)
 	int r = 0;
 	unsigned long flags;
 	struct log_c *lc = log->context;
-	LIST_HEAD(flush_list);
+	LIST_HEAD(mark_list);
+	LIST_HEAD(clear_list);
 	struct flush_entry *fe, *tmp_fe;
 
 	spin_lock_irqsave(&lc->flush_lock, flags);
-	list_splice_init(&lc->flush_list, &flush_list);
+	list_splice_init(&lc->mark_list, &mark_list);
+	list_splice_init(&lc->clear_list, &clear_list);
 	spin_unlock_irqrestore(&lc->flush_lock, flags);
 
-	if (list_empty(&flush_list))
+	if (list_empty(&mark_list) && list_empty(&clear_list))
 		return 0;
 
 	/*
@@ -379,7 +389,16 @@ static int userspace_flush(struct dm_dirty_log *log)
 	 * do it one by one.
 	 */
 
-	list_for_each_entry(fe, &flush_list, list) {
+	list_for_each_entry(fe, &mark_list, list) {
+		r = userspace_do_request(lc, lc->uuid, fe->type,
+					 (char *)&fe->region,
+					 sizeof(fe->region),
+					 NULL, NULL);
+		if (r)
+			goto fail;
+	}
+
+	list_for_each_entry(fe, &clear_list, list) {
 		r = userspace_do_request(lc, lc->uuid, fe->type,
 					 (char *)&fe->region,
 					 sizeof(fe->region),
@@ -397,7 +416,11 @@ fail:
 	 * Calling code will receive an error and will know that
 	 * the log facility has failed.
 	 */
-	list_for_each_entry_safe(fe, tmp_fe, &flush_list, list) {
+	list_for_each_entry_safe(fe, tmp_fe, &mark_list, list) {
+		list_del(&fe->list);
+		mempool_free(fe, flush_entry_pool);
+	}
+	list_for_each_entry_safe(fe, tmp_fe, &clear_list, list) {
 		list_del(&fe->list);
 		mempool_free(fe, flush_entry_pool);
 	}
@@ -427,7 +450,7 @@ static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
 	spin_lock_irqsave(&lc->flush_lock, flags);
 	fe->type = DM_ULOG_MARK_REGION;
 	fe->region = region;
-	list_add(&fe->list, &lc->flush_list);
+	list_add(&fe->list, &lc->mark_list);
 	spin_unlock_irqrestore(&lc->flush_lock, flags);
 
 	return;
@@ -464,7 +487,7 @@ static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
 	spin_lock_irqsave(&lc->flush_lock, flags);
 	fe->type = DM_ULOG_CLEAR_REGION;
 	fe->region = region;
-	list_add(&fe->list, &lc->flush_list);
+	list_add(&fe->list, &lc->clear_list);
 	spin_unlock_irqrestore(&lc->flush_lock, flags);
 
 	return;
-- 
cgit v1.2.3


From 085ae0651b2791f3a430ddb76da92925b9952e13 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Thu, 13 Jan 2011 19:59:51 +0000
Subject: dm log userspace: group clear and mark requests

Allow the device-mapper log's 'mark' and 'clear' requests to be
grouped and processed in a batch.  This can significantly reduce the
amount of traffic going between the kernel and userspace (where the
processing daemon resides).

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-log-userspace-base.c | 102 ++++++++++++++++++++++++++++---------
 1 file changed, 79 insertions(+), 23 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 767adf300fa5..31e1687e7bf6 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -18,6 +18,14 @@ struct flush_entry {
 	struct list_head list;
 };
 
+/*
+ * This limit on the number of mark and clear request is, to a degree,
+ * arbitrary.  However, there is some basis for the choice in the limits
+ * imposed on the size of data payload by dm-log-userspace-transfer.c:
+ * dm_consult_userspace().
+ */
+#define MAX_FLUSH_GROUP_COUNT 32
+
 struct log_c {
 	struct dm_target *ti;
 	uint32_t region_size;
@@ -348,6 +356,71 @@ static int userspace_in_sync(struct dm_dirty_log *log, region_t region,
 	return (r) ? 0 : (int)in_sync;
 }
 
+static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list)
+{
+	int r = 0;
+	struct flush_entry *fe;
+
+	list_for_each_entry(fe, flush_list, list) {
+		r = userspace_do_request(lc, lc->uuid, fe->type,
+					 (char *)&fe->region,
+					 sizeof(fe->region),
+					 NULL, NULL);
+		if (r)
+			break;
+	}
+
+	return r;
+}
+
+static int flush_by_group(struct log_c *lc, struct list_head *flush_list)
+{
+	int r = 0;
+	int count;
+	uint32_t type = 0;
+	struct flush_entry *fe, *tmp_fe;
+	LIST_HEAD(tmp_list);
+	uint64_t group[MAX_FLUSH_GROUP_COUNT];
+
+	/*
+	 * Group process the requests
+	 */
+	while (!list_empty(flush_list)) {
+		count = 0;
+
+		list_for_each_entry_safe(fe, tmp_fe, flush_list, list) {
+			group[count] = fe->region;
+			count++;
+
+			list_del(&fe->list);
+			list_add(&fe->list, &tmp_list);
+
+			type = fe->type;
+			if (count >= MAX_FLUSH_GROUP_COUNT)
+				break;
+		}
+
+		r = userspace_do_request(lc, lc->uuid, type,
+					 (char *)(group),
+					 count * sizeof(uint64_t),
+					 NULL, NULL);
+		if (r) {
+			/* Group send failed.  Attempt one-by-one. */
+			list_splice_init(&tmp_list, flush_list);
+			r = flush_one_by_one(lc, flush_list);
+			break;
+		}
+	}
+
+	/*
+	 * Must collect flush_entrys that were successfully processed
+	 * as a group so that they will be free'd by the caller.
+	 */
+	list_splice_init(&tmp_list, flush_list);
+
+	return r;
+}
+
 /*
  * userspace_flush
  *
@@ -382,30 +455,13 @@ static int userspace_flush(struct dm_dirty_log *log)
 	if (list_empty(&mark_list) && list_empty(&clear_list))
 		return 0;
 
-	/*
-	 * FIXME: Count up requests, group request types,
-	 * allocate memory to stick all requests in and
-	 * send to server in one go.  Failing the allocation,
-	 * do it one by one.
-	 */
-
-	list_for_each_entry(fe, &mark_list, list) {
-		r = userspace_do_request(lc, lc->uuid, fe->type,
-					 (char *)&fe->region,
-					 sizeof(fe->region),
-					 NULL, NULL);
-		if (r)
-			goto fail;
-	}
+	r = flush_by_group(lc, &mark_list);
+	if (r)
+		goto fail;
 
-	list_for_each_entry(fe, &clear_list, list) {
-		r = userspace_do_request(lc, lc->uuid, fe->type,
-					 (char *)&fe->region,
-					 sizeof(fe->region),
-					 NULL, NULL);
-		if (r)
-			goto fail;
-	}
+	r = flush_by_group(lc, &clear_list);
+	if (r)
+		goto fail;
 
 	r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
 				 NULL, 0, NULL, NULL);
-- 
cgit v1.2.3


From 86a54a4802df10d23ccd655e2083e812fe990243 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Thu, 13 Jan 2011 19:59:52 +0000
Subject: dm log userspace: add version number to comms

This patch adds a 'version' field to the 'dm_ulog_request'
structure.

The 'version' field is taken from a portion of the unused
'padding' field in the 'dm_ulog_request' structure.  This
was done to avoid changing the size of the structure and
possibly disrupting backwards compatibility.

The version number will help notify user-space daemons
when a change has been made to the kernel/userspace
log API.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-log-userspace-base.c     | 6 ++++--
 drivers/md/dm-log-userspace-transfer.c | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 31e1687e7bf6..aa2e0c374ab3 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -12,6 +12,8 @@
 
 #include "dm-log-userspace-transfer.h"
 
+#define DM_LOG_USERSPACE_VSN "1.1.0"
+
 struct flush_entry {
 	int type;
 	region_t region;
@@ -765,7 +767,7 @@ static int __init userspace_dirty_log_init(void)
 		return r;
 	}
 
-	DMINFO("version 1.0.0 loaded");
+	DMINFO("version " DM_LOG_USERSPACE_VSN " loaded");
 	return 0;
 }
 
@@ -775,7 +777,7 @@ static void __exit userspace_dirty_log_exit(void)
 	dm_ulog_tfr_exit();
 	mempool_destroy(flush_entry_pool);
 
-	DMINFO("version 1.0.0 unloaded");
+	DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded");
 	return;
 }
 
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
index 075cbcf8a9f5..049eaf12aaab 100644
--- a/drivers/md/dm-log-userspace-transfer.c
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -198,6 +198,7 @@ resend:
 
 	memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - sizeof(struct cn_msg));
 	memcpy(tfr->uuid, uuid, DM_UUID_LEN);
+	tfr->version = DM_ULOG_REQUEST_VERSION;
 	tfr->luid = luid;
 	tfr->seq = dm_ulog_seq++;
 
-- 
cgit v1.2.3


From 7dbcd137414f3877737802438926d6dba7906a9a Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Thu, 13 Jan 2011 19:59:52 +0000
Subject: dm crypt: simplify compatible table output

Rename cc->cipher_mode to cc->cipher_string and store the whole of the cipher
information so it can easily be printed when processing the DM_DEV_STATUS ioctl.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 4c4408a2602f..9a896e1cb2ea 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -108,7 +108,7 @@ struct crypt_config {
 	struct workqueue_struct *crypt_queue;
 
 	char *cipher;
-	char *cipher_mode;
+	char *cipher_string;
 
 	struct crypt_iv_operations *iv_gen_ops;
 	union {
@@ -1030,7 +1030,7 @@ static void crypt_dtr(struct dm_target *ti)
 		dm_put_device(ti, cc->dev);
 
 	kzfree(cc->cipher);
-	kzfree(cc->cipher_mode);
+	kzfree(cc->cipher_string);
 
 	/* Must zero key material before freeing */
 	kzfree(cc);
@@ -1050,6 +1050,10 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 		return -EINVAL;
 	}
 
+	cc->cipher_string = kstrdup(cipher_in, GFP_KERNEL);
+	if (!cc->cipher_string)
+		goto bad_mem;
+
 	/*
 	 * Legacy dm-crypt cipher specification
 	 * cipher-mode-iv:ivopts
@@ -1061,12 +1065,6 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 	if (!cc->cipher)
 		goto bad_mem;
 
-	if (tmp) {
-		cc->cipher_mode = kstrdup(tmp, GFP_KERNEL);
-		if (!cc->cipher_mode)
-			goto bad_mem;
-	}
-
 	chainmode = strsep(&tmp, "-");
 	ivopts = strsep(&tmp, "-");
 	ivmode = strsep(&ivopts, ":");
@@ -1074,10 +1072,11 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 	if (tmp)
 		DMWARN("Ignoring unexpected additional cipher options");
 
-	/* Compatibility mode for old dm-crypt mappings */
+	/*
+	 * For compatibility with the original dm-crypt mapping format, if
+	 * only the cipher name is supplied, use cbc-plain.
+	 */
 	if (!chainmode || (!strcmp(chainmode, "plain") && !ivmode)) {
-		kfree(cc->cipher_mode);
-		cc->cipher_mode = kstrdup("cbc-plain", GFP_KERNEL);
 		chainmode = "cbc";
 		ivmode = "plain";
 	}
@@ -1307,10 +1306,7 @@ static int crypt_status(struct dm_target *ti, status_type_t type,
 		break;
 
 	case STATUSTYPE_TABLE:
-		if (cc->cipher_mode)
-			DMEMIT("%s-%s ", cc->cipher, cc->cipher_mode);
-		else
-			DMEMIT("%s ", cc->cipher);
+		DMEMIT("%s ", cc->cipher_string);
 
 		if (cc->key_size > 0) {
 			if ((maxlen - sz) < ((cc->key_size << 1) + 1))
@@ -1422,7 +1418,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
 
 static struct target_type crypt_target = {
 	.name   = "crypt",
-	.version = {1, 7, 0},
+	.version = {1, 8, 0},
 	.module = THIS_MODULE,
 	.ctr    = crypt_ctr,
 	.dtr    = crypt_dtr,
-- 
cgit v1.2.3


From c029772125594e31eb1a5ad9e0913724ed9891f2 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 13 Jan 2011 19:59:53 +0000
Subject: dm crypt: scale to multiple cpus

Currently dm-crypt does all the encryption work for a single dm-crypt
mapping in a single workqueue. This does not scale well when multiple
CPUs are submitting IO at a high rate. The single CPU running the single
thread cannot keep up with the encryption and encrypted IO performance
tanks.

This patch changes the crypto workqueue to be per CPU. This means
that as long as the IO submitter (or the interrupt target CPUs
for reads) runs on different CPUs the encryption work will be also
parallel.

To avoid a bottleneck on the IO worker I also changed those to be
per-CPU threads.

There is still some shared data, so I suspect some bouncing
cache lines. But I haven't done a detailed study on that yet.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 254 ++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 196 insertions(+), 58 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 9a896e1cb2ea..50ae6ef83738 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -18,6 +18,7 @@
 #include <linux/crypto.h>
 #include <linux/workqueue.h>
 #include <linux/backing-dev.h>
+#include <linux/percpu.h>
 #include <asm/atomic.h>
 #include <linux/scatterlist.h>
 #include <asm/page.h>
@@ -77,7 +78,6 @@ struct crypt_iv_operations {
 };
 
 struct iv_essiv_private {
-	struct crypto_cipher *tfm;
 	struct crypto_hash *hash_tfm;
 	u8 *salt;
 };
@@ -91,6 +91,22 @@ struct iv_benbi_private {
  * and encrypts / decrypts at the same time.
  */
 enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
+
+/*
+ * Duplicated per-CPU state for cipher.
+ */
+struct crypt_cpu {
+	struct ablkcipher_request *req;
+	struct crypto_ablkcipher *tfm;
+
+	/* ESSIV: struct crypto_cipher *essiv_tfm */
+	void *iv_private;
+};
+
+/*
+ * The fields in here must be read only after initialization,
+ * changing state should be in crypt_cpu.
+ */
 struct crypt_config {
 	struct dm_dev *dev;
 	sector_t start;
@@ -118,6 +134,12 @@ struct crypt_config {
 	sector_t iv_offset;
 	unsigned int iv_size;
 
+	/*
+	 * Duplicated per cpu state. Access through
+	 * per_cpu_ptr() only.
+	 */
+	struct crypt_cpu __percpu *cpu;
+
 	/*
 	 * Layout of each crypto request:
 	 *
@@ -132,9 +154,7 @@ struct crypt_config {
 	 * correctly aligned.
 	 */
 	unsigned int dmreq_start;
-	struct ablkcipher_request *req;
 
-	struct crypto_ablkcipher *tfm;
 	unsigned long flags;
 	unsigned int key_size;
 	u8 key[0];
@@ -149,6 +169,19 @@ static struct kmem_cache *_crypt_io_pool;
 static void clone_init(struct dm_crypt_io *, struct bio *);
 static void kcryptd_queue_crypt(struct dm_crypt_io *io);
 
+static struct crypt_cpu *this_crypt_config(struct crypt_config *cc)
+{
+	return this_cpu_ptr(cc->cpu);
+}
+
+/*
+ * Use this to access cipher attributes that are the same for each CPU.
+ */
+static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
+{
+	return __this_cpu_ptr(cc->cpu)->tfm;
+}
+
 /*
  * Different IV generation algorithms:
  *
@@ -195,7 +228,8 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
 	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
 	struct hash_desc desc;
 	struct scatterlist sg;
-	int err;
+	struct crypto_cipher *essiv_tfm;
+	int err, cpu;
 
 	sg_init_one(&sg, cc->key, cc->key_size);
 	desc.tfm = essiv->hash_tfm;
@@ -205,8 +239,16 @@ static int crypt_iv_essiv_init(struct crypt_config *cc)
 	if (err)
 		return err;
 
-	return crypto_cipher_setkey(essiv->tfm, essiv->salt,
+	for_each_possible_cpu(cpu) {
+		essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private,
+
+		err = crypto_cipher_setkey(essiv_tfm, essiv->salt,
 				    crypto_hash_digestsize(essiv->hash_tfm));
+		if (err)
+			return err;
+	}
+
+	return 0;
 }
 
 /* Wipe salt and reset key derived from volume key */
@@ -214,24 +256,76 @@ static int crypt_iv_essiv_wipe(struct crypt_config *cc)
 {
 	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
 	unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm);
+	struct crypto_cipher *essiv_tfm;
+	int cpu, r, err = 0;
 
 	memset(essiv->salt, 0, salt_size);
 
-	return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size);
+	for_each_possible_cpu(cpu) {
+		essiv_tfm = per_cpu_ptr(cc->cpu, cpu)->iv_private;
+		r = crypto_cipher_setkey(essiv_tfm, essiv->salt, salt_size);
+		if (r)
+			err = r;
+	}
+
+	return err;
+}
+
+/* Set up per cpu cipher state */
+static struct crypto_cipher *setup_essiv_cpu(struct crypt_config *cc,
+					     struct dm_target *ti,
+					     u8 *salt, unsigned saltsize)
+{
+	struct crypto_cipher *essiv_tfm;
+	int err;
+
+	/* Setup the essiv_tfm with the given salt */
+	essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(essiv_tfm)) {
+		ti->error = "Error allocating crypto tfm for ESSIV";
+		return essiv_tfm;
+	}
+
+	if (crypto_cipher_blocksize(essiv_tfm) !=
+	    crypto_ablkcipher_ivsize(any_tfm(cc))) {
+		ti->error = "Block size of ESSIV cipher does "
+			    "not match IV size of block cipher";
+		crypto_free_cipher(essiv_tfm);
+		return ERR_PTR(-EINVAL);
+	}
+
+	err = crypto_cipher_setkey(essiv_tfm, salt, saltsize);
+	if (err) {
+		ti->error = "Failed to set key for ESSIV cipher";
+		crypto_free_cipher(essiv_tfm);
+		return ERR_PTR(err);
+	}
+
+	return essiv_tfm;
 }
 
 static void crypt_iv_essiv_dtr(struct crypt_config *cc)
 {
+	int cpu;
+	struct crypt_cpu *cpu_cc;
+	struct crypto_cipher *essiv_tfm;
 	struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
 
-	crypto_free_cipher(essiv->tfm);
-	essiv->tfm = NULL;
-
 	crypto_free_hash(essiv->hash_tfm);
 	essiv->hash_tfm = NULL;
 
 	kzfree(essiv->salt);
 	essiv->salt = NULL;
+
+	for_each_possible_cpu(cpu) {
+		cpu_cc = per_cpu_ptr(cc->cpu, cpu);
+		essiv_tfm = cpu_cc->iv_private;
+
+		if (essiv_tfm)
+			crypto_free_cipher(essiv_tfm);
+
+		cpu_cc->iv_private = NULL;
+	}
 }
 
 static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
@@ -240,7 +334,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 	struct crypto_cipher *essiv_tfm = NULL;
 	struct crypto_hash *hash_tfm = NULL;
 	u8 *salt = NULL;
-	int err;
+	int err, cpu;
 
 	if (!opts) {
 		ti->error = "Digest algorithm missing for ESSIV mode";
@@ -262,30 +356,22 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
 		goto bad;
 	}
 
-	/* Allocate essiv_tfm */
-	essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
-	if (IS_ERR(essiv_tfm)) {
-		ti->error = "Error allocating crypto tfm for ESSIV";
-		err = PTR_ERR(essiv_tfm);
-		goto bad;
-	}
-	if (crypto_cipher_blocksize(essiv_tfm) !=
-	    crypto_ablkcipher_ivsize(cc->tfm)) {
-		ti->error = "Block size of ESSIV cipher does "
-			    "not match IV size of block cipher";
-		err = -EINVAL;
-		goto bad;
-	}
-
 	cc->iv_gen_private.essiv.salt = salt;
-	cc->iv_gen_private.essiv.tfm = essiv_tfm;
 	cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
 
+	for_each_possible_cpu(cpu) {
+		essiv_tfm = setup_essiv_cpu(cc, ti, salt,
+					crypto_hash_digestsize(hash_tfm));
+		if (IS_ERR(essiv_tfm)) {
+			crypt_iv_essiv_dtr(cc);
+			return PTR_ERR(essiv_tfm);
+		}
+		per_cpu_ptr(cc->cpu, cpu)->iv_private = essiv_tfm;
+	}
+
 	return 0;
 
 bad:
-	if (essiv_tfm && !IS_ERR(essiv_tfm))
-		crypto_free_cipher(essiv_tfm);
 	if (hash_tfm && !IS_ERR(hash_tfm))
 		crypto_free_hash(hash_tfm);
 	kfree(salt);
@@ -294,16 +380,19 @@ bad:
 
 static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
 {
+	struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private;
+
 	memset(iv, 0, cc->iv_size);
 	*(u64 *)iv = cpu_to_le64(sector);
-	crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv);
+	crypto_cipher_encrypt_one(essiv_tfm, iv, iv);
+
 	return 0;
 }
 
 static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
 			      const char *opts)
 {
-	unsigned bs = crypto_ablkcipher_blocksize(cc->tfm);
+	unsigned bs = crypto_ablkcipher_blocksize(any_tfm(cc));
 	int log = ilog2(bs);
 
 	/* we need to calculate how far we must shift the sector count
@@ -412,7 +501,7 @@ static int crypt_convert_block(struct crypt_config *cc,
 
 	dmreq = dmreq_of_req(cc, req);
 	iv = (u8 *)ALIGN((unsigned long)(dmreq + 1),
-			 crypto_ablkcipher_alignmask(cc->tfm) + 1);
+			 crypto_ablkcipher_alignmask(any_tfm(cc)) + 1);
 
 	dmreq->ctx = ctx;
 	sg_init_table(&dmreq->sg_in, 1);
@@ -454,16 +543,19 @@ static int crypt_convert_block(struct crypt_config *cc,
 
 static void kcryptd_async_done(struct crypto_async_request *async_req,
 			       int error);
+
 static void crypt_alloc_req(struct crypt_config *cc,
 			    struct convert_context *ctx)
 {
-	if (!cc->req)
-		cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
-	ablkcipher_request_set_tfm(cc->req, cc->tfm);
-	ablkcipher_request_set_callback(cc->req, CRYPTO_TFM_REQ_MAY_BACKLOG |
-					CRYPTO_TFM_REQ_MAY_SLEEP,
-					kcryptd_async_done,
-					dmreq_of_req(cc, cc->req));
+	struct crypt_cpu *this_cc = this_crypt_config(cc);
+
+	if (!this_cc->req)
+		this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
+
+	ablkcipher_request_set_tfm(this_cc->req, this_cc->tfm);
+	ablkcipher_request_set_callback(this_cc->req,
+	    CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+	    kcryptd_async_done, dmreq_of_req(cc, this_cc->req));
 }
 
 /*
@@ -472,6 +564,7 @@ static void crypt_alloc_req(struct crypt_config *cc,
 static int crypt_convert(struct crypt_config *cc,
 			 struct convert_context *ctx)
 {
+	struct crypt_cpu *this_cc = this_crypt_config(cc);
 	int r;
 
 	atomic_set(&ctx->pending, 1);
@@ -483,7 +576,7 @@ static int crypt_convert(struct crypt_config *cc,
 
 		atomic_inc(&ctx->pending);
 
-		r = crypt_convert_block(cc, ctx, cc->req);
+		r = crypt_convert_block(cc, ctx, this_cc->req);
 
 		switch (r) {
 		/* async */
@@ -492,7 +585,7 @@ static int crypt_convert(struct crypt_config *cc,
 			INIT_COMPLETION(ctx->restart);
 			/* fall through*/
 		case -EINPROGRESS:
-			cc->req = NULL;
+			this_cc->req = NULL;
 			ctx->sector++;
 			continue;
 
@@ -651,6 +744,9 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
  * They must be separated as otherwise the final stages could be
  * starved by new requests which can block in the first stages due
  * to memory allocation.
+ *
+ * The work is done per CPU global for all dm-crypt instances.
+ * They should not depend on each other and do not block.
  */
 static void crypt_endio(struct bio *clone, int error)
 {
@@ -971,6 +1067,20 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
 	}
 }
 
+static int crypt_setkey_allcpus(struct crypt_config *cc)
+{
+	int cpu, err = 0, r;
+
+	for_each_possible_cpu(cpu) {
+		r = crypto_ablkcipher_setkey(per_cpu_ptr(cc->cpu, cpu)->tfm,
+					       cc->key, cc->key_size);
+		if (r)
+			err = r;
+	}
+
+	return err;
+}
+
 static int crypt_set_key(struct crypt_config *cc, char *key)
 {
 	/* The key size may not be changed. */
@@ -986,19 +1096,22 @@ static int crypt_set_key(struct crypt_config *cc, char *key)
 
 	set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
 
-	return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
+	return crypt_setkey_allcpus(cc);
 }
 
 static int crypt_wipe_key(struct crypt_config *cc)
 {
 	clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
 	memset(&cc->key, 0, cc->key_size * sizeof(u8));
-	return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
+
+	return crypt_setkey_allcpus(cc);
 }
 
 static void crypt_dtr(struct dm_target *ti)
 {
 	struct crypt_config *cc = ti->private;
+	struct crypt_cpu *cpu_cc;
+	int cpu;
 
 	ti->private = NULL;
 
@@ -1010,6 +1123,15 @@ static void crypt_dtr(struct dm_target *ti)
 	if (cc->crypt_queue)
 		destroy_workqueue(cc->crypt_queue);
 
+	if (cc->cpu)
+		for_each_possible_cpu(cpu) {
+			cpu_cc = per_cpu_ptr(cc->cpu, cpu);
+			if (cpu_cc->req)
+				mempool_free(cpu_cc->req, cc->req_pool);
+			if (cpu_cc->tfm)
+				crypto_free_ablkcipher(cpu_cc->tfm);
+		}
+
 	if (cc->bs)
 		bioset_free(cc->bs);
 
@@ -1023,12 +1145,12 @@ static void crypt_dtr(struct dm_target *ti)
 	if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
 		cc->iv_gen_ops->dtr(cc);
 
-	if (cc->tfm && !IS_ERR(cc->tfm))
-		crypto_free_ablkcipher(cc->tfm);
-
 	if (cc->dev)
 		dm_put_device(ti, cc->dev);
 
+	if (cc->cpu)
+		free_percpu(cc->cpu);
+
 	kzfree(cc->cipher);
 	kzfree(cc->cipher_string);
 
@@ -1040,9 +1162,10 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 			    char *cipher_in, char *key)
 {
 	struct crypt_config *cc = ti->private;
+	struct crypto_ablkcipher *tfm;
 	char *tmp, *cipher, *chainmode, *ivmode, *ivopts;
 	char *cipher_api = NULL;
-	int ret = -EINVAL;
+	int cpu, ret = -EINVAL;
 
 	/* Convert to crypto api definition? */
 	if (strchr(cipher_in, '(')) {
@@ -1072,6 +1195,12 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 	if (tmp)
 		DMWARN("Ignoring unexpected additional cipher options");
 
+	cc->cpu = alloc_percpu(struct crypt_cpu);
+	if (!cc->cpu) {
+		ti->error = "Cannot allocate per cpu state";
+		goto bad_mem;
+	}
+
 	/*
 	 * For compatibility with the original dm-crypt mapping format, if
 	 * only the cipher name is supplied, use cbc-plain.
@@ -1098,11 +1227,14 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 	}
 
 	/* Allocate cipher */
-	cc->tfm = crypto_alloc_ablkcipher(cipher_api, 0, 0);
-	if (IS_ERR(cc->tfm)) {
-		ret = PTR_ERR(cc->tfm);
-		ti->error = "Error allocating crypto tfm";
-		goto bad;
+	for_each_possible_cpu(cpu) {
+		tfm = crypto_alloc_ablkcipher(cipher_api, 0, 0);
+		if (IS_ERR(tfm)) {
+			ret = PTR_ERR(tfm);
+			ti->error = "Error allocating crypto tfm";
+			goto bad;
+		}
+		per_cpu_ptr(cc->cpu, cpu)->tfm = tfm;
 	}
 
 	/* Initialize and set key */
@@ -1113,7 +1245,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
 	}
 
 	/* Initialize IV */
-	cc->iv_size = crypto_ablkcipher_ivsize(cc->tfm);
+	cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc));
 	if (cc->iv_size)
 		/* at least a 64 bit sector number should fit in our buffer */
 		cc->iv_size = max(cc->iv_size,
@@ -1208,9 +1340,9 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	}
 
 	cc->dmreq_start = sizeof(struct ablkcipher_request);
-	cc->dmreq_start += crypto_ablkcipher_reqsize(cc->tfm);
+	cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc));
 	cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment());
-	cc->dmreq_start += crypto_ablkcipher_alignmask(cc->tfm) &
+	cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) &
 			   ~(crypto_tfm_ctx_alignment() - 1);
 
 	cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
@@ -1219,7 +1351,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		ti->error = "Cannot allocate crypt request mempool";
 		goto bad;
 	}
-	cc->req = NULL;
 
 	cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
 	if (!cc->page_pool) {
@@ -1252,13 +1383,20 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	cc->start = tmpll;
 
 	ret = -ENOMEM;
-	cc->io_queue = create_singlethread_workqueue("kcryptd_io");
+	cc->io_queue = alloc_workqueue("kcryptd_io",
+				       WQ_NON_REENTRANT|
+				       WQ_MEM_RECLAIM,
+				       1);
 	if (!cc->io_queue) {
 		ti->error = "Couldn't create kcryptd io queue";
 		goto bad;
 	}
 
-	cc->crypt_queue = create_singlethread_workqueue("kcryptd");
+	cc->crypt_queue = alloc_workqueue("kcryptd",
+					  WQ_NON_REENTRANT|
+					  WQ_CPU_INTENSIVE|
+					  WQ_MEM_RECLAIM,
+					  1);
 	if (!cc->crypt_queue) {
 		ti->error = "Couldn't create kcryptd queue";
 		goto bad;
@@ -1418,7 +1556,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
 
 static struct target_type crypt_target = {
 	.name   = "crypt",
-	.version = {1, 8, 0},
+	.version = {1, 9, 0},
 	.module = THIS_MODULE,
 	.ctr    = crypt_ctr,
 	.dtr    = crypt_dtr,
-- 
cgit v1.2.3


From 20c82538e4f5ede51bc2b4795bc6e5cae772796d Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Thu, 13 Jan 2011 19:59:53 +0000
Subject: dm crypt: use io thread for reads only if mempool exhausted

If there is enough memory, code can directly submit bio
instead queing this operation in separate thread.

Try to alloc bio clone with GFP_NOWAIT and only if it
fails use separate queue (map function cannot block here).

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 37 +++++++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 14 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 50ae6ef83738..dc4403bcc6a0 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -787,26 +787,30 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
 	clone->bi_destructor = dm_crypt_bio_destructor;
 }
 
-static void kcryptd_io_read(struct dm_crypt_io *io)
+static void kcryptd_unplug(struct crypt_config *cc)
+{
+	blk_unplug(bdev_get_queue(cc->dev->bdev));
+}
+
+static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
 {
 	struct crypt_config *cc = io->target->private;
 	struct bio *base_bio = io->base_bio;
 	struct bio *clone;
 
-	crypt_inc_pending(io);
-
 	/*
 	 * The block layer might modify the bvec array, so always
 	 * copy the required bvecs because we need the original
 	 * one in order to decrypt the whole bio data *afterwards*.
 	 */
-	clone = bio_alloc_bioset(GFP_NOIO, bio_segments(base_bio), cc->bs);
-	if (unlikely(!clone)) {
-		io->error = -ENOMEM;
-		crypt_dec_pending(io);
-		return;
+	clone = bio_alloc_bioset(gfp, bio_segments(base_bio), cc->bs);
+	if (!clone) {
+		kcryptd_unplug(cc);
+		return 1;
 	}
 
+	crypt_inc_pending(io);
+
 	clone_init(io, clone);
 	clone->bi_idx = 0;
 	clone->bi_vcnt = bio_segments(base_bio);
@@ -816,6 +820,7 @@ static void kcryptd_io_read(struct dm_crypt_io *io)
 	       sizeof(struct bio_vec) * clone->bi_vcnt);
 
 	generic_make_request(clone);
+	return 0;
 }
 
 static void kcryptd_io_write(struct dm_crypt_io *io)
@@ -828,9 +833,12 @@ static void kcryptd_io(struct work_struct *work)
 {
 	struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work);
 
-	if (bio_data_dir(io->base_bio) == READ)
-		kcryptd_io_read(io);
-	else
+	if (bio_data_dir(io->base_bio) == READ) {
+		crypt_inc_pending(io);
+		if (kcryptd_io_read(io, GFP_NOIO))
+			io->error = -ENOMEM;
+		crypt_dec_pending(io);
+	} else
 		kcryptd_io_write(io);
 }
 
@@ -1424,9 +1432,10 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
 
 	io = crypt_io_alloc(ti, bio, dm_target_offset(ti, bio->bi_sector));
 
-	if (bio_data_dir(io->base_bio) == READ)
-		kcryptd_queue_io(io);
-	else
+	if (bio_data_dir(io->base_bio) == READ) {
+		if (kcryptd_io_read(io, GFP_NOWAIT))
+			kcryptd_queue_io(io);
+	} else
 		kcryptd_queue_crypt(io);
 
 	return DM_MAPIO_SUBMITTED;
-- 
cgit v1.2.3


From 2dc5327d3acb3340ab6fa3981401b076b78a51f4 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Thu, 13 Jan 2011 19:59:54 +0000
Subject: dm crypt: add post iv call to iv generator

IV (initialisation vector) can in principle depend not only
on sector but also on plaintext data (or other attributes).

Change IV generator interface to work directly with dmreq
structure to allow such dependence in generator.

Also add post() function which is called after the crypto
operation.

This allows tricky modification of decrypted data or IV
internals.

In asynchronous mode the post() can be called after
ctx->sector count was increased so it is needed
to add iv_sector copy directly to dmreq structure.
(N.B. dmreq always include only one sector in scatterlists)

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 48 +++++++++++++++++++++++++++++++++++-------------
 1 file changed, 35 insertions(+), 13 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index dc4403bcc6a0..e0ebe685be6a 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -64,6 +64,7 @@ struct dm_crypt_request {
 	struct convert_context *ctx;
 	struct scatterlist sg_in;
 	struct scatterlist sg_out;
+	sector_t iv_sector;
 };
 
 struct crypt_config;
@@ -74,7 +75,10 @@ struct crypt_iv_operations {
 	void (*dtr)(struct crypt_config *cc);
 	int (*init)(struct crypt_config *cc);
 	int (*wipe)(struct crypt_config *cc);
-	int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector);
+	int (*generator)(struct crypt_config *cc, u8 *iv,
+			 struct dm_crypt_request *dmreq);
+	int (*post)(struct crypt_config *cc, u8 *iv,
+		    struct dm_crypt_request *dmreq);
 };
 
 struct iv_essiv_private {
@@ -168,6 +172,7 @@ static struct kmem_cache *_crypt_io_pool;
 
 static void clone_init(struct dm_crypt_io *, struct bio *);
 static void kcryptd_queue_crypt(struct dm_crypt_io *io);
+static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
 
 static struct crypt_cpu *this_crypt_config(struct crypt_config *cc)
 {
@@ -205,19 +210,20 @@ static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
  * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454
  */
 
-static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv,
+			      struct dm_crypt_request *dmreq)
 {
 	memset(iv, 0, cc->iv_size);
-	*(u32 *)iv = cpu_to_le32(sector & 0xffffffff);
+	*(u32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff);
 
 	return 0;
 }
 
 static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv,
-				sector_t sector)
+				struct dm_crypt_request *dmreq)
 {
 	memset(iv, 0, cc->iv_size);
-	*(u64 *)iv = cpu_to_le64(sector);
+	*(u64 *)iv = cpu_to_le64(dmreq->iv_sector);
 
 	return 0;
 }
@@ -378,12 +384,13 @@ bad:
 	return err;
 }
 
-static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv,
+			      struct dm_crypt_request *dmreq)
 {
 	struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private;
 
 	memset(iv, 0, cc->iv_size);
-	*(u64 *)iv = cpu_to_le64(sector);
+	*(u64 *)iv = cpu_to_le64(dmreq->iv_sector);
 	crypto_cipher_encrypt_one(essiv_tfm, iv, iv);
 
 	return 0;
@@ -417,19 +424,21 @@ static void crypt_iv_benbi_dtr(struct crypt_config *cc)
 {
 }
 
-static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv,
+			      struct dm_crypt_request *dmreq)
 {
 	__be64 val;
 
 	memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */
 
-	val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi.shift) + 1);
+	val = cpu_to_be64(((u64)dmreq->iv_sector << cc->iv_gen_private.benbi.shift) + 1);
 	put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64)));
 
 	return 0;
 }
 
-static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
+static int crypt_iv_null_gen(struct crypt_config *cc, u8 *iv,
+			     struct dm_crypt_request *dmreq)
 {
 	memset(iv, 0, cc->iv_size);
 
@@ -489,6 +498,13 @@ static struct ablkcipher_request *req_of_dmreq(struct crypt_config *cc,
 	return (struct ablkcipher_request *)((char *)dmreq - cc->dmreq_start);
 }
 
+static u8 *iv_of_dmreq(struct crypt_config *cc,
+		       struct dm_crypt_request *dmreq)
+{
+	return (u8 *)ALIGN((unsigned long)(dmreq + 1),
+		crypto_ablkcipher_alignmask(any_tfm(cc)) + 1);
+}
+
 static int crypt_convert_block(struct crypt_config *cc,
 			       struct convert_context *ctx,
 			       struct ablkcipher_request *req)
@@ -500,9 +516,9 @@ static int crypt_convert_block(struct crypt_config *cc,
 	int r = 0;
 
 	dmreq = dmreq_of_req(cc, req);
-	iv = (u8 *)ALIGN((unsigned long)(dmreq + 1),
-			 crypto_ablkcipher_alignmask(any_tfm(cc)) + 1);
+	iv = iv_of_dmreq(cc, dmreq);
 
+	dmreq->iv_sector = ctx->sector;
 	dmreq->ctx = ctx;
 	sg_init_table(&dmreq->sg_in, 1);
 	sg_set_page(&dmreq->sg_in, bv_in->bv_page, 1 << SECTOR_SHIFT,
@@ -525,7 +541,7 @@ static int crypt_convert_block(struct crypt_config *cc,
 	}
 
 	if (cc->iv_gen_ops) {
-		r = cc->iv_gen_ops->generator(cc, iv, ctx->sector);
+		r = cc->iv_gen_ops->generator(cc, iv, dmreq);
 		if (r < 0)
 			return r;
 	}
@@ -538,6 +554,9 @@ static int crypt_convert_block(struct crypt_config *cc,
 	else
 		r = crypto_ablkcipher_decrypt(req);
 
+	if (!r && cc->iv_gen_ops && cc->iv_gen_ops->post)
+		r = cc->iv_gen_ops->post(cc, iv, dmreq);
+
 	return r;
 }
 
@@ -1005,6 +1024,9 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
 		return;
 	}
 
+	if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
+		error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq);
+
 	mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool);
 
 	if (!atomic_dec_and_test(&ctx->pending))
-- 
cgit v1.2.3


From d1f9642381847e2b94caa34c3533211cf36ffcf4 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Thu, 13 Jan 2011 19:59:54 +0000
Subject: dm crypt: add multi key capability

This patch adds generic multikey handling to be used
in following patch for Loop-AES mode compatibility.

This patch extends mapping table to optional keycount and
implements generic multi-key capability.

With more keys defined the <key> string is divided into
several <keycount> sections and these are used for tfms.

The tfm is used according to sector offset
(sector 0->tfm[0], sector 1->tfm[1], sector N->tfm[N modulo keycount])
(only power of two values supported for keycount here).

Because of tfms per-cpu allocation, this mode can be take
a lot of memory on large smp systems.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Cc: Max Vozeler <max@hinterhof.net>
---
 drivers/md/dm-crypt.c | 85 ++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 64 insertions(+), 21 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index e0ebe685be6a..b8b9267c4dbb 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -101,10 +101,9 @@ enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
  */
 struct crypt_cpu {
 	struct ablkcipher_request *req;
-	struct crypto_ablkcipher *tfm;
-
 	/* ESSIV: struct crypto_cipher *essiv_tfm */
 	void *iv_private;
+	struct crypto_ablkcipher *tfms[0];
 };
 
 /*
@@ -143,6 +142,7 @@ struct crypt_config {
 	 * per_cpu_ptr() only.
 	 */
 	struct crypt_cpu __percpu *cpu;
+	unsigned tfms_count;
 
 	/*
 	 * Layout of each crypto request:
@@ -161,6 +161,7 @@ struct crypt_config {
 
 	unsigned long flags;
 	unsigned int key_size;
+	unsigned int key_parts;
 	u8 key[0];
 };
 
@@ -184,7 +185,7 @@ static struct crypt_cpu *this_crypt_config(struct crypt_config *cc)
  */
 static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
 {
-	return __this_cpu_ptr(cc->cpu)->tfm;
+	return __this_cpu_ptr(cc->cpu)->tfms[0];
 }
 
 /*
@@ -567,11 +568,12 @@ static void crypt_alloc_req(struct crypt_config *cc,
 			    struct convert_context *ctx)
 {
 	struct crypt_cpu *this_cc = this_crypt_config(cc);
+	unsigned key_index = ctx->sector & (cc->tfms_count - 1);
 
 	if (!this_cc->req)
 		this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
 
-	ablkcipher_request_set_tfm(this_cc->req, this_cc->tfm);
+	ablkcipher_request_set_tfm(this_cc->req, this_cc->tfms[key_index]);
 	ablkcipher_request_set_callback(this_cc->req,
 	    CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
 	    kcryptd_async_done, dmreq_of_req(cc, this_cc->req));
@@ -1097,15 +1099,48 @@ static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
 	}
 }
 
+static void crypt_free_tfms(struct crypt_config *cc, int cpu)
+{
+	struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu);
+	unsigned i;
+
+	for (i = 0; i < cc->tfms_count; i++)
+		if (cpu_cc->tfms[i] && !IS_ERR(cpu_cc->tfms[i])) {
+			crypto_free_ablkcipher(cpu_cc->tfms[i]);
+			cpu_cc->tfms[i] = NULL;
+		}
+}
+
+static int crypt_alloc_tfms(struct crypt_config *cc, int cpu, char *ciphermode)
+{
+	struct crypt_cpu *cpu_cc = per_cpu_ptr(cc->cpu, cpu);
+	unsigned i;
+	int err;
+
+	for (i = 0; i < cc->tfms_count; i++) {
+		cpu_cc->tfms[i] = crypto_alloc_ablkcipher(ciphermode, 0, 0);
+		if (IS_ERR(cpu_cc->tfms[i])) {
+			err = PTR_ERR(cpu_cc->tfms[i]);
+			crypt_free_tfms(cc, cpu);
+			return err;
+		}
+	}
+
+	return 0;
+}
+