From dbaf971c9cdf10843071a60dcafc1aaab3162354 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 26 Nov 2019 10:08:29 -0500
Subject: dm mpath: remove harmful bio-based optimization

Removes the branching for edge-case where no SCSI device handler
exists.  The __map_bio_fast() method was far too limited, by only
selecting a new pathgroup or path IFF there was a path failure, fix this
be eliminating it in favor of __map_bio().  __map_bio()'s extra SCSI
device handler specific MPATHF_PG_INIT_REQUIRED test is not in the fast
path anyway.

This change restores full path selector functionality for bio-based
configurations that don't haave a SCSI device handler.  But it should be
noted that the path selectors do have an impact on performance for
certain networks that are extremely fast (and don't require frequent
switching).

Fixes: 8d47e65948dd ("dm mpath: remove unnecessary NVMe branching in favor of scsi_dh checks")
Cc: stable@vger.kernel.org
Reported-by: Drew Hastings <dhastings@crucialwebhost.com>
Suggested-by: Martin Wilck <mwilck@suse.de>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c | 37 +------------------------------------
 1 file changed, 1 insertion(+), 36 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index dbcc1e41cd57..e0c32793c248 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -599,45 +599,10 @@ static struct pgpath *__map_bio(struct multipath *m, struct bio *bio)
 	return pgpath;
 }
 
-static struct pgpath *__map_bio_fast(struct multipath *m, struct bio *bio)
-{
-	struct pgpath *pgpath;
-	unsigned long flags;
-
-	/* Do we need to select a new pgpath? */
-	/*
-	 * FIXME: currently only switching path if no path (due to failure, etc)
-	 * - which negates the point of using a path selector
-	 */
-	pgpath = READ_ONCE(m->current_pgpath);
-	if (!pgpath)
-		pgpath = choose_pgpath(m, bio->bi_iter.bi_size);
-
-	if (!pgpath) {
-		if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
-			/* Queue for the daemon to resubmit */
-			spin_lock_irqsave(&m->lock, flags);
-			bio_list_add(&m->queued_bios, bio);
-			spin_unlock_irqrestore(&m->lock, flags);
-			queue_work(kmultipathd, &m->process_queued_bios);
-
-			return ERR_PTR(-EAGAIN);
-		}
-		return NULL;
-	}
-
-	return pgpath;
-}
-
 static int __multipath_map_bio(struct multipath *m, struct bio *bio,
 			       struct dm_mpath_io *mpio)
 {
-	struct pgpath *pgpath;
-
-	if (!m->hw_handler_name)
-		pgpath = __map_bio_fast(m, bio);
-	else
-		pgpath = __map_bio(m, bio);
+	struct pgpath *pgpath = __map_bio(m, bio);
 
 	if (IS_ERR(pgpath))
 		return DM_MAPIO_SUBMITTED;
-- 
cgit v1.2.3


From 474e559567fa631dea8fb8407ab1b6090c903755 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Tue, 3 Dec 2019 19:42:58 +0800
Subject: dm btree: increase rebalance threshold in __rebalance2()

We got the following warnings from thin_check during thin-pool setup:

  $ thin_check /dev/vdb
  examining superblock
  examining devices tree
    missing devices: [1, 84]
      too few entries in btree_node: 41, expected at least 42 (block 138, max_entries = 126)
  examining mapping tree

The phenomenon is the number of entries in one node of details_info tree is
less than (max_entries / 3). And it can be easily reproduced by the following
procedures:

  $ new a thin pool
  $ presume the max entries of details_info tree is 126
  $ new 127 thin devices (e.g. 1~127) to make the root node being full
    and then split
  $ remove the first 43 (e.g. 1~43) thin devices to make the children
    reblance repeatedly
  $ stop the thin pool
  $ thin_check

The root cause is that the B-tree removal procedure in __rebalance2()
doesn't guarantee the invariance: the minimal number of entries in
non-root node should be >= (max_entries / 3).

Simply fix the problem by increasing the rebalance threshold to
make sure the number of entries in each child will be greater
than or equal to (max_entries / 3 + 1), so no matter which
child is used for removal, the number will still be valid.

Cc: stable@vger.kernel.org
Signed-off-by: Hou Tao <houtao1@huawei.com>
Acked-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/persistent-data/dm-btree-remove.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
index 21ea537bd55e..eff04fa23dfa 100644
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -203,7 +203,13 @@ static void __rebalance2(struct dm_btree_info *info, struct btree_node *parent,
 	struct btree_node *right = r->n;
 	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
 	uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
-	unsigned threshold = 2 * merge_threshold(left) + 1;
+	/*
+	 * Ensure the number of entries in each child will be greater
+	 * than or equal to (max_entries / 3 + 1), so no matter which
+	 * child is used for removal, the number will still be not
+	 * less than (max_entries / 3).
+	 */
+	unsigned int threshold = 2 * (merge_threshold(left) + 1);
 
 	if (nr_left + nr_right < threshold) {
 		/*
-- 
cgit v1.2.3


From e6a505f3f9fae572fb3ab3bc486e755ac9cef32c Mon Sep 17 00:00:00 2001
From: Nikos Tsironis <ntsironis@arrikto.com>
Date: Wed, 4 Dec 2019 16:06:52 +0200
Subject: dm clone metadata: Track exact changes per transaction

Extend struct dirty_map with a second bitmap which tracks the exact
regions that were hydrated during the current metadata transaction.

Moreover, fix __flush_dmap() to only commit the metadata of the regions
that were hydrated during the current transaction.

This is required by the following commits to fix a data corruption bug.

Fixes: 7431b7835f55 ("dm: add clone target")
Cc: stable@vger.kernel.org # v5.4+
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-clone-metadata.c | 90 +++++++++++++++++++++++++++++-------------
 1 file changed, 62 insertions(+), 28 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-clone-metadata.c b/drivers/md/dm-clone-metadata.c
index 08c552e5e41b..ee870a425ab8 100644
--- a/drivers/md/dm-clone-metadata.c
+++ b/drivers/md/dm-clone-metadata.c
@@ -67,23 +67,34 @@ struct superblock_disk {
  * To save constantly doing look ups on disk we keep an in core copy of the
  * on-disk bitmap, the region_map.
  *
- * To further reduce metadata I/O overhead we use a second bitmap, the dmap
- * (dirty bitmap), which tracks the dirty words, i.e. longs, of the region_map.
+ * In order to track which regions are hydrated during a metadata transaction,
+ * we use a second set of bitmaps, the dmap (dirty bitmap), which includes two
+ * bitmaps, namely dirty_regions and dirty_words. The dirty_regions bitmap
+ * tracks the regions that got hydrated during the current metadata
+ * transaction. The dirty_words bitmap tracks the dirty words, i.e. longs, of
+ * the dirty_regions bitmap.
+ *
+ * This allows us to precisely track the regions that were hydrated during the
+ * current metadata transaction and update the metadata accordingly, when we
+ * commit the current transaction. This is important because dm-clone should
+ * only commit the metadata of regions that were properly flushed to the
+ * destination device beforehand. Otherwise, in case of a crash, we could end
+ * up with a corrupted dm-clone device.
  *
  * When a region finishes hydrating dm-clone calls
  * dm_clone_set_region_hydrated(), or for discard requests
  * dm_clone_cond_set_range(), which sets the corresponding bits in region_map
  * and dmap.
  *
- * During a metadata commit we scan the dmap for dirty region_map words (longs)
- * and update accordingly the on-disk metadata. Thus, we don't have to flush to
- * disk the whole region_map. We can just flush the dirty region_map words.
+ * During a metadata commit we scan dmap->dirty_words and dmap->dirty_regions
+ * and update the on-disk metadata accordingly. Thus, we don't have to flush to
+ * disk the whole region_map. We can just flush the dirty region_map bits.
  *
- * We use a dirty bitmap, which is smaller than the original region_map, to
- * reduce the amount of memory accesses during a metadata commit. As dm-bitset
- * accesses the on-disk bitmap in 64-bit word granularity, there is no
- * significant benefit in tracking the dirty region_map bits with a smaller
- * granularity.
+ * We use the helper dmap->dirty_words bitmap, which is smaller than the
+ * original region_map, to reduce the amount of memory accesses during a
+ * metadata commit. Moreover, as dm-bitset also accesses the on-disk bitmap in
+ * 64-bit word granularity, the dirty_words bitmap helps us avoid useless disk
+ * accesses.
  *
  * We could update directly the on-disk bitmap, when dm-clone calls either
  * dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), buts this
@@ -92,12 +103,13 @@ struct superblock_disk {
  * e.g., in a hooked overwrite bio's completion routine, and further reduce the
  * I/O completion latency.
  *
- * We maintain two dirty bitmaps. During a metadata commit we atomically swap
- * the currently used dmap with the unused one. This allows the metadata update
- * functions to run concurrently with an ongoing commit.
+ * We maintain two dirty bitmap sets. During a metadata commit we atomically
+ * swap the currently used dmap with the unused one. This allows the metadata
+ * update functions to run concurrently with an ongoing commit.
  */
 struct dirty_map {
 	unsigned long *dirty_words;
+	unsigned long *dirty_regions;
 	unsigned int changed;
 };
 
@@ -461,22 +473,40 @@ static size_t bitmap_size(unsigned long nr_bits)
 	return BITS_TO_LONGS(nr_bits) * sizeof(long);
 }
 
-static int dirty_map_init(struct dm_clone_metadata *cmd)
+static int __dirty_map_init(struct dirty_map *dmap, unsigned long nr_words,
+			    unsigned long nr_regions)
 {
-	cmd->dmap[0].changed = 0;
-	cmd->dmap[0].dirty_words = kvzalloc(bitmap_size(cmd->nr_words), GFP_KERNEL);
+	dmap->changed = 0;
 
-	if (!cmd->dmap[0].dirty_words) {
-		DMERR("Failed to allocate dirty bitmap");
+	dmap->dirty_words = kvzalloc(bitmap_size(nr_words), GFP_KERNEL);
+	if (!dmap->dirty_words)
+		return -ENOMEM;
+
+	dmap->dirty_regions = kvzalloc(bitmap_size(nr_regions), GFP_KERNEL);
+	if (!dmap->dirty_regions) {
+		kvfree(dmap->dirty_words);
 		return -ENOMEM;
 	}
 
-	cmd->dmap[1].changed = 0;
-	cmd->dmap[1].dirty_words = kvzalloc(bitmap_size(cmd->nr_words), GFP_KERNEL);
+	return 0;
+}
 
-	if (!cmd->dmap[1].dirty_words) {
+static void __dirty_map_exit(struct dirty_map *dmap)
+{
+	kvfree(dmap->dirty_words);
+	kvfree(dmap->dirty_regions);
+}
+
+static int dirty_map_init(struct dm_clone_metadata *cmd)
+{
+	if (__dirty_map_init(&cmd->dmap[0], cmd->nr_words, cmd->nr_regions)) {
 		DMERR("Failed to allocate dirty bitmap");
-		kvfree(cmd->dmap[0].dirty_words);
+		return -ENOMEM;
+	}
+
+	if (__dirty_map_init(&cmd->dmap[1], cmd->nr_words, cmd->nr_regions)) {
+		DMERR("Failed to allocate dirty bitmap");
+		__dirty_map_exit(&cmd->dmap[0]);
 		return -ENOMEM;
 	}
 
@@ -487,8 +517,8 @@ static int dirty_map_init(struct dm_clone_metadata *cmd)
 
 static void dirty_map_exit(struct dm_clone_metadata *cmd)
 {
-	kvfree(cmd->dmap[0].dirty_words);
-	kvfree(cmd->dmap[1].dirty_words);
+	__dirty_map_exit(&cmd->dmap[0]);
+	__dirty_map_exit(&cmd->dmap[1]);
 }
 
 static int __load_bitset_in_core(struct dm_clone_metadata *cmd)
@@ -633,21 +663,23 @@ unsigned long dm_clone_find_next_unhydrated_region(struct dm_clone_metadata *cmd
 	return find_next_zero_bit(cmd->region_map, cmd->nr_regions, start);
 }
 
-static int __update_metadata_word(struct dm_clone_metadata *cmd, unsigned long word)
+static int __update_metadata_word(struct dm_clone_metadata *cmd,
+				  unsigned long *dirty_regions,
+				  unsigned long word)
 {
 	int r;
 	unsigned long index = word * BITS_PER_LONG;
 	unsigned long max_index = min(cmd->nr_regions, (word + 1) * BITS_PER_LONG);
 
 	while (index < max_index) {
-		if (test_bit(index, cmd->region_map)) {
+		if (test_bit(index, dirty_regions)) {
 			r = dm_bitset_set_bit(&cmd->bitset_info, cmd->bitset_root,
 					      index, &cmd->bitset_root);
-
 			if (r) {
 				DMERR("dm_bitset_set_bit failed");
 				return r;
 			}
+			__clear_bit(index, dirty_regions);
 		}
 		index++;
 	}
@@ -721,7 +753,7 @@ static int __flush_dmap(struct dm_clone_metadata *cmd, struct dirty_map *dmap)
 		if (word == cmd->nr_words)
 			break;
 
-		r = __update_metadata_word(cmd, word);
+		r = __update_metadata_word(cmd, dmap->dirty_regions, word);
 
 		if (r)
 			return r;
@@ -802,6 +834,7 @@ int dm_clone_set_region_hydrated(struct dm_clone_metadata *cmd, unsigned long re
 	dmap = cmd->current_dmap;
 
 	__set_bit(word, dmap->dirty_words);
+	__set_bit(region_nr, dmap->dirty_regions);
 	__set_bit(region_nr, cmd->region_map);
 	dmap->changed = 1;
 
@@ -830,6 +863,7 @@ int dm_clone_cond_set_range(struct dm_clone_metadata *cmd, unsigned long start,
 		if (!test_bit(region_nr, cmd->region_map)) {
 			word = region_nr / BITS_PER_LONG;
 			__set_bit(word, dmap->dirty_words);
+			__set_bit(region_nr, dmap->dirty_regions);
 			__set_bit(region_nr, cmd->region_map);
 			dmap->changed = 1;
 		}
-- 
cgit v1.2.3


From 8fdbfe8d1690e8a38d497d83a30607d0d90cc15a Mon Sep 17 00:00:00 2001
From: Nikos Tsironis <ntsironis@arrikto.com>
Date: Wed, 4 Dec 2019 16:06:53 +0200
Subject: dm clone metadata: Use a two phase commit

Split the metadata commit in two parts:

1. dm_clone_metadata_pre_commit(): Prepare the current transaction for
   committing. After this is called, all subsequent metadata updates,
   done through either dm_clone_set_region_hydrated() or
   dm_clone_cond_set_range(), will be part of the next transaction.

2. dm_clone_metadata_commit(): Actually commit the current transaction
   to disk and start a new transaction.

This is required by the following commit. It allows dm-clone to flush
the destination device after step (1) to ensure that all freshly
hydrated regions, for which we are updating the metadata, are properly
written to non-volatile storage and won't be lost in case of a crash.

Fixes: 7431b7835f55 ("dm: add clone target")
Cc: stable@vger.kernel.org # v5.4+
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-clone-metadata.c | 46 +++++++++++++++++++++++++++++++++---------
 drivers/md/dm-clone-metadata.h | 17 ++++++++++++++++
 drivers/md/dm-clone-target.c   |  7 ++++++-
 3 files changed, 60 insertions(+), 10 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-clone-metadata.c b/drivers/md/dm-clone-metadata.c
index ee870a425ab8..c05b12110456 100644
--- a/drivers/md/dm-clone-metadata.c
+++ b/drivers/md/dm-clone-metadata.c
@@ -127,6 +127,9 @@ struct dm_clone_metadata {
 	struct dirty_map dmap[2];
 	struct dirty_map *current_dmap;
 
+	/* Protected by lock */
+	struct dirty_map *committing_dmap;
+
 	/*
 	 * In core copy of the on-disk bitmap to save constantly doing look ups
 	 * on disk.
@@ -511,6 +514,7 @@ static int dirty_map_init(struct dm_clone_metadata *cmd)
 	}
 
 	cmd->current_dmap = &cmd->dmap[0];
+	cmd->committing_dmap = NULL;
 
 	return 0;
 }
@@ -775,15 +779,17 @@ static int __flush_dmap(struct dm_clone_metadata *cmd, struct dirty_map *dmap)
 	return 0;
 }
 
-int dm_clone_metadata_commit(struct dm_clone_metadata *cmd)
+int dm_clone_metadata_pre_commit(struct dm_clone_metadata *cmd)
 {
-	int r = -EPERM;
+	int r = 0;
 	struct dirty_map *dmap, *next_dmap;
 
 	down_write(&cmd->lock);
 
-	if (cmd->fail_io || dm_bm_is_read_only(cmd->bm))
+	if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) {
+		r = -EPERM;
 		goto out;
+	}
 
 	/* Get current dirty bitmap */
 	dmap = cmd->current_dmap;
@@ -795,7 +801,7 @@ int dm_clone_metadata_commit(struct dm_clone_metadata *cmd)
 	 * The last commit failed, so we don't have a clean dirty-bitmap to
 	 * use.
 	 */
-	if (WARN_ON(next_dmap->changed)) {
+	if (WARN_ON(next_dmap->changed || cmd->committing_dmap)) {
 		r = -EINVAL;
 		goto out;
 	}
@@ -805,11 +811,33 @@ int dm_clone_metadata_commit(struct dm_clone_metadata *cmd)
 	cmd->current_dmap = next_dmap;
 	spin_unlock_irq(&cmd->bitmap_lock);
 
-	/*
-	 * No one is accessing the old dirty bitmap anymore, so we can flush
-	 * it.
-	 */
-	r = __flush_dmap(cmd, dmap);
+	/* Set old dirty bitmap as currently committing */
+	cmd->committing_dmap = dmap;
+out:
+	up_write(&cmd->lock);
+
+	return r;
+}
+
+int dm_clone_metadata_commit(struct dm_clone_metadata *cmd)
+{
+	int r = -EPERM;
+
+	down_write(&cmd->lock);
+
+	if (cmd->fail_io || dm_bm_is_read_only(cmd->bm))
+		goto out;
+
+	if (WARN_ON(!cmd->committing_dmap)) {
+		r = -EINVAL;
+		goto out;
+	}
+
+	r = __flush_dmap(cmd, cmd->committing_dmap);
+	if (!r) {
+		/* Clear committing dmap */
+		cmd->committing_dmap = NULL;
+	}
 out:
 	up_write(&cmd->lock);
 
diff --git a/drivers/md/dm-clone-metadata.h b/drivers/md/dm-clone-metadata.h
index 3fe50a781c11..14af1ebd853f 100644
--- a/drivers/md/dm-clone-metadata.h
+++ b/drivers/md/dm-clone-metadata.h
@@ -75,7 +75,23 @@ void dm_clone_metadata_close(struct dm_clone_metadata *cmd);
 
 /*
  * Commit dm-clone metadata to disk.
+ *
+ * We use a two phase commit:
+ *
+ * 1. dm_clone_metadata_pre_commit(): Prepare the current transaction for
+ *    committing. After this is called, all subsequent metadata updates, done
+ *    through either dm_clone_set_region_hydrated() or
+ *    dm_clone_cond_set_range(), will be part of the **next** transaction.
+ *
+ * 2. dm_clone_metadata_commit(): Actually commit the current transaction to
+ *    disk and start a new transaction.
+ *
+ * This allows dm-clone to flush the destination device after step (1) to
+ * ensure that all freshly hydrated regions, for which we are updating the
+ * metadata, are properly written to non-volatile storage and won't be lost in
+ * case of a crash.
  */
+int dm_clone_metadata_pre_commit(struct dm_clone_metadata *cmd);
 int dm_clone_metadata_commit(struct dm_clone_metadata *cmd);
 
 /*
@@ -112,6 +128,7 @@ int dm_clone_metadata_abort(struct dm_clone_metadata *cmd);
  * Switches metadata to a read only mode. Once read-only mode has been entered
  * the following functions will return -EPERM:
  *
+ *   dm_clone_metadata_pre_commit()
  *   dm_clone_metadata_commit()
  *   dm_clone_set_region_hydrated()
  *   dm_clone_cond_set_range()
diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c
index b3d89072d21c..613c913c296c 100644
--- a/drivers/md/dm-clone-target.c
+++ b/drivers/md/dm-clone-target.c
@@ -1122,8 +1122,13 @@ static int commit_metadata(struct clone *clone)
 		goto out;
 	}
 
-	r = dm_clone_metadata_commit(clone->cmd);
+	r = dm_clone_metadata_pre_commit(clone->cmd);
+	if (unlikely(r)) {
+		__metadata_operation_failed(clone, "dm_clone_metadata_pre_commit", r);
+		goto out;
+	}
 
+	r = dm_clone_metadata_commit(clone->cmd);
 	if (unlikely(r)) {
 		__metadata_operation_failed(clone, "dm_clone_metadata_commit", r);
 		goto out;
-- 
cgit v1.2.3


From 8b3fd1f53af3591d5624ab9df718369b14d09ed1 Mon Sep 17 00:00:00 2001
From: Nikos Tsironis <ntsironis@arrikto.com>
Date: Wed, 4 Dec 2019 16:06:54 +0200
Subject: dm clone: Flush destination device before committing metadata

dm-clone maintains an on-disk bitmap which records which regions are
valid in the destination device, i.e., which regions have already been
hydrated, or have been written to directly, via user I/O.

Setting a bit in the on-disk bitmap meas the corresponding region is
valid in the destination device and we redirect all I/O regarding it to
the destination device.

Suppose the destination device has a volatile write-back cache and the
following sequence of events occur:

1. A region gets hydrated, either through the background hydration or
   because it was written to directly, via user I/O.

2. The commit timeout expires and we commit the metadata, marking that
   region as valid in the destination device.

3. The system crashes and the destination device's cache has not been
   flushed, meaning the region's data are lost.

The next time we read that region we read it from the destination
device, since the metadata have been successfully committed, but the
data are lost due to the crash, so we read garbage instead of the old
data.

This has several implications:

1. In case of background hydration or of writes with size smaller than
   the region size (which means we first copy the whole region and then
   issue the smaller write), we corrupt data that the user never
   touched.

2. In case of writes with size equal to the device's logical block size,
   we fail to provide atomic sector writes. When the system recovers the
   user will read garbage from the sector instead of the old data or the
   new data.

3. In case of writes without the FUA flag set, after the system
   recovers, the written sectors will contain garbage instead of a
   random mix of sectors containing either old data or new data, thus we
   fail again to provide atomic sector writes.

4. Even when the user flushes the dm-clone device, because we first
   commit the metadata and then pass down the flush, the same risk for
   corruption exists (if the system crashes after the metadata have been
   committed but before the flush is passed down).

The only case which is unaffected is that of writes with size equal to
the region size and with the FUA flag set. But, because FUA writes
trigger metadata commits, this case can trigger the corruption
indirectly.

To solve this and avoid the potential data corruption we flush the
destination device **before** committing the metadata.

This ensures that any freshly hydrated regions, for which we commit the
metadata, are properly written to non-volatile storage and won't be lost
in case of a crash.

Fixes: 7431b7835f55 ("dm: add clone target")
Cc: stable@vger.kernel.org # v5.4+
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-clone-target.c | 46 ++++++++++++++++++++++++++++++++++++++------
 1 file changed, 40 insertions(+), 6 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c
index 613c913c296c..d1e1b5b56b1b 100644
--- a/drivers/md/dm-clone-target.c
+++ b/drivers/md/dm-clone-target.c
@@ -86,6 +86,12 @@ struct clone {
 
 	struct dm_clone_metadata *cmd;
 
+	/*
+	 * bio used to flush the destination device, before committing the
+	 * metadata.
+	 */
+	struct bio flush_bio;
+
 	/* Region hydration hash table */
 	struct hash_table_bucket *ht;
 
@@ -1108,10 +1114,13 @@ static bool need_commit_due_to_time(struct clone *clone)
 /*
  * A non-zero return indicates read-only or fail mode.
  */
-static int commit_metadata(struct clone *clone)
+static int commit_metadata(struct clone *clone, bool *dest_dev_flushed)
 {
 	int r = 0;
 
+	if (dest_dev_flushed)
+		*dest_dev_flushed = false;
+
 	mutex_lock(&clone->commit_lock);
 
 	if (!dm_clone_changed_this_transaction(clone->cmd))
@@ -1128,6 +1137,19 @@ static int commit_metadata(struct clone *clone)
 		goto out;
 	}
 
+	bio_reset(&clone->flush_bio);
+	bio_set_dev(&clone->flush_bio, clone->dest_dev->bdev);
+	clone->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+
+	r = submit_bio_wait(&clone->flush_bio);
+	if (unlikely(r)) {
+		__metadata_operation_failed(clone, "flush destination device", r);
+		goto out;
+	}
+
+	if (dest_dev_flushed)
+		*dest_dev_flushed = true;
+
 	r = dm_clone_metadata_commit(clone->cmd);
 	if (unlikely(r)) {
 		__metadata_operation_failed(clone, "dm_clone_metadata_commit", r);
@@ -1199,6 +1221,7 @@ static void process_deferred_bios(struct clone *clone)
 static void process_deferred_flush_bios(struct clone *clone)
 {
 	struct bio *bio;
+	bool dest_dev_flushed;
 	struct bio_list bios = BIO_EMPTY_LIST;
 	struct bio_list bio_completions = BIO_EMPTY_LIST;
 
@@ -1218,7 +1241,7 @@ static void process_deferred_flush_bios(struct clone *clone)
 	    !(dm_clone_changed_this_transaction(clone->cmd) && need_commit_due_to_time(clone)))
 		return;
 
-	if (commit_metadata(clone)) {
+	if (commit_metadata(clone, &dest_dev_flushed)) {
 		bio_list_merge(&bios, &bio_completions);
 
 		while ((bio = bio_list_pop(&bios)))
@@ -1232,8 +1255,17 @@ static void process_deferred_flush_bios(struct clone *clone)
 	while ((bio = bio_list_pop(&bio_completions)))
 		bio_endio(bio);
 
-	while ((bio = bio_list_pop(&bios)))
-		generic_make_request(bio);
+	while ((bio = bio_list_pop(&bios))) {
+		if ((bio->bi_opf & REQ_PREFLUSH) && dest_dev_flushed) {
+			/* We just flushed the destination device as part of
+			 * the metadata commit, so there is no reason to send
+			 * another flush.
+			 */
+			bio_endio(bio);
+		} else {
+			generic_make_request(bio);
+		}
+	}
 }
 
 static void do_worker(struct work_struct *work)
@@ -1405,7 +1437,7 @@ static void clone_status(struct dm_target *ti, status_type_t type,
 
 		/* Commit to ensure statistics aren't out-of-date */
 		if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
-			(void) commit_metadata(clone);
+			(void) commit_metadata(clone, NULL);
 
 		r = dm_clone_get_free_metadata_block_count(clone->cmd, &nr_free_metadata_blocks);
 
@@ -1839,6 +1871,7 @@ static int clone_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	bio_list_init(&clone->deferred_flush_completions);
 	clone->hydration_offset = 0;
 	atomic_set(&clone->hydrations_in_flight, 0);
+	bio_init(&clone->flush_bio, NULL, 0);
 
 	clone->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
 	if (!clone->wq) {
@@ -1912,6 +1945,7 @@ static void clone_dtr(struct dm_target *ti)
 	struct clone *clone = ti->private;
 
 	mutex_destroy(&clone->commit_lock);
+	bio_uninit(&clone->flush_bio);
 
 	for (i = 0; i < clone->nr_ctr_args; i++)
 		kfree(clone->ctr_args[i]);
@@ -1966,7 +2000,7 @@ static void clone_postsuspend(struct dm_target *ti)
 	wait_event(clone->hydration_stopped, !atomic_read(&clone->hydrations_in_flight));
 	flush_workqueue(clone->wq);
 
-	(void) commit_metadata(clone);
+	(void) commit_metadata(clone, NULL);
 }
 
 static void clone_resume(struct dm_target *ti)
-- 
cgit v1.2.3


From ecda7c0280e6b3398459dc589b9a41c1adb45529 Mon Sep 17 00:00:00 2001
From: Nikos Tsironis <ntsironis@arrikto.com>
Date: Wed, 4 Dec 2019 16:07:41 +0200
Subject: dm thin metadata: Add support for a pre-commit callback

Add support for one pre-commit callback which is run right before the
metadata are committed.

This allows the thin provisioning target to run a callback before the
metadata are committed and is required by the next commit.

Cc: stable@vger.kernel.org
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin-metadata.c | 29 +++++++++++++++++++++++++++++
 drivers/md/dm-thin-metadata.h |  7 +++++++
 2 files changed, 36 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 4c68a7b93d5e..b88d6d701f5b 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -188,6 +188,15 @@ struct dm_pool_metadata {
 	unsigned long flags;
 	sector_t data_block_size;
 
+	/*
+	 * Pre-commit callback.
+	 *
+	 * This allows the thin provisioning target to run a callback before
+	 * the metadata are committed.
+	 */
+	dm_pool_pre_commit_fn pre_commit_fn;
+	void *pre_commit_context;
+
 	/*
 	 * We reserve a section of the metadata for commit overhead.
 	 * All reported space does *not* include this.
@@ -826,6 +835,14 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
 	if (unlikely(!pmd->in_service))
 		return 0;
 
+	if (pmd->pre_commit_fn) {
+		r = pmd->pre_commit_fn(pmd->pre_commit_context);
+		if (r < 0) {
+			DMERR("pre-commit callback failed");
+			return r;
+		}
+	}
+
 	r = __write_changed_details(pmd);
 	if (r < 0)
 		return r;
@@ -892,6 +909,8 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
 	pmd->in_service = false;
 	pmd->bdev = bdev;
 	pmd->data_block_size = data_block_size;
+	pmd->pre_commit_fn = NULL;
+	pmd->pre_commit_context = NULL;
 
 	r = __create_persistent_data_objects(pmd, format_device);
 	if (r) {
@@ -2044,6 +2063,16 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
 	return r;
 }
 
+void dm_pool_register_pre_commit_callback(struct dm_pool_metadata *pmd,
+					  dm_pool_pre_commit_fn fn,
+					  void *context)
+{
+	pmd_write_lock_in_core(pmd);
+	pmd->pre_commit_fn = fn;
+	pmd->pre_commit_context = context;
+	pmd_write_unlock(pmd);
+}
+
 int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
 {
 	int r = -EINVAL;
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index f6be0d733c20..7ef56bd2a7e3 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -230,6 +230,13 @@ bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd);
  */
 void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd);
 
+/* Pre-commit callback */
+typedef int (*dm_pool_pre_commit_fn)(void *context);
+
+void dm_pool_register_pre_commit_callback(struct dm_pool_metadata *pmd,
+					  dm_pool_pre_commit_fn fn,
+					  void *context);
+
 /*----------------------------------------------------------------*/
 
 #endif
-- 
cgit v1.2.3


From 694cfe7f31db36912725e63a38a5179c8628a496 Mon Sep 17 00:00:00 2001
From: Nikos Tsironis <ntsironis@arrikto.com>
Date: Wed, 4 Dec 2019 16:07:42 +0200
Subject: dm thin: Flush data device before committing metadata

The thin provisioning target maintains per thin device mappings that map
virtual blocks to data blocks in the data device.

When we write to a shared block, in case of internal snapshots, or
provision a new block, in case of external snapshots, we copy the shared
block to a new data block (COW), update the mapping for the relevant
virtual block and then issue the write to the new data block.

Suppose the data device has a volatile write-back cache and the
following sequence of events occur:

1. We write to a shared block
2. A new data block is allocated
3. We copy the shared block to the new data block using kcopyd (COW)
4. We insert the new mapping for the virtual block in the btree for that
   thin device.
5. The commit timeout expires and we commit the metadata, that now
   includes the new mapping from step (4).
6. The system crashes and the data device's cache has not been flushed,
   meaning that the COWed data are lost.

The next time we read that virtual block of the thin device we read it
from the data block allocated in step (2), since the metadata have been
successfully committed. The data are lost due to the crash, so we read
garbage instead of the old, shared data.

This has the following implications:

1. In case of writes to shared blocks, with size smaller than the pool's
   block size (which means we first copy the whole block and then issue
   the smaller write), we corrupt data that the user never touched.

2. In case of writes to shared blocks, with size equal to the device's
   logical block size, we fail to provide atomic sector writes. When the
   system recovers the user will read garbage from that sector instead
   of the old data or the new data.

3. Even for writes to shared blocks, with size equal to the pool's block
   size (overwrites), after the system recovers, the written sectors
   will contain garbage instead of a random mix of sectors containing
   either old data or new data, thus we fail again to provide atomic
   sectors writes.

4. Even when the user flushes the thin device, because we first commit
   the metadata and then pass down the flush, the same risk for
   corruption exists (if the system crashes after the metadata have been
   committed but before the flush is passed down to the data device.)

The only case which is unaffected is that of writes with size equal to
the pool's block size and with the FUA flag set. But, because FUA writes
trigger metadata commits, this case can trigger the corruption
indirectly.

Moreover, apart from internal and external snapshots, the same issue
exists for newly provisioned blocks, when block zeroing is enabled.
After the system recovers the provisioned blocks might contain garbage
instead of zeroes.

To solve this and avoid the potential data corruption we flush the
pool's data device **before** committing its metadata.

This ensures that the data blocks of any newly inserted mappings are
properly written to non-volatile storage and won't be lost in case of a
crash.

Cc: stable@vger.kernel.org
Signed-off-by: Nikos Tsironis <ntsironis@arrikto.com>
Acked-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin.c | 42 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 40 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 5a2c494cb552..57626c27a54b 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -328,6 +328,7 @@ struct pool_c {
 	dm_block_t low_water_blocks;
 	struct pool_features requested_pf; /* Features requested during table load */
 	struct pool_features adjusted_pf;  /* Features used after adjusting for constituent devices */
+	struct bio flush_bio;
 };
 
 /*
@@ -2383,8 +2384,16 @@ static void process_deferred_bios(struct pool *pool)
 	while ((bio = bio_list_pop(&bio_completions)))
 		bio_endio(bio);
 
-	while ((bio = bio_list_pop(&bios)))
-		generic_make_request(bio);
+	while ((bio = bio_list_pop(&bios))) {
+		/*
+		 * The data device was flushed as part of metadata commit,
+		 * so complete redundant flushes immediately.
+		 */
+		if (bio->bi_opf & REQ_PREFLUSH)
+			bio_endio(bio);
+		else
+			generic_make_request(bio);
+	}
 }
 
 static void do_worker(struct work_struct *ws)
@@ -3115,6 +3124,7 @@ static void pool_dtr(struct dm_target *ti)
 	__pool_dec(pt->pool);
 	dm_put_device(ti, pt->metadata_dev);
 	dm_put_device(ti, pt->data_dev);
+	bio_uninit(&pt->flush_bio);
 	kfree(pt);
 
 	mutex_unlock(&dm_thin_pool_table.mutex);
@@ -3180,6 +3190,29 @@ static void metadata_low_callback(void *context)
 	dm_table_event(pool->ti->table);
 }
 
+/*
+ * We need to flush the data device **before** committing the metadata.
+ *
+ * This ensures that the data blocks of any newly inserted mappings are
+ * properly written to non-volatile storage and won't be lost in case of a
+ * crash.
+ *
+ * Failure to do so can result in data corruption in the case of internal or
+ * external snapshots and in the case of newly provisioned blocks, when block
+ * zeroing is enabled.
+ */
+static int metadata_pre_commit_callback(void *context)
+{
+	struct pool_c *pt = context;
+	struct bio *flush_bio = &pt->flush_bio;
+
+	bio_reset(flush_bio);
+	bio_set_dev(flush_bio, pt->data_dev->bdev);
+	flush_bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+
+	return submit_bio_wait(flush_bio);
+}
+
 static sector_t get_dev_size(struct block_device *bdev)
 {
 	return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
@@ -3348,6 +3381,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	pt->data_dev = data_dev;
 	pt->low_water_blocks = low_water_blocks;
 	pt->adjusted_pf = pt->requested_pf = pf;
+	bio_init(&pt->flush_bio, NULL, 0);
 	ti->num_flush_bios = 1;
 
 	/*
@@ -3374,6 +3408,10 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	if (r)
 		goto out_flags_changed;
 
+	dm_pool_register_pre_commit_callback(pt->pool->pmd,
+					     metadata_pre_commit_callback,
+					     pt);
+
 	pt->callbacks.congested_fn = pool_is_congested;
 	dm_table_add_target_callbacks(ti->table, &pt->callbacks);
 
-- 
cgit v1.2.3