From 935fe0983e09f4f7331ebf5ea4ae2124f6e9f9e8 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 10 Oct 2017 17:02:41 -0400
Subject: md: rename some drivers/md/ files to have an "md-" prefix

Motivated by the desire to illiminate the imprecise nature of
DM-specific patches being unnecessarily sent to both the MD maintainer
and mailing-list.  Which is born out of the fact that DM files also
reside in drivers/md/

Now all MD-specific files in drivers/md/ start with either "raid" or
"md-" and the MAINTAINERS file has been updated accordingly.

Shaohua: don't change module name

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 928e24a07133..10c0d87074f0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -63,7 +63,7 @@
 #include "md.h"
 #include "raid5.h"
 #include "raid0.h"
-#include "bitmap.h"
+#include "md-bitmap.h"
 #include "raid5-log.h"
 
 #define UNSUPPORTED_MDDEV_FLAGS	(1L << MD_FAILFAST_SUPPORTED)
-- 
cgit v1.2.3


From 235b6003fb28f0dd8e7ed8fbdb088bb548291766 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Tue, 17 Oct 2017 16:18:36 +1100
Subject: raid5: Set R5_Expanded on parity devices as well as data.

When reshaping a fully degraded raid5/raid6 to a larger
nubmer of devices, the new device(s) are not in-sync
and so that can make the newly grown stripe appear to be
"failed".
To avoid this, we set the R5_Expanded flag to say "Even though
this device is not fully in-sync, this block is safe so
don't treat the device as failed for this stripe".
This flag is set for data devices, not not for parity devices.

Consequently, if you have a RAID6 with two devices that are partly
recovered and a spare, and start a reshape to include the spare,
then when the reshape gets past the point where the recovery was
up to, it will think the stripes are failed and will get into
an infinite loop, failing to make progress.

So when contructing parity on an EXPAND_READY stripe,
set R5_Expanded.

Reported-by: Curt <lightspd@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 10c0d87074f0..a21dbd22a2fb 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1818,8 +1818,11 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
 		struct r5dev *dev = &sh->dev[i];
 
 		if (dev->written || i == pd_idx || i == qd_idx) {
-			if (!discard && !test_bit(R5_SkipCopy, &dev->flags))
+			if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) {
 				set_bit(R5_UPTODATE, &dev->flags);
+				if (test_bit(STRIPE_EXPAND_READY, &sh->state))
+					set_bit(R5_Expanded, &dev->flags);
+			}
 			if (fua)
 				set_bit(R5_WantFUA, &dev->flags);
 			if (sync)
-- 
cgit v1.2.3


From 230b55fa8d64007339319539f8f8e68114d08529 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Tue, 17 Oct 2017 14:24:09 +1100
Subject: md: forbid a RAID5 from having both a bitmap and a journal.

Having both a bitmap and a journal is pointless.
Attempting to do so can corrupt the bitmap if the journal
replay happens before the bitmap is initialized.
Rather than try to avoid this corruption, simply
refuse to allow arrays with both a bitmap and a journal.
So:
 - if raid5_run sees both are present, fail.
 - if adding a bitmap finds a journal is present, fail
 - if adding a journal finds a bitmap is present, fail.

Cc: stable@vger.kernel.org (4.10+)
Signed-off-by: NeilBrown <neilb@suse.com>
Tested-by: Joshua Kinard <kumba@gentoo.org>
Acked-by: Joshua Kinard <kumba@gentoo.org>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a21dbd22a2fb..a8732955f130 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7159,6 +7159,13 @@ static int raid5_run(struct mddev *mddev)
 			min_offset_diff = diff;
 	}
 
+	if ((test_bit(MD_HAS_JOURNAL, &mddev->flags) || journal_dev) &&
+	    (mddev->bitmap_info.offset || mddev->bitmap_info.file)) {
+		pr_notice("md/raid:%s: array cannot have both journal and bitmap\n",
+			  mdname(mddev));
+		return -EINVAL;
+	}
+
 	if (mddev->reshape_position != MaxSector) {
 		/* Check that we can continue the reshape.
 		 * Difficulties arise if the stripe we would write to
-- 
cgit v1.2.3


From b3143b9a38d5039bcd1f2d1c94039651bfba8043 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Tue, 17 Oct 2017 13:46:43 +1100
Subject: md: move suspend_hi/lo handling into core md code

responding to ->suspend_lo and ->suspend_hi is similar
to responding to ->suspended.  It is best to wait in
the common core code without incrementing ->active_io.
This allows mddev_suspend()/mddev_resume() to work while
requests are waiting for suspend_lo/hi to change.
This is will be important after a subsequent patch
which uses mddev_suspend() to synchronize updating for
suspend_lo/hi.

So move the code for testing suspend_lo/hi out of raid1.c
and raid5.c, and place it in md.c

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 22 ----------------------
 1 file changed, 22 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a8732955f130..354a969f50a6 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5685,28 +5685,6 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
 				goto retry;
 			}
 
-			if (rw == WRITE &&
-			    logical_sector >= mddev->suspend_lo &&
-			    logical_sector < mddev->suspend_hi) {
-				raid5_release_stripe(sh);
-				/* As the suspend_* range is controlled by
-				 * userspace, we want an interruptible
-				 * wait.
-				 */
-				prepare_to_wait(&conf->wait_for_overlap,
-						&w, TASK_INTERRUPTIBLE);
-				if (logical_sector >= mddev->suspend_lo &&
-				    logical_sector < mddev->suspend_hi) {
-					sigset_t full, old;
-					sigfillset(&full);
-					sigprocmask(SIG_BLOCK, &full, &old);
-					schedule();
-					sigprocmask(SIG_SETMASK, &old, NULL);
-					do_prepare = true;
-				}
-				goto retry;
-			}
-
 			if (test_bit(STRIPE_EXPANDING, &sh->state) ||
 			    !add_stripe_bio(sh, bi, dd_idx, rw, previous)) {
 				/* Stripe is busy expanding or
-- 
cgit v1.2.3


From b03e0ccb5ab9df3efbe51c87843a1ffbecbafa1f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Thu, 19 Oct 2017 12:49:15 +1100
Subject: md: remove special meaning of ->quiesce(.., 2)

The '2' argument means "wake up anything that is waiting".
This is an inelegant part of the design and was added
to help support management of suspend_lo/suspend_hi setting.
Now that suspend_lo/hi is managed in mddev_suspend/resume,
that need is gone.
These is still a couple of places where we call 'quiesce'
with an argument of '2', but they can safely be changed to
call ->quiesce(.., 1); ->quiesce(.., 0) which
achieve the same result at the small cost of pausing IO
briefly.

This removes a small "optimization" from suspend_{hi,lo}_store,
but it isn't clear that optimization served a useful purpose.
The code now is a lot clearer.

Suggested-by: Shaohua Li <shli@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 354a969f50a6..17ffa1e44c84 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -8008,16 +8008,12 @@ static void raid5_finish_reshape(struct mddev *mddev)
 	}
 }
 
-static void raid5_quiesce(struct mddev *mddev, int state)
+static void raid5_quiesce(struct mddev *mddev, int quiesce)
 {
 	struct r5conf *conf = mddev->private;
 
-	switch(state) {
-	case 2: /* resume for a suspend */
-		wake_up(&conf->wait_for_overlap);
-		break;
-
-	case 1: /* stop all writes */
+	if (quiesce) {
+		/* stop all writes */
 		lock_all_device_hash_locks_irq(conf);
 		/* '2' tells resync/reshape to pause so that all
 		 * active stripes can drain
@@ -8033,17 +8029,15 @@ static void raid5_quiesce(struct mddev *mddev, int state)
 		unlock_all_device_hash_locks_irq(conf);
 		/* allow reshape to continue */
 		wake_up(&conf->wait_for_overlap);
-		break;
-
-	case 0: /* re-enable writes */
+	} else {
+		/* re-enable writes */
 		lock_all_device_hash_locks_irq(conf);
 		conf->quiesce = 0;
 		wake_up(&conf->wait_for_quiescent);
 		wake_up(&conf->wait_for_overlap);
 		unlock_all_device_hash_locks_irq(conf);
-		break;
 	}
-	r5l_quiesce(conf->log, state);
+	r5l_quiesce(conf->log, quiesce);
 }
 
 static void *raid45_takeover_raid0(struct mddev *mddev, int level)
-- 
cgit v1.2.3


From ae89fd3de4793c0dc2ec7e9f26b58a357d74a6c7 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Wed, 18 Oct 2017 19:01:11 -0400
Subject: md: use TASK_IDLE instead of blocking signals

Hi - I submit this patch for the next merge window:

Some times ago, I made a patch f9c79bc05a2a that blocks signals around the
schedule() calls in MD. The MD subsystem needs to do an uninterruptible
sleep that is not accounted in load average - so we block signals and use
interruptible sleep.

The kernel has a special TASK_IDLE state for this purpose, so we can use
it instead of blocking signals. This patch doesn't fix any bug, it just
makes the code simpler.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Acked-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 17ffa1e44c84..2a4b34941d86 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -55,7 +55,6 @@
 #include <linux/ratelimit.h>
 #include <linux/nodemask.h>
 #include <linux/flex_array.h>
-#include <linux/sched/signal.h>
 
 #include <trace/events/block.h>
 #include <linux/list_sort.h>
-- 
cgit v1.2.3


From db0505d320660b6ad92418847e7eca6b61b246ac Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Tue, 17 Oct 2017 16:18:36 +1100
Subject: md: be cautious about using ->curr_resync_completed for
 ->recovery_offset

The ->recovery_offset shows how much of a non-InSync device is actually
in sync - how much has been recoveryed.

When performing a recovery, ->curr_resync and ->curr_resync_completed
follow the device address being recovered and so can be used to update
->recovery_offset.

When performing a reshape, ->curr_resync* might follow the device
addresses (raid5) or might follow array addresses (raid10), so cannot
in general be used to set ->recovery_offset.  When reshaping backwards,
->curre_resync* measures from the *end* of the array-or-device, so is
particularly unhelpful.

So change the common code in md.c to only use ->curr_resync_complete
for the simple recovery case, and add code to raid5.c to update
->recovery_offset during a forwards reshape.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2a4b34941d86..1649e82faae2 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5738,6 +5738,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
 	 */
 	struct r5conf *conf = mddev->private;
 	struct stripe_head *sh;
+	struct md_rdev *rdev;
 	sector_t first_sector, last_sector;
 	int raid_disks = conf->previous_raid_disks;
 	int data_disks = raid_disks - conf->max_degraded;
@@ -5860,6 +5861,15 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
 			return 0;
 		mddev->reshape_position = conf->reshape_progress;
 		mddev->curr_resync_completed = sector_nr;
+		if (!mddev->reshape_backwards)
+			/* Can update recovery_offset */
+			rdev_for_each(rdev, mddev)
+				if (rdev->raid_disk >= 0 &&
+				    !test_bit(Journal, &rdev->flags) &&
+				    !test_bit(In_sync, &rdev->flags) &&
+				    rdev->recovery_offset < sector_nr)
+					rdev->recovery_offset = sector_nr;
+
 		conf->reshape_checkpoint = jiffies;
 		set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
 		md_wakeup_thread(mddev->thread);
@@ -5958,6 +5968,14 @@ finish:
 			goto ret;
 		mddev->reshape_position = conf->reshape_progress;
 		mddev->curr_resync_completed = sector_nr;
+		if (!mddev->reshape_backwards)
+			/* Can update recovery_offset */
+			rdev_for_each(rdev, mddev)
+				if (rdev->raid_disk >= 0 &&
+				    !test_bit(Journal, &rdev->flags) &&
+				    !test_bit(In_sync, &rdev->flags) &&
+				    rdev->recovery_offset < sector_nr)
+					rdev->recovery_offset = sector_nr;
 		conf->reshape_checkpoint = jiffies;
 		set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
 		md_wakeup_thread(mddev->thread);
@@ -7945,6 +7963,7 @@ static void end_reshape(struct r5conf *conf)
 {
 
 	if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
+		struct md_rdev *rdev;
 
 		spin_lock_irq(&conf->device_lock);
 		conf->previous_raid_disks = conf->raid_disks;
@@ -7952,6 +7971,11 @@ static void end_reshape(struct r5conf *conf)
 		smp_wmb();
 		conf->reshape_progress = MaxSector;
 		conf->mddev->reshape_position = MaxSector;
+		rdev_for_each(rdev, conf->mddev)
+			if (rdev->raid_disk >= 0 &&
+			    !test_bit(Journal, &rdev->flags) &&
+			    !test_bit(In_sync, &rdev->flags))
+				rdev->recovery_offset = MaxSector;
 		spin_unlock_irq(&conf->device_lock);
 		wake_up(&conf->wait_for_overlap);
 
-- 
cgit v1.2.3