From c40f341f1e7fd4eddcfc5881d94cfa8669071ee6 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Wed, 19 Aug 2015 08:14:42 +1000
Subject: md-cluster: Use a small window for resync

Suspending the entire device for resync could take too long. Resync
in small chunks.

cluster's resync window (32M) is maintained in r1conf as
cluster_sync_low and cluster_sync_high and processed in
raid1's sync_request(). If the current resync is outside the cluster
resync window:

1. Set the cluster_sync_low to curr_resync_completed.
2. Check if the sync will fit in the new window, if not issue a
   wait_barrier() and set cluster_sync_low to sector_nr.
3. Set cluster_sync_high to cluster_sync_low + resync_window.
4. Send a message to all nodes so they may add it in their suspension
   list.

bitmap_cond_end_sync is modified to allow to force a sync inorder
to get the curr_resync_completed uptodate with the sector passed.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid1.c | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 049df6c4a8cc..1dd13bb52940 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -90,6 +90,8 @@ static void r1bio_pool_free(void *r1_bio, void *data)
 #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
 #define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
 #define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
+#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
+#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
 #define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS)
 
 static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
@@ -2488,6 +2490,13 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
 
 		bitmap_close_sync(mddev->bitmap);
 		close_sync(conf);
+
+		if (mddev_is_clustered(mddev)) {
+			conf->cluster_sync_low = 0;
+			conf->cluster_sync_high = 0;
+			/* Send zeros to mark end of resync */
+			md_cluster_ops->resync_info_update(mddev, 0, 0);
+		}
 		return 0;
 	}
 
@@ -2508,7 +2517,12 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
 		return sync_blocks;
 	}
 
-	bitmap_cond_end_sync(mddev->bitmap, sector_nr);
+	/* we are incrementing sector_nr below. To be safe, we check against
+	 * sector_nr + two times RESYNC_SECTORS
+	 */
+
+	bitmap_cond_end_sync(mddev->bitmap, sector_nr,
+		mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
 	r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
 
 	raise_barrier(conf, sector_nr);
@@ -2699,6 +2713,16 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
  bio_full:
 	r1_bio->sectors = nr_sectors;
 
+	if (mddev_is_clustered(mddev) &&
+			conf->cluster_sync_high < sector_nr + nr_sectors) {
+		conf->cluster_sync_low = mddev->curr_resync_completed;
+		conf->cluster_sync_high = conf->cluster_sync_low + CLUSTER_RESYNC_WINDOW_SECTORS;
+		/* Send resync message */
+		md_cluster_ops->resync_info_update(mddev,
+				conf->cluster_sync_low,
+				conf->cluster_sync_high);
+	}
+
 	/* For a user-requested sync, we read all readable devices and do a
 	 * compare
 	 */
-- 
cgit v1.2.3


From 70bcecdb1534a7dcd82503b705c27a048d568c9d Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Fri, 21 Aug 2015 10:33:39 -0500
Subject: md-cluster: Improve md_reload_sb to be less error prone

md_reload_sb is too simplistic and it explicitly needs to determine
the changes made by the writing node. However, there are multiple areas
where a simple reload could fail.

Instead, read the superblock of one of the "good" rdevs and update
the necessary information:

- read the superblock into a newly allocated page, by temporarily
  swapping out rdev->sb_page and calling ->load_super.
- if that fails return
- if it succeeds, call check_sb_changes
  1. iterates over list of active devices and checks the matching
   dev_roles[] value.
   	If that is 'faulty', the device must be  marked as faulty
	 - call md_error to mark the device as faulty. Make sure
	   not to set CHANGE_DEVS and wakeup mddev->thread or else
	   it would initiate a resync process, which is the responsibility
	   of the "primary" node.
	 - clear the Blocked bit
	 - Call remove_and_add_spares() to hot remove the device.
	If the device is 'spare':
	 - call remove_and_add_spares() to get the number of spares
	   added in this operation.
	 - Reduce mddev->degraded to mark the array as not degraded.
  2. reset recovery_cp
- read the rest of the rdevs to update recovery_offset. If recovery_offset
  is equal to MaxSector, call spare_active() to set it In_sync

This required that recovery_offset be initialized to MaxSector, as
opposed to zero so as to communicate the end of sync for a rdev.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/raid1.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 1dd13bb52940..b54fefc85b66 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1592,6 +1592,15 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 	if (rdev->raid_disk >= 0)
 		first = last = rdev->raid_disk;
 
+	/*
+	 * find the disk ... but prefer rdev->saved_raid_disk
+	 * if possible.
+	 */
+	if (rdev->saved_raid_disk >= 0 &&
+	    rdev->saved_raid_disk >= first &&
+	    conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
+		first = last = rdev->saved_raid_disk;
+
 	for (mirror = first; mirror <= last; mirror++) {
 		p = conf->mirrors+mirror;
 		if (!p->rdev) {
-- 
cgit v1.2.3


From c186b128cda5a246da25f474e4689cb2bfacfcac Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Wed, 30 Sep 2015 13:20:35 -0500
Subject: md-cluster: Perform resync/recovery under a DLM lock

Resync or recovery must be performed by only one node at a time.
A DLM lock resource, resync_lockres provides the mutual exclusion
so that only one node performs the recovery/resync at a time.

If a node is unable to get the resync_lockres, because recovery is
being performed by another node, it set MD_RECOVER_NEEDED so as
to schedule recovery in the future.

Remove the debug message in resync_info_update()
used during development.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/raid1.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index b54fefc85b66..a2d813c9eabd 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2503,8 +2503,6 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
 		if (mddev_is_clustered(mddev)) {
 			conf->cluster_sync_low = 0;
 			conf->cluster_sync_high = 0;
-			/* Send zeros to mark end of resync */
-			md_cluster_ops->resync_info_update(mddev, 0, 0);
 		}
 		return 0;
 	}
-- 
cgit v1.2.3


From 28c1b9fdf4562b52fe104384b16238c39c8a8d40 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Thu, 22 Oct 2015 16:01:25 +1100
Subject: md-cluster: Call update_raid_disks() if another node --grow's
 raid_disks

To incorporate --grow feature executed on one node, other nodes need to
acknowledge the change in number of disks. Call update_raid_disks()
to update internal data structures.

This leads to call check_reshape() -> md_allow_write() -> md_update_sb(),
this results in a deadlock. This is done so it can safely allocate memory
(which might trigger writeback which might write to raid1). This is
not required for md with a bitmap.

In the clustered case, we don't perform md_update_sb() in md_allow_write(),
but in do_md_run(). Also we disable safemode for clustered mode.

mddev->recovery_cp need not be set in check_sb_changes() because this
is required only when a node reads another node's bitmap. mddev->recovery_cp
(which is read from sb->resync_offset), is set only if mddev is in_sync.
Since we disabled safemode, in_sync is set to zero.
In a clustered environment, the MD may not be in sync because another
node could be writing to it. So make sure that in_sync is not set in
case of clustered node in __md_stop_writes().

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid1.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'drivers/md/raid1.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index ce2d797f8787..c1ad0b075807 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -3044,9 +3044,11 @@ static int raid1_reshape(struct mddev *mddev)
 		return -EINVAL;
 	}
 
-	err = md_allow_write(mddev);
-	if (err)
-		return err;
+	if (!mddev_is_clustered(mddev)) {
+		err = md_allow_write(mddev);
+		if (err)
+			return err;
+	}
 
 	raid_disks = mddev->raid_disks + mddev->delta_disks;
 
-- 
cgit v1.2.3