From c40f341f1e7fd4eddcfc5881d94cfa8669071ee6 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Wed, 19 Aug 2015 08:14:42 +1000 Subject: md-cluster: Use a small window for resync Suspending the entire device for resync could take too long. Resync in small chunks. cluster's resync window (32M) is maintained in r1conf as cluster_sync_low and cluster_sync_high and processed in raid1's sync_request(). If the current resync is outside the cluster resync window: 1. Set the cluster_sync_low to curr_resync_completed. 2. Check if the sync will fit in the new window, if not issue a wait_barrier() and set cluster_sync_low to sector_nr. 3. Set cluster_sync_high to cluster_sync_low + resync_window. 4. Send a message to all nodes so they may add it in their suspension list. bitmap_cond_end_sync is modified to allow to force a sync inorder to get the curr_resync_completed uptodate with the sector passed. Signed-off-by: Goldwyn Rodrigues Signed-off-by: NeilBrown --- drivers/md/raid1.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 049df6c4a8cc..1dd13bb52940 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -90,6 +90,8 @@ static void r1bio_pool_free(void *r1_bio, void *data) #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) #define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH) #define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9) +#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW) +#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9) #define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS) static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) @@ -2488,6 +2490,13 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp bitmap_close_sync(mddev->bitmap); close_sync(conf); + + if (mddev_is_clustered(mddev)) { + conf->cluster_sync_low = 0; + conf->cluster_sync_high = 0; + /* Send zeros to mark end of resync */ + md_cluster_ops->resync_info_update(mddev, 0, 0); + } return 0; } @@ -2508,7 +2517,12 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp return sync_blocks; } - bitmap_cond_end_sync(mddev->bitmap, sector_nr); + /* we are incrementing sector_nr below. To be safe, we check against + * sector_nr + two times RESYNC_SECTORS + */ + + bitmap_cond_end_sync(mddev->bitmap, sector_nr, + mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); raise_barrier(conf, sector_nr); @@ -2699,6 +2713,16 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp bio_full: r1_bio->sectors = nr_sectors; + if (mddev_is_clustered(mddev) && + conf->cluster_sync_high < sector_nr + nr_sectors) { + conf->cluster_sync_low = mddev->curr_resync_completed; + conf->cluster_sync_high = conf->cluster_sync_low + CLUSTER_RESYNC_WINDOW_SECTORS; + /* Send resync message */ + md_cluster_ops->resync_info_update(mddev, + conf->cluster_sync_low, + conf->cluster_sync_high); + } + /* For a user-requested sync, we read all readable devices and do a * compare */ -- cgit v1.2.3 From 70bcecdb1534a7dcd82503b705c27a048d568c9d Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Fri, 21 Aug 2015 10:33:39 -0500 Subject: md-cluster: Improve md_reload_sb to be less error prone md_reload_sb is too simplistic and it explicitly needs to determine the changes made by the writing node. However, there are multiple areas where a simple reload could fail. Instead, read the superblock of one of the "good" rdevs and update the necessary information: - read the superblock into a newly allocated page, by temporarily swapping out rdev->sb_page and calling ->load_super. - if that fails return - if it succeeds, call check_sb_changes 1. iterates over list of active devices and checks the matching dev_roles[] value. If that is 'faulty', the device must be marked as faulty - call md_error to mark the device as faulty. Make sure not to set CHANGE_DEVS and wakeup mddev->thread or else it would initiate a resync process, which is the responsibility of the "primary" node. - clear the Blocked bit - Call remove_and_add_spares() to hot remove the device. If the device is 'spare': - call remove_and_add_spares() to get the number of spares added in this operation. - Reduce mddev->degraded to mark the array as not degraded. 2. reset recovery_cp - read the rest of the rdevs to update recovery_offset. If recovery_offset is equal to MaxSector, call spare_active() to set it In_sync This required that recovery_offset be initialized to MaxSector, as opposed to zero so as to communicate the end of sync for a rdev. Signed-off-by: Goldwyn Rodrigues --- drivers/md/raid1.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 1dd13bb52940..b54fefc85b66 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1592,6 +1592,15 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; + /* + * find the disk ... but prefer rdev->saved_raid_disk + * if possible. + */ + if (rdev->saved_raid_disk >= 0 && + rdev->saved_raid_disk >= first && + conf->mirrors[rdev->saved_raid_disk].rdev == NULL) + first = last = rdev->saved_raid_disk; + for (mirror = first; mirror <= last; mirror++) { p = conf->mirrors+mirror; if (!p->rdev) { -- cgit v1.2.3 From c186b128cda5a246da25f474e4689cb2bfacfcac Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Wed, 30 Sep 2015 13:20:35 -0500 Subject: md-cluster: Perform resync/recovery under a DLM lock Resync or recovery must be performed by only one node at a time. A DLM lock resource, resync_lockres provides the mutual exclusion so that only one node performs the recovery/resync at a time. If a node is unable to get the resync_lockres, because recovery is being performed by another node, it set MD_RECOVER_NEEDED so as to schedule recovery in the future. Remove the debug message in resync_info_update() used during development. Signed-off-by: Goldwyn Rodrigues --- drivers/md/raid1.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index b54fefc85b66..a2d813c9eabd 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2503,8 +2503,6 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp if (mddev_is_clustered(mddev)) { conf->cluster_sync_low = 0; conf->cluster_sync_high = 0; - /* Send zeros to mark end of resync */ - md_cluster_ops->resync_info_update(mddev, 0, 0); } return 0; } -- cgit v1.2.3 From 28c1b9fdf4562b52fe104384b16238c39c8a8d40 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Thu, 22 Oct 2015 16:01:25 +1100 Subject: md-cluster: Call update_raid_disks() if another node --grow's raid_disks To incorporate --grow feature executed on one node, other nodes need to acknowledge the change in number of disks. Call update_raid_disks() to update internal data structures. This leads to call check_reshape() -> md_allow_write() -> md_update_sb(), this results in a deadlock. This is done so it can safely allocate memory (which might trigger writeback which might write to raid1). This is not required for md with a bitmap. In the clustered case, we don't perform md_update_sb() in md_allow_write(), but in do_md_run(). Also we disable safemode for clustered mode. mddev->recovery_cp need not be set in check_sb_changes() because this is required only when a node reads another node's bitmap. mddev->recovery_cp (which is read from sb->resync_offset), is set only if mddev is in_sync. Since we disabled safemode, in_sync is set to zero. In a clustered environment, the MD may not be in sync because another node could be writing to it. So make sure that in_sync is not set in case of clustered node in __md_stop_writes(). Signed-off-by: Goldwyn Rodrigues Signed-off-by: NeilBrown --- drivers/md/raid1.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index ce2d797f8787..c1ad0b075807 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -3044,9 +3044,11 @@ static int raid1_reshape(struct mddev *mddev) return -EINVAL; } - err = md_allow_write(mddev); - if (err) - return err; + if (!mddev_is_clustered(mddev)) { + err = md_allow_write(mddev); + if (err) + return err; + } raid_disks = mddev->raid_disks + mddev->delta_disks; -- cgit v1.2.3