From 935fe0983e09f4f7331ebf5ea4ae2124f6e9f9e8 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 10 Oct 2017 17:02:41 -0400 Subject: md: rename some drivers/md/ files to have an "md-" prefix Motivated by the desire to illiminate the imprecise nature of DM-specific patches being unnecessarily sent to both the MD maintainer and mailing-list. Which is born out of the fact that DM files also reside in drivers/md/ Now all MD-specific files in drivers/md/ start with either "raid" or "md-" and the MAINTAINERS file has been updated accordingly. Shaohua: don't change module name Signed-off-by: Mike Snitzer Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 928e24a07133..10c0d87074f0 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -63,7 +63,7 @@ #include "md.h" #include "raid5.h" #include "raid0.h" -#include "bitmap.h" +#include "md-bitmap.h" #include "raid5-log.h" #define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED) -- cgit v1.2.3 From 235b6003fb28f0dd8e7ed8fbdb088bb548291766 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 17 Oct 2017 16:18:36 +1100 Subject: raid5: Set R5_Expanded on parity devices as well as data. When reshaping a fully degraded raid5/raid6 to a larger nubmer of devices, the new device(s) are not in-sync and so that can make the newly grown stripe appear to be "failed". To avoid this, we set the R5_Expanded flag to say "Even though this device is not fully in-sync, this block is safe so don't treat the device as failed for this stripe". This flag is set for data devices, not not for parity devices. Consequently, if you have a RAID6 with two devices that are partly recovered and a spare, and start a reshape to include the spare, then when the reshape gets past the point where the recovery was up to, it will think the stripes are failed and will get into an infinite loop, failing to make progress. So when contructing parity on an EXPAND_READY stripe, set R5_Expanded. Reported-by: Curt Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 10c0d87074f0..a21dbd22a2fb 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1818,8 +1818,11 @@ static void ops_complete_reconstruct(void *stripe_head_ref) struct r5dev *dev = &sh->dev[i]; if (dev->written || i == pd_idx || i == qd_idx) { - if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) + if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) { set_bit(R5_UPTODATE, &dev->flags); + if (test_bit(STRIPE_EXPAND_READY, &sh->state)) + set_bit(R5_Expanded, &dev->flags); + } if (fua) set_bit(R5_WantFUA, &dev->flags); if (sync) -- cgit v1.2.3 From 230b55fa8d64007339319539f8f8e68114d08529 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 17 Oct 2017 14:24:09 +1100 Subject: md: forbid a RAID5 from having both a bitmap and a journal. Having both a bitmap and a journal is pointless. Attempting to do so can corrupt the bitmap if the journal replay happens before the bitmap is initialized. Rather than try to avoid this corruption, simply refuse to allow arrays with both a bitmap and a journal. So: - if raid5_run sees both are present, fail. - if adding a bitmap finds a journal is present, fail - if adding a journal finds a bitmap is present, fail. Cc: stable@vger.kernel.org (4.10+) Signed-off-by: NeilBrown Tested-by: Joshua Kinard Acked-by: Joshua Kinard Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a21dbd22a2fb..a8732955f130 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7159,6 +7159,13 @@ static int raid5_run(struct mddev *mddev) min_offset_diff = diff; } + if ((test_bit(MD_HAS_JOURNAL, &mddev->flags) || journal_dev) && + (mddev->bitmap_info.offset || mddev->bitmap_info.file)) { + pr_notice("md/raid:%s: array cannot have both journal and bitmap\n", + mdname(mddev)); + return -EINVAL; + } + if (mddev->reshape_position != MaxSector) { /* Check that we can continue the reshape. * Difficulties arise if the stripe we would write to -- cgit v1.2.3 From b3143b9a38d5039bcd1f2d1c94039651bfba8043 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 17 Oct 2017 13:46:43 +1100 Subject: md: move suspend_hi/lo handling into core md code responding to ->suspend_lo and ->suspend_hi is similar to responding to ->suspended. It is best to wait in the common core code without incrementing ->active_io. This allows mddev_suspend()/mddev_resume() to work while requests are waiting for suspend_lo/hi to change. This is will be important after a subsequent patch which uses mddev_suspend() to synchronize updating for suspend_lo/hi. So move the code for testing suspend_lo/hi out of raid1.c and raid5.c, and place it in md.c Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 22 ---------------------- 1 file changed, 22 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a8732955f130..354a969f50a6 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5685,28 +5685,6 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) goto retry; } - if (rw == WRITE && - logical_sector >= mddev->suspend_lo && - logical_sector < mddev->suspend_hi) { - raid5_release_stripe(sh); - /* As the suspend_* range is controlled by - * userspace, we want an interruptible - * wait. - */ - prepare_to_wait(&conf->wait_for_overlap, - &w, TASK_INTERRUPTIBLE); - if (logical_sector >= mddev->suspend_lo && - logical_sector < mddev->suspend_hi) { - sigset_t full, old; - sigfillset(&full); - sigprocmask(SIG_BLOCK, &full, &old); - schedule(); - sigprocmask(SIG_SETMASK, &old, NULL); - do_prepare = true; - } - goto retry; - } - if (test_bit(STRIPE_EXPANDING, &sh->state) || !add_stripe_bio(sh, bi, dd_idx, rw, previous)) { /* Stripe is busy expanding or -- cgit v1.2.3 From b03e0ccb5ab9df3efbe51c87843a1ffbecbafa1f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 19 Oct 2017 12:49:15 +1100 Subject: md: remove special meaning of ->quiesce(.., 2) The '2' argument means "wake up anything that is waiting". This is an inelegant part of the design and was added to help support management of suspend_lo/suspend_hi setting. Now that suspend_lo/hi is managed in mddev_suspend/resume, that need is gone. These is still a couple of places where we call 'quiesce' with an argument of '2', but they can safely be changed to call ->quiesce(.., 1); ->quiesce(.., 0) which achieve the same result at the small cost of pausing IO briefly. This removes a small "optimization" from suspend_{hi,lo}_store, but it isn't clear that optimization served a useful purpose. The code now is a lot clearer. Suggested-by: Shaohua Li Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 354a969f50a6..17ffa1e44c84 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -8008,16 +8008,12 @@ static void raid5_finish_reshape(struct mddev *mddev) } } -static void raid5_quiesce(struct mddev *mddev, int state) +static void raid5_quiesce(struct mddev *mddev, int quiesce) { struct r5conf *conf = mddev->private; - switch(state) { - case 2: /* resume for a suspend */ - wake_up(&conf->wait_for_overlap); - break; - - case 1: /* stop all writes */ + if (quiesce) { + /* stop all writes */ lock_all_device_hash_locks_irq(conf); /* '2' tells resync/reshape to pause so that all * active stripes can drain @@ -8033,17 +8029,15 @@ static void raid5_quiesce(struct mddev *mddev, int state) unlock_all_device_hash_locks_irq(conf); /* allow reshape to continue */ wake_up(&conf->wait_for_overlap); - break; - - case 0: /* re-enable writes */ + } else { + /* re-enable writes */ lock_all_device_hash_locks_irq(conf); conf->quiesce = 0; wake_up(&conf->wait_for_quiescent); wake_up(&conf->wait_for_overlap); unlock_all_device_hash_locks_irq(conf); - break; } - r5l_quiesce(conf->log, state); + r5l_quiesce(conf->log, quiesce); } static void *raid45_takeover_raid0(struct mddev *mddev, int level) -- cgit v1.2.3 From ae89fd3de4793c0dc2ec7e9f26b58a357d74a6c7 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 18 Oct 2017 19:01:11 -0400 Subject: md: use TASK_IDLE instead of blocking signals Hi - I submit this patch for the next merge window: Some times ago, I made a patch f9c79bc05a2a that blocks signals around the schedule() calls in MD. The MD subsystem needs to do an uninterruptible sleep that is not accounted in load average - so we block signals and use interruptible sleep. The kernel has a special TASK_IDLE state for this purpose, so we can use it instead of blocking signals. This patch doesn't fix any bug, it just makes the code simpler. Signed-off-by: Mikulas Patocka Acked-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 17ffa1e44c84..2a4b34941d86 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -55,7 +55,6 @@ #include #include #include -#include #include #include -- cgit v1.2.3 From db0505d320660b6ad92418847e7eca6b61b246ac Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 17 Oct 2017 16:18:36 +1100 Subject: md: be cautious about using ->curr_resync_completed for ->recovery_offset The ->recovery_offset shows how much of a non-InSync device is actually in sync - how much has been recoveryed. When performing a recovery, ->curr_resync and ->curr_resync_completed follow the device address being recovered and so can be used to update ->recovery_offset. When performing a reshape, ->curr_resync* might follow the device addresses (raid5) or might follow array addresses (raid10), so cannot in general be used to set ->recovery_offset. When reshaping backwards, ->curre_resync* measures from the *end* of the array-or-device, so is particularly unhelpful. So change the common code in md.c to only use ->curr_resync_complete for the simple recovery case, and add code to raid5.c to update ->recovery_offset during a forwards reshape. Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2a4b34941d86..1649e82faae2 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5738,6 +5738,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk */ struct r5conf *conf = mddev->private; struct stripe_head *sh; + struct md_rdev *rdev; sector_t first_sector, last_sector; int raid_disks = conf->previous_raid_disks; int data_disks = raid_disks - conf->max_degraded; @@ -5860,6 +5861,15 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk return 0; mddev->reshape_position = conf->reshape_progress; mddev->curr_resync_completed = sector_nr; + if (!mddev->reshape_backwards) + /* Can update recovery_offset */ + rdev_for_each(rdev, mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < sector_nr) + rdev->recovery_offset = sector_nr; + conf->reshape_checkpoint = jiffies; set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); md_wakeup_thread(mddev->thread); @@ -5958,6 +5968,14 @@ finish: goto ret; mddev->reshape_position = conf->reshape_progress; mddev->curr_resync_completed = sector_nr; + if (!mddev->reshape_backwards) + /* Can update recovery_offset */ + rdev_for_each(rdev, mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < sector_nr) + rdev->recovery_offset = sector_nr; conf->reshape_checkpoint = jiffies; set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); md_wakeup_thread(mddev->thread); @@ -7945,6 +7963,7 @@ static void end_reshape(struct r5conf *conf) { if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { + struct md_rdev *rdev; spin_lock_irq(&conf->device_lock); conf->previous_raid_disks = conf->raid_disks; @@ -7952,6 +7971,11 @@ static void end_reshape(struct r5conf *conf) smp_wmb(); conf->reshape_progress = MaxSector; conf->mddev->reshape_position = MaxSector; + rdev_for_each(rdev, conf->mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(In_sync, &rdev->flags)) + rdev->recovery_offset = MaxSector; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); -- cgit v1.2.3