From 1af2048a3e87b4e982c53ad8cfb0c75d1a9c0a73 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Sat, 2 Dec 2017 01:03:48 +0100 Subject: dm raid: fix deadlock caused by premature md_stop_writes() md_stop_writes() is called in raid_presuspend() causing deadlocks on bios submitted afterwards -- which happens on loaded raid sets with conversion requests. Fix by moving md_stop_writes() to raid_postsuspend(). NOTE: when the recovery's frozen (MD_RECOVERY_FROZEN), writes haven't been started (or are already stopped) so don't stop them again. Also remove superfluous readonly setting. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 6319d846e0ad..398314b6c31a 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3613,24 +3613,19 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) blk_limits_io_opt(limits, chunk_size * mddev_data_stripes(rs)); } -static void raid_presuspend(struct dm_target *ti) -{ - struct raid_set *rs = ti->private; - - md_stop_writes(&rs->md); -} - static void raid_postsuspend(struct dm_target *ti) { struct raid_set *rs = ti->private; if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) { + /* Writes have to be stopped before suspending to avoid deadlocks. */ + if (!test_bit(MD_RECOVERY_FROZEN, &rs->md.recovery)) + md_stop_writes(&rs->md); + mddev_lock_nointr(&rs->md); mddev_suspend(&rs->md); mddev_unlock(&rs->md); } - - rs->md.ro = 1; } static void attempt_restore_of_faulty_devices(struct raid_set *rs) @@ -3903,7 +3898,6 @@ static struct target_type raid_target = { .message = raid_message, .iterate_devices = raid_iterate_devices, .io_hints = raid_io_hints, - .presuspend = raid_presuspend, .postsuspend = raid_postsuspend, .preresume = raid_preresume, .resume = raid_resume, -- cgit v1.2.3 From 052b2b1e0689b30af2608d908916a16e9dbd0919 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Sat, 2 Dec 2017 01:03:49 +0100 Subject: dm raid: consume sizes after md_finish_reshape() completes changing them The md raid personalities call md_finish_reshape() at the end of a reshape conversion which adjusts rdev->sectors. Correct/check rdev->sectors before initiating a reshape and raise the recovery pointer accordingly. Otherwise, the DM raid coordinated reshape will fail. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 42 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 4 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 398314b6c31a..c3ea4337bf51 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -2640,12 +2640,19 @@ static int rs_adjust_data_offsets(struct raid_set *rs) * Make sure we got a minimum amount of free sectors per device */ if (rs->data_offset && - to_sector(i_size_read(rdev->bdev->bd_inode)) - rdev->sectors < MIN_FREE_RESHAPE_SPACE) { + to_sector(i_size_read(rdev->bdev->bd_inode)) - rs->md.dev_sectors < MIN_FREE_RESHAPE_SPACE) { rs->ti->error = data_offset ? "No space for forward reshape" : "No space for backward reshape"; return -ENOSPC; } out: + /* + * Raise recovery_cp in case data_offset != 0 to + * avoid false recovery positives in the constructor. + */ + if (rs->md.recovery_cp < rs->md.dev_sectors) + rs->md.recovery_cp += rs->dev[0].rdev.data_offset; + /* Adjust data offsets on all rdevs but on any raid4/5/6 journal device */ rdev_for_each(rdev, &rs->md) { if (!test_bit(Journal, &rdev->flags)) { @@ -2777,6 +2784,23 @@ static int rs_prepare_reshape(struct raid_set *rs) return 0; } +/* Get reshape sectors from data_offsets or raid set */ +static sector_t _get_reshape_sectors(struct raid_set *rs) +{ + struct md_rdev *rdev; + sector_t reshape_sectors = 0; + + rdev_for_each(rdev, &rs->md) + if (!test_bit(Journal, &rdev->flags)) { + reshape_sectors = (rdev->data_offset > rdev->new_data_offset) ? + rdev->data_offset - rdev->new_data_offset : + rdev->new_data_offset - rdev->data_offset; + break; + } + + return max(reshape_sectors, (sector_t) rs->data_offset); +} + /* * * - change raid layout @@ -2788,6 +2812,7 @@ static int rs_setup_reshape(struct raid_set *rs) { int r = 0; unsigned int cur_raid_devs, d; + sector_t reshape_sectors = _get_reshape_sectors(rs); struct mddev *mddev = &rs->md; struct md_rdev *rdev; @@ -2804,13 +2829,13 @@ static int rs_setup_reshape(struct raid_set *rs) /* * Adjust array size: * - * - in case of adding disks, array size has + * - in case of adding disk(s), array size has * to grow after the disk adding reshape, * which'll hapen in the event handler; * reshape will happen forward, so space has to * be available at the beginning of each disk * - * - in case of removing disks, array size + * - in case of removing disk(s), array size * has to shrink before starting the reshape, * which'll happen here; * reshape will happen backward, so space has to @@ -2841,7 +2866,7 @@ static int rs_setup_reshape(struct raid_set *rs) rdev->recovery_offset = rs_is_raid1(rs) ? 0 : MaxSector; } - mddev->reshape_backwards = 0; /* adding disks -> forward reshape */ + mddev->reshape_backwards = 0; /* adding disk(s) -> forward reshape */ /* Remove disk(s) */ } else if (rs->delta_disks < 0) { @@ -2874,6 +2899,15 @@ static int rs_setup_reshape(struct raid_set *rs) mddev->reshape_backwards = rs->dev[0].rdev.data_offset ? 0 : 1; } + /* + * Adjust device size for forward reshape + * because md_finish_reshape() reduces it. + */ + if (!mddev->reshape_backwards) + rdev_for_each(rdev, &rs->md) + if (!test_bit(Journal, &rdev->flags)) + rdev->sectors += reshape_sectors; + return r; } -- cgit v1.2.3 From 7501537ee3a5e6bd01c0084af141e4fa84e652c0 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Sat, 2 Dec 2017 01:03:50 +0100 Subject: dm raid: correct resizing state relative to reshape space in ctr Pay attention to existing reshape space to define if a raid set needs resizing. Otherwise we can hit "Can't resize a reshaping raid set" when a reshape is being requested. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index c3ea4337bf51..c4b0cb181fbc 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -2969,10 +2969,10 @@ static void configure_discard_support(struct raid_set *rs) static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) { int r; - bool resize; + bool resize = false; struct raid_type *rt; unsigned int num_raid_params, num_raid_devs; - sector_t calculated_dev_sectors, rdev_sectors; + sector_t calculated_dev_sectors, rdev_sectors, reshape_sectors; struct raid_set *rs = NULL; const char *arg; struct rs_layout rs_layout; @@ -3055,7 +3055,10 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - resize = calculated_dev_sectors != rdev_sectors; + + reshape_sectors = _get_reshape_sectors(rs); + if (calculated_dev_sectors != rdev_sectors) + resize = calculated_dev_sectors != (reshape_sectors ? rdev_sectors - reshape_sectors : rdev_sectors); INIT_WORK(&rs->md.event_work, do_table_event); ti->private = rs; @@ -3178,7 +3181,6 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) mddev_lock_nointr(&rs->md); r = md_run(&rs->md); rs->md.in_sync = 0; /* Assume already marked dirty */ - if (r) { ti->error = "Failed to run raid array"; mddev_unlock(&rs->md); -- cgit v1.2.3 From 61e06e2c3ebd986050958513bfa40dceed756f8f Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Sat, 2 Dec 2017 01:03:51 +0100 Subject: dm raid: fix raid set size revalidation The raid set size is being revalidated unconditionally before a reshaping conversion is started. MD requires the size to only be reduced in case of a stripe removing (i.e. shrinking) reshape but not when growing because the raid array has to stay small until after the growing reshape finishes. Fix by avoiding the size revalidation in preresume unless a shrinking reshape is requested. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index c4b0cb181fbc..ff75324133fb 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -675,15 +675,11 @@ static struct raid_type *get_raid_type_by_ll(const int level, const int layout) return NULL; } -/* - * Conditionally change bdev capacity of @rs - * in case of a disk add/remove reshape - */ -static void rs_set_capacity(struct raid_set *rs) +/* Adjust rdev sectors */ +static void rs_set_rdev_sectors(struct raid_set *rs) { struct mddev *mddev = &rs->md; struct md_rdev *rdev; - struct gendisk *gendisk = dm_disk(dm_table_get_md(rs->ti->table)); /* * raid10 sets rdev->sector to the device size, which @@ -692,8 +688,16 @@ static void rs_set_capacity(struct raid_set *rs) rdev_for_each(rdev, mddev) if (!test_bit(Journal, &rdev->flags)) rdev->sectors = mddev->dev_sectors; +} - set_capacity(gendisk, mddev->array_sectors); +/* + * Change bdev capacity of @rs in case of a disk add/remove reshape + */ +static void rs_set_capacity(struct raid_set *rs) +{ + struct gendisk *gendisk = dm_disk(dm_table_get_md(rs->ti->table)); + + set_capacity(gendisk, rs->md.array_sectors); revalidate_disk(gendisk); } @@ -1674,8 +1678,11 @@ static void do_table_event(struct work_struct *ws) struct raid_set *rs = container_of(ws, struct raid_set, md.event_work); smp_rmb(); /* Make sure we access most actual mddev properties */ - if (!rs_is_reshaping(rs)) + if (!rs_is_reshaping(rs)) { + if (rs_is_raid10(rs)) + rs_set_rdev_sectors(rs); rs_set_capacity(rs); + } dm_table_event(rs->ti->table); } @@ -3873,11 +3880,10 @@ static int raid_preresume(struct dm_target *ti) mddev->resync_min = mddev->recovery_cp; } - rs_set_capacity(rs); - /* Check for any reshape request unless new raid set */ if (test_and_clear_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags)) { /* Initiate a reshape. */ + rs_set_rdev_sectors(rs); mddev_lock_nointr(mddev); r = rs_start_reshape(rs); mddev_unlock(mddev); @@ -3906,6 +3912,10 @@ static void raid_resume(struct dm_target *ti) mddev->ro = 0; mddev->in_sync = 0; + /* Only reduce raid set size before running a disk removing reshape. */ + if (mddev->delta_disks < 0) + rs_set_capacity(rs); + /* * Keep the RAID set frozen if reshape/rebuild flags are set. * The RAID set is unfrozen once the next table load/resume, -- cgit v1.2.3 From 188a212df1f3a2d7ea9bb0fc0ab4173042c23470 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Sat, 2 Dec 2017 01:03:59 +0100 Subject: dm raid: add component device size checks to avoid runtime failure Check all component data device sizes versus calculated size. Reject if device(s) are too small. Otherwise, MD will fail the operation by accessing beyond the end of the data device. An example use-case is that growing bitmap won't fit any more and the MD runtime will report an error when DM raid should catch this earlier. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index ff75324133fb..2bb0ac7c3fba 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -1580,6 +1580,24 @@ static sector_t __rdev_sectors(struct raid_set *rs) return 0; } +/* Check that calculated dev_sectors fits all component devices. */ +static int _check_data_dev_sectors(struct raid_set *rs) +{ + sector_t ds = ~0; + struct md_rdev *rdev; + + rdev_for_each(rdev, &rs->md) + if (!test_bit(Journal, &rdev->flags) && rdev->bdev) { + ds = min(ds, to_sector(i_size_read(rdev->bdev->bd_inode))); + if (ds < rs->md.dev_sectors) { + rs->ti->error = "Component device(s) too small"; + return -EINVAL; + } + } + + return 0; +} + /* Calculate the sectors per device and per array used for @rs */ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev) { @@ -1629,7 +1647,7 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev) mddev->array_sectors = array_sectors; mddev->dev_sectors = dev_sectors; - return 0; + return _check_data_dev_sectors(rs); bad: rs->ti->error = "Target length not divisible by number of data devices"; return -EINVAL; -- cgit v1.2.3 From d39f0010e40964d959c5157be02839da8a178015 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Sat, 2 Dec 2017 01:03:52 +0100 Subject: dm raid: fix raid_resume() to keep raid set frozen as needed During a reshape request: if userspace reloads a "raid" table multiple times, resulting in multiple superblock reads, the raid set needs to stay frozen until all config changes (chunk size, layout data_offset, delta_disks) have been stored in the superblocks and respective flags cleared. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 2bb0ac7c3fba..bf3c9e3c736d 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3899,7 +3899,7 @@ static int raid_preresume(struct dm_target *ti) } /* Check for any reshape request unless new raid set */ - if (test_and_clear_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags)) { + if (test_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags)) { /* Initiate a reshape. */ rs_set_rdev_sectors(rs); mddev_lock_nointr(mddev); @@ -3941,8 +3941,14 @@ static void raid_resume(struct dm_target *ti) * This ensures that the constructor for the inactive table * retrieves an up-to-date reshape_position. */ - if (!(rs->ctr_flags & RESUME_STAY_FROZEN_FLAGS)) - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + if (!test_and_clear_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags) && + !(rs->ctr_flags & RESUME_STAY_FROZEN_FLAGS)) { + if (rs_is_reshapable(rs)) { + if (!rs_is_reshaping(rs) || _get_reshape_sectors(rs)) + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + } else + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + } if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) { mddev_lock_nointr(mddev); -- cgit v1.2.3 From 67143510a7e3634a23f06a48445d1148b2fdbc4d Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Sat, 2 Dec 2017 01:03:53 +0100 Subject: dm raid: display a consistent copy of the MD status via raid_status() The MD sync thread updates recovery flags providing state of any running, idle, frozen, recovering, reshaping, ... activity it performs and updates respective flags asynchronously versus dm processing raid_status(). To close that race window, take a single copy of the flags and pass it into its callees. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index bf3c9e3c736d..3df7c5bd5a9b 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3300,25 +3300,25 @@ static int raid_map(struct dm_target *ti, struct bio *bio) } /* Return string describing the current sync action of @mddev */ -static const char *decipher_sync_action(struct mddev *mddev) +static const char *decipher_sync_action(struct mddev *mddev, unsigned long recovery) { - if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) + if (test_bit(MD_RECOVERY_FROZEN, &recovery)) return "frozen"; - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || - (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) { - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + if (test_bit(MD_RECOVERY_RUNNING, &recovery) || + (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) { + if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) return "reshape"; - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + if (test_bit(MD_RECOVERY_SYNC, &recovery)) { + if (!test_bit(MD_RECOVERY_REQUESTED, &recovery)) return "resync"; - else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) + else if (test_bit(MD_RECOVERY_CHECK, &recovery)) return "check"; return "repair"; } - if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) + if (test_bit(MD_RECOVERY_RECOVER, &recovery)) return "recover"; } @@ -3350,7 +3350,7 @@ static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev, } /* Helper to return resync/reshape progress for @rs and @array_in_sync */ -static sector_t rs_get_progress(struct raid_set *rs, +static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, sector_t resync_max_sectors, bool *array_in_sync) { sector_t r, curr_resync_completed; @@ -3367,7 +3367,7 @@ static sector_t rs_get_progress(struct raid_set *rs, r = mddev->reshape_position; /* Reshape is relative to the array size */ - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) || + if (test_bit(MD_RECOVERY_RESHAPE, &recovery) || r != MaxSector) { if (r == MaxSector) { *array_in_sync = true; @@ -3382,20 +3382,20 @@ static sector_t rs_get_progress(struct raid_set *rs, } /* Sync is relative to the component device size */ - } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + } else if (test_bit(MD_RECOVERY_RUNNING, &recovery)) r = curr_resync_completed; else r = mddev->recovery_cp; if ((r == MaxSector) || - (test_bit(MD_RECOVERY_DONE, &mddev->recovery) && + (test_bit(MD_RECOVERY_DONE, &recovery) && (mddev->curr_resync_completed == resync_max_sectors))) { /* * Sync complete. */ *array_in_sync = true; r = resync_max_sectors; - } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { + } else if (test_bit(MD_RECOVERY_REQUESTED, &recovery)) { /* * If "check" or "repair" is occurring, the raid set has * undergone an initial sync and the health characters @@ -3438,6 +3438,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, struct r5conf *conf = mddev->private; int i, max_nr_stripes = conf ? conf->max_nr_stripes : 0; bool array_in_sync; + unsigned long recovery; unsigned int raid_param_cnt = 1; /* at least 1 for chunksize */ unsigned int sz = 0; unsigned int rebuild_disks; @@ -3457,13 +3458,14 @@ static void raid_status(struct dm_target *ti, status_type_t type, /* Access most recent mddev properties for status output */ smp_rmb(); + recovery = rs->md.recovery; /* Get sensible max sectors even if raid set not yet started */ resync_max_sectors = test_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags) ? mddev->resync_max_sectors : mddev->dev_sectors; - progress = rs_get_progress(rs, resync_max_sectors, &array_in_sync); + progress = rs_get_progress(rs, recovery, resync_max_sectors, &array_in_sync); resync_mismatches = (mddev->last_sync_action && !strcasecmp(mddev->last_sync_action, "check")) ? atomic64_read(&mddev->resync_mismatches) : 0; - sync_action = decipher_sync_action(&rs->md); + sync_action = decipher_sync_action(&rs->md, recovery); /* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */ for (i = 0; i < rs->raid_disks; i++) -- cgit v1.2.3 From 242ea5ad11a03f2fbdfc2fe422d8e1b0601a8073 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Sat, 2 Dec 2017 01:03:54 +0100 Subject: dm raid: avoid passing array_in_sync variable to raid_status() callees The raid_status() function passes the bool array_in_sync variable around providing synchronization state of the MD array. Replace it with a runtime flag. This will avoid a pattern of having to pass discrete variables to various functions. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 3df7c5bd5a9b..5730b32034aa 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -209,6 +209,7 @@ struct raid_dev { #define RT_FLAG_UPDATE_SBS 3 #define RT_FLAG_RESHAPE_RS 4 #define RT_FLAG_RS_SUSPENDED 5 +#define RT_FLAG_RS_IN_SYNC 6 /* Array elements of 64 bit needed for rebuild/failed disk bits */ #define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8) @@ -3335,7 +3336,7 @@ static const char *decipher_sync_action(struct mddev *mddev, unsigned long recov * 'A' = Alive and in-sync raid set component _or_ alive raid4/5/6 'write_through' journal device * '-' = Non-existing device (i.e. uspace passed '- -' into the ctr) */ -static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev, bool array_in_sync) +static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev) { if (!rdev->bdev) return "-"; @@ -3343,25 +3344,27 @@ static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev, return "D"; else if (test_bit(Journal, &rdev->flags)) return (rs->journal_dev.mode == R5C_JOURNAL_MODE_WRITE_THROUGH) ? "A" : "a"; - else if (!array_in_sync || !test_bit(In_sync, &rdev->flags)) + else if (!test_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags) && + !test_bit(In_sync, &rdev->flags)) return "a"; else return "A"; } -/* Helper to return resync/reshape progress for @rs and @array_in_sync */ +/* Helper to return resync/reshape progress for @rs and runtime flags for raid set in sync / resynching */ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, - sector_t resync_max_sectors, bool *array_in_sync) + sector_t resync_max_sectors) { sector_t r, curr_resync_completed; struct mddev *mddev = &rs->md; + clear_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); + curr_resync_completed = mddev->curr_resync_completed ?: mddev->recovery_cp; - *array_in_sync = false; if (rs_is_raid0(rs)) { r = resync_max_sectors; - *array_in_sync = true; + set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); } else { r = mddev->reshape_position; @@ -3370,7 +3373,7 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, if (test_bit(MD_RECOVERY_RESHAPE, &recovery) || r != MaxSector) { if (r == MaxSector) { - *array_in_sync = true; + set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); r = resync_max_sectors; } else { /* Got to reverse on backward reshape */ @@ -3393,7 +3396,7 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, /* * Sync complete. */ - *array_in_sync = true; + set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); r = resync_max_sectors; } else if (test_bit(MD_RECOVERY_REQUESTED, &recovery)) { /* @@ -3401,7 +3404,7 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, * undergone an initial sync and the health characters * should not be 'a' anymore. */ - *array_in_sync = true; + set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); } else { struct md_rdev *rdev; @@ -3414,7 +3417,7 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, rdev_for_each(rdev, mddev) if (!test_bit(Journal, &rdev->flags) && !test_bit(In_sync, &rdev->flags)) - *array_in_sync = true; + set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); #if 0 r = 0; /* HM FIXME: TESTME: https://bugzilla.redhat.com/show_bug.cgi?id=1210637 ? */ #endif @@ -3437,7 +3440,6 @@ static void raid_status(struct dm_target *ti, status_type_t type, struct mddev *mddev = &rs->md; struct r5conf *conf = mddev->private; int i, max_nr_stripes = conf ? conf->max_nr_stripes : 0; - bool array_in_sync; unsigned long recovery; unsigned int raid_param_cnt = 1; /* at least 1 for chunksize */ unsigned int sz = 0; @@ -3462,14 +3464,14 @@ static void raid_status(struct dm_target *ti, status_type_t type, /* Get sensible max sectors even if raid set not yet started */ resync_max_sectors = test_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags) ? mddev->resync_max_sectors : mddev->dev_sectors; - progress = rs_get_progress(rs, recovery, resync_max_sectors, &array_in_sync); + progress = rs_get_progress(rs, recovery, resync_max_sectors); resync_mismatches = (mddev->last_sync_action && !strcasecmp(mddev->last_sync_action, "check")) ? atomic64_read(&mddev->resync_mismatches) : 0; sync_action = decipher_sync_action(&rs->md, recovery); /* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */ for (i = 0; i < rs->raid_disks; i++) - DMEMIT(__raid_dev_status(rs, &rs->dev[i].rdev, array_in_sync)); + DMEMIT(__raid_dev_status(rs, &rs->dev[i].rdev)); /* * In-sync/Reshape ratio: @@ -3520,7 +3522,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, * v1.10.0+: */ DMEMIT(" %s", test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? - __raid_dev_status(rs, &rs->journal_dev.rdev, 0) : "-"); + __raid_dev_status(rs, &rs->journal_dev.rdev) : "-"); break; case STATUSTYPE_TABLE: -- cgit v1.2.3 From 4102d9de6d375fc27ec70382c4068f4f9f62ce4f Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Sat, 2 Dec 2017 01:03:55 +0100 Subject: dm raid: fix rs_get_progress() synchronization state/ratio Fix various sync state issues causing racy/bogus sync ratio, sync_action ad health chars in dm_status() info output. Sync ratio could be N/N (i.e. 100%) shortly after raid set creation, i.e. creating a new RaidLV or upconverting a linear LV to raid1 thus: "0 2097152 raid raid1 2 Aa 2097162/2097152 recover 0 0 -" instead of: "0 2097152 raid raid1 2 Aa 0/2097152 idle 0 0 -" Sync action could be non-idle, when the MD thread was done with io. Health chars could be 'A' when they should be 'a' for a short time before a resynchonization started. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 95 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 64 insertions(+), 31 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 5730b32034aa..7e7075fb9c28 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -210,6 +210,7 @@ struct raid_dev { #define RT_FLAG_RESHAPE_RS 4 #define RT_FLAG_RS_SUSPENDED 5 #define RT_FLAG_RS_IN_SYNC 6 +#define RT_FLAG_RS_RESYNCING 7 /* Array elements of 64 bit needed for rebuild/failed disk bits */ #define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8) @@ -3306,8 +3307,10 @@ static const char *decipher_sync_action(struct mddev *mddev, unsigned long recov if (test_bit(MD_RECOVERY_FROZEN, &recovery)) return "frozen"; - if (test_bit(MD_RECOVERY_RUNNING, &recovery) || - (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) { + /* The MD sync thread can be done with io but still be running */ + if (!test_bit(MD_RECOVERY_DONE, &recovery) && + (test_bit(MD_RECOVERY_RUNNING, &recovery) || + (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery)))) { if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) return "reshape"; @@ -3344,8 +3347,9 @@ static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev) return "D"; else if (test_bit(Journal, &rdev->flags)) return (rs->journal_dev.mode == R5C_JOURNAL_MODE_WRITE_THROUGH) ? "A" : "a"; - else if (!test_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags) && - !test_bit(In_sync, &rdev->flags)) + else if (test_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags) || + (!test_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags) && + !test_bit(In_sync, &rdev->flags))) return "a"; else return "A"; @@ -3355,49 +3359,70 @@ static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev) static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, sector_t resync_max_sectors) { - sector_t r, curr_resync_completed; + sector_t r; struct mddev *mddev = &rs->md; clear_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); - - curr_resync_completed = mddev->curr_resync_completed ?: mddev->recovery_cp; + clear_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags); if (rs_is_raid0(rs)) { r = resync_max_sectors; set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); } else { - r = mddev->reshape_position; - /* Reshape is relative to the array size */ - if (test_bit(MD_RECOVERY_RESHAPE, &recovery) || - r != MaxSector) { - if (r == MaxSector) { - set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); - r = resync_max_sectors; - } else { + if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) { + r = mddev->reshape_position; + if (r != MaxSector) { /* Got to reverse on backward reshape */ if (mddev->reshape_backwards) r = mddev->array_sectors - r; - /* Devide by # of data stripes */ - sector_div(r, mddev_data_stripes(rs)); + /* Divide by # of data stripes unless raid1 */ + if (!rs_is_raid1(rs)) + sector_div(r, mddev_data_stripes(rs)); } - /* Sync is relative to the component device size */ - } else if (test_bit(MD_RECOVERY_RUNNING, &recovery)) - r = curr_resync_completed; + /* + * Sync/recover is relative to the component device size. + * + * MD_RECOVERY_NEEDED for https://bugzilla.redhat.com/show_bug.cgi?id=1508070 + */ + } else if (test_bit(MD_RECOVERY_NEEDED, &recovery) || + test_bit(MD_RECOVERY_RUNNING, &recovery)) + r = mddev->curr_resync_completed; + else r = mddev->recovery_cp; - if ((r == MaxSector) || - (test_bit(MD_RECOVERY_DONE, &recovery) && - (mddev->curr_resync_completed == resync_max_sectors))) { + if (r >= resync_max_sectors && + (!test_bit(MD_RECOVERY_REQUESTED, &recovery) || + (!test_bit(MD_RECOVERY_FROZEN, &recovery) && + !test_bit(MD_RECOVERY_NEEDED, &recovery) && + !test_bit(MD_RECOVERY_RUNNING, &recovery)))) { /* * Sync complete. */ - set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); - r = resync_max_sectors; + /* In case we have finished recovering, the array is in sync. */ + if (test_bit(MD_RECOVERY_RECOVER, &recovery)) + set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); + + } else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) { + /* + * In case we are recovering, the array is not in sync + * and health chars should show the recovering legs. + */ + ; + + } else if (test_bit(MD_RECOVERY_SYNC, &recovery) && + !test_bit(MD_RECOVERY_REQUESTED, &recovery)) { + /* + * If "resync" is occurring, the raid set + * is or may be out of sync hence the health + * characters shall be 'a'. + */ + set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags); + } else if (test_bit(MD_RECOVERY_REQUESTED, &recovery)) { /* * If "check" or "repair" is occurring, the raid set has @@ -3405,26 +3430,34 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, * should not be 'a' anymore. */ set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); + } else { struct md_rdev *rdev; + /* + * We are idle and recovery is needed, prevent 'A' chars race + * caused by components still set to in-sync by constrcuctor. + */ + if (test_bit(MD_RECOVERY_NEEDED, &recovery)) + set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags); + /* * The raid set may be doing an initial sync, or it may * be rebuilding individual components. If all the * devices are In_sync, then it is the raid set that is * being initialized. */ + set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); rdev_for_each(rdev, mddev) if (!test_bit(Journal, &rdev->flags) && - !test_bit(In_sync, &rdev->flags)) - set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); -#if 0 - r = 0; /* HM FIXME: TESTME: https://bugzilla.redhat.com/show_bug.cgi?id=1210637 ? */ -#endif + !test_bit(In_sync, &rdev->flags)) { + clear_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); + break; + } } } - return r; + return min(r, resync_max_sectors); } /* Helper to return @dev name or "-" if !@dev */ -- cgit v1.2.3 From 78a75d10ef869f4fae70f9b86afce28eb1922529 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Sat, 2 Dec 2017 01:03:56 +0100 Subject: dm raid: small cleanup and remove unsed "struct raid_set" member Move raid_resume()'s setting of 'rw' and 'in_sync' to just prior to mddev_resume(). Also, remove unused 'bitmap_loaded' member from "struct raid_set". No functional changes. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 7e7075fb9c28..1069e617e727 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -227,7 +227,6 @@ struct rs_layout { struct raid_set { struct dm_target *ti; - uint32_t bitmap_loaded; uint32_t stripe_cache_entries; unsigned long ctr_flags; unsigned long runtime_flags; @@ -3964,9 +3963,6 @@ static void raid_resume(struct dm_target *ti) attempt_restore_of_faulty_devices(rs); } - mddev->ro = 0; - mddev->in_sync = 0; - /* Only reduce raid set size before running a disk removing reshape. */ if (mddev->delta_disks < 0) rs_set_capacity(rs); @@ -3989,6 +3985,8 @@ static void raid_resume(struct dm_target *ti) if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) { mddev_lock_nointr(mddev); + mddev->ro = 0; + mddev->in_sync = 0; mddev_resume(mddev); mddev_unlock(mddev); } -- cgit v1.2.3 From b84cf26924cfe405993fc45fa2911cde38f3c3ac Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 4 Dec 2017 10:26:21 -0500 Subject: dm raid: bump target version to reflect numerous fixes Also update Documentation accordingly. Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 1069e617e727..764baa9665bb 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3994,7 +3994,7 @@ static void raid_resume(struct dm_target *ti) static struct target_type raid_target = { .name = "raid", - .version = {1, 13, 0}, + .version = {1, 13, 1}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, -- cgit v1.2.3 From 53bf5384f9b9e37c628f171366959a38c89779ca Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Wed, 13 Dec 2017 17:13:17 +0100 Subject: dm raid: validate current raid sets redundancy Verifying the current raid sets redundancy based on retrieved superblock content has to use the superblock's raid level (e.g. raid0), not the constructor requested one (e.g. raid10). Using the requested raid level of raid10 lead to a "divide error" on raid0 which defines data copies divided by to be zero. Also check for bogus data copies. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 764baa9665bb..b82b7095a671 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -1007,7 +1007,7 @@ static int validate_raid_redundancy(struct raid_set *rs) !rs->dev[i].rdev.sb_page) rebuild_cnt++; - switch (rs->raid_type->level) { + switch (rs->md.level) { case 0: break; case 1: @@ -1022,6 +1022,11 @@ static int validate_raid_redundancy(struct raid_set *rs) break; case 10: copies = raid10_md_layout_to_copies(rs->md.new_layout); + if (copies < 2) { + DMERR("Bogus raid10 data copies < 2!"); + return -EINVAL; + } + if (rebuild_cnt < copies) break; -- cgit v1.2.3 From 11e4723206683ad59f8e9dc7771e7b44a37f7b62 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Wed, 13 Dec 2017 17:13:18 +0100 Subject: dm raid: stop keeping raid set frozen altogether In order to avoid redoing synchronization/recovery/reshape partially, the raid set got frozen until after all passed in table line flags had been cleared. The related table reload sequence had to be precisely followed, or reshaping may lead to data corruption caused by the active mapping carrying on with a reshape when the inactive mapping already had retrieved a stale reshape position. Harden by retrieving the actual resync/recovery/reshape position during resume whilst the active table is suspended thus avoiding to keep the raid set frozen altogether. This prevents superfluous redoing of an already resynchronized or recovered segment and, most importantly, potential for redoing of an already reshaped segment causing data corruption. Fixes: d39f0010e ("dm raid: fix raid_resume() to keep raid set frozen as needed") Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 108 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 70 insertions(+), 38 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index b82b7095a671..109b001407a8 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -29,6 +29,9 @@ */ #define MIN_RAID456_JOURNAL_SPACE (4*2048) +/* Global list of all raid sets */ +LIST_HEAD(raid_sets); + static bool devices_handle_discard_safely = false; /* @@ -105,8 +108,6 @@ struct raid_dev { #define CTR_FLAG_JOURNAL_DEV (1 << __CTR_FLAG_JOURNAL_DEV) #define CTR_FLAG_JOURNAL_MODE (1 << __CTR_FLAG_JOURNAL_MODE) -#define RESUME_STAY_FROZEN_FLAGS (CTR_FLAG_DELTA_DISKS | CTR_FLAG_DATA_OFFSET) - /* * Definitions of various constructor flags to * be used in checks of valid / invalid flags @@ -226,6 +227,7 @@ struct rs_layout { struct raid_set { struct dm_target *ti; + struct list_head list; uint32_t stripe_cache_entries; unsigned long ctr_flags; @@ -271,6 +273,19 @@ static void rs_config_restore(struct raid_set *rs, struct rs_layout *l) mddev->new_chunk_sectors = l->new_chunk_sectors; } +/* Find any raid_set in active slot for @rs on global list */ +static struct raid_set *rs_find_active(struct raid_set *rs) +{ + struct raid_set *r; + struct mapped_device *md = dm_table_get_md(rs->ti->table); + + list_for_each_entry(r, &raid_sets, list) + if (r != rs && dm_table_get_md(r->ti->table) == md) + return r; + + return NULL; +} + /* raid10 algorithms (i.e. formats) */ #define ALGORITHM_RAID10_DEFAULT 0 #define ALGORITHM_RAID10_NEAR 1 @@ -749,6 +764,7 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r mddev_init(&rs->md); + INIT_LIST_HEAD(&rs->list); rs->raid_disks = raid_devs; rs->delta_disks = 0; @@ -766,6 +782,9 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r for (i = 0; i < raid_devs; i++) md_rdev_init(&rs->dev[i].rdev); + /* Add @rs to global list. */ + list_add(&rs->list, &raid_sets); + /* * Remaining items to be initialized by further RAID params: * rs->md.persistent @@ -778,6 +797,7 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r return rs; } +/* Free all @rs allocations and remove it from global list. */ static void raid_set_free(struct raid_set *rs) { int i; @@ -795,6 +815,8 @@ static void raid_set_free(struct raid_set *rs) dm_put_device(rs->ti, rs->dev[i].data_dev); } + list_del(&rs->list); + kfree(rs); } @@ -2371,7 +2393,7 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev) DMERR("new device%s provided without 'rebuild'", new_devs > 1 ? "s" : ""); return -EINVAL; - } else if (rs_is_recovering(rs)) { + } else if (!test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags) && rs_is_recovering(rs)) { DMERR("'rebuild' specified while raid set is not in-sync (recovery_cp=%llu)", (unsigned long long) mddev->recovery_cp); return -EINVAL; @@ -3173,19 +3195,22 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - /* - * We can only prepare for a reshape here, because the - * raid set needs to run to provide the repective reshape - * check functions via its MD personality instance. - * - * So do the reshape check after md_run() succeeded. - */ - r = rs_prepare_reshape(rs); - if (r) - return r; + /* Out-of-place space has to be available to allow for a reshape unless raid1! */ + if (reshape_sectors || rs_is_raid1(rs)) { + /* + * We can only prepare for a reshape here, because the + * raid set needs to run to provide the repective reshape + * check functions via its MD personality instance. + * + * So do the reshape check after md_run() succeeded. + */ + r = rs_prepare_reshape(rs); + if (r) + return r; - /* Reshaping ain't recovery, so disable recovery */ - rs_setup_recovery(rs, MaxSector); + /* Reshaping ain't recovery, so disable recovery */ + rs_setup_recovery(rs, MaxSector); + } rs_set_cur(rs); } else { /* May not set recovery when a device rebuild is requested */ @@ -3395,7 +3420,6 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, } else if (test_bit(MD_RECOVERY_NEEDED, &recovery) || test_bit(MD_RECOVERY_RUNNING, &recovery)) r = mddev->curr_resync_completed; - else r = mddev->recovery_cp; @@ -3904,10 +3928,33 @@ static int raid_preresume(struct dm_target *ti) struct raid_set *rs = ti->private; struct mddev *mddev = &rs->md; - /* This is a resume after a suspend of the set -> it's already started */ + /* This is a resume after a suspend of the set -> it's already started. */ if (test_and_set_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags)) return 0; + if (!test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) { + struct raid_set *rs_active = rs_find_active(rs); + + if (rs_active) { + /* + * In case no rebuilds have been requested + * and an active table slot exists, copy + * current resynchonization completed and + * reshape position pointers across from + * suspended raid set in the active slot. + * + * This resumes the new mapping at current + * offsets to continue recover/reshape without + * necessarily redoing a raid set partially or + * causing data corruption in case of a reshape. + */ + if (rs_active->md.curr_resync_completed != MaxSector) + mddev->curr_resync_completed = rs_active->md.curr_resync_completed; + if (rs_active->md.reshape_position != MaxSector) + mddev->reshape_position = rs_active->md.reshape_position; + } + } + /* * The superblocks need to be updated on disk if the * array is new or new devices got added (thus zeroed @@ -3968,28 +4015,13 @@ static void raid_resume(struct dm_target *ti) attempt_restore_of_faulty_devices(rs); } - /* Only reduce raid set size before running a disk removing reshape. */ - if (mddev->delta_disks < 0) - rs_set_capacity(rs); - - /* - * Keep the RAID set frozen if reshape/rebuild flags are set. - * The RAID set is unfrozen once the next table load/resume, - * which clears the reshape/rebuild flags, occurs. - * This ensures that the constructor for the inactive table - * retrieves an up-to-date reshape_position. - */ - if (!test_and_clear_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags) && - !(rs->ctr_flags & RESUME_STAY_FROZEN_FLAGS)) { - if (rs_is_reshapable(rs)) { - if (!rs_is_reshaping(rs) || _get_reshape_sectors(rs)) - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - } else - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - } - if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) { + /* Only reduce raid set size before running a disk removing reshape. */ + if (mddev->delta_disks < 0) + rs_set_capacity(rs); + mddev_lock_nointr(mddev); + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); mddev->ro = 0; mddev->in_sync = 0; mddev_resume(mddev); @@ -3999,7 +4031,7 @@ static void raid_resume(struct dm_target *ti) static struct target_type raid_target = { .name = "raid", - .version = {1, 13, 1}, + .version = {1, 13, 2}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, -- cgit v1.2.3 From dc15b943d4651bc13b9737bb27283ad9d3b8eeba Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Wed, 13 Dec 2017 17:13:19 +0100 Subject: dm raid: ensure 'a' chars during reshape During reshape, 'A' chars were reported in status rather than 'a'. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 109b001407a8..af4f40de2c0b 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3451,6 +3451,15 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, */ set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags); + } else if (test_bit(MD_RECOVERY_RESHAPE, &recovery) && + !test_bit(MD_RECOVERY_REQUESTED, &recovery)) { + /* + * If "reshape" is occurring, the raid set + * is or may be out of sync hence the health + * characters shall be 'a'. + */ + set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags); + } else if (test_bit(MD_RECOVERY_REQUESTED, &recovery)) { /* * If "check" or "repair" is occurring, the raid set has -- cgit v1.2.3 From 7c29744eccecc2c74c9b4d1ea0a60b4d95229399 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Wed, 13 Dec 2017 17:13:20 +0100 Subject: dm raid: simplify rs_get_progress() No need to calculate the reshaping progress because mddev->curr_resync_completed holds it. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 23 +++-------------------- 1 file changed, 3 insertions(+), 20 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index af4f40de2c0b..21e007c89c2e 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3399,26 +3399,9 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); } else { - /* Reshape is relative to the array size */ - if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) { - r = mddev->reshape_position; - if (r != MaxSector) { - /* Got to reverse on backward reshape */ - if (mddev->reshape_backwards) - r = mddev->array_sectors - r; - - /* Divide by # of data stripes unless raid1 */ - if (!rs_is_raid1(rs)) - sector_div(r, mddev_data_stripes(rs)); - } - - /* - * Sync/recover is relative to the component device size. - * - * MD_RECOVERY_NEEDED for https://bugzilla.redhat.com/show_bug.cgi?id=1508070 - */ - } else if (test_bit(MD_RECOVERY_NEEDED, &recovery) || - test_bit(MD_RECOVERY_RUNNING, &recovery)) + if (test_bit(MD_RECOVERY_NEEDED, &recovery) || + test_bit(MD_RECOVERY_RESHAPE, &recovery) || + test_bit(MD_RECOVERY_RUNNING, &recovery)) r = mddev->curr_resync_completed; else r = mddev->recovery_cp; -- cgit v1.2.3 From 552aa679f265743163fb440c61370a9c51f66c81 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Wed, 13 Dec 2017 17:13:21 +0100 Subject: dm raid: use rs_is_raid*() Cleanup, no functional change. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 21e007c89c2e..7d7dc1723180 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -588,7 +588,7 @@ static const char *raid10_md_layout_to_format(int layout) } /* Return md raid10 algorithm for @name */ -static int raid10_name_to_format(const char *name) +static const int raid10_name_to_format(const char *name) { if (!strcasecmp(name, "near")) return ALGORITHM_RAID10_NEAR; @@ -1913,7 +1913,7 @@ static bool rs_reshape_requested(struct raid_set *rs) if (rs_takeover_requested(rs)) return false; - if (!mddev->level) + if (rs_is_raid0(rs)) return false; change = mddev->new_layout != mddev->layout || @@ -1921,7 +1921,7 @@ static bool rs_reshape_requested(struct raid_set *rs) rs->delta_disks; /* Historical case to support raid1 reshape without delta disks */ - if (mddev->level == 1) { + if (rs_is_raid1(rs)) { if (rs->delta_disks) return !!rs->delta_disks; @@ -1929,7 +1929,7 @@ static bool rs_reshape_requested(struct raid_set *rs) mddev->raid_disks != rs->raid_disks; } - if (mddev->level == 10) + if (rs_is_raid10(rs)) return change && !__is_raid10_far(mddev->new_layout) && rs->delta_disks >= 0; @@ -2742,14 +2742,14 @@ static int rs_setup_takeover(struct raid_set *rs) sector_t new_data_offset = rs->dev[0].rdev.data_offset ? 0 : rs->data_offset; if (rt_is_raid10(rs->raid_type)) { - if (mddev->level == 0) { + if (rs_is_raid0(rs)) { /* Userpace reordered disks -> adjust raid_disk indexes */ __reorder_raid_disk_indexes(rs); /* raid0 -> raid10_far layout */ mddev->layout = raid10_format_to_md_layout(rs, ALGORITHM_RAID10_FAR, rs->raid10_copies); - } else if (mddev->level == 1) + } else if (rs_is_raid1(rs)) /* raid1 -> raid10_near layout */ mddev->layout = raid10_format_to_md_layout(rs, ALGORITHM_RAID10_NEAR, rs->raid_disks); @@ -2977,7 +2977,7 @@ static void configure_discard_support(struct raid_set *rs) /* * XXX: RAID level 4,5,6 require zeroing for safety. */ - raid456 = (rs->md.level == 4 || rs->md.level == 5 || rs->md.level == 6); + raid456 = rs_is_raid456(rs); for (i = 0; i < rs->raid_disks; i++) { struct request_queue *q; @@ -3002,7 +3002,7 @@ static void configure_discard_support(struct raid_set *rs) * RAID1 and RAID10 personalities require bio splitting, * RAID0/4/5/6 don't and process large discard bios properly. */ - ti->split_discard_bios = !!(rs->md.level == 1 || rs->md.level == 10); + ti->split_discard_bios = !!(rs_is_raid1(rs) || rs_is_raid10(rs)); ti->num_discard_bios = 1; } -- cgit v1.2.3 From c06b3e583750fa8f1d214ca50c86d936f6a329c6 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 21 Nov 2017 08:44:35 -0500 Subject: dm: fix comment above dm_accept_partial_bio Clarify that dm_accept_partial_bio isn't allowed for REQ_OP_ZONE_RESET bios. Signed-off-by: NeilBrown Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index de17b7193299..fb9e6d808170 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -997,7 +997,7 @@ static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, /* * A target may call dm_accept_partial_bio only from the map routine. It is - * allowed for all bio types except REQ_PREFLUSH. + * allowed for all bio types except REQ_PREFLUSH and REQ_OP_ZONE_RESET. * * dm_accept_partial_bio informs the dm that the target only wants to process * additional n_sectors sectors of the bio and the rest of the data should be -- cgit v1.2.3 From 80cd17578310dbaf880ae0db9240ad2218c5811a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 30 Aug 2017 08:10:18 +1000 Subject: dm crypt: remove BIOSET_NEED_RESCUER flag The BIOSET_NEED_RESCUER flag is only needed when a make_request_fn might do two allocations from the one bioset, and the second one could block until the first bio completes. dm-crypt does allocate from this bioset inside the dm make_request_fn, but does so using GFP_NOWAIT so that the allocation will not block. So BIOSET_NEED_RESCUER is not needed. Signed-off-by: NeilBrown Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 9fc12f556534..9c53367d2f3e 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -2697,8 +2697,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - cc->bs = bioset_create(MIN_IOS, 0, (BIOSET_NEED_BVECS | - BIOSET_NEED_RESCUER)); + cc->bs = bioset_create(MIN_IOS, 0, BIOSET_NEED_BVECS); if (!cc->bs) { ti->error = "Cannot allocate crypt bioset"; goto bad; -- cgit v1.2.3 From c110a4b6e603ece6134fe436e84957f7a4cd099e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 30 Aug 2017 08:10:18 +1000 Subject: dm io: remove BIOSET_NEED_RESCUER flag from bios bioset The BIOSET_NEED_RESCUER flag is only needed when a make_request_fn might do two allocations from the one bioset, and the second one could block until the first bio completes. dm_io() is called from make_request_fn() context. The closest it comes to multiple allocations is in chunk_io() in dm-snap-persistent. But there the code uses a separate thread to avoid problems. So BIOSET_NEED_RESCUER is not needed. Signed-off-by: NeilBrown Signed-off-by: Mike Snitzer --- drivers/md/dm-io.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index b4357ed4d541..a8d914d5abbe 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -58,8 +58,7 @@ struct dm_io_client *dm_io_client_create(void) if (!client->pool) goto bad; - client->bios = bioset_create(min_ios, 0, (BIOSET_NEED_BVECS | - BIOSET_NEED_RESCUER)); + client->bios = bioset_create(min_ios, 0, BIOSET_NEED_BVECS); if (!client->bios) goto bad; -- cgit v1.2.3 From 18a25da84354c6bb655320de6072c00eda6eb602 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 6 Sep 2017 09:43:28 +1000 Subject: dm: ensure bio submission follows a depth-first tree walk A dm device can, in general, represent a tree of targets, each of which handles a sub-range of the range of blocks handled by the parent. The bio sequencing managed by generic_make_request() requires that bios are generated and handled in a depth-first manner. Each call to a make_request_fn() may submit bios to a single member device, and may submit bios for a reduced region of the same device as the make_request_fn. In particular, any bios submitted to member devices must be expected to be processed in order, so a later one must never wait for an earlier one. This ordering is usually achieved by using bio_split() to reduce a bio to a size that can be completely handled by one target, and resubmitting the remainder to the originating device. bio_queue_split() shows the canonical approach. dm doesn't follow this approach, largely because it has needed to split bios since long before bio_split() was available. It currently can submit bios to separate targets within the one dm_make_request() call. Dependencies between these targets, as can happen with dm-snap, can cause deadlocks if either bios gets stuck behind the other in the queues managed by generic_make_request(). This requires the 'rescue' functionality provided by dm_offload_{start,end}. Some of this requirement can be removed by changing the order of bio submission to follow the canonical approach. That is, if dm finds that it needs to split a bio, the remainder should be sent to generic_make_request() rather than being handled immediately. This delays the handling until the first part is completely processed, so the deadlock problems do not occur. __split_and_process_bio() can be called both from dm_make_request() and from dm_wq_work(). When called from dm_wq_work() the current approach is perfectly satisfactory as each bio will be processed immediately. When called from dm_make_request(), current->bio_list will be non-NULL, and in this case it is best to create a separate "clone" bio for the remainder. When we use bio_clone_bioset() to split off the front part of a bio and chain the two together and submit the remainder to generic_make_request(), it is important that the newly allocated bio is used as the head to be processed immediately, and the original bio gets "bio_advance()"d and sent to generic_make_request() as the remainder. Otherwise, if the newly allocated bio is used as the remainder, and if it then needs to be split again, then the next bio_clone_bioset() call will be made while holding a reference a bio (result of the first clone) from the same bioset. This can potentially exhaust the bioset mempool and result in a memory allocation deadlock. Note that there is no race caused by reassigning cio.io->bio after already calling __map_bio(). This bio will only be dereferenced again after dec_pending() has found io->io_count to be zero, and this cannot happen before the dec_pending() call at the end of __split_and_process_bio(). To provide the clone bio when splitting, we use q->bio_split. This was previously being freed by bio-based dm to avoid having excess rescuer threads. As bio_split bio sets no longer create rescuer threads, there is little cost and much gain from restoring the q->bio_split bio set. Signed-off-by: NeilBrown Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index fb9e6d808170..07dec8ece083 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1498,8 +1498,29 @@ static void __split_and_process_bio(struct mapped_device *md, } else { ci.bio = bio; ci.sector_count = bio_sectors(bio); - while (ci.sector_count && !error) + while (ci.sector_count && !error) { error = __split_and_process_non_flush(&ci); + if (current->bio_list && ci.sector_count && !error) { + /* + * Remainder must be passed to generic_make_request() + * so that it gets handled *after* bios already submitted + * have been completely processed. + * We take a clone of the original to store in + * ci.io->bio to be used by end_io_acct() and + * for dec_pending to use for completion handling. + * As this path is not used for REQ_OP_ZONE_REPORT, + * the usage of io->bio in dm_remap_zone_report() + * won't be affected by this reassignment. + */ + struct bio *b = bio_clone_bioset(bio, GFP_NOIO, + md->queue->bio_split); + ci.io->bio = b; + bio_advance(bio, (bio_sectors(bio) - ci.sector_count) << 9); + bio_chain(b, bio); + generic_make_request(bio); + break; + } + } } /* drop the extra reference count */ @@ -1510,8 +1531,8 @@ static void __split_and_process_bio(struct mapped_device *md, *---------------------------------------------------------------*/ /* - * The request function that just remaps the bio built up by - * dm_merge_bvec. + * The request function that remaps the bio to one target and + * splits off any remainder. */ static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio) { @@ -2034,12 +2055,6 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) case DM_TYPE_DAX_BIO_BASED: dm_init_normal_md_queue(md); blk_queue_make_request(md->queue, dm_make_request); - /* - * DM handles splitting bios as needed. Free the bio_split bioset - * since it won't be used (saves 1 process per bio-based DM device). - */ - bioset_free(md->queue->bio_split); - md->queue->bio_split = NULL; if (type == DM_TYPE_DAX_BIO_BASED) queue_flag_set_unlocked(QUEUE_FLAG_DAX, md->queue); -- cgit v1.2.3 From f31c21e4365c02ccf7226c33ea978cd5dbfc351e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 22 Nov 2017 14:25:18 +1100 Subject: dm: remove unused 'num_write_bios' target interface No DM target provides num_write_bios and none has since dm-cache's brief use in 2013. Having the possibility of num_write_bios > 1 complicates bio allocation. So remove the interface and assume there is only one bio needed. If a target ever needs more, it must provide a suitable bioset and allocate itself based on its particular needs. Signed-off-by: NeilBrown Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 30 ++++++++++-------------------- 1 file changed, 10 insertions(+), 20 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 07dec8ece083..2480c6abe8f1 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1319,32 +1319,22 @@ static int __send_empty_flush(struct clone_info *ci) } static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti, - sector_t sector, unsigned *len) + sector_t sector, unsigned *len) { struct bio *bio = ci->bio; struct dm_target_io *tio; - unsigned target_bio_nr; - unsigned num_target_bios = 1; - int r = 0; + int r; - /* - * Does the target want to receive duplicate copies of the bio? - */ - if (bio_data_dir(bio) == WRITE && ti->num_write_bios) - num_target_bios = ti->num_write_bios(ti, bio); - - for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) { - tio = alloc_tio(ci, ti, target_bio_nr); - tio->len_ptr = len; - r = clone_bio(tio, bio, sector, *len); - if (r < 0) { - free_tio(tio); - break; - } - __map_bio(tio); + tio = alloc_tio(ci, ti, 0); + tio->len_ptr = len; + r = clone_bio(tio, bio, sector, *len); + if (r < 0) { + free_tio(tio); + return r; } + __map_bio(tio); - return r; + return 0; } typedef unsigned (*get_num_bios_fn)(struct dm_target *ti); -- cgit v1.2.3 From 318716ddea0829d3be566efc69d31029c40d51e2 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 22 Nov 2017 14:56:12 -0500 Subject: dm: safely allocate multiple bioset bios DM targets can request multiple bios be sent to them by DM core (see: num_{flush,discard,write_same,write_zeroes}_bios). But until now these bios were allocated in an unsafe manner than could potentially exhaust the DM device's bioset -- in the face of multiple threads each trying to do multiple allocations from the same DM device's bioset. Fix __send_duplicate_bios() by using the new alloc_multiple_bios(). The allocation strategy used by alloc_multiple_bios() models that used by dm-crypt.c:crypt_alloc_buffer(). Neil Brown initially proposed this fix but the implementation has been revised enough that it inappropriate to attribute the entirety of it to him. Suggested-by: NeilBrown Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 57 insertions(+), 12 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 2480c6abe8f1..79b8f072e76a 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1264,16 +1264,17 @@ static int clone_bio(struct dm_target_io *tio, struct bio *bio, return 0; } -static struct dm_target_io *alloc_tio(struct clone_info *ci, - struct dm_target *ti, - unsigned target_bio_nr) +static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *ti, + unsigned target_bio_nr, gfp_t gf