From cc27b0c78c79680d128dbac79de0d40556d041bb Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 5 Jun 2017 16:49:39 +1000 Subject: md: fix deadlock between mddev_suspend() and md_write_start() If mddev_suspend() races with md_write_start() we can deadlock with mddev_suspend() waiting for the request that is currently in md_write_start() to complete the ->make_request() call, and md_write_start() waiting for the metadata to be updated to mark the array as 'dirty'. As metadata updates done by md_check_recovery() only happen then the mddev_lock() can be claimed, and as mddev_suspend() is often called with the lock held, these threads wait indefinitely for each other. We fix this by having md_write_start() abort if mddev_suspend() is happening, and ->make_request() aborts if md_write_start() aborted. md_make_request() can detect this abort, decrease the ->active_io count, and wait for mddev_suspend(). Reported-by: Nix Fix: 68866e425be2(MD: no sync IO while suspended) Cc: stable@vger.kernel.org Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 87edc342ccb3..d7847014821a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -277,7 +277,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) bio_endio(bio); return BLK_QC_T_NONE; } - smp_rmb(); /* Ensure implications of 'active' are visible */ +check_suspended: rcu_read_lock(); if (mddev->suspended) { DEFINE_WAIT(__wait); @@ -302,7 +302,11 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) sectors = bio_sectors(bio); /* bio could be mergeable after passing to underlayer */ bio->bi_opf &= ~REQ_NOMERGE; - mddev->pers->make_request(mddev, bio); + if (!mddev->pers->make_request(mddev, bio)) { + atomic_dec(&mddev->active_io); + wake_up(&mddev->sb_wait); + goto check_suspended; + } cpu = part_stat_lock(); part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); @@ -327,6 +331,7 @@ void mddev_suspend(struct mddev *mddev) if (mddev->suspended++) return; synchronize_rcu(); + wake_up(&mddev->sb_wait); wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); mddev->pers->quiesce(mddev, 1); @@ -7950,12 +7955,14 @@ EXPORT_SYMBOL(md_done_sync); * If we need to update some array metadata (e.g. 'active' flag * in superblock) before writing, schedule a superblock update * and wait for it to complete. + * A return value of 'false' means that the write wasn't recorded + * and cannot proceed as the array is being suspend. */ -void md_write_start(struct mddev *mddev, struct bio *bi) +bool md_write_start(struct mddev *mddev, struct bio *bi) { int did_change = 0; if (bio_data_dir(bi) != WRITE) - return; + return true; BUG_ON(mddev->ro == 1); if (mddev->ro == 2) { @@ -7987,7 +7994,12 @@ void md_write_start(struct mddev *mddev, struct bio *bi) if (did_change) sysfs_notify_dirent_safe(mddev->sysfs_state); wait_event(mddev->sb_wait, - !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) && !mddev->suspended); + if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { + percpu_ref_put(&mddev->writes_pending); + return false; + } + return true; } EXPORT_SYMBOL(md_write_start); -- cgit v1.2.3 From 8df72024393cdba2543e55d51297f2b2c4ede46f Mon Sep 17 00:00:00 2001 From: Lidong Zhong Date: Mon, 12 Jun 2017 10:45:55 +0800 Subject: md: change the initialization value for a spare device spot to MD_DISK_ROLE_SPARE The value for spare spot of sb->dev_roles is changed from MD_DISK_ROLE_FAULTY to MD_DISK_ROLE_SPARE to keep align with the value when the superblock is firstly created in userspace. Signed-off-by: Lidong Zhong Signed-off-by: Shaohua Li --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index d7847014821a..528c1452ce54 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1857,7 +1857,7 @@ retry: max_dev = le32_to_cpu(sb->max_dev); for (i=0; idev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); + sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); -- cgit v1.2.3 From 5a85071c2cbcc7d8d8f764b33bf64c76e47d268d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 21 Jun 2017 09:12:21 +1000 Subject: md: use a separate bio_set for synchronous IO. md devices allocate a bio_set and use it for two distinct purposes. mddev->bio_set is used to clone bios as part of sending upper level requests down to lower level devices, and it is also use for synchronous IO such as superblock and bitmap updates, and for correcting read errors. This multiple usage can lead to deadlocks. It is likely that cloned bios might be queued for write and to be waiting for a metadata update before the write can be permitted. If the cloning exhausted mddev->bio_set, the metadata update may not be able to proceed. This scenario has been seen during heavy testing, with lots of IO and lots of memory pressure. Address this by adding a new bio_set specifically for synchronous IO. All synchronous IO goes directly to the underlying device and is not queued at the md level, so request using entries from the new mddev->sync_set will complete in a timely fashion. Requests that use mddev->bio_set will sometimes need to wait for synchronous IO, but will no longer risk deadlocking that iO. Also: small simplification in mddev_put(): there is no need to wait until the spinlock is released before calling bioset_free(). Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 528c1452ce54..65ad837aeb54 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -203,6 +203,14 @@ struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, } EXPORT_SYMBOL_GPL(bio_alloc_mddev); +static struct bio *md_bio_alloc_sync(struct mddev *mddev) +{ + if (!mddev->sync_set) + return bio_alloc(GFP_NOIO, 1); + + return bio_alloc_bioset(GFP_NOIO, 1, mddev->sync_set); +} + /* * We have a system wide 'event count' that is incremented * on any 'interesting' event, and readers of /proc/mdstat @@ -467,8 +475,6 @@ static void mddev_delayed_delete(struct work_struct *ws); static void mddev_put(struct mddev *mddev) { - struct bio_set *bs = NULL; - if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) return; if (!mddev->raid_disks && list_empty(&mddev->disks) && @@ -476,8 +482,12 @@ static void mddev_put(struct mddev *mddev) /* Array is not configured at all, and not held active, * so destroy it */ list_del_init(&mddev->all_mddevs); - bs = mddev->bio_set; + if (mddev->bio_set) + bioset_free(mddev->bio_set); + if (mddev->sync_set) + bioset_free(mddev->sync_set); mddev->bio_set = NULL; + mddev->sync_set = NULL; if (mddev->gendisk) { /* We did a probe so need to clean up. Call * queue_work inside the spinlock so that @@ -490,8 +500,6 @@ static void mddev_put(struct mddev *mddev) kfree(mddev); } spin_unlock(&all_mddevs_lock); - if (bs) - bioset_free(bs); } static void md_safemode_timeout(unsigned long data); @@ -756,7 +764,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, if (test_bit(Faulty, &rdev->flags)) return; - bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); + bio = md_bio_alloc_sync(mddev); atomic_inc(&rdev->nr_pending); @@ -788,7 +796,7 @@ int md_super_wait(struct mddev *mddev) int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct page *page, int op, int op_flags, bool metadata_op) { - struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); + struct bio *bio = md_bio_alloc_sync(rdev->mddev); int ret; bio->bi_bdev = (metadata_op && rdev->meta_bdev) ? @@ -5437,6 +5445,11 @@ int md_run(struct mddev *mddev) if (!mddev->bio_set) return -ENOMEM; } + if (mddev->sync_set == NULL) { + mddev->sync_set = bioset_create(BIO_POOL_SIZE, 0); + if (!mddev->sync_set) + return -ENOMEM; + } spin_lock(&pers_lock); pers = find_pers(mddev->level, mddev->clevel); -- cgit v1.2.3 From 7f053a6a745557b3f3ad63e9d28ba85c3c0b1563 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 23 Jun 2017 09:19:49 -0700 Subject: MD: fix a null dereference rdev->mddev could be null in start time. Reported-by: Ming Lei Fix: 5a85071c2cbc(md: use a separate bio_set for synchronous IO.) Cc: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 65ad837aeb54..092b48f8095e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -205,7 +205,7 @@ EXPORT_SYMBOL_GPL(bio_alloc_mddev); static struct bio *md_bio_alloc_sync(struct mddev *mddev) { - if (!mddev->sync_set) + if (!mddev || !mddev->sync_set) return bio_alloc(GFP_NOIO, 1); return bio_alloc_bioset(GFP_NOIO, 1, mddev->sync_set); -- cgit v1.2.3 From 7184ef8bab0cb865c3cea9dd1a675771145df0af Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 3 Jul 2017 14:34:23 -0700 Subject: MD: fix sleep in atomic bioset_free() will take a mutex, so can't get called with spinlock hold. Fix: 5a85071c2cbc(md: use a separate bio_set for synchronous IO.) Cc: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 092b48f8095e..66f6b928a80b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -475,6 +475,8 @@ static void mddev_delayed_delete(struct work_struct *ws); static void mddev_put(struct mddev *mddev) { + struct bio_set *bs = NULL, *sync_bs = NULL; + if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) return; if (!mddev->raid_disks && list_empty(&mddev->disks) && @@ -482,10 +484,8 @@ static void mddev_put(struct mddev *mddev) /* Array is not configured at all, and not held active, * so destroy it */ list_del_init(&mddev->all_mddevs); - if (mddev->bio_set) - bioset_free(mddev->bio_set); - if (mddev->sync_set) - bioset_free(mddev->sync_set); + bs = mddev->bio_set; + sync_bs = mddev->sync_set; mddev->bio_set = NULL; mddev->sync_set = NULL; if (mddev->gendisk) { @@ -500,6 +500,10 @@ static void mddev_put(struct mddev *mddev) kfree(mddev); } spin_unlock(&all_mddevs_lock); + if (bs) + bioset_free(bs); + if (sync_bs) + bioset_free(sync_bs); } static void md_safemode_timeout(unsigned long data); -- cgit v1.2.3