diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-02-24 14:42:19 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-02-24 14:42:19 -0800 |
commit | a682e0035494c449e53a57d039f86f75b9e2fe67 (patch) | |
tree | 382d6c2d4729e6ed8f697fd528209a2b4701b618 /drivers/md | |
parent | 1802979ab1ee8ec5a72987ad518f5a91bf41cd89 (diff) | |
parent | 1ec492232ed659acde8cc00b9ecc7529778e03e1 (diff) |
Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md
Pull md updates from Shaohua Li:
"Mainly fixes bugs and improves performance:
- Improve scalability for raid1 from Coly
- Improve raid5-cache read performance, disk efficiency and IO
pattern from Song and me
- Fix a race condition of disk hotplug for linear from Coly
- A few cleanup patches from Ming and Byungchul
- Fix a memory leak from Neil
- Fix WRITE SAME IO failure from me
- Add doc for raid5-cache from me"
* 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md: (23 commits)
md/raid1: fix write behind issues introduced by bio_clone_bioset_partial
md/raid1: handle flush request correctly
md/linear: shutup lockdep warnning
md/raid1: fix a use-after-free bug
RAID1: avoid unnecessary spin locks in I/O barrier code
RAID1: a new I/O barrier implementation to remove resync window
md/raid5: Don't reinvent the wheel but use existing llist API
md: fast clone bio in bio_clone_mddev()
md: remove unnecessary check on mddev
md/raid1: use bio_clone_bioset_partial() in case of write behind
md: fail if mddev->bio_set can't be created
block: introduce bio_clone_bioset_partial()
md: disable WRITE SAME if it fails in underlayer disks
md/raid5-cache: exclude reclaiming stripes in reclaim check
md/raid5-cache: stripe reclaim only counts valid stripes
MD: add doc for raid5-cache
Documentation: move MD related doc into a separate dir
md: ensure md devices are freed before module is unloaded.
md/r5cache: improve journal device efficiency
md/r5cache: enable chunk_aligned_read with write back cache
...
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/faulty.c | 2 | ||||
-rw-r--r-- | drivers/md/linear.c | 41 | ||||
-rw-r--r-- | drivers/md/linear.h | 1 | ||||
-rw-r--r-- | drivers/md/md.c | 22 | ||||
-rw-r--r-- | drivers/md/md.h | 9 | ||||
-rw-r--r-- | drivers/md/multipath.c | 1 | ||||
-rw-r--r-- | drivers/md/raid0.c | 1 | ||||
-rw-r--r-- | drivers/md/raid1.c | 596 | ||||
-rw-r--r-- | drivers/md/raid1.h | 58 | ||||
-rw-r--r-- | drivers/md/raid10.c | 11 | ||||
-rw-r--r-- | drivers/md/raid5-cache.c | 225 | ||||
-rw-r--r-- | drivers/md/raid5.c | 129 | ||||
-rw-r--r-- | drivers/md/raid5.h | 7 |
13 files changed, 768 insertions, 335 deletions
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 685aa2d77e25..b0536cfd8e17 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -214,7 +214,7 @@ static void faulty_make_request(struct mddev *mddev, struct bio *bio) } } if (failit) { - struct bio *b = bio_clone_mddev(bio, GFP_NOIO, mddev); + struct bio *b = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); b->bi_bdev = conf->rdev->bdev; b->bi_private = bio; diff --git a/drivers/md/linear.c b/drivers/md/linear.c index f1c7bbac31a5..3e38e0207a3e 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -53,18 +53,26 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector) return conf->disks + lo; } +/* + * In linear_congested() conf->raid_disks is used as a copy of + * mddev->raid_disks to iterate conf->disks[], because conf->raid_disks + * and conf->disks[] are created in linear_conf(), they are always + * consitent with each other, but mddev->raid_disks does not. + */ static int linear_congested(struct mddev *mddev, int bits) { struct linear_conf *conf; int i, ret = 0; - conf = mddev->private; + rcu_read_lock(); + conf = rcu_dereference(mddev->private); - for (i = 0; i < mddev->raid_disks && !ret ; i++) { + for (i = 0; i < conf->raid_disks && !ret ; i++) { struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev); ret |= bdi_congested(q->backing_dev_info, bits); } + rcu_read_unlock(); return ret; } @@ -144,6 +152,19 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) conf->disks[i-1].end_sector + conf->disks[i].rdev->sectors; + /* + * conf->raid_disks is copy of mddev->raid_disks. The reason to + * keep a copy of mddev->raid_disks in struct linear_conf is, + * mddev->raid_disks may not be consistent with pointers number of + * conf->disks[] when it is updated in linear_add() and used to + * iterate old conf->disks[] earray in linear_congested(). + * Here conf->raid_disks is always consitent with number of + * pointers in conf->disks[] array, and mddev->private is updated + * with rcu_assign_pointer() in linear_addr(), such race can be + * avoided. + */ + conf->raid_disks = raid_disks; + return conf; out: @@ -196,15 +217,24 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev) if (!newconf) return -ENOMEM; + /* newconf->raid_disks already keeps a copy of * the increased + * value of mddev->raid_disks, WARN_ONCE() is just used to make + * sure of this. It is possible that oldconf is still referenced + * in linear_congested(), therefore kfree_rcu() is used to free + * oldconf until no one uses it anymore. + */ mddev_suspend(mddev); - oldconf = mddev->private; + oldconf = rcu_dereference_protected(mddev->private, + lockdep_is_held(&mddev->reconfig_mutex)); mddev->raid_disks++; - mddev->private = newconf; + WARN_ONCE(mddev->raid_disks != newconf->raid_disks, + "copied raid_disks doesn't match mddev->raid_disks"); + rcu_assign_pointer(mddev->private, newconf); md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); set_capacity(mddev->gendisk, mddev->array_sectors); mddev_resume(mddev); revalidate_disk(mddev->gendisk); - kfree(oldconf); + kfree_rcu(oldconf, rcu); return 0; } @@ -262,6 +292,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio) trace_block_bio_remap(bdev_get_queue(split->bi_bdev), split, disk_devt(mddev->gendisk), bio_sector); + mddev_check_writesame(mddev, split); generic_make_request(split); } } while (split != bio); diff --git a/drivers/md/linear.h b/drivers/md/linear.h index b685ddd7d7f7..8d392e6098b3 100644 --- a/drivers/md/linear.h +++ b/drivers/md/linear.h @@ -10,6 +10,7 @@ struct linear_conf { struct rcu_head rcu; sector_t array_sectors; + int raid_disks; /* a copy of mddev->raid_disks */ struct dev_info disks[0]; }; #endif diff --git a/drivers/md/md.c b/drivers/md/md.c index ba485dcf1064..985374f20e2e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -190,16 +190,6 @@ struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, } EXPORT_SYMBOL_GPL(bio_alloc_mddev); -struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, - struct mddev *mddev) -{ - if (!mddev || !mddev->bio_set) - return bio_clone(bio, gfp_mask); - - return bio_clone_bioset(bio, gfp_mask, mddev->bio_set); -} -EXPORT_SYMBOL_GPL(bio_clone_mddev); - /* * We have a system wide 'event count' that is incremented * on any 'interesting' event, and readers of /proc/mdstat @@ -5228,8 +5218,11 @@ int md_run(struct mddev *mddev) sysfs_notify_dirent_safe(rdev->sysfs_state); } - if (mddev->bio_set == NULL) + if (mddev->bio_set == NULL) { mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0); + if (!mddev->bio_set) + return -ENOMEM; + } spin_lock(&pers_lock); pers = find_pers(mddev->level, mddev->clevel); @@ -8980,7 +8973,14 @@ static __exit void md_exit(void) for_each_mddev(mddev, tmp) { export_array(mddev); + mddev->ctime = 0; mddev->hold_active = 0; + /* + * for_each_mddev() will call mddev_put() at the end of each + * iteration. As the mddev is now fully clear, this will + * schedule the mddev for destruction by a workqueue, and the + * destroy_workqueue() below will wait for that to complete. + */ } destroy_workqueue(md_misc_wq); destroy_workqueue(md_wq); diff --git a/drivers/md/md.h b/drivers/md/md.h index 2a514036a83d..b8859cbf84b6 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -673,8 +673,6 @@ extern void md_rdev_clear(struct md_rdev *rdev); extern void mddev_suspend(struct mddev *mddev); extern void mddev_resume(struct mddev *mddev); -extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, - struct mddev *mddev); extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, struct mddev *mddev); @@ -710,4 +708,11 @@ static inline void mddev_clear_unsupported_flags(struct mddev *mddev, { mddev->flags &= ~unsupported_flags; } + +static inline void mddev_check_writesame(struct mddev *mddev, struct bio *bio) +{ + if (bio_op(bio) == REQ_OP_WRITE_SAME && + !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors) + mddev->queue->limits.max_write_same_sectors = 0; +} #endif /* _MD_MD_H */ diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index d457afa672d5..79a12b59250b 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -138,6 +138,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio) mp_bh->bio.bi_opf |= REQ_FAILFAST_TRANSPORT; mp_bh->bio.bi_end_io = multipath_end_request; mp_bh->bio.bi_private = mp_bh; + mddev_check_writesame(mddev, &mp_bh->bio); generic_make_request(&mp_bh->bio); return; } diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index d6585239bff2..93347ca7c7a6 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -503,6 +503,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) trace_block_bio_remap(bdev_get_queue(split->bi_bdev), split, disk_devt(mddev->gendisk), bio_sector); + mddev_check_writesame(mddev, split); generic_make_request(split); } } while (split != bio); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 830ff2b20346..7453d94eeed7 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -71,9 +71,8 @@ */ static int max_queued_requests = 1024; -static void allow_barrier(struct r1conf *conf, sector_t start_next_window, - sector_t bi_sector); -static void lower_barrier(struct r1conf *conf); +static void allow_barrier(struct r1conf *conf, sector_t sector_nr); +static void lower_barrier(struct r1conf *conf, sector_t sector_nr); #define raid1_log(md, fmt, args...) \ do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0) @@ -100,7 +99,6 @@ static void r1bio_pool_free(void *r1_bio, void *data) #define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9) #define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW) #define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9) -#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS) static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) { @@ -205,6 +203,7 @@ static void free_r1bio(struct r1bio *r1_bio) static void put_buf(struct r1bio *r1_bio) { struct r1conf *conf = r1_bio->mddev->private; + sector_t sect = r1_bio->sector; int i; for (i = 0; i < conf->raid_disks * 2; i++) { @@ -215,7 +214,7 @@ static void put_buf(struct r1bio *r1_bio) mempool_free(r1_bio, conf->r1buf_pool); - lower_barrier(conf); + lower_barrier(conf, sect); } static void reschedule_retry(struct r1bio *r1_bio) @@ -223,10 +222,12 @@ static void reschedule_retry(struct r1bio *r1_bio) unsigned long flags; struct mddev *mddev = r1_bio->mddev; struct r1conf *conf = mddev->private; + int idx; + idx = sector_to_idx(r1_bio->sector); spin_lock_irqsave(&conf->device_lock, flags); list_add(&r1_bio->retry_list, &conf->retry_list); - conf->nr_queued ++; + atomic_inc(&conf->nr_queued[idx]); spin_unlock_irqrestore(&conf->device_lock, flags); wake_up(&conf->wait_barrier); @@ -243,7 +244,6 @@ static void call_bio_endio(struct r1bio *r1_bio) struct bio *bio = r1_bio->master_bio; int done; struct r1conf *conf = r1_bio->mddev->private; - sector_t start_next_window = r1_bio->start_next_window; sector_t bi_sector = bio->bi_iter.bi_sector; if (bio->bi_phys_segments) { @@ -269,7 +269,7 @@ static void call_bio_endio(struct r1bio *r1_bio) * Wake up any possible resync thread that waits for the device * to go idle. */ - allow_barrier(conf, start_next_window, bi_sector); + allow_barrier(conf, bi_sector); } } @@ -517,6 +517,25 @@ static void raid1_end_write_request(struct bio *bio) bio_put(to_put); } +static sector_t align_to_barrier_unit_end(sector_t start_sector, + sector_t sectors) +{ + sector_t len; + + WARN_ON(sectors == 0); + /* + * len is the number of sectors from start_sector to end of the + * barrier unit which start_sector belongs to. + */ + len = round_up(start_sector + 1, BARRIER_UNIT_SECTOR_SIZE) - + start_sector; + + if (len > sectors) + len = sectors; + + return len; +} + /* * This routine returns the disk from which the requested read should * be done. There is a per-array 'next expected sequential IO' sector @@ -813,168 +832,228 @@ static void flush_pending_writes(struct r1conf *conf) */ static void raise_barrier(struct r1conf *conf, sector_t sector_nr) { + int idx = sector_to_idx(sector_nr); + spin_lock_irq(&conf->resync_lock); /* Wait until no block IO is waiting */ - wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting, + wait_event_lock_irq(conf->wait_barrier, + !atomic_read(&conf->nr_waiting[idx]), conf->resync_lock); /* block any new IO from starting */ - conf->barrier++; - conf->next_resync = sector_nr; + atomic_inc(&conf->barrier[idx]); + /* + * In raise_barrier() we firstly increase conf->barrier[idx] then + * check conf->nr_pending[idx]. In _wait_barrier() we firstly + * increase conf->nr_pending[idx] then check conf->barrier[idx]. + * A memory barrier here to make sure conf->nr_pending[idx] won't + * be fetched before conf->barrier[idx] is increased. Otherwise + * there will be a race between raise_barrier() and _wait_barrier(). + */ + smp_mb__after_atomic(); /* For these conditions we must wait: * A: while the array is in frozen state - * B: while barrier >= RESYNC_DEPTH, meaning resync reach - * the max count which allowed. - * C: next_resync + RESYNC_SECTORS > start_next_window, meaning - * next resync will reach to the window which normal bios are - * handling. - * D: while there are any active requests in the current window. + * B: while conf->nr_pending[idx] is not 0, meaning regular I/O + * existing in corresponding I/O barrier bucket. + * C: while conf->barrier[idx] >= RESYNC_DEPTH, meaning reaches + * max resync count which allowed on current I/O barrier bucket. */ wait_event_lock_irq(conf->wait_barrier, !conf->array_frozen && - conf->barrier < RESYNC_DEPTH && - conf->current_window_requests == 0 && - (conf->start_next_window >= - conf->next_resync + RESYNC_SECTORS), + !atomic_read(&conf->nr_pending[idx]) && + atomic_read(&conf->barrier[idx]) < RESYNC_DEPTH, conf->resync_lock); - conf->nr_pending++; + atomic_inc(&conf->nr_pending[idx]); spin_unlock_irq(&conf->resync_lock); } -static void lower_barrier(struct r1conf *conf) +static void lower_barrier(struct r1conf *conf, sector_t sector_nr) { - unsigned long flags; - BUG_ON(conf->barrier <= 0); - spin_lock_irqsave(&conf->resync_lock, flags); - conf->barrier--; - conf->nr_pending--; - spin_unlock_irqrestore(&conf->resync_lock, flags); + int idx = sector_to_idx(sector_nr); + + BUG_ON(atomic_read(&conf->barrier[idx]) <= 0); + + atomic_dec(&conf->barrier[idx]); + atomic_dec(&conf->nr_pending[idx]); wake_up(&conf->wait_barrier); } -static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio) +static void _wait_barrier(struct r1conf *conf, int idx) { - bool wait = false; + /* + * We need to increase conf->nr_pending[idx] very early here, + * then raise_barrier() can be blocked when it waits for + * conf->nr_pending[idx] to be 0. Then we can avoid holding + * conf->resync_lock when there is no barrier raised in same + * barrier unit bucket. Also if the array is frozen, I/O + * should be blocked until array is unfrozen. + */ + atomic_inc(&conf->nr_pending[idx]); + /* + * In _wait_barrier() we firstly increase conf->nr_pending[idx], then + * check conf->barrier[idx]. In raise_barrier() we firstly increase + * conf->barrier[idx], then check conf->nr_pending[idx]. A memory + * barrier is necessary here to make sure conf->barrier[idx] won't be + * fetched before conf->nr_pending[idx] is increased. Otherwise there + * will be a race between _wait_barrier() and raise_barrier(). + */ + smp_mb__after_atomic(); - if (conf->array_frozen || !bio) - wait = true; - else if (conf->barrier && bio_data_dir(bio) == WRITE) { - if ((conf->mddev->curr_resync_completed - >= bio_end_sector(bio)) || - (conf->start_next_window + NEXT_NORMALIO_DISTANCE - <= bio->bi_iter.bi_sector)) - wait = false; - else - wait = true; - } + /* + * Don't worry about checking two atomic_t variables at same time + * here. If during we check conf->barrier[idx], the array is + * frozen (conf->array_frozen is 1), and chonf->barrier[idx] is + * 0, it is safe to return and make the I/O continue. Because the + * array is frozen, all I/O returned here will eventually complete + * or be queued, no race will happen. See code comment in + * frozen_array(). + */ + if (!READ_ONCE(conf->array_frozen) && + !atomic_read(&conf->barrier[idx])) + return; - return wait; + /* + * After holding conf->resync_lock, conf->nr_pending[idx] + * should be decreased before waiting for barrier to drop. + * Otherwise, we may encounter a race condition because + * raise_barrer() might be waiting for conf->nr_pending[idx] + * to be 0 at same time. + */ + spin_lock_irq(&conf->resync_lock); + atomic_inc(&conf->nr_waiting[idx]); + atomic_dec(&conf->nr_pending[idx]); + /* + * In case freeze_array() is waiting for + * get_unqueued_pending() == extra + */ + wake_up(&conf->wait_barrier); + /* Wait for the barrier in same barrier unit bucket to drop. */ + wait_event_lock_irq(conf->wait_barrier, + !conf->array_frozen && + !atomic_read(&conf->barrier[idx]), + conf->resync_lock); + atomic_inc(&conf->nr_pending[idx]); + atomic_dec(&conf->nr_waiting[idx]); + spin_unlock_irq(&conf->resync_lock); } -static sector_t wait_barrier(struct r1conf *conf, struct bio *bio) +static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr) { - sector_t sector = 0; + int idx = sector_to_idx(sector_nr); - spin_lock_irq(&conf->resync_lock); - if (need_to_wait_for_sync(conf, bio)) { - conf->nr_waiting++; - /* Wait for the barrier to drop. - * However if there are already pending - * requests (preventing the barrier from - * rising completely), and the - * per-process bio queue isn't empty, - * then don't wait, as we need to empty - * that queue to allow conf->start_next_window - * to increase. - */ - raid1_log(conf->mddev, "wait barrier"); - wait_event_lock_irq(conf->wait_barrier, - !conf->array_frozen && - (!conf->barrier || - ((conf->start_next_window < - conf->next_resync + RESYNC_SECTORS) && - current->bio_list && - !bio_list_empty(current->bio_list))), - conf->resync_lock); - conf->nr_waiting--; - } - - if (bio && bio_data_dir(bio) == WRITE) { - if (bio->bi_iter.bi_sector >= conf->next_resync) { - if (conf->start_next_window == MaxSector) - conf->start_next_window = - conf->next_resync + - NEXT_NORMALIO_DISTANCE; - - if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE) - <= bio->bi_iter.bi_sector) - conf->next_window_requests++; - else - conf->current_window_requests++; - sector = conf->start_next_window; - } - } + /* + * Very similar to _wait_barrier(). The difference is, for read + * I/O we don't need wait for sync I/O, but if the whole array + * is frozen, the read I/O still has to wait until the array is + * unfrozen. Since there is no ordering requirement with + * conf->barrier[idx] here, memory barrier is unnecessary as well. + */ + atomic_inc(&conf->nr_pending[idx]); - conf->nr_pending++; + if (!READ_ONCE(conf->array_frozen)) + return; + + spin_lock_irq(&conf->resync_lock); + atomic_inc(&conf->nr_waiting[idx]); + atomic_dec(&conf->nr_pending[idx]); + /* + * In case freeze_array() is waiting for + * get_unqueued_pending() == extra + */ + wake_up(&conf->wait_barrier); + /* Wait for array to be unfrozen */ + wait_event_lock_irq(conf->wait_barrier, + !conf->array_frozen, + conf->resync_lock); + atomic_inc(&conf->nr_pending[idx]); + atomic_dec(&conf->nr_waiting[idx]); spin_unlock_irq(&conf->resync_lock); - return sector; } -static void allow_barrier(struct r1conf *conf, sector_t start_next_window, - sector_t bi_sector) +static void wait_barrier(struct r1conf *conf, sector_t sector_nr) { - unsigned long flags; + int idx = sector_to_idx(sector_nr); - spin_lock_irqsave(&conf->resync_lock, flags); - conf->nr_pending--; - if (start_next_window) { - if (start_next_window == conf->start_next_window) { - if (conf->start_next_window + NEXT_NORMALIO_DISTANCE - <= bi_sector) - conf->next_window_requests--; - else - conf->current_window_requests--; - } else - conf->current_window_requests--; - - if (!conf->current_window_requests) { - if (conf->next_window_requests) { - conf->current_window_requests = - conf->next_window_requests; - conf->next_window_requests = 0; - conf->start_next_window += - NEXT_NORMALIO_DISTANCE; - } else - conf->start_next_window = MaxSector; - } - } - spin_unlock_irqrestore(&conf->resync_lock, flags); + _wait_barrier(conf, idx); +} + +static void wait_all_barriers(struct r1conf *conf) +{ + int idx; + + for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) + _wait_barrier(conf, idx); +} + +static void _allow_barrier(struct r1conf *conf, int idx) +{ + atomic_dec(&conf->nr_pending[idx]); wake_up(&conf->wait_barrier); } +static void allow_barrier(struct r1conf *conf, sector_t sector_nr) +{ + int idx = sector_to_idx(sector_nr); + + _allow_barrier(conf, idx); +} + +static void allow_all_barriers(struct r1conf *conf) +{ + int idx; + + for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) + _allow_barrier(conf, idx); +} + +/* conf->resync_lock should be held */ +static int get_unqueued_pending(struct r1conf *conf) +{ + int idx, ret; + + for (ret = 0, idx = 0; idx < BARRIER_BUCKETS_NR; idx++) + ret += atomic_read(&conf->nr_pending[idx]) - + atomic_read(&conf->nr_queued[idx]); + + return ret; +} + static void freeze_array(struct r1conf *conf, int extra) { - /* stop syncio and normal IO and wait for everything to + /* Stop sync I/O and normal I/O and wait for everything to * go quite. - * We wait until nr_pending match nr_queued+extra - * This is called in the context of one normal IO request - * that has failed. Thus any sync request that might be pending - * will be blocked by nr_pending, and we need to wait for - * pending IO requests to complete or be queued for re-try. - * Thus the number queued (nr_queued) plus this request (extra) - * must match the number of pending IOs (nr_pending) before - * we continue. + * This is called in two situations: + * 1) management command handlers (reshape, remove disk, quiesce). + * 2) one normal I/O request failed. + + * After array_frozen is set to 1, new sync IO will be blocked at + * raise_barrier(), and new normal I/O will blocked at _wait_barrier() + * or wait_read_barrier(). The flying I/Os will either complete or be + * queued. When everything goes quite, there are only queued I/Os left. + + * Every flying I/O contributes to a conf->nr_pending[idx], idx is the + * barrier bucket index which this I/O request hits. When all sync and + * normal I/O are queued, sum of all conf->nr_pending[] will match sum + * of all conf->nr_queued[]. But normal I/O failure is an exception, + * in handle_read_error(), we may call freeze_array() before trying to + * fix the read error. In this case, the error read I/O is not queued, + * so get_unqueued_pending() == 1. + * + * Therefore before this function returns, we need to wait until + * get_unqueued_pendings(conf) gets equal to extra. For + * normal I/O context, extra is 1, in rested situations extra is 0. */ spin_lock_irq(&conf->resync_lock); conf->array_frozen = 1; raid1_log(conf->mddev, "wait freeze"); - wait_event_lock_irq_cmd(conf->wait_barrier, - conf->nr_pending == conf->nr_queued+extra, - conf->resync_lock, - flush_pending_writes(conf)); + wait_event_lock_irq_cmd( + conf->wait_barrier, + get_unqueued_pending(conf) == extra, + conf->resync_lock, + flush_pending_writes(conf)); spin_unlock_irq(&conf->resync_lock); } static void unfreeze_array(struct r1conf *conf) @@ -982,8 +1061,8 @@ static void unfreeze_array(struct r1conf *conf) /* reverse the effect of the freeze */ spin_lock_irq(&conf->resync_lock); conf->array_frozen = 0; - wake_up(&conf->wait_barrier); spin_unlock_irq(&conf->resync_lock); + wake_up(&conf->wait_barrier); } /* duplicate the data pages for behind I/O @@ -1070,11 +1149,28 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) kfree(plug); } -static void raid1_read_request(struct mddev *mddev, struct bio *bio, - struct r1bio *r1_bio) +static inline struct r1bio * +alloc_r1bio(struct mddev *mddev, struct bio *bio, sector_t sectors_handled) +{ + struct r1conf *conf = mddev->private; + struct r1bio *r1_bio; + + r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); + + r1_bio->master_bio = bio; + r1_bio->sectors = bio_sectors(bio) - sectors_handled; + r1_bio->state = 0; + r1_bio->mddev = mddev; + r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled; + + return r1_bio; +} + +static void raid1_read_request(struct mddev *mddev, struct bio *bio) { struct r1conf *conf = mddev->private; struct raid1_info *mirror; + struct r1bio *r1_bio; struct bio *read_bio; struct bitmap *bitmap = mddev->bitmap; const int op = bio_op(bio); @@ -1083,8 +1179,29 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, int max_sectors; int rdisk; - wait_barrier(conf, bio); + /* + * Still need barrier for READ in case that whole + * array is frozen. + */ + wait_read_barrier(conf, bio->bi_iter.bi_sector); + + r1_bio = alloc_r1bio(mddev, bio, 0); + /* + * We might need to issue multiple reads to different + * devices if there are bad blocks around, so we keep + * track of the number of reads in bio->bi_phys_segments. + * If this is 0, there is only one r1_bio and no locking + * will be needed when requests complete. If it is + * non-zero, then it is the number of not-completed requests. + */ + bio->bi_phys_segments = 0; + bio_clear_flag(bio, BIO_SEG_VALID); + + /* + * make_request() can abort the operation when read-ahead is being + * used and no empty request is available. + */ read_again: rdisk = read_balance(conf, r1_bio, &max_sectors); @@ -1106,9 +1223,8 @@ read_again: atomic_read(&bitmap->behind_writes) == 0); } r1_bio->read_disk = rdisk; - r1_bio->start_next_window = 0; - read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); + read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector, max_sectors); @@ -1151,22 +1267,16 @@ read_again: */ reschedule_retry(r1_bio); - r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); - - r1_bio->master_bio = bio; - r1_bio->sectors = bio_sectors(bio) - sectors_handled; - r1_bio->state = 0; - r1_bio->mddev = mddev; - r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled; + r1_bio = alloc_r1bio(mddev, bio, sectors_handled); goto read_again; } else generic_make_request(read_bio); } -static void raid1_write_request(struct mddev *mddev, struct bio *bio, - struct r1bio *r1_bio) +static void raid1_write_request(struct mddev *mddev, struct bio *bio) { struct r1conf *conf = mddev->private; + struct r1bio *r1_bio; int i, disks; struct bitmap *bitmap = mddev->bitmap; unsigned long flags; @@ -1176,7 +1286,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, int first_clone; int sectors_handled; int max_sectors; - sector_t start_next_window; /* * Register the new request and wait if the reconstruction @@ -1212,7 +1321,19 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, } finish_wait(&conf->wait_barrier, &w); } - start_next_window = wait_barrier(conf, bio); + wait_barrier(conf, bio->bi_iter.bi_sector); + + r1_bio = alloc_r1bio(mddev, bio, 0); + + /* We might need to issue multiple writes to different + * devices if there are bad blocks around, so we keep + * track of the number of writes in bio->bi_phys_segments. + * If this is 0, there is only one r1_bio and no locking + * will be needed when requests complete. If it is + * non-zero, then it is the number of not-completed requests. + */ + bio->bi_phys_segments = 0; + bio_clear_flag(bio, BIO_SEG_VALID); if (conf->pending_count >= max_queued_requests) { md_wakeup_thread(mddev->thread); @@ -1233,7 +1354,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, disks = conf->raid_disks * 2; retry_write: - r1_bio->start_next_window = start_next_window; blocked_rdev = NULL; rcu_read_lock(); max_sectors = r1_bio->sectors; @@ -1300,25 +1420,15 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, if (unlikely(blocked_rdev)) { /* Wait for this device to become unblocked */ int j; - sector_t old = start_next_window; for (j = 0; j < i; j++) if (r1_bio->bios[j]) rdev_dec_pending(conf->mirrors[j].rdev, mddev); r1_bio->state = 0; - allow_barrier(conf, start_next_window, bio->bi_iter.bi_sector); + allow_barrier(conf, bio->bi_iter.bi_sector); raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); md_wait_for_blocked_rdev(blocked_rdev, mddev); - start_next_window = wait_barrier(conf, bio); - /* - * We must make sure the multi r1bios of bio have - * the same value of bi_phys_segments - */ - if (bio->bi_phys_segments && old && - old != start_next_window) - /* Wait for the former r1bio(s) to complete */ - wait_event(conf->wait_barrier, - bio->bi_phys_segments == 1); + wait_barrier(conf, bio->bi_iter.bi_sector); goto retry_write; } @@ -1341,13 +1451,12 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, first_clone = 1; for (i = 0; i < disks; i++) { - struct bio *mbio; + struct bio *mbio = NULL; + sector_t offset; if (!r1_bio->bios[i]) continue; - mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); - bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector, - max_sectors); + offset = r1_bio->sector - bio->bi_iter.bi_sector; if (first_clone) { /* do behind I/O ? @@ -1357,8 +1466,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, if (bitmap && (atomic_read(&bitmap->behind_writes) < mddev->bitmap_info.max_write_behind) && - !waitqueue_active(&bitmap->behind_wait)) + !waitqueue_active(&bitmap->behind_wait)) { + mbio = bio_clone_bioset_partial(bio, GFP_NOIO, + mddev->bio_set, + offset << 9, + max_sectors << 9); alloc_behind_pages(mbio, r1_bio); + } bitmap_startwrite(bitmap, r1_bio->sector, r1_bio->sectors, @@ -1366,6 +1480,19 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, &r1_bio->state)); first_clone = 0; } + + if (!mbio) { + if (r1_bio->behind_bvecs) + mbio = bio_clone_bioset_partial(bio, GFP_NOIO, + mddev->bio_set, + offset << 9, + max_sectors << 9); + else { + mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); + bio_trim(mbio, offset, max_sectors); + } + } + if (r1_bio->behind_bvecs) { struct bio_vec *bvec; int j; @@ -1385,8 +1512,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, conf->mirrors[i].rdev->data_offset); mbio->bi_bdev = conf->mirrors[i].rdev->bdev; mbio->bi_end_io = raid1_end_write_request; - mbio->bi_opf = bio_op(bio) | - (bio->bi_opf & (REQ_SYNC | REQ_PREFLUSH | REQ_FUA)); + mbio->bi_opf = bio_op(bio) | (bio->bi_opf & (REQ_SYNC | REQ_FUA)); if (test_bit(FailFast, &conf->mirrors[i].rdev->flags) && !test_bit(WriteMostly, &conf->mirrors[i].rdev->flags) && conf->raid_disks - mddev->degraded > 1) @@ -1427,12 +1553,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, /* We need another r1_bio. It has already been counted * in bio->bi_phys_segments |