From bc78c57388e7f447f58e30d60b1505ddaaaf3a7d Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Thu, 11 Oct 2012 13:08:02 +1100 Subject: md/linear: rcu_dereference outside read-lock section According to the comment in linear_stop function rcu_dereference in linear_start and linear_stop functions occurs under reconfig_mutex. The patch represents this agreement in code and prevents lockdep complaint. Found by Linux Driver Verification project (linuxtesting.org) Signed-off-by: Denis Efremov Signed-off-by: NeilBrown --- drivers/md/linear.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/linear.c b/drivers/md/linear.c index fa211d80fc0a..92c64d162a92 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -244,7 +244,9 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev) if (!newconf) return -ENOMEM; - oldconf = rcu_dereference(mddev->private); + oldconf = rcu_dereference_protected(mddev->private, + lockdep_is_held( + &mddev->reconfig_mutex)); mddev->raid_disks++; rcu_assign_pointer(mddev->private, newconf); md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); @@ -256,7 +258,10 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev) static int linear_stop (struct mddev *mddev) { - struct linear_conf *conf = mddev->private; + struct linear_conf *conf = + rcu_dereference_protected(mddev->private, + lockdep_is_held( + &mddev->reconfig_mutex)); /* * We do not require rcu protection here since -- cgit v1.2.3 From f1cad2b68ed12c0f82d3f56e150691f62b6f5edf Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 11 Oct 2012 13:08:44 +1100 Subject: md: linear supports TRIM This makes md linear support TRIM. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/linear.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 92c64d162a92..21014836bdbf 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -138,6 +138,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) struct linear_conf *conf; struct md_rdev *rdev; int i, cnt; + bool discard_supported = false; conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(struct dev_info), GFP_KERNEL); @@ -171,6 +172,8 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) conf->array_sectors += rdev->sectors; cnt++; + if (blk_queue_discard(bdev_get_queue(rdev->bdev))) + discard_supported = true; } if (cnt != raid_disks) { printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n", @@ -178,6 +181,11 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) goto out; } + if (!discard_supported) + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + else + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + /* * Here we calculate the device offsets. */ @@ -331,6 +339,14 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio) bio->bi_sector = bio->bi_sector - start_sector + tmp_dev->rdev->data_offset; rcu_read_unlock(); + + if (unlikely((bio->bi_rw & REQ_DISCARD) && + !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) { + /* Just ignore it */ + bio_endio(bio, 0); + return; + } + generic_make_request(bio); } -- cgit v1.2.3 From c83057a1f4f987327c49448b046d9625c612ed8e Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 11 Oct 2012 13:25:44 +1100 Subject: md: raid 0 supports TRIM This makes md raid 0 support TRIM. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid0.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index a9e4fa95dfaa..24b359717a7e 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -88,6 +88,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) char b[BDEVNAME_SIZE]; char b2[BDEVNAME_SIZE]; struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL); + bool discard_supported = false; if (!conf) return -ENOMEM; @@ -195,6 +196,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) if (!smallest || (rdev1->sectors < smallest->sectors)) smallest = rdev1; cnt++; + + if (blk_queue_discard(bdev_get_queue(rdev1->bdev))) + discard_supported = true; } if (cnt != mddev->raid_disks) { printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - " @@ -272,6 +276,11 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) blk_queue_io_opt(mddev->queue, (mddev->chunk_sectors << 9) * mddev->raid_disks); + if (!discard_supported) + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + else + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + pr_debug("md/raid0:%s: done.\n", mdname(mddev)); *private_conf = conf; @@ -423,6 +432,7 @@ static int raid0_run(struct mddev *mddev) return -EINVAL; blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors); + blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors); /* if private is not null, we are here after takeover */ if (mddev->private == NULL) { @@ -510,7 +520,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) sector_t sector = bio->bi_sector; struct bio_pair *bp; /* Sanity check -- queue functions should prevent this happening */ - if (bio->bi_vcnt != 1 || + if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) || bio->bi_idx != 0) goto bad_map; /* This is a one page bio that upper layers @@ -536,6 +546,13 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) bio->bi_sector = sector_offset + zone->dev_start + tmp_dev->data_offset; + if (unlikely((bio->bi_rw & REQ_DISCARD) && + !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) { + /* Just ignore it */ + bio_endio(bio, 0); + return; + } + generic_make_request(bio); return; -- cgit v1.2.3 From 2ff8cc2c6d4e323de71a42affeb3041fa17d5b10 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 11 Oct 2012 13:28:54 +1100 Subject: md: raid 1 supports TRIM This makes md raid 1 support TRIM. If one disk supports discard and another not, or one has discard_zero_data and another not, there could be inconsistent between data from such disks. But this should not matter, discarded data is useless. This will add extra copy in rebuild though. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid1.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 611b5f797618..05c557e8f862 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -781,7 +781,12 @@ static void flush_pending_writes(struct r1conf *conf) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; bio->bi_next = NULL; - generic_make_request(bio); + if (unlikely((bio->bi_rw & REQ_DISCARD) && + !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) + /* Just ignore it */ + bio_endio(bio, 0); + else + generic_make_request(bio); bio = next; } } else @@ -994,6 +999,8 @@ static void make_request(struct mddev *mddev, struct bio * bio) const int rw = bio_data_dir(bio); const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); + const unsigned long do_discard = (bio->bi_rw + & (REQ_DISCARD | REQ_SECURE)); struct md_rdev *blocked_rdev; struct blk_plug_cb *cb; struct raid1_plug_cb *plug = NULL; @@ -1295,7 +1302,7 @@ read_again: conf->mirrors[i].rdev->data_offset); mbio->bi_bdev = conf->mirrors[i].rdev->bdev; mbio->bi_end_io = raid1_end_write_request; - mbio->bi_rw = WRITE | do_flush_fua | do_sync; + mbio->bi_rw = WRITE | do_flush_fua | do_sync | do_discard; mbio->bi_private = r1_bio; atomic_inc(&r1_bio->remaining); @@ -1549,6 +1556,8 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) clear_bit(Unmerged, &rdev->flags); } md_integrity_add_rdev(rdev, mddev); + if (blk_queue_discard(bdev_get_queue(rdev->bdev))) + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); print_conf(conf); return err; } @@ -2783,6 +2792,7 @@ static int run(struct mddev *mddev) int i; struct md_rdev *rdev; int ret; + bool discard_supported = false; if (mddev->level != 1) { printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n", @@ -2812,6 +2822,8 @@ static int run(struct mddev *mddev) continue; disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); + if (blk_queue_discard(bdev_get_queue(rdev->bdev))) + discard_supported = true; } mddev->degraded = 0; @@ -2846,6 +2858,13 @@ static int run(struct mddev *mddev) mddev->queue->backing_dev_info.congested_fn = raid1_congested; mddev->queue->backing_dev_info.congested_data = mddev; blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec); + + if (discard_supported) + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, + mddev->queue); + else + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, + mddev->queue); } ret = md_integrity_register(mddev); -- cgit v1.2.3 From 532a2a3fba8df076d65fdf17518eeb327b37a313 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 11 Oct 2012 13:30:52 +1100 Subject: md: raid 10 supports TRIM This makes md raid 10 support TRIM. If one disk supports discard and another not, or one has discard_zero_data and another not, there could be inconsistent between data from such disks. But this should not matter, discarded data is useless. This will add extra copy in rebuild though. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid10.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 1c2eb38f3c51..f92e0ed59be0 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -911,7 +911,12 @@ static void flush_pending_writes(struct r10conf *conf) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; bio->bi_next = NULL; - generic_make_request(bio); + if (unlikely((bio->bi_rw & REQ_DISCARD) && + !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) + /* Just ignore it */ + bio_endio(bio, 0); + else + generic_make_request(bio); bio = next; } } else @@ -1061,6 +1066,8 @@ static void make_request(struct mddev *mddev, struct bio * bio) const int rw = bio_data_dir(bio); const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); const unsigned long do_fua = (bio->bi_rw & REQ_FUA); + const unsigned long do_discard = (bio->bi_rw + & (REQ_DISCARD | REQ_SECURE)); unsigned long flags; struct md_rdev *blocked_rdev; int sectors_handled; @@ -1081,7 +1088,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) || conf->prev.near_copies < conf->prev.raid_disks))) { struct bio_pair *bp; /* Sanity check -- queue functions should prevent this happening */ - if (bio->bi_vcnt != 1 || + if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) || bio->bi_idx != 0) goto bad_map; /* This is a one page bio that upper layers @@ -1410,7 +1417,7 @@ retry_write: conf->mirrors[d].rdev)); mbio->bi_bdev = conf->mirrors[d].rdev->bdev; mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE | do_sync | do_fua; + mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); @@ -1439,7 +1446,7 @@ retry_write: conf->mirrors[d].replacement)); mbio->bi_bdev = conf->mirrors[d].replacement->bdev; mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE | do_sync | do_fua; + mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); @@ -1723,6 +1730,9 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) clear_bit(Unmerged, &rdev->flags); } md_integrity_add_rdev(rdev, mddev); + if (blk_queue_discard(bdev_get_queue(rdev->bdev))) + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + print_conf(conf); return err; } @@ -3480,6 +3490,7 @@ static int run(struct mddev *mddev) sector_t size; sector_t min_offset_diff = 0; int first = 1; + bool discard_supported = false; if (mddev->private == NULL) { conf = setup_conf(mddev); @@ -3496,6 +3507,8 @@ static int run(struct mddev *mddev) chunk_size = mddev->chunk_sectors << 9; if (mddev->queue) { + blk_queue_max_discard_sectors(mddev->queue, + mddev->chunk_sectors); blk_queue_io_min(mddev->queue, chunk_size); if (conf->geo.raid_disks % conf->geo.near_copies) blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); @@ -3541,8 +3554,16 @@ static int run(struct mddev *mddev) rdev->data_offset << 9); disk->head_position = 0; + + if (blk_queue_discard(bdev_get_queue(rdev->bdev))) + discard_supported = true; } + if (discard_supported) + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + else + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + /* need to check that every block has at least one working mirror */ if (!enough(conf, -1)) { printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", -- cgit v1.2.3 From 57c67df48866d57b50d72eb198ffcc0cf7a6232d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 11 Oct 2012 13:32:13 +1100 Subject: md/raid10: submit IO from originating thread instead of md thread. queuing writes to the md thread means that all requests go through the one processor which may not be able to keep up with very high request rates. So use the plugging infrastructure to submit all requests on unplug. If a 'schedule' is needed, we fall back on the old approach of handing the requests to the thread for it to handle. This is nearly identical to a recent patch which provided similar functionality to RAID1. Signed-off-by: NeilBrown --- drivers/md/raid10.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 54 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index f92e0ed59be0..05dc96a950d5 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1055,6 +1055,44 @@ static sector_t choose_data_offset(struct r10bio *r10_bio, return rdev->new_data_offset; } +struct raid10_plug_cb { + struct blk_plug_cb cb; + struct bio_list pending; + int pending_cnt; +}; + +static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) +{ + struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb, + cb); + struct mddev *mddev = plug->cb.data; + struct r10conf *conf = mddev->private; + struct bio *bio; + + if (from_schedule) { + spin_lock_irq(&conf->device_lock); + bio_list_merge(&conf->pending_bio_list, &plug->pending); + conf->pending_count += plug->pending_cnt; + spin_unlock_irq(&conf->device_lock); + md_wakeup_thread(mddev->thread); + kfree(plug); + return; + } + + /* we aren't scheduling, so we can do the write-out directly. */ + bio = bio_list_get(&plug->pending); + bitmap_unplug(mddev->bitmap); + wake_up(&conf->wait_barrier); + + while (bio) { /* submit pending writes */ + struct bio *next = bio->bi_next; + bio->bi_next = NULL; + generic_make_request(bio); + bio = next; + } + kfree(plug); +} + static void make_request(struct mddev *mddev, struct bio * bio) { struct r10conf *conf = mddev->private; @@ -1070,6 +1108,8 @@ static void make_request(struct mddev *mddev, struct bio * bio) & (REQ_DISCARD | REQ_SECURE)); unsigned long flags; struct md_rdev *blocked_rdev; + struct blk_plug_cb *cb; + struct raid10_plug_cb *plug = NULL; int sectors_handled; int max_sectors; int sectors; @@ -1421,11 +1461,22 @@ retry_write: mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); + + cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug)); + if (cb) + plug = container_of(cb, struct raid10_plug_cb, cb); + else + plug = NULL; spin_lock_irqsave(&conf->device_lock, flags); - bio_list_add(&conf->pending_bio_list, mbio); - conf->pending_count++; + if (plug) { + bio_list_add(&plug->pending, mbio); + plug->pending_cnt++; + } else { + bio_list_add(&conf->pending_bio_list, mbio); + conf->pending_count++; + } spin_unlock_irqrestore(&conf->device_lock, flags); - if (!mddev_check_plugged(mddev)) + if (!plug) md_wakeup_thread(mddev->thread); if (!r10_bio->devs[i].repl_bio) -- cgit v1.2.3 From 4ed8731d8e6bd2a88a30697fbf4f7e6e979a6c46 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 11 Oct 2012 13:34:00 +1100 Subject: MD: change the parameter of md thread Change the thread parameter, so the thread can carry extra info. Next patch will use it. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/md.c | 9 +++++---- drivers/md/md.h | 7 ++++--- drivers/md/multipath.c | 3 ++- drivers/md/raid1.c | 3 ++- drivers/md/raid10.c | 3 ++- drivers/md/raid5.c | 3 ++- 6 files changed, 17 insertions(+), 11 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 7a2b0793f66e..8e842b326ebd 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6641,7 +6641,7 @@ static int md_thread(void * arg) clear_bit(THREAD_WAKEUP, &thread->flags); if (!kthread_should_stop()) - thread->run(thread->mddev); + thread->run(thread); } return 0; @@ -6656,8 +6656,8 @@ void md_wakeup_thread(struct md_thread *thread) } } -struct md_thread *md_register_thread(void (*run) (struct mddev *), struct mddev *mddev, - const char *name) +struct md_thread *md_register_thread(void (*run) (struct md_thread *), + struct mddev *mddev, const char *name) { struct md_thread *thread; @@ -7206,8 +7206,9 @@ EXPORT_SYMBOL_GPL(md_allow_write); #define SYNC_MARKS 10 #define SYNC_MARK_STEP (3*HZ) -void md_do_sync(struct mddev *mddev) +void md_do_sync(struct md_thread *thread) { + struct mddev *mddev = thread->mddev; struct mddev *mddev2; unsigned int currspeed = 0, window; diff --git a/drivers/md/md.h b/drivers/md/md.h index f385b038589d..93ada214b50e 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -540,12 +540,13 @@ static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev) list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set) struct md_thread { - void (*run) (struct mddev *mddev); + void (*run) (struct md_thread *thread); struct mddev *mddev; wait_queue_head_t wqueue; unsigned long flags; struct task_struct *tsk; unsigned long timeout; + void *private; }; #define THREAD_WAKEUP 0 @@ -584,7 +585,7 @@ static inline void safe_put_page(struct page *p) extern int register_md_personality(struct md_personality *p); extern int unregister_md_personality(struct md_personality *p); extern struct md_thread *md_register_thread( - void (*run)(struct mddev *mddev), + void (*run)(struct md_thread *thread), struct mddev *mddev, const char *name); extern void md_unregister_thread(struct md_thread **threadp); @@ -603,7 +604,7 @@ extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, extern void md_super_wait(struct mddev *mddev); extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct page *page, int rw, bool metadata_op); -extern void md_do_sync(struct mddev *mddev); +extern void md_do_sync(struct md_thread *thread); extern void md_new_event(struct mddev *mddev); extern int md_allow_write(struct mddev *mddev); extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev); diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 61a1833ebaf3..1642eae75a33 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -335,8 +335,9 @@ abort: * 3. Performs writes following reads for array syncronising. */ -static void multipathd (struct mddev *mddev) +static void multipathd(struct md_thread *thread) { + struct mddev *mddev = thread->mddev; struct multipath_bh *mp_bh; struct bio *bio; unsigned long flags; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 05c557e8f862..55ccf4730536 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2294,8 +2294,9 @@ read_more: } } -static void raid1d(struct mddev *mddev) +static void raid1d(struct md_thread *thread) { + struct mddev *mddev = thread->mddev; struct r1bio *r1_bio; unsigned long flags; struct r1conf *conf = mddev->private; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 05dc96a950d5..54860604d097 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2732,8 +2732,9 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) } } -static void raid10d(struct mddev *mddev) +static void raid10d(struct md_thread *thread) { + struct mddev *mddev = thread->mddev; struct r10bio *r10_bio; unsigned long flags; struct r10conf *conf = mddev->private; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index adda94df5eb2..81c02d63440b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4625,8 +4625,9 @@ static int handle_active_stripes(struct r5conf *conf) * During the scan, completed stripes are saved for us by the interrupt * handler, so that they will not have to wait for our next wakeup. */ -static void raid5d(struct mddev *mddev) +static void raid5d(struct md_thread *thread) { + struct mddev *mddev = thread->mddev; struct r5conf *conf = mddev->private; int handled; struct blk_plug plug; -- cgit v1.2.3 From 1ca69c4bc4b1ef889861db39f325901eadbf9497 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 11 Oct 2012 13:37:33 +1100 Subject: md: avoid taking the mutex on some ioctls. Some ioctls don't need to take the mutex and doing so can cause a delay as it is held during super-block update. So move those ioctls out of the mutex and rely on rcu locking to ensure we don't access stale data. Signed-off-by: NeilBrown --- drivers/md/md.c | 85 +++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 62 insertions(+), 23 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 8e842b326ebd..feab588adb50 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -674,7 +674,18 @@ static struct md_rdev * find_rdev_nr(struct mddev *mddev, int nr) return NULL; } -static struct md_rdev * find_rdev(struct mddev * mddev, dev_t dev) +static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr) +{ + struct md_rdev *rdev; + + rdev_for_each_rcu(rdev, mddev) + if (rdev->desc_nr == nr) + return rdev; + + return NULL; +} + +static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) { struct md_rdev *rdev; @@ -685,6 +696,17 @@ static struct md_rdev * find_rdev(struct mddev * mddev, dev_t dev) return NULL; } +static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev) +{ + struct md_rdev *rdev; + + rdev_for_each_rcu(rdev, mddev) + if (rdev->bdev->bd_dev == dev) + return rdev; + + return NULL; +} + static struct md_personality *find_pers(int level, char *clevel) { struct md_personality *pers; @@ -5509,8 +5531,9 @@ static int get_array_info(struct mddev * mddev, void __user * arg) int nr,working,insync,failed,spare; struct md_rdev *rdev; - nr=working=insync=failed=spare=0; - rdev_for_each(rdev, mddev) { + nr = working = insync = failed = spare = 0; + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { nr++; if (test_bit(Faulty, &rdev->flags)) failed++; @@ -5522,6 +5545,7 @@ static int get_array_info(struct mddev * mddev, void __user * arg) spare++; } } + rcu_read_unlock(); info.major_version = mddev->major_version; info.minor_version = mddev->minor_version; @@ -5605,7 +5629,8 @@ static int get_disk_info(struct mddev * mddev, void __user * arg) if (copy_from_user(&info, arg, sizeof(info))) return -EFAULT; - rdev = find_rdev_nr(mddev, info.number); + rcu_read_lock(); + rdev = find_rdev_nr_rcu(mddev, info.number); if (rdev) { info.major = MAJOR(rdev->bdev->bd_dev); info.minor = MINOR(rdev->bdev->bd_dev); @@ -5624,6 +5649,7 @@ static int get_disk_info(struct mddev * mddev, void __user * arg) info.raid_disk = -1; info.state = (1<pers == NULL) return -ENODEV; - rdev = find_rdev(mddev, dev); + rcu_read_lock(); + rdev = find_rdev_rcu(mddev, dev); if (!rdev) - return -ENODEV; - - md_error(mddev, rdev); - if (!test_bit(Faulty, &rdev->flags)) - return -EBUSY; - return 0; + err = -ENODEV; + else { + md_error(mddev, rdev); + if (!test_bit(Faulty, &rdev->flags)) + err = -EBUSY; + } + rcu_read_unlock(); + return err; } /* @@ -6315,6 +6345,27 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, goto abort; } + /* Some actions do not requires the mutex */ + switch (cmd) { + case GET_ARRAY_INFO: + if (!mddev->raid_disks && !mddev->external) + err = -ENODEV; + else + err = get_array_info(mddev, argp); + goto abort; + + case GET_DISK_INFO: + if (!mddev->raid_disks && !mddev->external) + err = -ENODEV; + else + err = get_disk_info(mddev, argp); + goto abort; + + case SET_DISK_FAULTY: + err = set_disk_faulty(mddev, new_decode_dev(arg)); + goto abort; + } + err = mddev_lock(mddev); if (err) { printk(KERN_INFO @@ -6387,18 +6438,10 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, */ switch (cmd) { - case GET_ARRAY_INFO: - err = get_array_info(mddev, argp); - goto done_unlock; - case GET_BITMAP_FILE: err = get_bitmap_file(mddev, argp); goto done_unlock; - case GET_DISK_INFO: - err = get_disk_info(mddev, argp); - goto done_unlock; - case RESTART_ARRAY_RW: err = restart_array(mddev); goto done_unlock; @@ -6480,10 +6523,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, err = hot_add_disk(mddev, new_decode_dev(arg)); goto done_unlock; - case SET_DISK_FAULTY: - err = set_disk_faulty(mddev, new_decode_dev(arg)); - goto done_unlock; - case RUN_ARRAY: err = do_md_run(mddev); goto done_unlock; -- cgit v1.2.3 From 2863b9eb44787adecba4f977d71d7fd876805b1c Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Thu, 11 Oct 2012 13:38:58 +1100 Subject: MD RAID10: Prep for DM RAID10 device replacement capability MD RAID10: Fix a couple potential kernel panics if RAID10 is used by dm-raid When device-mapper uses the RAID10 personality through dm-raid.c, there is no 'gendisk' structure in mddev and some sysfs information is also not populated. This patch avoids touching those non-existent structures. Signed-off-by: Jonathan Brassow Signed-off-by: NeilBrown --- drivers/md/md.c | 10 ++++++++-- drivers/md/raid10.c | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index feab588adb50..85e6786653ea 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2044,8 +2044,14 @@ EXPORT_SYMBOL(md_integrity_register); /* Disable data integrity if non-capable/non-matching disk is being added */ void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) { - struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev); - struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk); + struct blk_integrity *bi_rdev; + struct blk_integrity *bi_mddev; + + if (!mddev->gendisk) + return; + + bi_rdev = bdev_get_integrity(rdev->bdev); + bi_mddev = blk_get_integrity(mddev->gendisk); if (!bi_mddev) /* nothing to do */ return; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 54860604d097..fb5bd607e15c 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1694,7 +1694,7 @@ static int raid10_spare_active(struct mddev *mddev) && !test_bit(Faulty, &tmp->rdev->flags) && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { count++; - sysfs_notify_dirent(tmp->rdev->sysfs_state); + sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); } } spin_lock_irqsave(&conf->device_lock, flags); -- cgit v1.2.3 From eb6491236f283eb6ebc5421fcdb14b86701a7e36 Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Thu, 11 Oct 2012 13:40:09 +1100 Subject: DM RAID: Move 'rebuild' checking code to its own function DM RAID: Move chunk of code to it's own function The code that checks whether device replacements/rebuilds are possible given a specific RAID type is moved to it's own function. It will further expand when the code to check RAID10 is added. A separate function makes it easier to read. Signed-off-by: Jonathan Brassow Signed-off-by: NeilBrown --- drivers/md/dm-raid.c | 75 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 50 insertions(+), 25 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 982e3e390c45..10635e965fec 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -337,6 +337,52 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size) return 0; } +/* + * validate_rebuild_devices + * @rs + * + * Determine if the devices specified for rebuild can result in a valid + * usable array that is capable of rebuilding the given devices. + * + * Returns: 0 on success, -EINVAL on failure. + */ +static int validate_rebuild_devices(struct raid_set *rs) +{ + unsigned i, rebuild_cnt = 0; + + if (!(rs->print_flags & DMPF_REBUILD)) + return 0; + + for (i = 0; i < rs->md.raid_disks; i++) + if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) + rebuild_cnt++; + + switch (rs->raid_type->level) { + case 1: + if (rebuild_cnt >= rs->md.raid_disks) + goto too_many; + break; + case 4: + case 5: + case 6: + if (rebuild_cnt > rs->raid_type->parity_devs) + goto too_many; + break; + case 10: + default: + DMERR("The rebuild parameter is not supported for %s", + rs->raid_type->name); + rs->ti->error = "Rebuild not supported for this RAID type"; + return -EINVAL; + } + + return 0; + +too_many: + rs->ti->error = "Too many rebuild devices specified"; + return -EINVAL; +} + /* * Possible arguments are... * [optional_args] @@ -365,7 +411,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv, { char *raid10_format = "near"; unsigned raid10_copies = 2; - unsigned i, rebuild_cnt = 0; + unsigned i; unsigned long value, region_size = 0; sector_t sectors_per_dev = rs->ti->len; sector_t max_io_len; @@ -461,30 +507,6 @@ static int parse_raid_params(struct raid_set *rs, char **argv, /* Parameters that take a numeric value are checked here */ if (!strcasecmp(key, "rebuild")) { - rebuild_cnt++; - - switch (rs->raid_type->level) { - case 1: - if (rebuild_cnt >= rs->md.raid_disks) { - rs->ti->error = "Too many rebuild devices specified"; - return -EINVAL; - } - break; - case 4: - case 5: - case 6: - if (rebuild_cnt > rs->raid_type->parity_devs) { - rs->ti->error = "Too many rebuild devices specified for given RAID type"; - return -EINVAL; - } - break; - case 10: - default: - DMERR("The rebuild parameter is not supported for %s", rs->raid_type->name); - rs->ti->error = "Rebuild not supported for this RAID type"; - return -EINVAL; - } - if (value > rs->md.raid_disks) { rs->ti->error = "Invalid rebuild index given"; return -EINVAL; @@ -608,6 +630,9 @@ static int parse_raid_params(struct raid_set *rs, char **argv, } rs->md.dev_sectors = sectors_per_dev; + if (validate_rebuild_devices(rs)) + return -EINVAL; + /* Assume there are no metadata devices until the drives are parsed */ rs->md.persistent = 0; rs->md.external = 1; -- cgit v1.2.3 From 4ec1e369af83f7ecdfbd48a905e44fc9910115ba Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Thu, 11 Oct 2012 13:40:24 +1100 Subject: DM RAID: Add rebuild capability for RAID10 DM RAID: Add code to validate replacement slots for RAID10 arrays RAID10 can handle 'copies - 1' failures for each mirror group. This code ensures the user has provided a valid array - one whose devices specified for rebuild do not exceed the amount of redundancy available. Signed-off-by: Jonathan Brassow Signed-off-by: NeilBrown --- drivers/md/dm-raid.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 10635e965fec..4e79ebaab3c1 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -349,6 +349,7 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size) static int validate_rebuild_devices(struct raid_set *rs) { unsigned i, rebuild_cnt = 0; + unsigned rebuilds_per_group, copies, d; if (!(rs->print_flags & DMPF_REBUILD)) return 0; @@ -369,6 +370,37 @@ static int validate_rebuild_devices(struct raid_set *rs) goto too_many; break; case 10: + copies = raid10_md_layout_to_copies(rs->md.layout); + if (rebuild_cnt < copies) + break; + + /* + * It is possible to have a higher rebuild count for RAID10, + * as long as the failed devices occur in different mirror + * groups (i.e. different stripes). + * + * Right now, we only allow for "near" copies. When other + * formats are added, we will have to check those too. + * + * When checking "near" format, make sure no adjacent devices + * have failed beyond what can be handled. In addition to the + * simple case where the number of devices is a multiple of the + * number of copies, we must also handle cases where the number + * of devices is not a multiple of the number of copies. + * E.g. dev1 dev2 dev3 dev4 dev5 + * A A B B C + * C D D E E + */ + rebuilds_per_group = 0; + for (i = 0; i < rs->md.raid_disks * copies; i++) { + d = i % rs->md.raid_disks; + if (!test_bit(In_sync, &rs->dev[d].rdev.flags) && + (++rebuilds_per_group >= copies)) + goto too_many; + if (!((i + 1) % copies)) + rebuilds_per_group = 0; + } + break; default: DMERR("The rebuild parameter is not supported for %s", rs->raid_type->name); @@ -1385,7 +1417,7 @@ static void raid_resume(struct dm_target *ti) static struct target_type raid_target = { .name = "raid", - .version = {1, 3, 0}, + .version = {1, 3, 1}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, -- cgit v1.2.3 From 7386199c471f70526bbcc629f072a5a8effe218f Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Thu, 11 Oct 2012 13:40:36 +1100 Subject: DM RAID: Fix comparison of index and quantity for "rebuild" parameter DM RAID: Fix comparison of index and quantity for "rebuild" parameter The "rebuild" parameter takes an index argument that starts counting from zero. The conditional used to validate the index was using '>' rather than '>=', leaving the door open for an index value that would be 1 too large. Reported-by: Neil Brown Signed-off-by: Jonathan Brassow Signed-off-by: NeilBrown --- drivers/md/dm-raid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 4e79ebaab3c1..89a06a361332 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -539,7 +539,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv, /* Parameters that take a numeric value are checked here */ if (!strcasecmp(key, "rebuild")) { - if (value > rs->md.raid_disks) { + if (value >= rs->md.raid_disks) { rs->ti->error = "Invalid rebuild index given"; return -EINVAL; } -- cgit v1.2.3 From 761becff016b82a6a7a1b2ef224248da5f46bae9 Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Thu, 11 Oct 2012 13:42:19 +1100 Subject: DM RAID: Fix for "sync" directive ineffectiveness There are two table arguments that can be given to a DM RAID target that control whether the array is forced to (re)synchronize or skip initialization: "sync" and "nosync". When "sync" is given, we set mddev->recovery_cp to 0 in order to cause the device to resynchronize. This is insufficient if there is a bitmap in use, because the array will simply look at the bitmap and see that there is no recovery necessary. The fix is to skip over the loading of the superblocks when "sync" is given, causing new superblocks to be written that will force the array to go through initialization (i.e. synchronization). Signed-off-by: Jonathan Brassow Signed-off-by: NeilBrown --- drivers/md/dm-raid.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 89a06a361332..45d94a7e7f6d 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -1017,6 +1017,19 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) freshest = NULL; rdev_for_each_safe(rdev, tmp, mddev) { + /* + * Skipping super_load due to DMPF_SYNC will cause + * the array to undergo initialization again as + * though it were new. This is the intended effect + * of the "sync" directive. + * + * When reshaping capability is added, we must ensure + * that the "sync" directive is disallowed during the + * reshape. + */ + if (rs->print_flags & DMPF_SYNC) + continue; + if (!rdev->meta_bdev) continue; -- cgit v1.2.3 From fd177481b440c3f7b5ee9b821a76b29fdf2a6712 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 11 Oct 2012 13:43:21 +1100 Subject: raid: replace list_for_each_continue_rcu with new interface This patch replaces list_for_each_continue_rcu() with list_for_each_entry_continue_rcu() to save a few lines of code and allow removing list_for_each_continue_rcu(). Reviewed-by: Paul E. McKenney Signed-off-by: Michael Wang Signed-off-by: NeilBrown --- drivers/md/bitmap.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 94e7f6ba2e11..df73375c160a 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -163,20 +163,17 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde * As devices are only added or removed when raid_disk is < 0 and * nr_pending is 0 and In_sync is clear, the entries we return will * still be in the same position on the list when we re-enter - * list_for_each_continue_rcu. + * list_for_each_entry_continue_rcu. */ - struct list_head *pos; rcu_read_lock(); if (rdev == NULL) /* start at the beginning */ - pos = &mddev->disks; + rdev = list_entry_rcu(&mddev->disks, struct md_rdev, same_set); else { /* release the previous rdev and start from there. */ rdev_dec_pending(rdev, mddev); - pos = &rdev->same_set; } - list_for_each_continue_rcu(pos, &mddev->disks) { - rdev = list_entry(pos, struct md_rdev, same_set); + list_for_each_entry_continue_rcu(rdev, &mddev->disks, same_set) { if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags)) { /* this is a usable devices */ -- cgit v1.2.3 From 7ad4d4a68a1a19f21c7b39cb3f51bf17fba6e3d0 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 11 Oct 2012 13:44:30 +1100 Subject: md/raid1: Don't release reference to device while handling read error. When we get a read error, we arrange for raid1d to handle it. Currently we release the reference on the device. This can result in conf->mirrors[read_disk].rdev being NULL in fix_read_error, if the device happens to get removed before the read error is handled. So instead keep the reference until the read error has been fully handled. Reported-by: hank Signed-off-by: NeilBrown --- drivers/md/raid1.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 55ccf4730536..e913356b257e 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -333,9 +333,10 @@ static void raid1_end_read_request(struct bio *bio, int error) spin_unlock_irqrestore(&conf->device_lock, flags); } - if (uptodate) + if (uptodate) { raid_end_bio_io(r1_bio); - else { + rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); + } else { /* * oops, read error: */ @@ -349,9 +350,8 @@ static void raid1_end_read_request(struct bio *bio, int error) (unsigned long long)r1_bio->sector); set_bit(R1BIO_ReadError, &r1_bio->state); reschedule_retry(r1_bio); + /* don't drop the reference on read_disk yet */ } - - rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); } static void close_write(struct r1bio *r1_bio) @@ -2229,6 +2229,7 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) unfreeze_array(conf); } else md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); + rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev); bio = r1_bio->bios[r1_bio->read_disk]; bdevname(bio->bi_bdev, b); -- cgit v1.2.3 From 582e2e056a5c3410174c23f5134e6b00e0db9101 Mon Sep 17 00:00:00 2001 From: Jianpeng Ma Date: Thu, 11 Oct 2012 13:45:36 +1100 Subject: md/bitmap:Don't use IS_ERR to judge alloc_page(). Signed-off-by: Jianpeng Ma Signed-off-by: NeilBrown --- drivers/md/bitmap.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index df73375c160a..7155945f8eb8 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -470,14 +470,10 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap) { bitmap_super_t *sb; unsigned long chunksize, daemon_sleep, write_behind; - int err = -EINVAL; bitmap->storage.sb_page = alloc_page(GFP_KERNEL); - if (IS_ERR(bitmap->storage.sb_page)) { - err = PTR_ERR(bitmap->storage.sb_page); - bitmap->storage.sb_page = NULL; - return err; - } + if (bitmap->storage.sb_page == NULL) + return -ENOMEM; bitmap->storage.sb_page->index = 0; sb = kmap_atomic(bitmap->storage.sb_page); -- cgit v1.2.3 From 620125f2bf8ff0c4969b79653b54d7bcc9d40637 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 11 Oct 2012 13:49:05 +1100 Subject: MD: raid5 trim support Discard for raid4/5/6 has limitation. If discard request size is small, we do discard for one disk, but we need calculate parity and write parity disk. To correctly calculate parity, zero_after_discard must be guaranteed. Even it's true, we need do discard for one disk but write another disks, which makes the parity disks wear out fast. This doesn't make sense. So an efficient discard for raid4/5/6 should discard all data disks and parity disks, which requires the write pattern to be (A, A+chunk_size, A+chunk_size*2...). If A's size is smaller than chunk_size, such pattern is almost impossible in practice. So in this patch, I only handle the case that A's size equals to chunk_size. That is discard request should be aligned to stripe size and its size is multiple of stripe size. Since we can only handle request with specific alignment and size (or part of the request fitting stripes), we can't guarantee zero_after_discard even zero_after_discard is true in low level drives. The block layer doesn't send down correctly aligned requests even correct discard alignment is set, so I must filter out. For raid4/5/6 parity calculation, if data is 0, parity is 0. So if zero_after_discard is true for all disks, data is consistent after discard. Otherwise, data might be lost. Let's consider a scenario: discard a stripe, write data to one disk and write parity disk. The stripe could be still inconsistent till then depending on using data from other data disks or parity disks to calculate new parity. If the disk is broken, we can't restore it. So in this patch, we only enable discard support if all disks have zero_after_discard. If discard fails in one disk, we face the similar inconsistent issue above. The patch will make discard follow the same path as normal write request. If discard fails, a resync will be scheduled to make the data consistent. This isn't good to have extra writes, but data consistency is important. If a subsequent read/write request hits raid5 cache of a discarded stripe, the discarded dev page should have zero filled, so the data is consistent. This patch will always zero dev page for discarded request stripe. This isn't optimal because discard request doesn't need such payload. Next patch will avoid it. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid5.c | 168 ++++++++++++++++++++++++++++++++++++++++++++++++++++- drivers/md/raid5.h | 1 + 2 files changed, 166 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 81c02d63440b..74dcf19cfe68 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -547,6 +547,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) rw = WRITE_FUA; else rw = WRITE; + if (test_and_clear_bit(R5_Discard, &sh->dev[i].flags)) + rw |= REQ_DISCARD; } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) rw = READ; else if (test_and_clear_bit(R5_WantReplace, @@ -1170,8 +1172,13 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) set_bit(R5_WantFUA, &dev->flags); if (wbi->bi_rw & REQ_SYNC) set_bit(R5_SyncIO, &dev->flags); - tx = async_copy_data(1, wbi, dev->page, - dev->sector, tx); + if (wbi->bi_rw & REQ_DISCARD) { + memset(page_address(dev->page), 0, + STRIPE_SECTORS << 9); + set_bit(R5_Discard, &dev->flags); + } else + tx = async_copy_data(1, wbi, dev->page, + dev->sector, tx); wbi = r5_next_bio(wbi, dev->sector); } } @@ -1237,6 +1244,20 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); + for (i = 0; i < sh->disks; i++) { + if (pd_idx == i) + continue; + if (!test_bit(R5_Discard, &sh->dev[i].flags)) + break; + } + if (i >= sh->disks) { + atomic_inc(&sh->count); + memset(page_address(sh->dev[pd_idx].page), 0, + STRIPE_SECTORS << 9); + set_bit(R5_Discard, &sh->dev[pd_idx].flags); + ops_complete_reconstruct(sh); + return; + } /* check if prexor is active which means only process blocks * that are part of a read-modify-write (written) */ @@ -1281,10 +1302,28 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, { struct async_submit_ctl submit; struct page **blocks = percpu->scribble; - int count; + int count, i; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); + for (i = 0; i < sh->disks; i++) { + if (sh->pd_idx == i || sh->qd_idx == i) + continue; + if (!test_bit(R5_Discard, &sh->dev[i].flags)) + break; + } + if (i >= sh->disks) { + atomic_inc(&sh->count); + memset(page_address(sh->dev[sh->pd_idx].page), 0, + STRIPE_SECTORS << 9); + memset(page_address(sh->dev[sh->qd_idx].page), 0, + STRIPE_SECTORS << 9); + set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); + set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); + ops_complete_reconstruct(sh); + return; + } + count = set_syndrome_sources(blocks, sh); atomic_inc(&sh->count); @@ -4067,6 +4106,88 @@ static void release_stripe_plug(struct mddev *mddev, release_stripe(sh); } +static void make_discard_request(struct mddev *mddev, struct bio *bi) +{ + struct r5conf *conf = mddev->private; + sector_t logical_sector, last_sector; + struct stripe_head *sh; + int remaining; + int stripe_sectors; + + if (mddev->reshape_position != MaxSector) + /* Skip discard while reshape is happening */ + return; + + logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); + last_sector = bi->bi_sector + (bi->bi_size>>9); + + bi->bi_next = NULL; + bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ + + stripe_sectors = conf->chunk_sectors * + (conf->raid_disks - conf->max_degraded); + logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector, + stripe_sectors); + sector_div(last_sector, stripe_sectors); + + logical_sector *= conf->chunk_sectors; + last_sector *= conf->chunk_sectors; + + for (; logical_sector < last_sector; + logical_sector += STRIPE_SECTORS) { + DEFINE_WAIT(w); + int d; + again: + sh = get_active_stripe(conf, logical_sector, 0, 0, 0); + prepare_to_wait(&conf->wait_for_overlap, &w, + TASK_UNINTERRUPTIBLE); + spin_lock_irq(&sh->stripe_lock); + for (d = 0; d < conf->raid_disks; d++) { + if (d == sh->pd_idx || d == sh->qd_idx) + continue; + if (sh->dev[d].towrite || sh->dev[d].toread) { + set_bit(R5_Overlap, &sh->dev[d].flags); + spin_unlock_irq(&sh->stripe_lock); + release_stripe(sh); + schedule(); + goto again; + } + } + finish_wait(&conf->wait_for_overlap, &w); + for (d = 0; d < conf->raid_disks; d++) { + if (d == sh->pd_idx || d == sh->qd_idx) + continue; + sh->dev[d].towrite = bi; + set_bit(R5_OVERWRITE, &sh->dev[d].flags); + raid5_inc_bi_active_stripes(bi); + } + spin_unlock_irq(&sh->stripe_lock); + if (conf->mddev->bitmap) { + for (d = 0; + d < conf->raid_disks - conf->max_degraded; + d++) + bitmap_startwrite(mddev->bitmap, + sh->sector, + STRIPE_SECTORS, + 0); + sh->bm_seq = conf->seq_flush + 1; + set_bit(STRIPE_BIT_DELAY, &sh->state); + } + + set_bit(STRIPE_HANDLE, &sh->state); + clear_bit(STRIPE_DELAYED, &sh->state); + if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + atomic_inc(&conf->preread_active_stripes); + release_stripe_plug(mddev, sh); + } + + remaining = raid5_dec_bi_active_stripes(bi); + if (remaining == 0) { + md_write_end(mddev); + bio_endio(bi, 0); + } +} + static void make_request(struct mddev *mddev, struct bio * bi) { struct r5conf *conf = mddev->private; @@ -4089,6 +4210,11 @@ static void make_request(struct mddev *mddev, struct bio * bi) chunk_aligned_read(mddev,bi)) return; + if (unlikely(bi->bi_rw & REQ_DISCARD)) { + make_discard_request(mddev, bi); + return; + } + logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); last_sector = bi->bi_sector + (bi->bi_size>>9); bi->bi_next = NULL; @@ -5362,6 +5488,7 @@ static int run(struct mddev *mddev) if (mddev->queue) { int chunk_size; + bool discard_supported = true; /* read-ahead size must cover two whole stripes, which * is 2 * (datadisks) * chunksize where 'n' is the * number of raid devices @@ -5381,13 +5508,48 @@ static int run(struct mddev *mddev) blk_queue_io_min(mddev->queue, chunk_size); blk_queue_io_opt(mddev->queue, chunk_size * (conf->raid_disks - conf->max_degraded)); + /* + * We can only discard a whole stripe. It doesn't make sense to + * discard data disk but write parity disk + */ + stripe = stripe * PAGE_SIZE; + mddev->queue->limits.discard_alignment = stripe; + mddev->queue->limits.discard_granularity = stripe; + /* + * unaligned part of discard request will be ignored, so can't + * guarantee discard_zerors_data + */ + mddev->queue->limits.discard_zeroes_data = 0; rdev_for_each(rdev, mddev) { disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->new_data_offset << 9); + /* + * discard_zeroes_data is required, otherwise data + * could be lost. Consider a scenario: discard a stripe + * (the stripe could be inconsistent if + * discard_zeroes_data is 0); write one disk of the + * stripe (the stripe could be inconsistent again + * depending on which disks are used to calculate + * parity); the disk is broken; The stripe data of this + * disk is lost. + */ + if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) || + !bdev_get_queue(rdev->bdev)-> + limits.discard_zeroes_data) + discard_supported = false; } + + if (discard_supported && + mddev->queue->limits.max_discard_sectors >= stripe && + mddev->queue->limits.discard_granularity >= stripe) + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, + mddev->queue); + else + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, + mddev->queue); } return 0; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index a9fc24901eda..18b2c4a8a1fd 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -298,6 +298,7 @@ enum r5dev_flags { R5_WantReplace, /* We need to update the replacement, we have read * data in, and now is a good time to write it out. */ + R5_Discard, /* Discard the stripe */ }; /* -- cgit v1.2.3 From 9e44476851e91c86c98eb92b9bc27fb801f89072 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 11 Oct 2012 13:49:49 +1100 Subject: MD: raid5 avoid unnecessary zero page for trim We want to avoid zero discarded dev page, because it's useless for discard. But if we don't zero it, another read/write hit such page in the cache and will get inconsistent data. To avoid zero the page, we don't set R5_UPTODATE flag after construction is done. In this way, discard write request is still issued and finished, but read will not hit the page. If the stripe gets accessed soon, we need reread the stripe, but since the chance is low, the reread isn't a big deal. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid5.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 74dcf19cfe68..758b77296404 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -547,7 +547,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) rw = WRITE_FUA; else rw = WRITE; - if (test_and_clear_bit(R5_Discard, &sh->dev[i].flags)) + if (test_bit(R5_Discard, &sh->dev[i].flags)) rw |= REQ_DISCARD; } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) rw = READ; @@ -1172,11 +1172,9 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) set_bit(R5_WantFUA, &dev->flags); if (wbi->bi_rw & REQ_SYNC) set_bit(R5_SyncIO, &dev->flags); - if (wbi->bi_rw & REQ_DISCARD) { - memset(page_address(dev->page), 0, - STRIPE_SECTORS << 9); + if (wbi->bi_rw & REQ_DISCARD) set_bit(R5_Discard, &dev->flags); - } else + else tx = async_copy_data(1, wbi, dev->page, dev->sector, tx); wbi = r5_next_bio(wbi, dev->sector); @@ -1194,7 +1192,7 @@ static void ops_complete_reconstruct(void *stripe_head_ref) int pd_idx = sh->pd_idx; int qd_idx = sh->qd_idx; int i; - bool fua = false, sync = false; + bool fua = false, sync = false, discard = false; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); @@ -1202,13 +1200,15 @@ static void ops_complete_reconstruct(void *stripe_head_ref) for (i = disks; i--; ) { fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); + discard |= test_bit(R5_Discard, &sh->dev[i].flags); } for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (dev->written || i == pd_idx || i == qd_idx) { - set_bit(R5_UPTODATE, &dev->flags); + if (!discard) + set_bit(R5_UPTODATE, &dev->flags); if (fua) set_bit(R5_WantFUA, &dev->flags); if (sync) @@ -1252,8 +1252,6 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, } if (i >= sh->disks) { atomic_inc(&sh->count); - memset(page_address(sh->dev[pd_idx].page), 0, - STRIPE_SECTORS << 9); set_bit(R5_Discard, &sh->dev[pd_idx].flags); ops_complete_reconstruct(sh); return; @@ -1314,10 +1312,6 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, } if (i >= sh->disks) { atomic_inc(&sh->count); - memset(page_address(sh->dev[sh->pd_idx].page), 0, - STRIPE_SECTORS << 9); - memset(page_address(sh->dev[sh->qd_idx].page), 0, - STRIPE_SECTORS << 9); set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); ops_complete_reconstruct(sh); @@ -2775,7 +2769,8 @@ static void handle_stripe_clean_event(struct r5conf *conf, if (sh->dev[i].written) { dev = &sh->dev[i]; if (!test_bit(R5_LOCKED, &dev->flags) && - test_bit(R5_UPTODATE, &dev->flags)) { + (test_bit(R5_UPTODATE, &dev->flags) || + test_and_clear_bit(R5_Discard, &dev->flags))) { /* We can return any write requests */ struct bio *wbi, *wbi2; pr_debug("Return write for disc %d\n", i); @@ -3493,10 +3488,12 @@ static void handle_stripe(struct stripe_head *sh) if (s.written && (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) && !test_bit(R5_LOCKED, &pdev->flags) - && test_bit(R5_UPTODATE, &pdev->flags)))) && + && (test_bit(R5_UPTODATE, &pdev->flags) || + test_bit(R5_Discard, &pdev->flags))))) && (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) && !test_bit(R5_LOCKED, &qdev->flags) - && test_bit(R5_UPTODATE, &qdev->flags))))) + && (test_bit(R5_UPTODATE, &qdev->flags) || + test_bit(R5_Discard, &qdev->flags)))))) handle_stripe_clean_event(conf, sh, disks, &s.return_bi); /* Now we might consider reading some blocks, either to check/generate @@ -3523,9 +3520,11 @@ static void handle_stripe(struct stripe_head *sh) /* All the 'written' buffers and the parity block are ready to * be written back to disk */ - BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); + BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) && + !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)); BUG_ON(sh->qd_idx >= 0 && - !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags)); + !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) && + !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags)); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (test_bit(R5_LOCKED, &dev->flags) && -- cgit v1.2.3 From 143c4d0573caebe0ae017097614349697e2280eb Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 11 Oct 2012 13:50:12 +1100 Subject: md/raid5: add some missing locking in handle_failed_stripe. We really should hold the stripe_lock while accessing 'toread' else we could race with add_stripe_bio and corrupt a list. Reported-by: "Jianpeng Ma" Signed-off-by: NeilBrown --- drivers/md/raid5.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 758b77296404..36c0a158730b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2552,8 +2552,10 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && (!test_bit(R5_Insync, &sh->dev[i].flags) || test_bit(R5_ReadError, &sh->dev[i].flags))) { + spin_lock_irq(&sh->stripe_lock); bi = sh->dev[i].toread; sh->dev[i].toread = NULL; + spin_unlock_irq(&sh->stripe_lock); if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) wake_up(&conf->wait_for_overlap); if (bi) s->to_read--; -- cgit v1.2.3 From b97390aec4756373168ad2976e1f117b610513ea Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 11 Oct 2012 13:50:12 +1100 Subject: md/raid5: protect debug message against NULL derefernce. The pr_debug in add_stripe_bio could race with something changing *bip, so it is best to hold the lock until after the pr_debug. Reported-by: "Jianpeng Ma" Signed-off-by: NeilBrown --- drivers/md/raid5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 36c0a158730b..d11012604e28 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2436,11 +2436,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); } - spin_unlock_irq(&sh->stripe_lock); pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", (unsigned long long)(*bip)->bi_sector, (unsigned long long)sh->sector, dd_idx); + spin_unlock_irq(&sh->stripe_lock); if (conf->mddev->bitmap && firstwrite) { bitmap_startwrite(conf->mddev->bitmap, sh->sector, -- cgit v1.2.3 From a7854487cd7128a30a7f4f5259de9f67d5efb95f Mon Sep 17 00:00:00 2001 From: Alexander Lyakas Date: Thu, 11 Oct 2012 13:50:12 +1100 Subject: md: When RAID5 is dirty, force reconstruct-write instead of read-modify-write. Signed-off-by: Alex Lyakas Suggested-by: Yair Hershko Signed-off-by: NeilBrown --- drivers/md/raid5.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d11012604e28..9de8221f64ec 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2806,12 +2806,25 @@ static void handle_stripe_dirtying(struct r5conf *conf, int disks) { int rmw = 0, rcw = 0, i; - if (conf->max_degraded == 2) { - /* RAID6 requires 'rcw' in current implementation - * Calculate the real rcw later - for now fake it + sector_t recovery_cp = conf->mddev->recovery_cp; + + /* RAID6 requires 'rcw' in current implementation. + * Otherwise, check whether resync is now happening or should start. + * If yes, then the array is dirty (after unclean shutdown or + * initial creation), so parity in some stripes might be inconsistent. + * In this case, we need to always do reconstruct-write, to ensure + * that in case of drive failure or read-error correction, we + * generate correct data from the parity. + */ + if (conf->max_degraded == 2 || + (recovery_cp < MaxSector && sh->sector >= recovery_cp)) { + /* Calculate the real rcw later - for now make it * look like rcw is cheaper */ rcw = 1; rmw = 2; + pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n", + conf->max_degraded, (unsigned long long)recovery_cp, + (unsigned long long)sh->sector); } else for (i = disks; i--; ) { /* would I have to read this buffer for read_modify_write */ struct r5dev *dev = &sh->dev[i]; -- cgit v1.2.3 From 1ed850f356a0a422013846b5291acff08815008b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 11 Oct 2012 13:50:13 +1100 Subject: md/raid5: make sure to_read and to_write never go negative. to_read and to_write are part of the result of analysing a stripe before handling it. Their use is to avoid some loops and tests if the values are known to be zero. Thus it is not a problem if they are a little bit larger than they should be. So decrementing them in handle_failed_stripe serves little value, and due to races it could cause some loops to be skipped incorrectly. So remove those decrements. Reported-by: "Jianpeng Ma" Signed-off-by: NeilBrown --- drivers/md/raid5.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 9de8221f64ec..ab613efbbead 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @