summaryrefslogtreecommitdiffstats
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c498
1 files changed, 291 insertions, 207 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 643d217bfa13..704ef7fcfbf8 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -223,18 +223,14 @@ static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
return slot;
}
-static void return_io(struct bio *return_bi)
+static void return_io(struct bio_list *return_bi)
{
- struct bio *bi = return_bi;
- while (bi) {
-
- return_bi = bi->bi_next;
- bi->bi_next = NULL;
+ struct bio *bi;
+ while ((bi = bio_list_pop(return_bi)) != NULL) {
bi->bi_iter.bi_size = 0;
trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
bi, 0);
- bio_endio(bi, 0);
- bi = return_bi;
+ bio_endio(bi);
}
}
@@ -357,7 +353,7 @@ static void release_inactive_stripe_list(struct r5conf *conf,
struct list_head *list = &temp_inactive_list[size - 1];
/*
- * We don't hold any lock here yet, get_active_stripe() might
+ * We don't hold any lock here yet, raid5_get_active_stripe() might
* remove stripes from the list
*/
if (!list_empty_careful(list)) {
@@ -417,7 +413,7 @@ static int release_stripe_list(struct r5conf *conf,
return count;
}
-static void release_stripe(struct stripe_head *sh)
+void raid5_release_stripe(struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
unsigned long flags;
@@ -662,9 +658,9 @@ static int has_failed(struct r5conf *conf)
return 0;
}
-static struct stripe_head *
-get_active_stripe(struct r5conf *conf, sector_t sector,
- int previous, int noblock, int noquiesce)
+struct stripe_head *
+raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
+ int previous, int noblock, int noquiesce)
{
struct stripe_head *sh;
int hash = stripe_hash_locks_hash(sector);
@@ -759,6 +755,10 @@ static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
/* Only freshly new full stripe normal write stripe can be added to a batch list */
static bool stripe_can_batch(struct stripe_head *sh)
{
+ struct r5conf *conf = sh->raid_conf;
+
+ if (conf->log)
+ return false;
return test_bit(STRIPE_BATCH_READY, &sh->state) &&
!test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
is_full_stripe_write(sh);
@@ -862,7 +862,7 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh
unlock_out:
unlock_two_stripes(head, sh);
out:
- release_stripe(head);
+ raid5_release_stripe(head);
}
/* Determine if 'data_offset' or 'new_data_offset' should be used
@@ -887,9 +887,9 @@ static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
}
static void
-raid5_end_read_request(struct bio *bi, int error);
+raid5_end_read_request(struct bio *bi);
static void
-raid5_end_write_request(struct bio *bi, int error);
+raid5_end_write_request(struct bio *bi);
static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
{
@@ -899,6 +899,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
might_sleep();
+ if (r5l_write_stripe(conf->log, sh) == 0)
+ return;
for (i = disks; i--; ) {
int rw;
int replace_only = 0;
@@ -1177,7 +1179,7 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
static void ops_complete_biofill(void *stripe_head_ref)
{
struct stripe_head *sh = stripe_head_ref;
- struct bio *return_bi = NULL;
+ struct bio_list return_bi = BIO_EMPTY_LIST;
int i;
pr_debug("%s: stripe %llu\n", __func__,
@@ -1201,20 +1203,18 @@ static void ops_complete_biofill(void *stripe_head_ref)
while (rbi && rbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
rbi2 = r5_next_bio(rbi, dev->sector);
- if (!raid5_dec_bi_active_stripes(rbi)) {
- rbi->bi_next = return_bi;
- return_bi = rbi;
- }
+ if (!raid5_dec_bi_active_stripes(rbi))
+ bio_list_add(&return_bi, rbi);
rbi = rbi2;
}
}
}
clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
- return_io(return_bi);
+ return_io(&return_bi);
set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
+ raid5_release_stripe(sh);
}
static void ops_run_biofill(struct stripe_head *sh)
@@ -1277,7 +1277,7 @@ static void ops_complete_compute(void *stripe_head_ref)
if (sh->check_state == check_state_compute_run)
sh->check_state = check_state_compute_result;
set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
+ raid5_release_stripe(sh);
}
/* return a pointer to the address conversion region of the scribble buffer */
@@ -1703,7 +1703,7 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
}
set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
+ raid5_release_stripe(sh);
}
static void
@@ -1861,7 +1861,7 @@ static void ops_complete_check(void *stripe_head_ref)
sh->check_state = check_state_check_result;
set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
+ raid5_release_stripe(sh);
}
static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
@@ -2023,7 +2023,7 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
/* we just created an active stripe so... */
atomic_inc(&conf->active_stripes);
- release_stripe(sh);
+ raid5_release_stripe(sh);
conf->max_nr_stripes++;
return 1;
}
@@ -2242,7 +2242,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
if (!p)
err = -ENOMEM;
}
- release_stripe(nsh);
+ raid5_release_stripe(nsh);
}
/* critical section pass, GFP_NOIO no longer needed */
@@ -2256,7 +2256,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
static int drop_one_stripe(struct r5conf *conf)
{
struct stripe_head *sh;
- int hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS;
+ int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK;
spin_lock_irq(conf->hash_locks + hash);
sh = get_free_stripe(conf, hash);
@@ -2277,17 +2277,15 @@ static void shrink_stripes(struct r5conf *conf)
drop_one_stripe(conf))
;
- if (conf->slab_cache)
- kmem_cache_destroy(conf->slab_cache);
+ kmem_cache_destroy(conf->slab_cache);
conf->slab_cache = NULL;
}
-static void raid5_end_read_request(struct bio * bi, int error)
+static void raid5_end_read_request(struct bio * bi)
{
struct stripe_head *sh = bi->bi_private;
struct r5conf *conf = sh->raid_conf;
int disks = sh->disks, i;
- int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
char b[BDEVNAME_SIZE];
struct md_rdev *rdev = NULL;
sector_t s;
@@ -2296,9 +2294,9 @@ static void raid5_end_read_request(struct bio * bi, int error)
if (bi == &sh->dev[i].req)
break;
- pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n",
+ pr_debug("end_read_request %llu/%d, count: %d, error %d.\n",
(unsigned long long)sh->sector, i, atomic_read(&sh->count),
- uptodate);
+ bi->bi_error);
if (i == disks) {
BUG();
return;
@@ -2317,7 +2315,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
s = sh->sector + rdev->new_data_offset;
else
s = sh->sector + rdev->data_offset;
- if (uptodate) {
+ if (!bi->bi_error) {
set_bit(R5_UPTODATE, &sh->dev[i].flags);
if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
/* Note that this cannot happen on a
@@ -2402,16 +2400,15 @@ static void raid5_end_read_request(struct bio * bi, int error)
rdev_dec_pending(rdev, conf->mddev);
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
+ raid5_release_stripe(sh);
}
-static void raid5_end_write_request(struct bio *bi, int error)
+static void raid5_end_write_request(struct bio *bi)
{
struct stripe_head *sh = bi->bi_private;
struct r5conf *conf = sh->raid_conf;
int disks = sh->disks, i;
struct md_rdev *uninitialized_var(rdev);
- int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
sector_t first_bad;
int bad_sectors;
int replacement = 0;
@@ -2434,23 +2431,23 @@ static void raid5_end_write_request(struct bio *bi, int error)
break;
}
}
- pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
+ pr_debug("end_write_request %llu/%d, count %d, error: %d.\n",
(unsigned long long)sh->sector, i, atomic_read(&sh->count),
- uptodate);
+ bi->bi_error);
if (i == disks) {
BUG();
return;
}
if (replacement) {
- if (!uptodate)
+ if (bi->bi_error)
md_error(conf->mddev, rdev);
else if (is_badblock(rdev, sh->sector,
STRIPE_SECTORS,
&first_bad, &bad_sectors))
set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
} else {
- if (!uptodate) {
+ if (bi->bi_error) {
set_bit(STRIPE_DEGRADED, &sh->state);
set_bit(WriteErrorSeen, &rdev->flags);
set_bit(R5_WriteError, &sh->dev[i].flags);
@@ -2471,20 +2468,18 @@ static void raid5_end_write_request(struct bio *bi, int error)
}
rdev_dec_pending(rdev, conf->mddev);
- if (sh->batch_head && !uptodate && !replacement)
+ if (sh->batch_head && bi->bi_error && !replacement)
set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
+ raid5_release_stripe(sh);
if (sh->batch_head && sh != sh->batch_head)
- release_stripe(sh->batch_head);
+ raid5_release_stripe(sh->batch_head);
}
-static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
-
static void raid5_build_block(struct stripe_head *sh, int i, int previous)
{
struct r5dev *dev = &sh->dev[i];
@@ -2500,7 +2495,7 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous)
dev->rreq.bi_private = sh;
dev->flags = 0;
- dev->sector = compute_blocknr(sh, i, previous);
+ dev->sector = raid5_compute_blocknr(sh, i, previous);
}
static void error(struct mddev *mddev, struct md_rdev *rdev)
@@ -2519,6 +2514,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
set_bit(Blocked, &rdev->flags);
set_bit(Faulty, &rdev->flags);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ set_bit(MD_CHANGE_PENDING, &mddev->flags);
printk(KERN_ALERT
"md/raid:%s: Disk failure on %s, disabling device.\n"
"md/raid:%s: Operation continuing on %d devices.\n",
@@ -2532,9 +2528,9 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
* Input: a 'big' sector number,
* Output: index of the data and parity disk, and the sector # in them.
*/
-static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
- int previous, int *dd_idx,
- struct stripe_head *sh)
+sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
+ int previous, int *dd_idx,
+ struct stripe_head *sh)
{
sector_t stripe, stripe2;
sector_t chunk_number;
@@ -2734,7 +2730,7 @@ static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
return new_sector;
}
-static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
+sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
{
struct r5conf *conf = sh->raid_conf;
int raid_disks = sh->disks;
@@ -3071,7 +3067,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
static void
handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
struct stripe_head_state *s, int disks,
- struct bio **return_bi)
+ struct bio_list *return_bi)
{
int i;
BUG_ON(sh->batch_head);
@@ -3106,17 +3102,19 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
if (bi)
bitmap_end = 1;
+ r5l_stripe_write_finished(sh);
+
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap);
while (bi && bi->bi_iter.bi_sector <
sh->dev[i].sector + STRIPE_SECTORS) {
struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
- clear_bit(BIO_UPTODATE, &bi->bi_flags);
+
+ bi->bi_error = -EIO;
if (!raid5_dec_bi_active_stripes(bi)) {
md_write_end(conf->mddev);
- bi->bi_next = *return_bi;
- *return_bi = bi;
+ bio_list_add(return_bi, bi);
}
bi = nextbi;
}
@@ -3136,11 +3134,11 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
while (bi && bi->bi_iter.bi_sector <
sh->dev[i].sector + STRIPE_SECTORS) {
struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
- clear_bit(BIO_UPTODATE, &bi->bi_flags);
+
+ bi->bi_error = -EIO;
if (!raid5_dec_bi_active_stripes(bi)) {
md_write_end(conf->mddev);
- bi->bi_next = *return_bi;
- *return_bi = bi;
+ bio_list_add(return_bi, bi);
}
bi = bi2;
}
@@ -3149,6 +3147,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
* the data has not reached the cache yet.
*/
if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
+ s->failed > conf->max_degraded &&
(!test_bit(R5_Insync, &sh->dev[i].flags) ||
test_bit(R5_ReadError, &sh->dev[i].flags))) {
spin_lock_irq(&sh->stripe_lock);
@@ -3157,15 +3156,16 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
spin_unlock_irq(&sh->stripe_lock);
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap);
+ if (bi)
+ s->to_read--;
while (bi && bi->bi_iter.bi_sector <
sh->dev[i].sector + STRIPE_SECTORS) {
struct bio *nextbi =
r5_next_bio(bi, sh->dev[i].sector);
- clear_bit(BIO_UPTODATE, &bi->bi_flags);
- if (!raid5_dec_bi_active_stripes(bi)) {
- bi->bi_next = *return_bi;
- *return_bi = bi;
- }
+
+ bi->bi_error = -EIO;
+ if (!raid5_dec_bi_active_stripes(bi))
+ bio_list_add(return_bi, bi);
bi = nextbi;
}
}
@@ -3177,6 +3177,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
*/
clear_bit(R5_LOCKED, &sh->dev[i].flags);
}
+ s->to_write = 0;
+ s->written = 0;
if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
if (atomic_dec_and_test(&conf->pending_full_writes))
@@ -3308,7 +3310,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
*/
return 0;
- for (i = 0; i < s->failed; i++) {
+ for (i = 0; i < s->failed && i < 2; i++) {
if (fdev[i]->towrite &&
!test_bit(R5_UPTODATE, &fdev[i]->flags) &&
!test_bit(R5_OVERWRITE, &fdev[i]->flags))
@@ -3332,7 +3334,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
sh->sector < sh->raid_conf->mddev->recovery_cp)
/* reconstruct-write isn't being forced */
return 0;
- for (i = 0; i < s->failed; i++) {
+ for (i = 0; i < s->failed && i < 2; i++) {
if (s->failed_num[i] != sh->pd_idx &&
s->failed_num[i] != sh->qd_idx &&
!test_bit(R5_UPTODATE, &fdev[i]->flags) &&
@@ -3444,7 +3446,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
* never LOCKED, so we don't need to test 'failed' directly.
*/
static void handle_stripe_clean_event(struct r5conf *conf,
- struct stripe_head *sh, int disks, struct bio **return_bi)
+ struct stripe_head *sh, int disks, struct bio_list *return_bi)
{
int i;
struct r5dev *dev;
@@ -3478,8 +3480,7 @@ returnbi:
wbi2 = r5_next_bio(wbi, dev->sector);
if (!raid5_dec_bi_active_stripes(wbi)) {
md_write_end(conf->mddev);
- wbi->bi_next = *return_bi;
- *return_bi = wbi;
+ bio_list_add(return_bi, wbi);
}
wbi = wbi2;
}
@@ -3503,8 +3504,12 @@ returnbi:
WARN_ON(test_bit(R5_SkipCopy, &dev->flags));
WARN_ON(dev->page != dev->orig_page);
}
+
+ r5l_stripe_write_finished(sh);
+
if (!discard_pending &&
test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
+ int hash;
clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
if (sh->qd_idx >= 0) {
@@ -3518,16 +3523,17 @@ returnbi:
* no updated data, so remove it from hash list and the stripe
* will be reinitialized
*/
- spin_lock_irq(&conf->device_lock);
unhash:
+ hash = sh->hash_lock_index;
+ spin_lock_irq(conf->hash_locks + hash);
remove_hash(sh);
+ spin_unlock_irq(conf->hash_locks + hash);
if (head_sh->batch_head) {
sh = list_first_entry(&sh->batch_list,
struct stripe_head, batch_list);
if (sh != head_sh)
goto unhash;
}
- spin_unlock_irq(&conf->device_lock);
sh = head_sh;
if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
@@ -3943,10 +3949,10 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
struct stripe_head *sh2;
struct async_submit_ctl submit;
- sector_t bn = compute_blocknr(sh, i, 1);
+ sector_t bn = raid5_compute_blocknr(sh, i, 1);
sector_t s = raid5_compute_sector(conf, bn, 0,
&dd_idx, NULL);
- sh2 = get_active_stripe(conf, s, 0, 1, 1);
+ sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1);
if (sh2 == NULL)
/* so far only the early blocks of this stripe
* have been requested. When later blocks
@@ -3956,7 +3962,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
/* must have already done this block */
- release_stripe(sh2);
+ raid5_release_stripe(sh2);
continue;
}
@@ -3977,7 +3983,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
set_bit(STRIPE_EXPAND_READY, &sh2->state);
set_bit(STRIPE_HANDLE, &sh2->state);
}
- release_stripe(sh2);
+ raid5_release_stripe(sh2);
}
/* done submitting copies, wait for them to complete */
@@ -4012,6 +4018,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head;
s->failed_num[0] = -1;
s->failed_num[1] = -1;
+ s->log_failed = r5l_log_disk_error(conf);
/* Now to look around and see what can be done */
rcu_read_lock();
@@ -4263,7 +4270,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
if (handle_flags == 0 ||
sh->state & handle_flags)
set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
+ raid5_release_stripe(sh);
}
spin_lock_irq(&head_sh->stripe_lock);
head_sh->batch_head = NULL;
@@ -4324,6 +4331,9 @@ static void handle_stripe(struct stripe_head *sh)
analyse_stripe(sh, &s);
+ if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
+ goto finish;
+
if (s.handle_bad_blocks) {
set_bit(STRIPE_HANDLE, &sh->state);
goto finish;
@@ -4352,7 +4362,7 @@ static void handle_stripe(struct stripe_head *sh)
/* check if the array has lost more than max_degraded devices and,
* if so, some requests might need to be failed.
*/
- if (s.failed > conf->max_degraded) {
+ if (s.failed > conf->max_degraded || s.log_failed) {
sh->check_state = 0;
sh->reconstruct_state = 0;
break_stripe_batch_list(sh, 0);
@@ -4510,7 +4520,7 @@ static void handle_stripe(struct stripe_head *sh)
/* Finish reconstruct operations initiated by the expansion process */
if (sh->reconstruct_state == reconstruct_state_result) {
struct stripe_head *sh_src
- = get_active_stripe(conf, sh->sector, 1, 1, 1);
+ = raid5_get_active_stripe(conf, sh->sector, 1, 1, 1);
if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
/* sh cannot be written until sh_src has been read.
* so arrange for sh to be delayed a little
@@ -4520,11 +4530,11 @@ static void handle_stripe(struct stripe_head *sh)
if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
&sh_src->state))
atomic_inc(&conf->preread_active_stripes);
- release_stripe(sh_src);
+ raid5_release_stripe(sh_src);
goto finish;
}
if (sh_src)
- release_stripe(sh_src);
+ raid5_release_stripe(sh_src);
sh->reconstruct_state = reconstruct_state_idle;
clear_bit(STRIPE_EXPANDING, &sh->state);
@@ -4612,7 +4622,15 @@ finish:
md_wakeup_thread(conf->mddev->thread);
}
- return_io(s.return_bi);
+ if (!bio_list_empty(&s.return_bi)) {
+ if (test_bit(MD_CHANGE_PENDING, &conf->mddev->flags)) {
+ spin_lock_irq(&conf->device_lock);
+ bio_list_merge(&conf->return_bi, &s.return_bi);
+ spin_unlock_irq(&conf->device_lock);
+ md_wakeup_thread(conf->mddev->thread);
+ } else
+ return_io(&s.return_bi);
+ }
clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
}
@@ -4669,43 +4687,14 @@ static int raid5_congested(struct mddev *mddev, int bits)
return 0;
}
-/* We want read requests to align with chunks where possible,
- * but write requests don't need to.
- */
-static int raid5_mergeable_bvec(struct mddev *mddev,
- struct bvec_merge_data *bvm,
- struct bio_vec *biovec)
-{
- sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
- int max;
- unsigned int chunk_sectors = mddev->chunk_sectors;
- unsigned int bio_sectors = bvm->bi_size >> 9;
-
- /*
- * always allow writes to be mergeable, read as well if array
- * is degraded as we'll go through stripe cache anyway.
- */
- if ((bvm->bi_rw & 1) == WRITE || mddev->degraded)
- return biovec->bv_len;
-
- if (mddev->new_chunk_sectors < mddev->chunk_sectors)
- chunk_sectors = mddev->new_chunk_sectors;
- max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
- if (max < 0) max = 0;
- if (max <= biovec->bv_len && bio_sectors == 0)
- return biovec->bv_len;
- else
- return max;
-}
-
static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
{
+ struct r5conf *conf = mddev->private;
sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev);
- unsigned int chunk_sectors = mddev->chunk_sectors;
+ unsigned int chunk_sectors;
unsigned int bio_sectors = bio_sectors(bio);
- if (mddev->new_chunk_sectors < mddev->chunk_sectors)
- chunk_sectors = mddev->new_chunk_sectors;
+ chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors);
return chunk_sectors >=
((sector & (chunk_sectors - 1)) + bio_sectors);
}
@@ -4756,13 +4745,13 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf)
* first).
* If the read failed..
*/
-static void raid5_align_endio(struct bio *bi, int error)
+static void raid5_align_endio(struct bio *bi)
{
struct bio* raid_bi = bi->bi_private;
struct mddev *mddev;
struct r5conf *conf;
- int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
struct md_rdev *rdev;
+ int error = bi->bi_error;
bio_put(bi);
@@ -4773,10 +4762,10 @@ static void raid5_align_endio(struct bio *bi, int error)
rdev_dec_pending(rdev, conf->mddev);
- if (!error && uptodate) {
+ if (!error) {
trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev),
raid_bi, 0);
- bio_endio(raid_bi, 0);
+ bio_endio(raid_bi);
if (atomic_dec_and_test(&conf->active_aligned_reads))
wake_up(&conf->wait_for_quiescent);
return;
@@ -4787,26 +4776,7 @@ static void raid5_align_endio(struct bio *bi, int error)
add_bio_to_retry(raid_bi, conf);
}
-static int bio_fits_rdev(struct bio *bi)
-{
- struct request_queue *q = bdev_get_queue(bi->bi_bdev);
-
- if (bio_sectors(bi) > queue_max_sectors(q))
- return 0;
- blk_recount_segments(q, bi);
- if (bi->bi_phys_segments > queue_max_segments(q))
- return 0;
-
- if (q->merge_bvec_fn)
- /* it's too hard to apply the merge_bvec_fn at this stage,
- * just just give up
- */
- return 0;
-
- return 1;
-}
-
-static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
+static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
{
struct r5conf *conf = mddev->private;
int dd_idx;
@@ -4815,7 +4785,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
sector_t end_sector;
if (!in_chunk_boundary(mddev, raid_bio)) {
- pr_debug("chunk_aligned_read : non aligned\n");
+ pr_debug("%s: non aligned\n", __func__);
return 0;
}
/*
@@ -4857,13 +4827,11 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
rcu_read_unlock();
raid_bio->bi_next = (void*)rdev;
align_bi->bi_bdev = rdev->bdev;
- __clear_bit(BIO_SEG_VALID, &align_bi->bi_flags);
+ bio_clear_flag(align_bi, BIO_SEG_VALID);
- if (!bio_fits_rdev(align_bi) ||
- is_badblock(rdev, align_bi->bi_iter.bi_sector,
+ if (is_badblock(rdev, align_bi->bi_iter.bi_sector,
bio_sectors(align_bi),
&first_bad, &bad_sectors)) {
- /* too big in some way, or has a known bad block */
bio_put(align_bi);
rdev_dec_pending(rdev, mddev);
return 0;
@@ -4892,6 +4860,31 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
}
}
+static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
+{
+ struct bio *split;
+
+ do {
+ sector_t sector = raid_bio->bi_iter.bi_sector;
+ unsigned chunk_sects = mddev->chunk_sectors;
+ unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
+
+ if (sectors < bio_sectors(raid_bio)) {
+ split = bio_split(raid_bio, sectors, GFP_NOIO, fs_bio_set);
+ bio_chain(split, raid_bio);
+ } else
+ split = raid_bio;
+
+ if (!raid5_read_one_chunk(mddev, split)) {
+ if (split != raid_bio)
+ generic_make_request(raid_bio);
+ return split;
+ }
+ } while (split != raid_bio);
+
+ return NULL;
+}
+
/* __get_priority_stripe - get the next stripe to process
*
* Full stripe writes are allowed to pass preread active stripes up until
@@ -5033,7 +5026,7 @@ static void release_stripe_plug(struct mddev *mddev,
struct raid5_plug_cb *cb;
if (!blk_cb) {
- release_stripe(sh);
+ raid5_release_stripe(sh);
return;
}
@@ -5049,7 +5042,7 @@ static void release_stripe_plug(struct mddev *mddev,
if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
list_add_tail(&sh->lru, &cb->list);
else
- release_stripe(sh);
+ raid5_release_stripe(sh);
}
static void make_discard_request(struct mddev *mddev, struct bio *bi)
@@ -5084,12 +5077,12 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
DEFINE_WAIT(w);
int d;
again:
- sh = get_active_stripe(conf, logical_sector, 0, 0, 0);
+ sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0);
prepare_to_wait(&conf->wait_for_overlap, &w,
TASK_UNINTERRUPTIBLE);
set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
if (test_bit(STRIPE_SYNCING, &sh->state)) {
- release_stripe(sh);
+ raid5_release_stripe(sh);
schedule();
goto again;
}
@@ -5101,7 +5094,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
if (sh->dev[d].towrite || sh->dev[d].toread) {
set_bit(R5_Overlap, &sh->dev[d].flags);
spin_unlock_irq(&sh->stripe_lock);
- release_stripe(sh);
+ raid5_release_stripe(sh);
schedule();
goto again;
}
@@ -5140,7 +5133,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
remaining = raid5_dec_bi_active_stripes(bi);
if (remaining == 0) {
md_write_end(mddev);
- bio_endio(bi, 0);
+ bio_endio(bi);
}
}
@@ -5157,8 +5150,15 @@ static void make_request(struct mddev *mddev, struct bio * bi)
bool do_prepare;
if (unlikely(bi->bi_rw & REQ_FLUSH)) {
- md_flush_request(mddev, bi);
- return;
+ int ret = r5l_handle_flush_request(conf->log, bi);
+
+ if (ret == 0)
+ return;
+ if (ret == -ENODEV) {
+ md_flush_request(mddev, bi);
+ return;
+ }
+ /* ret == -EAGAIN, fallback */
}
md_write_start(mddev, bi);
@@ -5169,9 +5169,11 @@ static void make_request(struct mddev *mddev, struct bio * bi)
* data on failed drives.
*/
if (rw == READ && mddev->degraded == 0 &&
- mddev->reshape_position == MaxSector &&
- chunk_aligned_read(mddev,bi))
- return;
+ mddev->reshape_position == MaxSector) {
+ bi = chunk_aligned_read(mddev, bi);
+ if (!bi)
+ return;
+ }
if (unlikely(bi->bi_rw & REQ_DISCARD)) {
make_discard_request(mddev, bi);
@@ -5229,7 +5231,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
(unsigned long long)new_sector,
(unsigned long long)logical_sector);
- sh = get_active_stripe(conf, new_sector, previous,
+ sh = raid5_get_active_stripe(conf, new_sector, previous,
(bi->bi_rw&RWA_MASK), 0);
if (sh) {
if (unlikely(previous)) {
@@ -5250,7 +5252,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
must_retry = 1;
spin_unlock_irq(&conf->device_lock);
if (must_retry) {
- release_stripe(sh);
+ raid5_release_stripe(sh);
schedule();
do_prepare = true;
goto retry;
@@ -5260,14 +5262,14 @@ static void make_request(struct mddev *mddev, struct bio * bi)
/* Might have got the wrong stripe_head
* by accident
*/
- release_stripe(sh);
+ raid5_release_stripe(sh);
goto retry;
}
if (rw == WRITE &&
logical_sector >= mddev->suspend_lo &&
logical_sector < mddev->suspend_hi) {
- release_stripe(sh);
+ raid5_release_stripe(sh);
/* As the suspend_* range is controlled by
* userspace, we want an interruptible
* wait.
@@ -5290,7 +5292,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
* and wait a while
*/
md_wakeup_thread(mddev->thread);
- release_stripe(sh);
+ raid5_release_stripe(sh);
schedule();
do_prepare = true;
goto retry;
@@ -5304,7 +5306,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
release_stripe_plug(mddev, sh);
} else {
/* cannot get stripe for read-ahead, just give-up */
- clear_bit(BIO_UPTODATE, &bi->bi_flags);
+ bi->bi_error = -EIO;
break;
}
}
@@ -5318,7 +5320,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
bi, 0);
- bio_endio(bi, 0);
+ bio_endio(bi);
}
}
@@ -5347,6 +5349,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
sector_t stripe_addr;
int reshape_sectors;
struct list_head stripes;
+ sector_t retn;
if (sector_nr == 0) {
/* If restarting in the middle, skip the initial sectors */
@@ -5354,6 +5357,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
conf->reshape_progress < raid5_size(mddev, 0, 0)) {
sector_nr = raid5_size(mddev, 0, 0)
- conf->reshape_progress;
+ } else if (mddev->reshape_backwards &&
+ conf->reshape_progress == MaxSector) {
+ /* shouldn't happen, but just in case, finish up.*/
+ sector_nr = MaxSector;
} else if (!mddev->reshape_backwards &&
conf->reshape_progress > 0)
sector_nr = conf->reshape_progress;
@@ -5362,7 +5369,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
mddev->curr_resync_completed = sector_nr;
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
*skipped = 1;
- return sector_nr;
+ retn = sector_nr;
+ goto finish;
}
}
@@ -5370,10 +5378,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
* If old and new chunk sizes differ, we need to process the
* largest of these
*/
- if (mddev->new_chunk_sectors > mddev->chunk_sectors)
- reshape_sectors = mddev->new_chunk_sectors;
- else
- reshape_sectors = mddev->chunk_sectors;
+
+ reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors);
/* We update the metadata at least every 10 seconds, or when
* the data about to be copied would over-write the source of
@@ -5388,11 +5394,16 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
safepos = conf->reshape_safe;
sector_div(safepos, data_disks);
if (mddev->reshape_backwards) {
- writepos -= min_t(sector_t, reshape_sectors, writepos);
+ BUG_ON(writepos < reshape_sectors);
+ writepos -= reshape_sectors;
readpos += reshape_sectors;
safepos += reshape_sectors;
} else {
writepos += reshape_sectors;
+ /* readpos and safepos are worst-case calculations.
+ * A negative number is overly pessimistic, and causes
+ * obvious problems for unsigned storage. So clip to 0.
+ */
readpos -= min_t(sector_t, reshape_sectors, readpos);
safepos -= min_t(sector_t, reshape_sectors, safepos);
}
@@ -5468,7 +5479,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
int j;
int skipped_disk = 0;
- sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
+ sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
set_bit(STRIPE_EXPANDING, &sh->state);
atomic_inc(&conf->reshape_stripes);
/* If any of this stripe is beyond the end of the old
@@ -5481,7 +5492,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
if (conf->level == 6 &&
j == sh->qd_idx)
continue;
- s = compute_blocknr(sh, j, 0);
+ s = raid5_compute_blocknr(sh, j, 0);
if (s < raid5_size(mddev, 0, 0)) {
skipped_disk = 1;
continue;
@@ -5517,10 +5528,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
if (last_sector >= mddev->dev_sectors)
last_sector = mddev->dev_sectors - 1;
while (first_sector <= last_sector) {
- sh = get_active_stripe(conf, first_sector, 1, 0, 1);
+ sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1);
set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
+ raid5_release_stripe(sh);
first_sector += STRIPE_SECTORS;
}
/* Now that the sources are clearly marked, we can release
@@ -5529,13 +5540,16 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
while (!list_empty(&stripes)) {
sh = list_entry(stripes.next, struct stripe_head, lru);
list_del_init(&sh->lru);
- release_stripe(sh);
+ raid5_release_stripe(sh);
}
/* If this takes us to the resync_max point where we have to pause,
* then we need to write out the superblock.
*/
sector_nr += reshape_sectors;
- if ((sector_nr - mddev->curr_resync_completed) * 2
+ retn = reshape_sectors;
+finish:
+ if (mddev->curr_resync_completed > mddev->resync_max ||
+ (sector_nr - mddev->curr_resync_completed) * 2
>= mddev->resync_max - mddev->curr_resync_completed) {
/* Cannot proceed until we've updated the superblock... */
wait_event(conf->wait_for_overlap,
@@ -5560,7 +5574,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
}
ret:
- return reshape_sectors;
+ return retn;
}
static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
@@ -5622,11 +5636,11 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
}
- bitmap_cond_end_sync(mddev->bitmap, sector_nr);
+ bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
- sh = get_active_stripe(conf, sector_nr, 0, 1, 0);
+ sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0);
if (sh == NULL) {
- sh = get_active_stripe(conf, sector_nr, 0, 0, 0);
+ sh