summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorShaohua Li <shli@fb.com>2017-05-01 14:09:21 -0700
committerShaohua Li <shli@fb.com>2017-05-01 14:09:21 -0700
commite265eb3a30543a237b2ebc4e0422ac82e55b07e4 (patch)
tree5485bce4a0645e5e9b6ef4686bd390b7b2599ffb
parent85724edecbdc19f53ed4b902fc3a32e4d1b61c9b (diff)
parentb506335e5d2b4ec687dde392a3bdbf7601778f1d (diff)
Merge branch 'md-next' into md-linus
-rw-r--r--Documentation/admin-guide/md.rst32
-rw-r--r--Documentation/md/md-cluster.txt2
-rw-r--r--Documentation/md/raid5-ppl.txt44
-rw-r--r--block/bio.c61
-rw-r--r--drivers/md/Makefile2
-rw-r--r--drivers/md/bitmap.c59
-rw-r--r--drivers/md/bitmap.h3
-rw-r--r--drivers/md/linear.c75
-rw-r--r--drivers/md/md-cluster.c223
-rw-r--r--drivers/md/md-cluster.h1
-rw-r--r--drivers/md/md.c414
-rw-r--r--drivers/md/md.h71
-rw-r--r--drivers/md/raid0.c78
-rw-r--r--drivers/md/raid1.c679
-rw-r--r--drivers/md/raid1.h13
-rw-r--r--drivers/md/raid10.c736
-rw-r--r--drivers/md/raid10.h1
-rw-r--r--drivers/md/raid5-cache.c362
-rw-r--r--drivers/md/raid5-log.h115
-rw-r--r--drivers/md/raid5-ppl.c1271
-rw-r--r--drivers/md/raid5.c643
-rw-r--r--drivers/md/raid5.h106
-rw-r--r--include/linux/bio.h11
-rw-r--r--include/linux/percpu-refcount.h1
-rw-r--r--include/uapi/linux/raid/md_p.h45
-rw-r--r--lib/percpu-refcount.c17
26 files changed, 3582 insertions, 1483 deletions
diff --git a/Documentation/admin-guide/md.rst b/Documentation/admin-guide/md.rst
index 1e61bf50595c..84de718f24a4 100644
--- a/Documentation/admin-guide/md.rst
+++ b/Documentation/admin-guide/md.rst
@@ -276,14 +276,14 @@ All md devices contain:
array creation it will default to 0, though starting the array as
``clean`` will set it much larger.
- new_dev
+ new_dev
This file can be written but not read. The value written should
be a block device number as major:minor. e.g. 8:0
This will cause that device to be attached to the array, if it is
available. It will then appear at md/dev-XXX (depending on the
name of the device) and further configuration is then possible.
- safe_mode_delay
+ safe_mode_delay
When an md array has seen no write requests for a certain period
of time, it will be marked as ``clean``. When another write
request arrives, the array is marked as ``dirty`` before the write
@@ -292,7 +292,7 @@ All md devices contain:
period as a number of seconds. The default is 200msec (0.200).
Writing a value of 0 disables safemode.
- array_state
+ array_state
This file contains a single word which describes the current
state of the array. In many cases, the state can be set by
writing the word for the desired state, however some states
@@ -401,7 +401,30 @@ All md devices contain:
once the array becomes non-degraded, and this fact has been
recorded in the metadata.
+ consistency_policy
+ This indicates how the array maintains consistency in case of unexpected
+ shutdown. It can be:
+ none
+ Array has no redundancy information, e.g. raid0, linear.
+
+ resync
+ Full resync is performed and all redundancy is regenerated when the
+ array is started after unclean shutdown.
+
+ bitmap
+ Resync assisted by a write-intent bitmap.
+
+ journal
+ For raid4/5/6, journal device is used to log transactions and replay
+ after unclean shutdown.
+
+ ppl
+ For raid5 only, Partial Parity Log is used to close the write hole and
+ eliminate resync.
+
+ The accepted values when writing to this file are ``ppl`` and ``resync``,
+ used to enable and disable PPL.
As component devices are added to an md array, they appear in the ``md``
@@ -563,6 +586,9 @@ Each directory contains:
adds bad blocks without acknowledging them. This is largely
for testing.
+ ppl_sector, ppl_size
+ Location and size (in sectors) of the space used for Partial Parity Log
+ on this device.
An active md device will also contain an entry for each active device
diff --git a/Documentation/md/md-cluster.txt b/Documentation/md/md-cluster.txt
index 38883276d31c..2663d49dd8a0 100644
--- a/Documentation/md/md-cluster.txt
+++ b/Documentation/md/md-cluster.txt
@@ -321,4 +321,4 @@ The algorithm is:
There are somethings which are not supported by cluster MD yet.
-- update size and change array_sectors.
+- change array_sectors.
diff --git a/Documentation/md/raid5-ppl.txt b/Documentation/md/raid5-ppl.txt
new file mode 100644
index 000000000000..127072b09363
--- /dev/null
+++ b/Documentation/md/raid5-ppl.txt
@@ -0,0 +1,44 @@
+Partial Parity Log
+
+Partial Parity Log (PPL) is a feature available for RAID5 arrays. The issue
+addressed by PPL is that after a dirty shutdown, parity of a particular stripe
+may become inconsistent with data on other member disks. If the array is also
+in degraded state, there is no way to recalculate parity, because one of the
+disks is missing. This can lead to silent data corruption when rebuilding the
+array or using it is as degraded - data calculated from parity for array blocks
+that have not been touched by a write request during the unclean shutdown can
+be incorrect. Such condition is known as the RAID5 Write Hole. Because of
+this, md by default does not allow starting a dirty degraded array.
+
+Partial parity for a write operation is the XOR of stripe data chunks not
+modified by this write. It is just enough data needed for recovering from the
+write hole. XORing partial parity with the modified chunks produces parity for
+the stripe, consistent with its state before the write operation, regardless of
+which chunk writes have completed. If one of the not modified data disks of
+this stripe is missing, this updated parity can be used to recover its
+contents. PPL recovery is also performed when starting an array after an
+unclean shutdown and all disks are available, eliminating the need to resync
+the array. Because of this, using write-intent bitmap and PPL together is not
+supported.
+
+When handling a write request PPL writes partial parity before new data and
+parity are dispatched to disks. PPL is a distributed log - it is stored on
+array member drives in the metadata area, on the parity drive of a particular
+stripe. It does not require a dedicated journaling drive. Write performance is
+reduced by up to 30%-40% but it scales with the number of drives in the array
+and the journaling drive does not become a bottleneck or a single point of
+failure.
+
+Unlike raid5-cache, the other solution in md for closing the write hole, PPL is
+not a true journal. It does not protect from losing in-flight data, only from
+silent data corruption. If a dirty disk of a stripe is lost, no PPL recovery is
+performed for this stripe (parity is not updated). So it is possible to have
+arbitrary data in the written part of a stripe if that disk is lost. In such
+case the behavior is the same as in plain raid5.
+
+PPL is available for md version-1 metadata and external (specifically IMSM)
+metadata arrays. It can be enabled using mdadm option --consistency-policy=ppl.
+
+Currently, volatile write-back cache should be disabled on all member drives
+when using PPL. Otherwise it cannot guarantee consistency in case of power
+failure.
diff --git a/block/bio.c b/block/bio.c
index f4d207180266..888e7801c638 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -633,20 +633,21 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
}
EXPORT_SYMBOL(bio_clone_fast);
-static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
- struct bio_set *bs, int offset,
- int size)
+/**
+ * bio_clone_bioset - clone a bio
+ * @bio_src: bio to clone
+ * @gfp_mask: allocation priority
+ * @bs: bio_set to allocate from
+ *
+ * Clone bio. Caller will own the returned bio, but not the actual data it
+ * points to. Reference count of returned bio will be one.
+ */
+struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
+ struct bio_set *bs)
{
struct bvec_iter iter;
struct bio_vec bv;
struct bio *bio;
- struct bvec_iter iter_src = bio_src->bi_iter;
-
- /* for supporting partial clone */
- if (offset || size != bio_src->bi_iter.bi_size) {
- bio_advance_iter(bio_src, &iter_src, offset);
- iter_src.bi_size = size;
- }
/*
* Pre immutable biovecs, __bio_clone() used to just do a memcpy from
@@ -670,8 +671,7 @@ static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
* __bio_clone_fast() anyways.
*/
- bio = bio_alloc_bioset(gfp_mask, __bio_segments(bio_src,
- &iter_src), bs);
+ bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
if (!bio)
return NULL;
bio->bi_bdev = bio_src->bi_bdev;
@@ -688,7 +688,7 @@ static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
break;
default:
- __bio_for_each_segment(bv, bio_src, iter, iter_src)
+ bio_for_each_segment(bv, bio_src, iter)
bio->bi_io_vec[bio->bi_vcnt++] = bv;
break;
}
@@ -707,44 +707,9 @@ static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
return bio;
}
-
-/**
- * bio_clone_bioset - clone a bio
- * @bio_src: bio to clone
- * @gfp_mask: allocation priority
- * @bs: bio_set to allocate from
- *
- * Clone bio. Caller will own the returned bio, but not the actual data it
- * points to. Reference count of returned bio will be one.
- */
-struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
- struct bio_set *bs)
-{
- return __bio_clone_bioset(bio_src, gfp_mask, bs, 0,
- bio_src->bi_iter.bi_size);
-}
EXPORT_SYMBOL(bio_clone_bioset);
/**
- * bio_clone_bioset_partial - clone a partial bio
- * @bio_src: bio to clone
- * @gfp_mask: allocation priority
- * @bs: bio_set to allocate from
- * @offset: cloned starting from the offset
- * @size: size for the cloned bio
- *
- * Clone bio. Caller will own the returned bio, but not the actual data it
- * points to. Reference count of returned bio will be one.
- */
-struct bio *bio_clone_bioset_partial(struct bio *bio_src, gfp_t gfp_mask,
- struct bio_set *bs, int offset,
- int size)
-{
- return __bio_clone_bioset(bio_src, gfp_mask, bs, offset, size);
-}
-EXPORT_SYMBOL(bio_clone_bioset_partial);
-
-/**
* bio_add_pc_page - attempt to add page to bio
* @q: the target queue
* @bio: destination bio
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 3cbda1af87a0..4d48714ccc6b 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -18,7 +18,7 @@ dm-cache-cleaner-y += dm-cache-policy-cleaner.o
dm-era-y += dm-era-target.o
dm-verity-y += dm-verity-target.o
md-mod-y += md.o bitmap.o
-raid456-y += raid5.o raid5-cache.o
+raid456-y += raid5.o raid5-cache.o raid5-ppl.o
# Note: link order is important. All raid personalities
# and must come before md.o, as they each initialise
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 9fb2ccac958a..bf7419a56454 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -471,6 +471,7 @@ void bitmap_update_sb(struct bitmap *bitmap)
kunmap_atomic(sb);
write_page(bitmap, bitmap->storage.sb_page, 1);
}
+EXPORT_SYMBOL(bitmap_update_sb);
/* print out the bitmap file superblock */
void bitmap_print_sb(struct bitmap *bitmap)
@@ -696,7 +697,7 @@ re_read:
out:
kunmap_atomic(sb);
- /* Assiging chunksize is required for "re_read" */
+ /* Assigning chunksize is required for "re_read" */
bitmap->mddev->bitmap_info.chunksize = chunksize;
if (err == 0 && nodes && (bitmap->cluster_slot < 0)) {
err = md_setup_cluster(bitmap->mddev, nodes);
@@ -1727,7 +1728,7 @@ void bitmap_flush(struct mddev *mddev)
/*
* free memory that was allocated
*/
-static void bitmap_free(struct bitmap *bitmap)
+void bitmap_free(struct bitmap *bitmap)
{
unsigned long k, pages;
struct bitmap_page *bp;
@@ -1761,6 +1762,21 @@ static void bitmap_free(struct bitmap *bitmap)
kfree(bp);
kfree(bitmap);
}
+EXPORT_SYMBOL(bitmap_free);
+
+void bitmap_wait_behind_writes(struct mddev *mddev)
+{
+ struct bitmap *bitmap = mddev->bitmap;
+
+ /* wait for behind writes to complete */
+ if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
+ pr_debug("md:%s: behind writes in progress - waiting to stop.\n",
+ mdname(mddev));
+ /* need to kick something here to make sure I/O goes? */
+ wait_event(bitmap->behind_wait,
+ atomic_read(&bitmap->behind_writes) == 0);
+ }
+}
void bitmap_destroy(struct mddev *mddev)
{
@@ -1769,6 +1785,8 @@ void bitmap_destroy(struct mddev *mddev)
if (!bitmap) /* there was no bitmap */
return;
+ bitmap_wait_behind_writes(mddev);
+
mutex_lock(&mddev->bitmap_info.mutex);
spin_lock(&mddev->lock);
mddev->bitmap = NULL; /* disconnect from the md device */
@@ -1920,6 +1938,27 @@ out:
}
EXPORT_SYMBOL_GPL(bitmap_load);
+struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot)
+{
+ int rv = 0;
+ struct bitmap *bitmap;
+
+ bitmap = bitmap_create(mddev, slot);
+ if (IS_ERR(bitmap)) {
+ rv = PTR_ERR(bitmap);
+ return ERR_PTR(rv);
+ }
+
+ rv = bitmap_init_from_disk(bitmap, 0);
+ if (rv) {
+ bitmap_free(bitmap);
+ return ERR_PTR(rv);
+ }
+
+ return bitmap;
+}
+EXPORT_SYMBOL(get_bitmap_from_slot);
+
/* Loads the bitmap associated with slot and copies the resync information
* to our bitmap
*/
@@ -1929,14 +1968,13 @@ int bitmap_copy_from_slot(struct mddev *mddev, int slot,
int rv = 0, i, j;
sector_t block, lo = 0, hi = 0;
struct bitmap_counts *counts;
- struct bitmap *bitmap = bitmap_create(mddev, slot);
-
- if (IS_ERR(bitmap))
- return PTR_ERR(bitmap);
+ struct bitmap *bitmap;
- rv = bitmap_init_from_disk(bitmap, 0);
- if (rv)
- goto err;
+ bitmap = get_bitmap_from_slot(mddev, slot);
+ if (IS_ERR(bitmap)) {
+ pr_err("%s can't get bitmap from slot %d\n", __func__, slot);
+ return -1;
+ }
counts = &bitmap->counts;
for (j = 0; j < counts->chunks; j++) {
@@ -1963,8 +2001,7 @@ int bitmap_copy_from_slot(struct mddev *mddev, int slot,
bitmap_unplug(mddev->bitmap);
*low = lo;
*high = hi;
-err:
- bitmap_free(bitmap);
+
return rv;
}
EXPORT_SYMBOL_GPL(bitmap_copy_from_slot);
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index 5b6dd63dda91..d15721ac07a6 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -267,8 +267,11 @@ void bitmap_daemon_work(struct mddev *mddev);
int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
int chunksize, int init);
+struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot);
int bitmap_copy_from_slot(struct mddev *mddev, int slot,
sector_t *lo, sector_t *hi, bool clear_bits);
+void bitmap_free(struct bitmap *bitmap);
+void bitmap_wait_behind_writes(struct mddev *mddev);
#endif
#endif
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 377a8a3672e3..df6f2c98eca7 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -249,54 +249,49 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
{
char b[BDEVNAME_SIZE];
struct dev_info *tmp_dev;
- struct bio *split;
sector_t start_sector, end_sector, data_offset;
+ sector_t bio_sector = bio->bi_iter.bi_sector;
if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
md_flush_request(mddev, bio);
return;
}
- do {
- sector_t bio_sector = bio->bi_iter.bi_sector;
- tmp_dev = which_dev(mddev, bio_sector);
- start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
- end_sector = tmp_dev->end_sector;
- data_offset = tmp_dev->rdev->data_offset;
- bio->bi_bdev = tmp_dev->rdev->bdev;
-
- if (unlikely(bio_sector >= end_sector ||
- bio_sector < start_sector))
- goto out_of_bounds;
-
- if (unlikely(bio_end_sector(bio) > end_sector)) {
- /* This bio crosses a device boundary, so we have to
- * split it.
- */
- split = bio_split(bio, end_sector - bio_sector,
- GFP_NOIO, fs_bio_set);
- bio_chain(split, bio);
- } else {
- split = bio;
- }
+ tmp_dev = which_dev(mddev, bio_sector);
+ start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
+ end_sector = tmp_dev->end_sector;
+ data_offset = tmp_dev->rdev->data_offset;
+
+ if (unlikely(bio_sector >= end_sector ||
+ bio_sector < start_sector))
+ goto out_of_bounds;
+
+ if (unlikely(bio_end_sector(bio) > end_sector)) {
+ /* This bio crosses a device boundary, so we have to split it */
+ struct bio *split = bio_split(bio, end_sector - bio_sector,
+ GFP_NOIO, mddev->bio_set);
+ bio_chain(split, bio);
+ generic_make_request(bio);
+ bio = split;
+ }
- split->bi_iter.bi_sector = split->bi_iter.bi_sector -
- start_sector + data_offset;
-
- if (unlikely((bio_op(split) == REQ_OP_DISCARD) &&
- !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
- /* Just ignore it */
- bio_endio(split);
- } else {
- if (mddev->gendisk)
- trace_block_bio_remap(bdev_get_queue(split->bi_bdev),
- split, disk_devt(mddev->gendisk),
- bio_sector);
- mddev_check_writesame(mddev, split);
- mddev_check_write_zeroes(mddev, split);
- generic_make_request(split);
- }
- } while (split != bio);
+ bio->bi_bdev = tmp_dev->rdev->bdev;
+ bio->bi_iter.bi_sector = bio->bi_iter.bi_sector -
+ start_sector + data_offset;
+
+ if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
+ !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
+ /* Just ignore it */
+ bio_endio(bio);
+ } else {
+ if (mddev->gendisk)
+ trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
+ bio, disk_devt(mddev->gendisk),
+ bio_sector);
+ mddev_check_writesame(mddev, bio);
+ mddev_check_write_zeroes(mddev, bio);
+ generic_make_request(bio);
+ }
return;
out_of_bounds:
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 321ecac23027..7299ce2f08a8 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -67,9 +67,10 @@ struct resync_info {
* set up all the related infos such as bitmap and personality */
#define MD_CLUSTER_ALREADY_IN_CLUSTER 6
#define MD_CLUSTER_PENDING_RECV_EVENT 7
-
+#define MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD 8
struct md_cluster_info {
+ struct mddev *mddev; /* the md device which md_cluster_info belongs to */
/* dlm lock space and resources for clustered raid. */
dlm_lockspace_t *lockspace;
int slot_number;
@@ -103,6 +104,7 @@ enum msg_type {
REMOVE,
RE_ADD,
BITMAP_NEEDS_SYNC,
+ CHANGE_CAPACITY,
};
struct cluster_msg {
@@ -523,11 +525,17 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
{
+ int got_lock = 0;
struct md_cluster_info *cinfo = mddev->cluster_info;
mddev->good_device_nr = le32_to_cpu(msg->raid_slot);
- set_bit(MD_RELOAD_SB, &mddev->flags);
+
dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
- md_wakeup_thread(mddev->thread);
+ wait_event(mddev->thread->wqueue,
+ (got_lock = mddev_trylock(mddev)) ||
+ test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state));
+ md_reload_sb(mddev, mddev->good_device_nr);
+ if (got_lock)
+ mddev_unlock(mddev);
}
static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
@@ -572,6 +580,10 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
case METADATA_UPDATED:
process_metadata_update(mddev, msg);
break;
+ case CHANGE_CAPACITY:
+ set_capacity(mddev->gendisk, mddev->array_sectors);
+ revalidate_disk(mddev->gendisk);
+ break;
case RESYNCING:
process_suspend_info(mddev, le32_to_cpu(msg->slot),
le64_to_cpu(msg->low),
@@ -646,11 +658,29 @@ out:
* Takes the lock on the TOKEN lock resource so no other
* node can communicate while the operation is underway.
*/
-static int lock_token(struct md_cluster_info *cinfo)
+static int lock_token(struct md_cluster_info *cinfo, bool mddev_locked)
{
- int error;
+ int error, set_bit = 0;
+ struct mddev *mddev = cinfo->mddev;
+ /*
+ * If resync thread run after raid1d thread, then process_metadata_update
+ * could not continue if raid1d held reconfig_mutex (and raid1d is blocked
+ * since another node already got EX on Token and waitting the EX of Ack),
+ * so let resync wake up thread in case flag is set.
+ */
+ if (mddev_locked && !test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
+ &cinfo->state)) {
+ error = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
+ &cinfo->state);
+ WARN_ON_ONCE(error);
+ md_wakeup_thread(mddev->thread);
+ set_bit = 1;
+ }
error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
+ if (set_bit)
+ clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
+
if (error)
pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
__func__, __LINE__, error);
@@ -663,12 +693,12 @@ static int lock_token(struct md_cluster_info *cinfo)
/* lock_comm()
* Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel.
*/
-static int lock_comm(struct md_cluster_info *cinfo)
+static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked)
{
wait_event(cinfo->wait,
!test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state));
- return lock_token(cinfo);
+ return lock_token(cinfo, mddev_locked);
}
static void unlock_comm(struct md_cluster_info *cinfo)
@@ -743,11 +773,12 @@ failed_message:
return error;
}
-static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
+static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg,
+ bool mddev_locked)
{
int ret;
- lock_comm(cinfo);
+ lock_comm(cinfo, mddev_locked);
ret = __sendmsg(cinfo, cmsg);
unlock_comm(cinfo);
return ret;
@@ -834,6 +865,7 @@ static int join(struct mddev *mddev, int nodes)
mutex_init(&cinfo->recv_mutex);
mddev->cluster_info = cinfo;
+ cinfo->mddev = mddev;
memset(str, 0, 64);
sprintf(str, "%pU", mddev->uuid);
@@ -908,6 +940,7 @@ static int join(struct mddev *mddev, int nodes)
return 0;
err:
+ set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
md_unregister_thread(&cinfo->recovery_thread);
md_unregister_thread(&cinfo->recv_thread);
lockres_free(cinfo->message_lockres);
@@ -943,7 +976,7 @@ static void resync_bitmap(struct mddev *mddev)
int err;
cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
- err = sendmsg(cinfo, &cmsg);
+ err = sendmsg(cinfo, &cmsg, 1);
if (err)
pr_err("%s:%d: failed to send BITMAP_NEEDS_SYNC message (%d)\n",
__func__, __LINE__, err);
@@ -963,6 +996,7 @@ static int leave(struct mddev *mddev)
if (cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector)
resync_bitmap(mddev);
+ set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
md_unregister_thread(&cinfo->recovery_thread);
md_unregister_thread(&cinfo->recv_thread);
lockres_free(cinfo->message_lockres);
@@ -997,16 +1031,30 @@ static int slot_number(struct mddev *mddev)
static int metadata_update_start(struct mddev *mddev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
+ int ret;
+
+ /*
+ * metadata_update_start is always called with the protection of
+ * reconfig_mutex, so set WAITING_FOR_TOKEN here.
+ */
+ ret = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
+ &cinfo->state);
+ WARN_ON_ONCE(ret);
+ md_wakeup_thread(mddev->thread);
wait_event(cinfo->wait,
!test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state) ||
test_and_clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state));
/* If token is already locked, return 0 */
- if (cinfo->token_lockres->mode == DLM_LOCK_EX)
+ if (cinfo->token_lockres->mode == DLM_LOCK_EX) {
+ clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
return 0;
+ }
- return lock_token(cinfo);
+ ret = lock_token(cinfo, 1);
+ clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
+ return ret;
}
static int metadata_update_finish(struct mddev *mddev)
@@ -1043,6 +1091,141 @@ static void metadata_update_cancel(struct mddev *mddev)
unlock_comm(cinfo);
}
+/*
+ * return 0 if all the bitmaps have the same sync_size
+ */
+int cluster_check_sync_size(struct mddev *mddev)
+{
+ int i, rv;
+ bitmap_super_t *sb;
+ unsigned long my_sync_size, sync_size = 0;
+ int node_num = mddev->bitmap_info.nodes;
+ int current_slot = md_cluster_ops->slot_number(mddev);
+ struct bitmap *bitmap = mddev->bitmap;
+ char str[64];
+ struct dlm_lock_resource *bm_lockres;
+
+ sb = kmap_atomic(bitmap->storage.sb_page);
+ my_sync_size = sb->sync_size;
+ kunmap_atomic(sb);
+
+ for (i = 0; i < node_num; i++) {
+ if (i == current_slot)
+ continue;
+
+ bitmap = get_bitmap_from_slot(mddev, i);
+ if (IS_ERR(bitmap)) {
+ pr_err("can't get bitmap from slot %d\n", i);
+ return -1;
+ }
+
+ /*
+ * If we can hold the bitmap lock of one node then
+ * the slot is not occupied, update the sb.
+ */
+ snprintf(str, 64, "bitmap%04d", i);
+ bm_lockres = lockres_init(mddev, str, NULL, 1);
+ if (!bm_lockres) {
+ pr_err("md-cluster: Cannot initialize %s\n", str);
+ bitmap_free(bitmap);
+ return -1;
+ }
+ bm_lockres->flags |= DLM_LKF_NOQUEUE;
+ rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
+ if (!rv)
+ bitmap_update_sb(bitmap);
+ lockres_free(bm_lockres);
+
+ sb = kmap_atomic(bitmap->storage.sb_page);
+ if (sync_size == 0)
+ sync_size = sb->sync_size;
+ else if (sync_size != sb->sync_size) {
+ kunmap_atomic(sb);
+ bitmap_free(bitmap);
+ return -1;
+ }
+ kunmap_atomic(sb);
+ bitmap_free(bitmap);
+ }
+
+ return (my_sync_size == sync_size) ? 0 : -1;
+}
+
+/*
+ * Update the size for cluster raid is a little more complex, we perform it
+ * by the steps:
+ * 1. hold token lock and update superblock in initiator node.
+ * 2. send METADATA_UPDATED msg to other nodes.
+ * 3. The initiator node continues to check each bitmap's sync_size, if all
+ * bitmaps have the same value of sync_size, then we can set capacity and
+ * let other nodes to perform it. If one node can't update sync_size
+ * accordingly, we need to revert to previous value.
+ */
+static void update_size(struct mddev *mddev, sector_t old_dev_sectors)
+{
+ struct md_cluster_info *cinfo = mddev->cluster_info;
+ struct cluster_msg cmsg;
+ struct md_rdev *rdev;
+ int ret = 0;
+ int raid_slot = -1;
+
+ md_update_sb(mddev, 1);
+ lock_comm(cinfo, 1);
+
+ memset(&cmsg, 0, sizeof(cmsg));
+ cmsg.type = cpu_to_le32(METADATA_UPDATED);
+ rdev_for_each(rdev, mddev)
+ if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags)) {
+ raid_slot = rdev->desc_nr;
+ break;
+ }
+ if (raid_slot >= 0) {
+ cmsg.raid_slot = cpu_to_le32(raid_slot);
+ /*
+ * We can only change capiticy after all the nodes can do it,
+ * so need to wait after other nodes already received the msg
+ * and handled the change
+ */
+ ret = __sendmsg(cinfo, &cmsg);
+ if (ret) {
+ pr_err("%s:%d: failed to send METADATA_UPDATED msg\n",
+ __func__, __LINE__);
+ unlock_comm(cinfo);
+ return;
+ }
+ } else {
+ pr_err("md-cluster: No good device id found to send\n");
+ unlock_comm(cinfo);
+ return;
+ }
+
+ /*
+ * check the sync_size from other node's bitmap, if sync_size
+ * have already updated in other nodes as expected, send an
+ * empty metadata msg to permit the change of capacity
+ */
+ if (cluster_check_sync_size(mddev) == 0) {
+ memset(&cmsg, 0, sizeof(cmsg));
+ cmsg.type = cpu_to_le32(CHANGE_CAPACITY);
+ ret = __sendmsg(cinfo, &cmsg);
+ if (ret)
+ pr_err("%s:%d: failed to send CHANGE_CAPACITY msg\n",
+ __func__, __LINE__);
+ set_capacity(mddev->gendisk, mddev->array_sectors);
+ revalidate_disk(mddev->gendisk);
+ } else {
+ /* revert to previous sectors */
+ ret = mddev->pers->resize(mddev, old_dev_sectors);
+ if (!ret)
+ revalidate_disk(mddev->gendisk);
+ ret = __sendmsg(cinfo, &cmsg);
+ if (ret)
+ pr_err("%s:%d: failed to send METADATA_UPDATED msg\n",
+ __func__, __LINE__);
+ }
+ unlock_comm(cinfo);
+}
+
static int resync_start(struct mddev *mddev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
@@ -1069,7 +1252,14 @@ static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
cmsg.low = cpu_to_le64(lo);
cmsg.high = cpu_to_le64(hi);
- return sendmsg(cinfo, &cmsg);
+ /*
+ * mddev_lock is held if resync_info_update is called from
+ * resync_finish (md_reap_sync_thread -> resync_finish)
+ */
+ if (lo == 0 && hi == 0)
+ return sendmsg(cinfo, &cmsg, 1);
+ else
+ return sendmsg(cinfo, &cmsg, 0);
}
static int resync_finish(struct mddev *mddev)
@@ -1119,7 +1309,7 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
cmsg.type = cpu_to_le32(NEWDISK);
memcpy(cmsg.uuid, uuid, 16);
cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
- lock_comm(cinfo);
+ lock_comm(cinfo, 1);
ret = __sendmsg(cinfo, &cmsg);
if (ret)
return ret;
@@ -1179,7 +1369,7 @@ static int remove_disk(struct mddev *mddev, struct md_rdev *rdev)
struct md_cluster_info *cinfo = mddev->cluster_info;
cmsg.type = cpu_to_le32(REMOVE);
cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
- return sendmsg(cinfo, &cmsg);
+ return sendmsg(cinfo, &cmsg, 1);
}
static int lock_all_bitmaps(struct mddev *mddev)
@@ -1243,7 +1433,7 @@ static int gather_bitmaps(struct md_rdev *rdev)
cmsg.type = cpu_to_le32(RE_ADD);
cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
- err = sendmsg(cinfo, &cmsg);
+ err = sendmsg(cinfo, &cmsg, 1);
if (err)
goto out;
@@ -1281,6 +1471,7 @@ static struct md_cluster_operations cluster_ops = {
.gather_bitmaps = gather_bitmaps,
.lock_all_bitmaps = lock_all_bitmaps,
.unlock_all_bitmaps = unlock_all_bitmaps,
+ .update_size = update_size,
};
static int __init cluster_init(void)
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index e765499ba591..274016177983 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -27,6 +27,7 @@ struct md_cluster_operations {
int (*gather_bitmaps)(struct md_rdev *rdev);
int (*lock_all_bitmaps)(struct mdde